From 98ba27985026f479c26bc4cbfbc51d645cfc82b3 Mon Sep 17 00:00:00 2001 From: Sunil Mohan Adapa Date: Tue, 25 Aug 2020 11:22:05 -0700 Subject: [PATCH] doc: wikiparser: Fix parsing URLs, simplify plain text parsing - Parse Wiki words like MoinMoin. - Handle . : etc. at the end of the links properly. - Use regular expressions to simplify parsing plain text. - Strip spaces in link targets and text. Signed-off-by: Sunil Mohan Adapa Reviewed-by: James Valleroy --- doc/scripts/wikiparser.py | 97 +++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/doc/scripts/wikiparser.py b/doc/scripts/wikiparser.py index 3edd818ab..e34430586 100644 --- a/doc/scripts/wikiparser.py +++ b/doc/scripts/wikiparser.py @@ -540,15 +540,11 @@ def split_formatted(text, delimiter, end_delimiter=None): return (content, text) -def parse_text(line, context=None): +def parse_text(line, context=None, parse_links=True): """ Parse a line of MoinMoin wiki text. Returns a list of objects representing text. """ - # Handle !WikiWords which suppress automatic links. - words = [word.lstrip('!') for word in line.split()] - line = ' '.join(words) - result = [] while line: # Icons @@ -587,7 +583,7 @@ def parse_text(line, context=None): if remaining: params, _, remaining = remaining.partition('|') - link = Link(target, [ItalicText(text)], params) + link = Link(target.strip(), [ItalicText(text.strip())], params) result.append(link) continue @@ -617,6 +613,7 @@ def parse_text(line, context=None): content, line = split_formatted(line, '[[', ']]') if content: target, _, remaining = content.partition('|') + target = target.strip() text = None if remaining: # Handle embedded attachments inside links @@ -629,7 +626,8 @@ def parse_text(line, context=None): else: text, _, remaining = remaining.partition('|') - text = parse_text(text) + text = text.strip() + text = parse_text(text, parse_links=False) params = None if remaining: @@ -655,7 +653,7 @@ def parse_text(line, context=None): else: text, _, remaining = remaining.partition('|') - text = parse_text(text) + text = parse_text(text, parse_links=False) params = None if remaining: @@ -674,37 +672,7 @@ def parse_text(line, context=None): if content: line = line.replace(content, '', 1) content = content.strip() - while content: - if '<' not in content and '>' not in content \ - and (content.startswith('http://') - or content.startswith('https://')): - contents = content.split(' ', 1) - result.append(Url(contents[0])) - if len(contents) > 1: - content = contents[1] - else: - break - else: - found_http = content.find('http://') - found_https = content.find('https://') - if found_http >= 0: - if found_https >= 0: - length = min(content.find('http://'), - content.find('https://')) - else: - length = found_http - else: - length = found_https - - if length > 0: - result.append(PlainText(content[:length])) - content = content[length:] - else: - result.append(PlainText(content)) - break - - continue - + result += parse_plain_text(content, parse_links=parse_links) continue break @@ -712,6 +680,49 @@ def parse_text(line, context=None): return result +def parse_plain_text(content, parse_links=True): + """Parse a line or plain text and generate plain text and URL objects.""" + result = [] + while content: + content = content.strip() + wiki_link_match = re.search( + r'(?: |^)([A-Z][a-z0-9]+([A-Z][a-z0-9]+)+)(?: |$)', content) + link_match = re.search(r'(https?://[^<> ]+[^<> .:\(\)])', content) + if parse_links and link_match and link_match.span(0)[0] == 0: + link = link_match.group(1) + result.append(Url(link)) + content = content[link_match.span(1)[1]:] + elif parse_links and wiki_link_match and wiki_link_match.span( + 0)[0] == 0: + link = wiki_link_match.group(1) + result.append(Link(link, [PlainText(link)])) + content = content[wiki_link_match.span(1)[1]:] + else: + end = None + if parse_links and link_match: + end = link_match.span(1)[0] + + if parse_links and wiki_link_match: + end = wiki_link_match.span(1)[0] + + text = content[:end] + + # Replace occurrences of !WikiText with WikiText + text = re.sub(r'([^A-Za-z]|^)!', r'\g<1>', text) + + # Gobble multiple spaces + text = re.sub(r' +', r' ', text) + + result.append(PlainText(text)) + + if end: + content = content[end:] + else: + break + + return result + + def parse_table_row(line, context=None): """Parse a line of MoinMoin wiki text. Returns a TableRow.""" row_cells = re.split(r'\|\|', line)[1:-1] @@ -1152,7 +1163,13 @@ the instructions on that site to install and run it.')])] >>> parse_wiki('After installation a web page becomes available on \ https:///_minidlna.') [Paragraph([PlainText('After installation a web page becomes available on \ -'), PlainText('https:///_minidlna.')])] +https:///_minidlna.')])] + + >>> parse_wiki('or http://10.42.0.1/.') + [Paragraph([PlainText('or '), Url('http://10.42.0.1/'), PlainText('.')])] + + >>> parse_wiki('or http://10.42.0.1/:') + [Paragraph([PlainText('or '), Url('http://10.42.0.1/'), PlainText(':')])] >>> parse_wiki('|| [[FreedomBox/Hardware/\ A20-OLinuXino-Lime2|{{attachment:a20-olinuxino-lime2_thumb.jpg|A20 OLinuXino \