# SPDX-License-Identifier: AGPL-3.0-or-later
"""
MoinMoin wiki parser
"""
import logging
import re
import urllib
from enum import Enum
from pathlib import Path
from xml.sax.saxutils import escape
ICONS_DIR = 'icons'
# Additional language codes, besides 'en'
LANGUAGES = [
'es',
]
WIKI_ICONS = {
'/!\\': 'alert',
'(./)': 'checkmark',
'{X}': 'icon-error',
'{i}': 'icon-info',
'{o}': 'star_off',
'{*}': 'star_on',
}
BASE_URL = 'https://wiki.debian.org/'
class Element:
"""Represents an element of a MoinMoin wiki page."""
def __repr__(self, *args):
rep = self.__class__.__name__ + '('
if args:
rep += repr(args[0])
for arg in args[1:]:
rep += ', ' + repr(arg)
rep += ')'
return rep
def to_docbook(self, context=None):
return '<' + self.__class__.__name__ + '/>'
class Heading(Element):
def __init__(self, level, content):
self.level = min(level, 5)
self.content = content
def __repr__(self):
return super().__repr__(self.level, self.content)
def to_docbook(self, context=None):
return f'
{escape(self.content)}'
class TableOfContents(Element):
def __init__(self, max_level=None):
self.max_level = max_level
def __repr__(self):
if self.max_level:
return super().__repr__(self.max_level)
else:
return super().__repr__()
def to_docbook(self, context=None):
return ''
class Text(Element):
def __init__(self, content):
self.content = content
def __repr__(self):
return super().__repr__(self.content)
def to_docbook(self, context=None):
return escape(self.content)
class PlainText(Text):
pass
class Url(Text):
def to_docbook(self, context=None):
return f''
class ItalicText(Text):
def to_docbook(self, context=None):
return f'{escape(self.content)}'
class BoldText(Text):
def to_docbook(self, context=None):
return f'{escape(self.content)}'
class MonospaceText(Text):
def to_docbook(self, context=None):
return f'{escape(self.content)}'
class CodeText(Text):
def to_docbook(self, context=None):
if context and 'in_paragraph' in context and context['in_paragraph']:
return f'{escape(self.content)}'
else:
return f''
class UnderlineText(Text):
def to_docbook(self, context=None):
return f'{escape(self.content)}'
class SmallerTextWarning(Element):
def to_docbook(self, context=None):
return ''
class Paragraph(Element):
def __init__(self, content, indent=0):
self.content = content
self.indent = indent
def __repr__(self):
if self.indent:
rep = super().__repr__(self.content, self.indent)
else:
rep = super().__repr__(self.content)
return rep
def add_content(self, content):
self.content += content
def to_docbook(self, context=None):
if context is not None:
context['in_paragraph'] = True
items_xml = [item.to_docbook(context) for item in self.content]
if context is not None:
context['in_paragraph'] = False
try:
xml = items_xml.pop(0)
except IndexError:
xml = ''
for item_xml in items_xml:
xml += item_xml
return f'{xml}'
class Link(Element):
def __init__(self, target, text=None, params=None):
self.target = target
self.text = text
self.params = params
def __repr__(self):
if self.text and self.params:
rep = super().__repr__(self.target, self.text, self.params)
elif self.text:
rep = super().__repr__(self.target, self.text)
else:
rep = super().__repr__(self.target)
return rep
def to_docbook(self, context=None):
target = escape(resolve_url(self.target, context))
link_text = ''
if self.text:
for element in self.text:
link_text += element.to_docbook(context)
if target.startswith('#'):
xml = f'{link_text}'
else:
xml = f'{link_text}'
return xml
class EmbeddedLink(Link):
pass
class EmbeddedAttachment(EmbeddedLink):
def __init__(self, target, text=None, params=None, context=None):
self.page_title = context.get('title', None) if context else None
if not text:
text = [PlainText(target)]
return super().__init__(target, text, params)
def to_docbook(self, context=None):
if self.page_title:
target = BASE_URL + self.page_title \
+ '?action=AttachFile&do=get&target=' \
+ escape(self.target)
else:
target = escape(self.target)
xml = ''
xml += f''
xml += ''
if self.text:
xml += ''
for element in self.text:
xml += element.to_docbook(context)
xml += ''
xml += ''
return xml
class ListItem(Element):
def __init__(self, content=None, override_marker=False):
self.content = content or []
self.override_marker = override_marker
def __repr__(self):
return super().__repr__(self.content)
def add_content(self, content):
self.content.append(content)
def to_docbook(self, context=None):
if self.override_marker:
xml = ''
else:
xml = ''
item_xml = [item.to_docbook(context) for item in self.content]
xml += ' '.join(item_xml) + ''
return xml
class ListType(Enum):
PLAIN = 1
BULLETED = 2
NUMBERED = 3
SPACED = 4
class List(Element):
def __init__(self, list_type=ListType.PLAIN, items=None):
if isinstance(list_type, str):
if list_type == 'plain':
self.list_type = ListType.PLAIN
elif list_type == 'bulleted':
self.list_type = ListType.BULLETED
elif list_type == 'numbered':
self.list_type = ListType.NUMBERED
else:
self.list_type = ListType.SPACED
else:
self.list_type = list_type
self.items = items or []
def __repr__(self):
if self.list_type == ListType.PLAIN:
list_type = 'plain'
elif self.list_type == ListType.BULLETED:
list_type = 'bulleted'
elif self.list_type == ListType.NUMBERED:
list_type = 'numbered'
else:
list_type = 'spaced'
return super().__repr__(list_type, self.items)
def add_item(self, item):
self.items.append(item)
def to_docbook(self, context=None):
if self.list_type == ListType.PLAIN:
xml = ''
elif self.list_type == ListType.BULLETED:
xml = ''
elif self.list_type == ListType.NUMBERED:
xml = ''
else:
xml = ''
for item in self.items:
xml += item.to_docbook(context)
if self.list_type == ListType.PLAIN:
xml += ''
elif self.list_type == ListType.BULLETED:
xml += ''
elif self.list_type == ListType.NUMBERED:
xml += ''
else:
xml += ''
return xml
class HorizontalRule(Element):
def __init__(self, dashes):
self.dashes = dashes
def __repr__(self):
return super().__repr__(self.dashes)
def to_docbook(self, context=None):
return ''
class TableItem(Element):
def __init__(self, content=None, align=None):
self.content = content
self.align = align
def __repr__(self):
if self.content and self.align:
rep = super().__repr__(self.content, self.align)
elif self.content:
rep = super().__repr__(self.content)
else:
rep = super().__repr__()
return rep
def to_docbook(self, context=None):
if self.align:
align = f'align="{self.align}" '
else:
align = ''
if self.content:
xml = f''
for item in self.content:
xml += item.to_docbook(context)
xml += ''
else:
xml = ''
return xml
class TableRow(Element):
def __init__(self, items):
self.items = items
def __len__(self):
return len(self.items)
def __repr__(self):
return super().__repr__(self.items)
def to_docbook(self, context=None):
xml = ''
for item in self.items:
xml += item.to_docbook(context)
xml += '
'
return xml
class Table(Element):
def __init__(self, rows, style=None):
self.rows = rows
self.style = style
def __repr__(self):
if self.style:
rep = super().__repr__(self.rows, self.style)
else:
rep = super().__repr__(self.rows)
return rep
def to_docbook(self, context=None):
cols = len(self.rows[0]) if self.rows else 0
xml = f''
for number in range(cols):
xml += f''
xml += ''
for row in self.rows:
xml += row.to_docbook(context)
xml += ''
return xml
class Include(Element):
def __init__(self, page, from_marker=None, to_marker=None):
self.page = page
self.from_marker = from_marker
self.to_marker = to_marker
def __repr__(self):
if self.from_marker and self.to_marker:
rep = super().__repr__(self.page, self.from_marker, self.to_marker)
elif self.to_marker:
rep = super().__repr__(self.page, self.to_marker)
else:
rep = super().__repr__(self.page)
return rep
def to_docbook(self, context=None):
if context and 'path' in context:
include_folder = context['path'].parent
else:
include_folder = Path('.')
include_file = include_folder / Path(
self.page.split('/')[-1] + '.raw.wiki')
if not include_file.exists():
logging.warning('Included page not found:' + str(include_file))
return ''
with include_file.open() as wiki_file:
wiki_text = wiki_file.read()
context = get_context(include_file)
parsed_wiki = parse_wiki(wiki_text, context, self.from_marker,
self.to_marker)
return generate_inner_docbook(parsed_wiki, context)
class Admonition(Element):
def __init__(self, style, content):
self.style = style
self.content = content
def __repr__(self):
return super().__repr__(self.style, self.content)
def to_docbook(self, context=None):
xml = '<' + self.style + '>'
item_xml = [item.to_docbook(context) for item in self.content]
xml += ' '.join(item_xml) + '' + self.style + '>'
return xml
class Comment(Text):
def to_docbook(self, context=None):
item_xml = [item.to_docbook(context) for item in self.content]
xml = ' '.join(item_xml)
return f'{xml}'
class BeginInclude(Element):
def to_docbook(self, context=None):
return ''
class EndInclude(Element):
def to_docbook(self, context=None):
return ''
class Category(Element):
def __init__(self, name):
self.name = name
def __repr__(self):
return super().__repr__(self.name)
def to_docbook(self, context=None):
return ''
class Anchor(Element):
def __init__(self, name):
self.name = name
def __repr__(self):
return super().__repr__(self.name)
def to_docbook(self, context=None):
return f''
def get_url_text(url):
"""Return text to assign to URLs if not provided."""
if re.match(r'[A-Za-z]+://', url) or url.startswith('#'):
return None
if re.match(r'[A-Za-z]+:', url):
return url.partition(':')[2]
return url
def resolve_url(url, context):
"""Expand a URL into a full path."""
if re.match(r'https?://', url) or url.startswith('mailto:') or \
url.startswith('irc://'):
return url
if url.startswith('#'):
return url
if url.startswith('attachment:'):
target = url.lstrip('attachment:')
page_title = context.get('title') if context else None
if page_title:
target = f'{BASE_URL}{page_title}?action=AttachFile&do=get&' + \
urllib.parse.urlencode({'target': target})
return target
if url.startswith('DebianBug:'):
target = url.lstrip('DebianBug:')
return f'https://bugs.debian.org/{target}#'
if url.startswith('DebianPkg:'):
target = url.lstrip('DebianPkg:')
return f'https://packages.debian.org/{target}#'
if url.startswith('AliothList:'):
target = url.lstrip('AliothList:')
return f'https://lists.alioth.debian.org/mailman/listinfo/{target}#'
if url.startswith('../'):
page_title = context.get('title', '') if context else ''
while url.startswith('../'):
url = url[3:]
page_title = page_title.rpartition('/')[0]
url = f'{BASE_URL}{page_title}/{url}'
elif url.startswith('/'):
page_title = context.get('title', '') if context else ''
url = url.lstrip('/')
url = f'{BASE_URL}{page_title}/{url}'
else:
url = f'{BASE_URL}{url}'
if '#' not in url:
url = url + '#'
return url
def split_formatted(text, delimiter, end_delimiter=None):
"""
Split formatted text marked by delimiter, if it is found at beginning.
A distinct end delmiter can be specified, or it is same as delimiter.
Return (formatted_text, remaining_text) if it is found.
Return (None, text) otherwise.
"""
end_delimiter = end_delimiter or delimiter
content = None
if text.startswith(delimiter):
text = text.lstrip(delimiter)
end = text.find(end_delimiter)
content = text[:end]
text = text[end:].lstrip(end_delimiter)
return (content, text)
def parse_text(line, context=None, parse_links=True):
"""
Parse a line of MoinMoin wiki text.
Returns a list of objects representing text.
"""
result = []
while line:
# Icons
for icon_text, icon_name in WIKI_ICONS.items():
if line.lstrip().startswith(icon_text):
target = f'{ICONS_DIR}/{WIKI_ICONS[line.strip()]}.png'
result.append(EmbeddedAttachment(target, None, 'height=20'))
line = line.lstrip().replace(icon_text, '', 1)
break
# Smaller text
content, line = split_formatted(line, '~-', '-~')
if content:
result.append(SmallerTextWarning())
line = content + line
# continue processing line
# Bold text
content, line = split_formatted(line, "'''")
if content:
result.append(BoldText(content))
continue
# Italic text
content, line = split_formatted(line, "''")
if content:
if content.startswith('[[') and content.endswith(']]'):
# Special handling for links within emphasis
content = content.lstrip('[[').rstrip(']]')
target, _, remaining = content.partition('|')
text = None
if remaining:
text, _, remaining = remaining.partition('|')
params = None
if remaining:
params, _, remaining = remaining.partition('|')
text = text or get_url_text(target)
link = Link(target.strip(), [ItalicText(text.strip())], params)
result.append(link)
continue
else:
result.append(ItalicText(content))
continue
# Monospace text
content, line = split_formatted(line, '`')
if content:
result.append(MonospaceText(content))
continue
# Code text
content, line = split_formatted(line, '{{{', '}}}')
if content:
result.append(CodeText(content))
continue
# Underline text
content, line = split_formatted(line, '__')
if content:
result.append(UnderlineText(content))
continue
# Links
content, line = split_formatted(line, '[[', ']]')
if content:
target, _, remaining = content.partition('|')
target = target.strip()
text = get_url_text(target)
if remaining:
# Handle embedded attachments inside links
if '{{' in remaining and '}}' in remaining:
index = remaining.find('}}')
text = remaining[:index + 1]
remaining = remaining[index + 2:]
more_text, _, remaining = remaining.partition('|')
text += more_text
else:
text, _, remaining = remaining.partition('|')
if text:
text = text.strip()
text = parse_text(text, parse_links=False)
params = None
if remaining:
params, _, remaining = remaining.partition('|')
link = Link(target, text, params)
result.append(link)
continue
# Embedded
content, line = split_formatted(line, '{{', '}}')
if content:
target, _, remaining = content.partition('|')
text = None
if remaining:
# Handle embedded attachments inside links
if '{{' in remaining and '}}' in remaining:
index = remaining.find('}}')
text = remaining[:index + 1]
remaining = remaining[index + 2:]
more_text, _, remaining = remaining.partition('|')
text += more_text
else:
text, _, remaining = remaining.partition('|')
text = parse_text(text.strip(), parse_links=False)
params = None
if remaining:
params, _, remaining = remaining.partition('|')
if target.startswith('attachment:'):
link = EmbeddedAttachment(target[11:], text, params, context)
else:
link = EmbeddedLink(target, text, params)
result.append(link)
continue
# Plain text and URLs
content = re.split(r"''|`|{{|__|\[\[", line)[0]
if content:
line = line.replace(content, '', 1)
result += parse_plain_text(content, parse_links=parse_links)
continue
break
return result
def parse_plain_text(content, parse_links=True):
"""Parse a line or plain text and generate plain text and URL objects."""
result = []
while content:
wiki_link_match = re.search(
r'(?: |^)([A-Z][a-z0-9]+([A-Z][a-z0-9]+)+)(?: |$)', content)
link_match = re.search(r'(https?://[^<> ]+[^<> .:\(\)])', content)
if parse_links and link_match and link_match.span(0)[0] == 0:
link = link_match.group(1)
result.append(Url(link))
content = content[link_match.span(1)[1]:]
elif parse_links and wiki_link_match and wiki_link_match.span(
0)[0] == 0:
link = wiki_link_match.group(1)
result.append(Link(link, [PlainText(link)]))
content = content[wiki_link_match.span(1)[1]:]
else:
end = None
if parse_links and link_match:
end = link_match.span(1)[0]
if parse_links and wiki_link_match:
end = wiki_link_match.span(1)[0]
text = content[:end]
# Replace occurrences of !WikiText with WikiText
text = re.sub(r'([^A-Za-z]|^)!', r'\g<1>', text)
result.append(PlainText(text))
if end:
content = content[end:]
else:
break
return result
def parse_table_row(line, context=None):
"""Parse a line of MoinMoin wiki text. Returns a TableRow."""
row_cells = re.split(r'\|\|', line)[1:-1]
row_items = []
for cell in row_cells:
content = cell
if content.strip():
# remove that was already processed
content = re.sub(']+>', '', content)
align = None
match = re.match('