#!/usr/bin/python3
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
MoinMoin wiki parser
"""
import logging
import re
import sys
import urllib
from enum import Enum
from pathlib import Path
from xml.sax.saxutils import escape
BASE_URL = 'https://wiki.debian.org/'
LOCAL_BASE = '/plinth/help/manual/{lang}/'
ICONS_DIR = 'icons'
DEFAULT_LANGUAGE = 'en'
# List of language codes for provided translations
LANGUAGES = ('en', 'es')
WIKI_ICONS = {
'/!\\': 'alert',
'(./)': 'checkmark',
'{X}': 'icon-error',
'{i}': 'icon-info',
'{o}': 'star_off',
'{*}': 'star_on',
}
class Element:
"""Represents an element of a MoinMoin wiki page."""
def __repr__(self, *args):
rep = self.__class__.__name__ + '('
if args:
rep += repr(args[0])
for arg in args[1:]:
rep += ', ' + repr(arg)
rep += ')'
return rep
def to_docbook(self, context=None):
return '<' + self.__class__.__name__ + '/>'
class Heading(Element):
def __init__(self, level, content):
self.level = min(level, 5)
self.content = content
def __repr__(self):
return super().__repr__(self.level, self.content)
def to_docbook(self, context=None):
return f'
{escape(self.content)}'
class TableOfContents(Element):
def __init__(self, max_level=None):
self.max_level = max_level
def __repr__(self):
if self.max_level:
return super().__repr__(self.max_level)
else:
return super().__repr__()
def to_docbook(self, context=None):
return ''
class Text(Element):
def __init__(self, content):
self.content = content
def __repr__(self):
return super().__repr__(self.content)
def to_docbook(self, context=None):
return escape(self.content)
class PlainText(Text):
pass
class Url(Text):
def to_docbook(self, context=None):
return f''
class ItalicText(Text):
def to_docbook(self, context=None):
xml = ''.join([item.to_docbook() for item in self.content])
return f'{xml}'
class BoldText(Text):
def to_docbook(self, context=None):
xml = ''.join([item.to_docbook() for item in self.content])
return f'{xml}'
class MonospaceText(Text):
def to_docbook(self, context=None):
return f'{escape(self.content)}'
class CodeText(Text):
def to_docbook(self, context=None):
if context and 'in_paragraph' in context and context['in_paragraph']:
return f'{escape(self.content)}'
else:
return f''
class UnderlineText(Text):
def to_docbook(self, context=None):
return f'{escape(self.content)}'
class SmallerTextWarning(Element):
def to_docbook(self, context=None):
return ''
class Paragraph(Element):
def __init__(self, content, indent=0):
self.content = content
self.indent = indent
def __repr__(self):
if self.indent:
rep = super().__repr__(self.content, self.indent)
else:
rep = super().__repr__(self.content)
return rep
def add_content(self, content):
self.content += content
def to_docbook(self, context=None):
if context is not None:
context['in_paragraph'] = True
items_xml = [item.to_docbook(context) for item in self.content]
if context is not None:
context['in_paragraph'] = False
try:
xml = items_xml.pop(0)
except IndexError:
xml = ''
for item_xml in items_xml:
xml += item_xml
return f'{xml}'
class Link(Element):
def __init__(self, target, text=None, params=None):
self.target = target
self.text = text
self.params = params
def __repr__(self):
if self.text and self.params:
rep = super().__repr__(self.target, self.text, self.params)
elif self.text:
rep = super().__repr__(self.target, self.text)
else:
rep = super().__repr__(self.target)
return rep
def to_docbook(self, context=None):
target = escape(resolve_url(self.target, context))
link_text = ''
if self.text:
for element in self.text:
link_text += element.to_docbook(context)
if target.startswith('#'):
xml = f'{link_text}'
else:
xml = f'{link_text}'
return xml
class EmbeddedLink(Link):
pass
class EmbeddedAttachment(EmbeddedLink):
def __init__(self, target, text=None, params=None, context=None):
self.page_title = context.get('title', None) if context else None
if not text:
text = [PlainText(target)]
super().__init__(target, text, params)
def to_docbook(self, context=None):
if self.page_title:
target = BASE_URL + self.page_title \
+ '?action=AttachFile&do=get&target=' \
+ escape(self.target)
else:
target = escape(self.target)
xml = ''
xml += ''
xml += ''
if self.text:
xml += ''
for element in self.text:
xml += element.to_docbook(context)
xml += ''
xml += ''
return xml
class ListItem(Element):
def __init__(self, content=None, override_marker=False):
self.content = content or []
self.override_marker = override_marker
def __repr__(self):
return super().__repr__(self.content)
def add_content(self, content):
self.content.append(content)
def to_docbook(self, context=None):
if self.override_marker:
xml = ''
else:
xml = ''
item_xml = [item.to_docbook(context) for item in self.content]
xml += ' '.join(item_xml) + ''
return xml
class ListType(Enum):
PLAIN = 1
BULLETED = 2
NUMBERED = 3
SPACED = 4
class List(Element):
def __init__(self, list_type=ListType.PLAIN, items=None):
if isinstance(list_type, str):
if list_type == 'plain':
self.list_type = ListType.PLAIN
elif list_type == 'bulleted':
self.list_type = ListType.BULLETED
elif list_type == 'numbered':
self.list_type = ListType.NUMBERED
else:
self.list_type = ListType.SPACED
else:
self.list_type = list_type
self.items = items or []
def __repr__(self):
if self.list_type == ListType.PLAIN:
list_type = 'plain'
elif self.list_type == ListType.BULLETED:
list_type = 'bulleted'
elif self.list_type == ListType.NUMBERED:
list_type = 'numbered'
else:
list_type = 'spaced'
return super().__repr__(list_type, self.items)
def add_item(self, item):
self.items.append(item)
def to_docbook(self, context=None):
if self.list_type == ListType.PLAIN:
xml = ''
elif self.list_type == ListType.BULLETED:
xml = ''
elif self.list_type == ListType.NUMBERED:
xml = ''
else:
xml = ''
for item in self.items:
xml += item.to_docbook(context)
if self.list_type == ListType.PLAIN:
xml += ''
elif self.list_type == ListType.BULLETED:
xml += ''
elif self.list_type == ListType.NUMBERED:
xml += ''
else:
xml += ''
return xml
class HorizontalRule(Element):
def __init__(self, dashes):
self.dashes = dashes
def __repr__(self):
return super().__repr__(self.dashes)
def to_docbook(self, context=None):
return ''
class TableItem(Element):
def __init__(self, content=None, align=None):
self.content = content
self.align = align
def __repr__(self):
if self.content and self.align:
rep = super().__repr__(self.content, self.align)
elif self.content:
rep = super().__repr__(self.content)
else:
rep = super().__repr__()
return rep
def to_docbook(self, context=None):
if self.align:
align = f'align="{self.align}" '
else:
align = ''
if self.content:
xml = f''
for item in self.content:
xml += item.to_docbook(context)
xml += ''
else:
xml = f''
return xml
class TableRow(Element):
def __init__(self, items):
self.items = items
def __len__(self):
return len(self.items)
def __repr__(self):
return super().__repr__(self.items)
def to_docbook(self, context=None):
xml = ''
for item in self.items:
xml += item.to_docbook(context)
xml += ''
return xml
class Table(Element):
def __init__(self, rows, style=None):
self.rows = rows
self.style = style
def __repr__(self):
if self.style:
rep = super().__repr__(self.rows, self.style)
else:
rep = super().__repr__(self.rows)
return rep
def to_docbook(self, context=None):
cols = len(self.rows[0]) if self.rows else 0
xml = f''
for number in range(cols):
xml += f''
xml += ''
for row in self.rows:
xml += row.to_docbook(context)
xml += ''
return xml
class Include(Element):
def __init__(self, page, from_marker=None, to_marker=None):
self.page = page
self.from_marker = from_marker
self.to_marker = to_marker
def __repr__(self):
if self.from_marker and self.to_marker:
rep = super().__repr__(self.page, self.from_marker, self.to_marker)
elif self.to_marker:
rep = super().__repr__(self.page, self.to_marker)
else:
rep = super().__repr__(self.page)
return rep
def to_docbook(self, context=None):
if context and 'path' in context:
include_folder = context['path'].parent
else:
include_folder = Path('.')
include_file = include_folder / Path(
self.page.split('/')[-1] + '.raw.wiki')
if not include_file.exists():
logging.warning('Included page not found:' + str(include_file))
return ''
with include_file.open() as wiki_file:
wiki_text = wiki_file.read()
context = get_context(include_file, self.page)
parsed_wiki = parse_wiki(wiki_text, context, self.from_marker,
self.to_marker)
return generate_inner_docbook(parsed_wiki, context)
class Admonition(Element):
def __init__(self, style, content):
self.style = style
self.content = content
def __repr__(self):
return super().__repr__(self.style, self.content)
def to_docbook(self, context=None):
if self.style == 'comment':
return ''
xml = '<' + self.style + '>'
item_xml = [item.to_docbook(context) for item in self.content]
xml += ' '.join(item_xml) + '' + self.style + '>'
return xml
class Comment(Text):
def to_docbook(self, context=None):
item_xml = [item.to_docbook(context) for item in self.content]
xml = ' '.join(item_xml)
return f'{xml}'
class BeginInclude(Element):
def to_docbook(self, context=None):
return ''
class EndInclude(Element):
def to_docbook(self, context=None):
return ''
class Category(Element):
def __init__(self, name):
self.name = name
def __repr__(self):
return super().__repr__(self.name)
def to_docbook(self, context=None):
return ''
class Anchor(Element):
def __init__(self, name):
self.name = name
def __repr__(self):
return super().__repr__(self.name)
def to_docbook(self, context=None):
return f''
def get_url_text(url):
"""Return text to assign to URLs if not provided."""
if re.match(r'[A-Za-z]+://', url) or url.startswith('#'):
return None
if re.match(r'[A-Za-z]+:', url):
return url.partition(':')[2]
return url
def convert_image_units(value):
"""Covert wiki image units to docbook image units."""
value = int(value)
value = value / 2.0 if value % 2 else int(value / 2)
return str(value) + 'pt'
def map_local_files(path):
"""Map files to locally existing paths."""
if 'target=' in path:
path = path.partition('target=')[2]
if path.startswith('icons/'):
pass
elif '/' in path:
path = path.rsplit('/', maxsplit=1)[1]
return f'images/{path}'
def resolve_url(url, context):
"""Expand a URL into a full path.
XXX: Links inside the included pages are resolved properly. However,
without the original path of a page, links in page can't always be resolved
correctly. Preserve the original path information.
Return these urls unmodified:
-----------------------------
>>> resolve_url('http://tst.me', {'language': '', 'title': ''})
'http://tst.me'
>>> resolve_url('https://tst.me', {'language': '', 'title': ''})
'https://tst.me'
>>> resolve_url('mailto:tst.me', {'language': '', 'title': ''})
'mailto:tst.me'
>>> resolve_url('irc://etc', {'language': '', 'title': ''})
'irc://etc'
>>> resolve_url('#tst', {'language': '', 'title': ''})
'#tst'
Detect and resolve Keyword-protocolled urls:
--------------------------------------------
>>> resolve_url('attachment:tst', {'language': '', 'title': ''})
'tst'
>>> resolve_url('attachment:tst', {'language': '', 'title': 'here'})
'https://wiki.debian.org/here?action=AttachFile&do=get&target=tst'
>>> resolve_url('DebianBug:tst', {'language': '', 'title': ''})
'https://bugs.debian.org/tst#'
>>> resolve_url('DebianPkg:tst', {'language': '', 'title': ''})
'https://packages.debian.org/tst#'
>>> resolve_url('AliothList:tst', {'language': '', 'title': ''})
'https://lists.alioth.debian.org/mailman/listinfo/tst#'
Relative links:
---------------
>>> resolve_url('../../back', {'language': '', 'title': 'here/skip_me/A'})
'https://wiki.debian.org/here/back#'
>>> resolve_url('/sub', {'language': '', 'title': 'A'})
'https://wiki.debian.org/A/sub#'
FreedomBox urls:
----------------
Locally unavailable => send to online help (wiki):
>>> resolve_url('FreedomBox/unavailable', {'language': '', 'title': ''})
'https://wiki.debian.org/FreedomBox/unavailable#'
Locally available page in default language => shortcut to local copy:
>>> resolve_url('FreedomBox/Contribute', {'language': '', 'title': ''})
'/plinth/help/manual/en/Contribute#'
Translated available page => shortcut to local copy:
>>> resolve_url('es/FreedomBox/Contribute', {'language': '', 'title': ''})
'/plinth/help/manual/es/Contribute#'
Available page in default language refferred as translated => shortcut to
local copy:
>>> resolve_url('en/FreedomBox/Contribute', {'language': '', 'title': ''})
'/plinth/help/manual/en/Contribute#'
Unrecognized language => handle considering it as default language:
>>> resolve_url('missing/FreedomBox/Contribute', {'language': '', \
'title': ''})
'/plinth/help/manual/en/Contribute#'
"""
# Process first all easy, straight forward cases:
if re.match(r'https?://', url) or url.startswith('mailto:') or \
url.startswith('irc://'):
return url
if url.startswith('#'):
return url
if url.startswith('attachment:'):
target = url[len('attachment:'):]
page_title = context.get('title') if context else None
if page_title:
target = f'{BASE_URL}{page_title}?action=AttachFile&do=get&' + \
urllib.parse.urlencode({'target': target})
return target
if url.startswith('DebianBug:'):
target = url[len('DebianBug:'):]
return f'https://bugs.debian.org/{target}#'
if url.startswith('DebianPkg:'):
target = url[len('DebianPkg:'):]
return f'https://packages.debian.org/{target}#'
if url.startswith('AliothList:'):
target = url[len('AliothList:'):]
return f'https://lists.alioth.debian.org/mailman/listinfo/{target}#'
# Intermediate step(s) for relative links:
if url.startswith('../'):
page_title = context.get('title', '') if context else ''
while url.startswith('../'):
url = url[3:]
page_title = page_title.rpartition('/')[0]
url = f'{page_title}/{url}'
elif url.startswith('/'):
page_title = context.get('title', '') if context else ''
url = url.lstrip('/')
url = f'{page_title}/{url}'
# Shortcut url to local copy if available:
if re.match(r'(?:[a-zA-Z_-]+/)?FreedomBox/', url):
# Digest URL
link_parts = url.split('/')
link_page = link_parts[-1]
# Identify language of link target
link_language = link_parts[0]
if link_language not in LANGUAGES:
link_language = DEFAULT_LANGUAGE
# Check for local file and use local path
file_ = Path(__file__).parent.parent
file_ = file_ / f'manual/{link_language}' / (link_page + '.raw.wiki')
if file_.exists():
help_base = LOCAL_BASE.format(lang=link_language)
url = f'{help_base}{link_page}'
else:
url = f'{BASE_URL}{url}'
else:
url = f'{BASE_URL}{url}'
# Match the behavior of DocBook exporter that appends # at the end of a URL
# that does not have it.
if '#' not in url:
url = url + '#'
return url
def split_formatted(text, delimiter, end_delimiter=None):
"""
Split formatted text marked by delimiter, if it is found at beginning.
A distinct end delmiter can be specified, or it is same as delimiter.
Return (formatted_text, remaining_text) if it is found.
Return (None, text) otherwise.
"""
end_delimiter = end_delimiter or delimiter
content = None
if text.startswith(delimiter):
text = text[len(delimiter):]
end = text.find(end_delimiter)
content = text[:end]
text = text[end:][len(end_delimiter):]
return (content, text)
def parse_text(line, context=None, parse_links=True):
"""
Parse a line of MoinMoin wiki text.
Returns a list of objects representing text.
"""
result = []
while line:
# Icons
for icon_text, icon_name in WIKI_ICONS.items():
if line.lstrip().startswith(icon_text):
target = f'{ICONS_DIR}/{WIKI_ICONS[line.strip()]}.png'
result.append(
EmbeddedAttachment(target, [PlainText(icon_text)],
'height=26'))
line = line.lstrip().replace(icon_text, '', 1)
break
# Smaller text
content, line = split_formatted(line, '~-', '-~')
if content:
result.append(SmallerTextWarning())
line = content + line
# continue processing line
# Bold text
content, line = split_formatted(line, "'''")
if content:
result.append(BoldText(parse_text(content, context)))
continue
# Italic text
content, line = split_formatted(line, "''")
if content:
result.append(ItalicText(parse_text(content, context)))
continue
# Monospace text
content, line = split_formatted(line, '`')
if content:
result.append(MonospaceText(content))
continue
# Code text
content, line = split_formatted(line, '{{{', '}}}')
if content:
result.append(CodeText(content))
continue
# Underline text
content, line = split_formatted(line, '__')
if content:
result.append(UnderlineText(content))
continue
# Links
content, line = split_formatted(line, '[[', ']]')
if content:
target, _, remaining = content.partition('|')
target = target.strip()
text = get_url_text(target)
if remaining:
# Handle embedded attachments inside links
if '{{' in remaining and '}}' in remaining:
index = remaining.find('}}')
text = remaining[:index + 1]
remaining = remaining[index + 2:]
more_text, _, remaining = remaining.partition('|')
text += more_text
else:
text, _, remaining = remaining.partition('|')
if text:
text = text.strip()
text = parse_text(text, parse_links=False)
params = None
if remaining:
params, _, remaining = remaining.partition('|')
link = Link(target, text, params)
result.append(link)
continue
# Embedded
content, line = split_formatted(line, '{{', '}}')
if content:
target, _, remaining = content.partition('|')
text = None
if remaining:
# Handle embedded attachments inside links
if '{{' in remaining and '}}' in remaining:
index = remaining.find('}}')
text = remaining[:index + 1]
remaining = remaining[index + 2:]
more_text, _, remaining = remaining.partition('|')
text += more_text
else:
text, _, remaining = remaining.partition('|')
text = parse_text(text.strip(), parse_links=False)
params = None
if remaining:
params, _, remaining = remaining.partition('|')
if target.startswith('attachment:'):
link = EmbeddedAttachment(target[11:], text, params, context)
else:
link = EmbeddedLink(target, text, params)
result.append(link)
continue
# Plain text and URLs
content = re.split(r"''|`|{{|__|\[\[", line)[0]
if content:
line = line.replace(content, '', 1)
result += parse_plain_text(content, parse_links=parse_links)
continue
break
return result
def parse_plain_text(content, parse_links=True):
"""Parse a line or plain text and generate plain text and URL objects."""
result = []
while content:
wiki_link_match = re.search(
r'(?: |^)([A-Z][a-z0-9]+([A-Z][a-z0-9]+)+)(?: |$)', content)
link_match = re.search(r'(https?://[^<> ]+[^<> .:\(\)])', content)
if parse_links and link_match and link_match.span(0)[0] == 0:
link = link_match.group(1)
result.append(Url(link))
content = content[link_match.span(1)[1]:]
elif parse_links and wiki_link_match and wiki_link_match.span(
0)[0] == 0:
link = wiki_link_match.group(1)
result.append(Link(link, [PlainText(link)]))
content = content[wiki_link_match.span(1)[1]:]
else:
end = None
if parse_links and link_match:
end = link_match.span(1)[0]
if parse_links and wiki_link_match:
end = wiki_link_match.span(1)[0]
text = content[:end]
# Replace occurrences of !WikiText with WikiText
text = re.sub(r'([^A-Za-z]|^)!', r'\g<1>', text)
result.append(PlainText(text))
if end:
content = content[end:]
else:
break
return result
def parse_table_row(line, context=None):
"""Parse a line of MoinMoin wiki text. Returns a TableRow."""
row_cells = re.split(r'\|\|', line)[1:-1]
row_items = []
for cell in row_cells:
content = cell
if content.strip():
# remove that was already processed
content = re.sub(']+>', '', content)
align = None
match = re.match('