# Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This file contains renderers from the supported input formats. # As of [[2020-01-09]]: # - markdown # - orgmode import re import shutil import smartypants import subprocess from . import config, regexes, util from flask import current_app from marko import Markdown, inline from marko.helpers import MarkoExtension # we add and then remove this from detected Tiddlylinks to opt out from default Marko rendering. # pretty dirty but it works (tm) and prevents an excursion deeper into Marko. TIDDLYHACK = " TIDDLYLINK" # Markdown class WikilinkElement(inline.InlineElement): # is this regexes pattern a good idea? pattern = regexes.WIKILINK.pattern parse_children = True def __init__(self, match): self.target = match.group(1) class WikilinkRendererMixin(object): # This name is magic; it must match render_. def render_wikilink_element(self, element): if "|" in element.target: try: first, second = element.target.split("|") except ValueError: # probably more than one pipe; not supported for now first = element.target second = element.target target = util.canonical_wikilink(first.rstrip()) label = second.lstrip() else: target = util.canonical_wikilink(element.target) label = self.render_children(element) href = f"/{target}" return f'[[{label}]]' class TiddlylinkElement(inline.InlineElement): # is this regexes pattern a good idea? pattern = regexes.TIDDLYLINK.pattern def __init__(self, match): self.anchor = match.group(1) self.target = match.group(2).replace(TIDDLYHACK, "") class TiddlylinkRendererMixin(object): # This name is magic; it must match render_. def render_tiddlylink_element(self, element): return '[[{}]]'.format( util.canonical_wikilink(element.target), element.anchor ) class HashtagElement(inline.InlineElement): # is this regexes pattern a good idea? pattern = regexes.HASHTAG.pattern parse_children = True def __init__(self, match): self.target = match.group(1) class HashtagRendererMixin(object): # This name is magic; it must match render_. def render_hashtag_element(self, element): # return '[[{}]]'.format( return '#{}'.format( # util.canonical_wikilink(self.escape_url(element.target)), self.render_children(element) util.canonical_wikilink(element.target), self.render_children(element), ) Wikilinks = MarkoExtension( elements=[WikilinkElement, TiddlylinkElement, HashtagElement], renderer_mixins=[ WikilinkRendererMixin, TiddlylinkRendererMixin, HashtagRendererMixin, ], ) markdown = Markdown(extensions=["footnote", "gfm"]) markdown.use(Wikilinks) # This doesn't work here as we're out of app context -- I should probably move render to be a class? Hmm. # if current_app.config["ENABLE_ORGORA"]: # # Org-mode, now much improved through orgora. # from orgorapython import parse_string # orgmode = parse_string # Mycomarkup # If we can, use mycomarkup parser; if not, fall back to markdown which gets us something half readable. def mycomarkup(src): if shutil.which("mycomarkup"): ret = subprocess.check_output("mycomarkup", input=bytes(src, "utf-8")) ret = ret.decode("utf-8") else: ret = "Mycomarkup binary not found, the following was rendered in Markdown compatibility mode." ret += markdown(src) return ret # Embeds. # The *application* of this pattern could perhaps be here instead of in... hmm, db.py? Yeah, that doesn't make sense. # TODO: [[refactor]]. # Twitter embeds. # Now disabled, we prefer to embed client side. def add_twitter_embeds(content, subnode): TWITTER_REGEX = "(https://twitter.com/\w+/status/[0-9]+)" TWITTER_EMBED = '

' return re.sub(TWITTER_REGEX, TWITTER_EMBED, content) def add_twitter_pull(content, subnode): # negative lookbehind tries to only match twitter links not preceded by a ", which would be there if the URL is being used as part of an tag (adding an embed in that case using regexes would break the link). # https://www.regular-expressions.info/lookaround.html if you're wondering how this works. if subnode and "subnode/virtual" in subnode.url: # trouble at the mill. # virtual subnodes are prerendered by virtue of how they are generated (from the final html) # they should be "pre cooked". return content TWITTER_REGEX = r'(?pull' return re.sub(TWITTER_REGEX, TWITTER_EMBED, content) def add_mastodon_pull(content, subnode): if subnode and "subnode/virtual" in subnode.url: # as per the above. return content if subnode and 'agora' not in subnode.url: # HACK: # These pulls break inline posts (fully dumped by opted-in users) as of 2024-03-16, so skip pulls for those for now. return content # hack: negative lookbehind tries to only match for anchors not preceded by a span... just because in the agora we have # spans just preceding every anchor that is a wikilink. if re.search(r"(?)pull' ret = re.sub(MASTODON_REGEX_ALT, MASTODON_EMBED, content) # ret = re.sub(MASTODON_REGEX_ALT, MASTODON_EMBED, ret) return ret def add_pleroma_pull(content, subnode): # hack: negative lookbehind tries to only match for anchors not preceded by a span... just because in the agora we have # spans just preceding every anchor that is a wikilink. if re.search(r"(?) https://regexr.com/3e6m0 # 'http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*' # URL_REGEX='(\[\[go\]\]) (.+:\/\/.+)' # @(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS # URL_REGEX='(https?:\/\/([^s/?\<>]+wiki.+)' # URL_REGEX="^[a-z0-9!#$%&'-*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?" # URL_REGEX='http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*' # # some explanations are in order for the regex below :) # (?pull' ) if re.search(r"(a href|=> http|\[\[http)", content): # don't apply filters when content has html links or rocket links to URLs, as we risk adding a button inside an anchor # and breaking the Agora in interesting ways, see https://anagora.org/2023-05-27 :) return content ret = re.sub(URL_REGEX, URL_EMBED, content) # hack hack -- "fixes" pulling for markdown style links, e.g. [text](anchor). # As of 2023-07-16, this hack is disabled as it was breaking Wikipedia articles that end with ) while not providing obvious value -- I tried reproducing the issue with Markdown links but they are rendering fine, it seems we're only missing pulling functionality for them but that seems preferrable to breaking a fraction of Wikipedia pulls! # ret = ret.replace(')"', '"') return ret def add_go_button(content, subnode): URL_REGEX = "(\[\[go\]\]) (.+:\/\/.+)" URL_EMBED = ' \\2' ret = re.sub(URL_REGEX, URL_EMBED, content) return ret # Trim front matter until we do something useful with it. def trim_front_matter(content, subnode): FRONT_MATTER_REGEX = "---(\n.*)*---" return re.sub(FRONT_MATTER_REGEX, "", content, flags=re.MULTILINE) # Hack: trim

and

to try to work around fix mastodon wikilinks. # Marko is ignoring wiki links in paragraphs for some reason? def trim_p(content, subnode): P_REGEX = r'<\/?p>' return re.sub(P_REGEX, '', content, flags=re.MULTILINE) def add_hr(content, subnode): HR_REGEX = r'^--+' return re.sub(HR_REGEX, '

', content, flags=re.MULTILINE) # Trim obsidian block anchors until we do something useful with them. def trim_block_anchors(content, subnode): BLOCK_ANCHOR_REGEX = r"\^[0-9-]+$" return re.sub(BLOCK_ANCHOR_REGEX, "", content, flags=re.MULTILINE) # Trim Logseq :LOGBOOK: .. :END: blocks until we do something useful with them def trim_logbook(content, subnode): LOGBOOK_REGEX = r":LOGBOOK:.*?:END:" return re.sub(LOGBOOK_REGEX, "", content, flags=re.MULTILINE + re.DOTALL) # Trim liquid templates (Jekyll stuff) until we do something useful with them. def trim_liquid(content, subnode): LIQUID_REGEX = r"{%.*?%}" return re.sub( LIQUID_REGEX, "(Unsupported content elided by the Agora.)", content, flags=re.MULTILINE, ) # Trim margin notes (Jekyll stuff). def trim_margin_notes(content, subnode): MARGIN_NOTES_REGEX = r"\[\[[^\]]*?::...\]\]" return re.sub(MARGIN_NOTES_REGEX, "", content, flags=re.MULTILINE) # Make it so that Tiddlylinks (links of the form [foo](#bar), wish octothorpe) aren't handled by # [foo](bar) standard Markdown link parsing in Marko. def force_tiddlylink_parsing(content, subnode): return re.sub( regexes.TIDDLYLINK.pattern, rf"[\1](#\2{TIDDLYHACK})", content, flags=re.MULTILINE, ) # def content_to_obsidian_embeds(content): # match = regexes.WIKILINKS.findall(content) # if match: # # Work around broken forward links due to org mode convention I didn't think of. # # TODO: make link parsing format-aware. # return [util.canonical_wikilink(m) for m in match if '][' not in m] # else: # return [] # Obsidian pasted images / attachments. def add_obsidian_embeds(content, subnode): OBSIDIAN_REGEX = re.compile("!" + regexes.WIKILINK.pattern) OBSIDIAN_EMBED = f'

⥅ [[\\1]]

' # also include something like this to move to a lazily loaded div? # return re.sub(OBSIDIAN_REGEX, OBSIDIAN_EMBED, content) def add_silverbullet_embeds(content, subnode): SILVERBULLET_REGEX = re.compile(r"!\[\]$ *(.+?) *$") SILVERBULLET_EMBED = f'

⥅ [[\\1]]

' # also include something like this to move to a lazily loaded div? # return re.sub(SILVERBULLET_REGEX, SILVERBULLET_EMBED, content) def add_logseq_embeds(content, subnode): LOGSEQ_REGEX = re.compile(r"(\./assets/.*)") LOGSEQ_FIX = f"/raw/garden/{subnode.user}/\\1" # also include something like this to move to a lazily loaded div? # content = re.sub(LOGSEQ_REGEX, LOGSEQ_FIX, content) return content def filter_smartypants(content, subnode): return smartypants.smartypants(content) def preprocess(content, subnode=""): # add_logseq_embeds breaks links everywhere, there's an issue with the regex :) # filters = [trim_front_matter, trim_block_anchors, trim_logbook, force_tiddlylink_parsing, trim_liquid, trim_margin_notes, add_logseq_embeds, add_obsidian_embeds, add_url_pull, add_twitter_pull] filters = [ trim_p, trim_front_matter, trim_block_anchors, trim_logbook, force_tiddlylink_parsing, trim_liquid, trim_margin_notes, add_obsidian_embeds, add_logseq_embeds, add_silverbullet_embeds, add_url_pull, add_twitter_pull, add_mastodon_pull, add_pleroma_pull, add_hr, ] for f in filters: content = f(content, subnode) return content def postprocess(content, subnode=""): # filters = [add_twitter_embeds] # these all ended up moving to preprocess() -- might mean there's not a need for postprocessing overall? filters = [filter_smartypants] for f in filters: content = f(content, subnode) return content