import io import re import uuid import zipfile import math import re from django.utils.html import strip_tags import bleach import markdown import pygments from bleach.css_sanitizer import CSSSanitizer from django.conf import settings from django.utils.text import slugify from pygments.formatters import HtmlFormatter from pygments.lexers import ClassNotFound, get_lexer_by_name, get_lexer_for_filename from l2m4m import LaTeX2MathMLExtension from main import denylist, models from markdown_it import MarkdownIt from mdit_py_plugins.footnote import footnote_plugin from mdit_py_plugins.tasklists import tasklists_plugin from graphviz import Source md = ( MarkdownIt("commonmark", {"html": True}) .enable("strikethrough") .enable("table") .use(footnote_plugin) .use(tasklists_plugin) ) # Define allowed CSS properties and SVG attributes ALLOWED_CSS_PROPERTIES = frozenset([ "azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "text-transform", "visibility", "white-space", "widows", "width", "word-spacing", "z-index" ]) SVG_TAGS = ["svg","g","path","line","polygon","polyline","circle","ellipse","rect","text","defs","title","desc"] SVG_ATTRS = ["width","height","viewBox","fill","stroke","stroke-width","d","x","y","cx","cy","r","points","transform","style","id","class"] # Allow MathML tags MATHML_TAGS = [ "math","mrow","mi","mo","mn","msup","msub","mfrac","msqrt","mstyle", "mtable","mtr","mtd","mfenced","ms","mspace","menclose","mover","munder" ] def is_disallowed(username): """Return true if username is not allowed to be registered.""" if username[0] == "_": # do not allow leading underscores return True if username == "docs" and settings.ALLOW_DOCS_USER: return False # check if subdomain is disallowed if username in denylist.DISALLOWED_USERNAMES and not settings.ALLOW_DOCS_USER: return username in denylist.DISALLOWED_USERNAMES def get_approx_number(number): """Get approximate number, eg. 1823 -> 2k""" if number > 999: approx = round(number / 1000) return f"{approx}k" return number def create_post_slug(post_title, owner, post=None): """ Generate slug given post title. Optional post arg for post that already exists. """ slug = slugify(post_title) # in case of post_title such as این متن است if not slug: generated_uuid = str(uuid.uuid4())[:8] slug = f"{generated_uuid[:3]}-{generated_uuid[3:5]}-{generated_uuid[5:]}" # if post is not None, then this is an update op if post is not None: post_with_same_slugs = models.Post.objects.filter(owner=owner, slug=slug) if post_with_same_slugs: if post_with_same_slugs.first().id == post.id: # if post being updating is the same one, then just return the same slug return slug else: # if post being updating is another one, then add a suffix to differentiate slug += "-" + str(uuid.uuid4())[:8] return slug # if post arg is None, then this is a new post # if slug already exists for another post, add a suffix to make it unique if models.Post.objects.filter(owner=owner, slug=slug).exists(): slug += "-" + str(uuid.uuid4())[:8] return slug def syntax_highlight(text): """Highlights markdown codeblocks within a markdown text.""" processed_text = "" within_code_block = False lexer = None code_block = "" for line in text.split("\n"): # code block backticks found, either begin or end if line[:3] == "```": if not within_code_block: # then this is the beginning of a block lang = line[3:].strip() if lang: # then this is a *code* block within_code_block = True lang_filename = "file." + lang try: lexer = get_lexer_for_filename(lang_filename) except ClassNotFound: try: lexer = get_lexer_by_name(lang) except ClassNotFound: # can't find lexer, just use C lang as default lexer = get_lexer_by_name("c") # continue because we don't want to add backticks in the processed text continue else: # no lang, so just a generic block (non-code) lexer = None else: # then this is the end of a code block # actual highlighting happens here within_code_block = False highlighted_block = pygments.highlight( code_block, lexer, HtmlFormatter(style="solarized-light", noclasses=True, cssclass=""), ) processed_text += highlighted_block code_block = "" # reset code_block variable # continue because we don't want to add backticks in the processed text continue if within_code_block: code_block += line + "\n" else: processed_text += line + "\n" return processed_text def clean_html(dirty_html, strip_tags=False): allowed_tags = list(bleach.sanitizer.ALLOWED_TAGS) + denylist.ALLOWED_HTML_ELEMENTS + SVG_TAGS + MATHML_TAGS denylist_attrs_dict = {"*": denylist.ALLOWED_HTML_ATTRS} allowed_attrs = {**bleach.sanitizer.ALLOWED_ATTRIBUTES, **{tag: SVG_ATTRS for tag in SVG_TAGS}, **denylist_attrs_dict } if strip_tags: return bleach.clean(dirty_html, strip=True) css_sanitizer = CSSSanitizer(allowed_css_properties=ALLOWED_CSS_PROPERTIES) return bleach.clean(dirty_html, tags=allowed_tags, attributes=allowed_attrs, css_sanitizer=css_sanitizer) default_fence = md.renderer.rules.get("fence") def fence_override(tokens, idx, options, env): token = tokens[idx] code = token.content lang = token.info.strip() if lang == "dot": try: svg_bytes = Source(code, format="svg").pipe() svg_str = svg_bytes.decode() # Remove XML declaration svg_str = svg_str.replace('', '').strip() return svg_str except Exception: return f"
{code}"
# fallback to default renderer
if default_fence:
return default_fence(tokens, idx, options, env)
return f"{code}"
md.renderer.rules["fence"] = fence_override
def replace_latex_with_mathml(md_text: str) -> str:
# Replace all inline ($...$) and display ($$...$$) LaTeX with MathML
def repl(match):
latex = match.group(0)
# Use l2m4m to convert LaTeX to MathML
mathml_html = markdown.markdown(latex, extensions=[LaTeX2MathMLExtension()])
return mathml_html
# Display math first
md_text = re.sub(r"\$\$.*?\$\$", repl, md_text, flags=re.S)
# Inline math
md_text = re.sub(r"\$.*?\$", repl, md_text, flags=re.S)
return md_text
def md_to_html(markdown_string: str, strip_tags=False) -> str:
if not markdown_string:
return ""
# Convert LaTeX to MathML first
mathml_html = replace_latex_with_mathml(markdown_string)
# Pass through Markdown-It for strikethrough, tables, footnotes, and Graphviz diagrams
intermediate_html = md.render(mathml_html)
# Clean the HTML but preserve SVG and MathML
return clean_html(intermediate_html, strip_tags=strip_tags)
def remove_control_chars(text):
"""Remove control characters from a string.
We remove all characters of the Cc category of unicode, except for
\t (tab), \n (new line), \r (carriage return).
See http://www.unicode.org/reports/tr44/#General_Category_Values
"""
control_char_string = "".join(denylist.DISALLOWED_CHARACTERS)
control_char_re = re.compile(f"[{re.escape(control_char_string)}]")
return control_char_re.sub(" ", text)
def get_protocol():
if settings.DEBUG:
return "http:"
else:
return "https:"
def generate_markdown_export(user_id):
"""
Generates a markdown export ZIP file in /tmp/.
Returns (export name, export filepath).
"""
# compile all posts into dictionary
user = models.User.objects.get(id=user_id)
user_posts = models.Post.objects.filter(owner=user)
export_posts = []
for p in user_posts:
pub_date = p.published_at or p.created_at
title = p.slug + ".md"
body = f"# {p.title}\n\n"
body += f"> Published on {pub_date.strftime('%b %-d, %Y')}\n\n"
body += f"{p.body}\n"
export_posts.append((title, io.BytesIO(body.encode())))
# write zip archive in /tmp/
export_name = "export-markdown-" + str(uuid.uuid4())[:8]
container_dir = f"{user.username}-bocpress-blog"
zip_outfile = f"/tmp/{export_name}.zip"
with zipfile.ZipFile(
zip_outfile, "a", zipfile.ZIP_DEFLATED, False
) as export_archive:
for file_name, data in export_posts:
export_archive.writestr(
export_name + f"/{container_dir}/" + file_name, data.getvalue()
)
return (export_name, zip_outfile)
def escape_quotes(input_string):
output_string = input_string.replace('"', '\\"')
return output_string
def reading_time(text):
"""Calculate reading time in minutes for a given bit of text."""
words = re.findall(r"\w+", strip_tags(text))
minutes = math.ceil(len(words) / 200) # 200 wpm
return minutes