From 560ca092709448ed89a047a14b91fb694f809667 Mon Sep 17 00:00:00 2001 From: Neil Williams Date: Thu, 17 Nov 2011 19:49:02 -0800 Subject: [PATCH] Remove markdown.py and replace scattered uses of it. It was used directly in two places in pages.py, I've replaced those uses with safemarkdown calls. In the case of the search fail page, I just removed the javascript try-again link since it wouldn't be doable through safemarkdown and that page isn't hit very frequently any more anyway. --- r2/example.ini | 2 +- r2/r2/lib/contrib/markdown.py | 687 -------------------------------- r2/r2/lib/filters.py | 3 - r2/r2/lib/pages/pages.py | 11 +- r2/r2/lib/py_markdown.py | 59 --- r2/r2/lib/strings.py | 2 +- r2/r2/templates/searchfail.html | 17 - 7 files changed, 5 insertions(+), 776 deletions(-) delete mode 100644 r2/r2/lib/contrib/markdown.py delete mode 100644 r2/r2/lib/py_markdown.py diff --git a/r2/example.ini b/r2/example.ini index 502673056..dd9f81d6a 100755 --- a/r2/example.ini +++ b/r2/example.ini @@ -312,7 +312,7 @@ takedown_sr = _takedowns png_optimizer = /usr/bin/env optipng # bad words that should be *'d out profanity_wordlist = -# which markdown backend to use (c = discount, py = markdown.py, snudown = snudown) +# which markdown backend to use (c = discount, snudown = snudown) markdown_backend = snudown # -- search -- diff --git a/r2/r2/lib/contrib/markdown.py b/r2/r2/lib/contrib/markdown.py deleted file mode 100644 index 854ff0128..000000000 --- a/r2/r2/lib/contrib/markdown.py +++ /dev/null @@ -1,687 +0,0 @@ -#!/usr/bin/python -import re, md5, sys, string - -"""markdown.py: A Markdown-styled-text to HTML converter in Python. - -Usage: - ./markdown.py textfile.markdown - -Calling: - import markdown - somehtml = markdown.markdown(sometext) -""" - -__version__ = '1.0.1-2' # port of 1.0.1 -__license__ = "GNU GPL 2" -__author__ = [ - 'John Gruber ', - 'Tollef Fog Heen ', - 'Aaron Swartz ' -] - -def htmlquote(text): - """Encodes `text` for raw use in HTML.""" - text = text.replace("&", "&") # Must be done first! - text = text.replace("<", "<") - text = text.replace(">", ">") - text = text.replace("'", "'") - text = text.replace('"', """) - return text - -def mangle_text(text): - from pylons import g - return md5.new(text + g.SECRET).hexdigest() - -def semirandom(seed): - from pylons import g - x = 0 - for c in md5.new(seed + g.SECRET).digest(): x += ord(c) - return x / (255*16.) - -class _Markdown: - emptyelt = " />" - tabwidth = 4 - - escapechars = '\\`*_{}[]()>#+-.!' - escapetable = {} - for char in escapechars: - escapetable[char] = mangle_text(char) - - r_multiline = re.compile("\n{2,}") - r_stripspace = re.compile(r"^[ \t]+$", re.MULTILINE) - def parse(self, text): - self.urls = {} - self.titles = {} - self.html_blocks = {} - self.list_level = 0 - - text = text.replace("\r\n", "\n") - text = text.replace("\r", "\n") - text += "\n\n" - text = self._Detab(text) - text = self.r_stripspace.sub("", text) - text = self._HashHTMLBlocks(text) - text = self._StripLinkDefinitions(text) - text = self._RunBlockGamut(text) - text = self._UnescapeSpecialChars(text) - return text - - r_StripLinkDefinitions = re.compile(r""" - ^[ ]{0,%d}\[(.+)\]: # id = $1 - [ \t]*\n?[ \t]* - ? # url = $2 - [ \t]*\n?[ \t]* - (?: - (?<=\s) # lookbehind for whitespace - [\"\(] # " is backlashed so it colorizes our code right - (.+?) # title = $3 - [\"\)] - [ \t]* - )? # title is optional - (?:\n+|\Z) - """ % (tabwidth-1), re.MULTILINE|re.VERBOSE) - def _StripLinkDefinitions(self, text): - def replacefunc(matchobj): - (t1, t2, t3) = matchobj.groups() - #@@ case sensitivity? - self.urls[t1.lower()] = self._EncodeAmpsAndAngles(t2) - if t3 is not None: - self.titles[t1.lower()] = t3.replace('"', '"') - return "" - - text = self.r_StripLinkDefinitions.sub(replacefunc, text) - return text - - blocktagsb = r"p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|math" - blocktagsa = blocktagsb + "|ins|del" - - r_HashHTMLBlocks1 = re.compile(r""" - ( # save in $1 - ^ # start of line (with /m) - <(%s) # start tag = $2 - \b # word break - (.*\n)*? # any number of lines, minimally matching - # the matching end tag - [ \t]* # trailing spaces/tabs - (?=\n+|$) # followed by a newline or end of document - ) - """ % blocktagsa, re.MULTILINE | re.VERBOSE) - - r_HashHTMLBlocks2 = re.compile(r""" - ( # save in $1 - ^ # start of line (with /m) - <(%s) # start tag = $2 - \b # word break - (.*\n)*? # any number of lines, minimally matching - .* # the matching end tag - [ \t]* # trailing spaces/tabs - (?=\n+|\Z) # followed by a newline or end of document - ) - """ % blocktagsb, re.MULTILINE | re.VERBOSE) - - r_HashHR = re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,%d} - <(hr) # start tag = $2 - \b # word break - ([^<>])*? # - /?> # the matching end tag - [ \t]* - (?=\n{2,}|\Z)# followed by a blank line or end of document - ) - """ % (tabwidth-1), re.VERBOSE) - r_HashComment = re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,%d} - (?: - - ) - [ \t]* - (?=\n{2,}|\Z)# followed by a blank line or end of document - ) - """ % (tabwidth-1), re.VERBOSE) - - def _HashHTMLBlocks(self, text): - def handler(m): - key = m.group(1) - try: - key = key.encode('utf8') - except UnicodeDecodeError: - key = ''.join(k for k in key if ord(k) < 128) - key = mangle_text(key) - self.html_blocks[key] = m.group(1) - return "\n\n%s\n\n" % key - - text = self.r_HashHTMLBlocks1.sub(handler, text) - text = self.r_HashHTMLBlocks2.sub(handler, text) - oldtext = text - text = self.r_HashHR.sub(handler, text) - text = self.r_HashComment.sub(handler, text) - return text - - #@@@ wrong! - r_hr1 = re.compile(r'^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$', re.M) - r_hr2 = re.compile(r'^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$', re.M) - r_hr3 = re.compile(r'^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$', re.M) - - def _RunBlockGamut(self, text): - text = self._DoHeaders(text) - for x in [self.r_hr1, self.r_hr2, self.r_hr3]: - text = x.sub("\ns. - text = self._HashHTMLBlocks(text) - text = self._FormParagraphs(text) - return text - - r_NewLine = re.compile(" {2,}\n") - def _RunSpanGamut(self, text): - text = self._DoCodeSpans(text) - text = self._EscapeSpecialChars(text) - text = self._DoImages(text) - text = self._DoAnchors(text) - text = self._DoAutoLinks(text) - text = self._EncodeAmpsAndAngles(text) - text = self._DoItalicsAndBold(text) - text = self.r_NewLine.sub(" ? # href = $3 - [ \t]* - ( # $4 - ([\'\"]) # quote char = $5 - (.*?) # Title = $6 - \5 # matching quote - )? # title is optional - \) - ) - """, re.S|re.VERBOSE) - def _DoAnchors(self, text): - # We here don't do the same as the perl version, as python's regex - # engine gives us no way to match brackets. - - def handler1(m): - whole_match = m.group(1) - link_text = m.group(2) - link_id = m.group(3).lower() - if not link_id: link_id = link_text.lower() - title = self.titles.get(link_id, None) - - - if self.urls.has_key(link_id): - url = self.urls[link_id] - url = url.replace("*", self.escapetable["*"]) - url = url.replace("_", self.escapetable["_"]) - url = url.replace("[", self.escapetable["["]) - res = '? # src url = $3 - [ \t]* - ( # $4 - ([\'\"]) # quote char = $5 - (.*?) # title = $6 - \5 # matching quote - [ \t]* - )? # title is optional - \) - ) - """, re.VERBOSE|re.S) - - def _DoImages(self, text): - def handler1(m): - whole_match = m.group(1) - alt_text = m.group(2) - link_id = m.group(3).lower() - - if not link_id: - link_id = alt_text.lower() - - alt_text = alt_text.replace('"', """) - if self.urls.has_key(link_id): - url = self.urls[link_id] - url = url.replace("*", self.escapetable["*"]) - url = url.replace("_", self.escapetable["_"]) - res = '''%s= len(textl): continue - count = textl[i].strip().count(c) - if count > 0 and count == len(textl[i].strip()) and textl[i+1].strip() == '' and textl[i-1].strip() != '': - textl = textl[:i] + textl[i+1:] - textl[i-1] = ''+self._RunSpanGamut(textl[i-1])+'' - textl = textl[:i] + textl[i+1:] - text = '\n'.join(textl) - return text - - def handler(m): - level = len(m.group(1)) - header = self._RunSpanGamut(m.group(2)) - return "%s\n\n" % (level, header, level) - - text = findheader(text, '=', '1') - text = findheader(text, '-', '2') - text = self.r_DoHeaders.sub(handler, text) - return text - - rt_l = r""" - ( - ( - [ ]{0,%d} - ([*+-]|\d+[.]) - [ \t]+ - ) - (?:.+?) - ( - \Z - | - \n{2,} - (?=\S) - (?![ \t]* ([*+-]|\d+[.])[ \t]+) - ) - ) - """ % (tabwidth - 1) - r_DoLists = re.compile('^'+rt_l, re.M | re.VERBOSE | re.S) - r_DoListsTop = re.compile( - r'(?:\A\n?|(?<=\n\n))'+rt_l, re.M | re.VERBOSE | re.S) - - def _DoLists(self, text): - def handler(m): - list_type = "ol" - if m.group(3) in [ "*", "-", "+" ]: - list_type = "ul" - listn = m.group(1) - listn = self.r_multiline.sub("\n\n\n", listn) - res = self._ProcessListItems(listn) - res = "<%s>\n%s\n" % (list_type, res, list_type) - return res - - if self.list_level: - text = self.r_DoLists.sub(handler, text) - else: - text = self.r_DoListsTop.sub(handler, text) - return text - - r_multiend = re.compile(r"\n{2,}\Z") - r_ProcessListItems = re.compile(r""" - (\n)? # leading line = $1 - (^[ \t]*) # leading whitespace = $2 - ([*+-]|\d+[.]) [ \t]+ # list marker = $3 - ((?:.+?) # list item text = $4 - (\n{1,2})) - (?= \n* (\Z | \2 ([*+-]|\d+[.]) [ \t]+)) - """, re.VERBOSE | re.M | re.S) - - def _ProcessListItems(self, text): - self.list_level += 1 - text = self.r_multiend.sub("\n", text) - - def handler(m): - item = m.group(4) - leading_line = m.group(1) - leading_space = m.group(2) - - if leading_line or self.r_multiline.search(item): - item = self._RunBlockGamut(self._Outdent(item)) - else: - item = self._DoLists(self._Outdent(item)) - if item[-1] == "\n": item = item[:-1] # chomp - item = self._RunSpanGamut(item) - return "
  • %s
  • \n" % item - - text = self.r_ProcessListItems.sub(handler, text) - self.list_level -= 1 - return text - - r_DoCodeBlocks = re.compile(r""" - (?:\n\n|\A) - ( # $1 = the code block - (?: - (?:[ ]{%d} | \t) # Lines must start with a tab or equiv - .*\n+ - )+ - ) - ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space/end of doc - """ % (tabwidth, tabwidth), re.M | re.VERBOSE) - def _DoCodeBlocks(self, text): - def handler(m): - codeblock = m.group(1) - codeblock = self._EncodeCode(self._Outdent(codeblock)) - codeblock = self._Detab(codeblock) - codeblock = codeblock.lstrip("\n") - codeblock = codeblock.rstrip() - res = "\n\n
    %s\n
    \n\n" % codeblock - return res - - text = self.r_DoCodeBlocks.sub(handler, text) - return text - r_DoCodeSpans = re.compile(r""" - (`+) # $1 = Opening run of ` - (.+?) # $2 = The code block - (?%s" % c - - text = self.r_DoCodeSpans.sub(handler, text) - return text - - def _EncodeCode(self, text): - text = text.replace("&","&") - text = text.replace("<","<") - text = text.replace(">",">") - for c in "*_{}[]\\": - text = text.replace(c, self.escapetable[c]) - return text - - - r_DoBold = re.compile(r"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1", re.VERBOSE | re.S) - r_DoItalics = re.compile(r"(\*|_) (?=\S) (.+?) (?<=\S) \1", re.VERBOSE | re.S) - def _DoItalicsAndBold(self, text): - text = self.r_DoBold.sub(r"\2", text) - text = self.r_DoItalics.sub(r"\2", text) - return text - - r_start = re.compile(r"^", re.M) - ####r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M) - r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M) - r_DoBlockQuotes2 = re.compile(r"^[ \t]+$", re.M) - r_DoBlockQuotes3 = re.compile(r""" - ( # Wrap whole match in $1 - ( - ^[ \t]*>[ \t]? # '>' at the start of a line - .+\n # rest of the first line - (.+\n)* # subsequent consecutive lines - \n* # blanks - )+ - )""", re.M | re.VERBOSE) - r_protectpre = re.compile(r'(\s*
    .+?
    )', re.S) - r_propre = re.compile(r'^ ', re.M) - - def _DoBlockQuotes(self, text): - def prehandler(m): - return self.r_propre.sub('', m.group(1)) - - def handler(m): - bq = m.group(1) - bq = self.r_DoBlockQuotes1.sub("", bq) - bq = self.r_DoBlockQuotes2.sub("", bq) - bq = self._RunBlockGamut(bq) - bq = self.r_start.sub(" ", bq) - bq = self.r_protectpre.sub(prehandler, bq) - return "
    \n%s\n
    \n\n" % bq - - text = self.r_DoBlockQuotes3.sub(handler, text) - return text - - r_tabbed = re.compile(r"^([ \t]*)") - def _FormParagraphs(self, text): - text = text.strip("\n") - grafs = self.r_multiline.split(text) - - for g in xrange(len(grafs)): - t = grafs[g].strip() #@@? - if not self.html_blocks.has_key(t): - t = self._RunSpanGamut(t) - t = self.r_tabbed.sub(r"

    ", t) - t += "

    " - grafs[g] = t - - for g in xrange(len(grafs)): - t = grafs[g].strip() - if self.html_blocks.has_key(t): - grafs[g] = self.html_blocks[t] - - return "\n\n".join(grafs) - - r_EncodeAmps = re.compile(r"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)") - r_EncodeAngles = re.compile(r"<(?![a-z/?\$!])") - def _EncodeAmpsAndAngles(self, text): - text = self.r_EncodeAmps.sub("&", text) - text = self.r_EncodeAngles.sub("<", text) - return text - - def _EncodeBackslashEscapes(self, text): - for char in self.escapechars: - text = text.replace("\\" + char, self.escapetable[char]) - return text - - r_link = re.compile(r"<((https?|ftp):[^\'\">\s]+)>", re.I) - r_email = re.compile(r""" - < - (?:mailto:)? - ( - [-.\w]+ - \@ - [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ - ) - >""", re.VERBOSE|re.I) - def _DoAutoLinks(self, text): - text = self.r_link.sub(r'
    \1', text) - - def handler(m): - l = m.group(1) - return self._EncodeEmailAddress(self._UnescapeSpecialChars(l)) - - text = self.r_email.sub(handler, text) - return text - - r_EncodeEmailAddress = re.compile(r">.+?:") - def _EncodeEmailAddress(self, text): - encode = [ - lambda x: "&#%s;" % ord(x), - lambda x: "&#x%X;" % ord(x), - lambda x: x - ] - - text = "mailto:" + text - addr = "" - for c in text: - if c == ':': addr += c; continue - - r = semirandom(addr) - if r < 0.45: - addr += encode[1](c) - elif r > 0.9 and c != '@': - addr += encode[2](c) - else: - addr += encode[0](c) - - text = '%s' % (addr, addr) - text = self.r_EncodeEmailAddress.sub('>', text) - return text - - def _UnescapeSpecialChars(self, text): - for key in self.escapetable.keys(): - text = text.replace(self.escapetable[key], key) - return text - - tokenize_depth = 6 - tokenize_nested_tags = '|'.join([r'(?:<[a-z/!$](?:[^<>]'] * tokenize_depth) + (')*>)' * tokenize_depth) - r_TokenizeHTML = re.compile( - r"""(?: ) | # comment - (?: <\? .*? \?> ) | # processing instruction - %s # nested tags - """ % tokenize_nested_tags, re.I|re.VERBOSE) - def _TokenizeHTML(self, text): - pos = 0 - tokens = [] - matchobj = self.r_TokenizeHTML.search(text, pos) - while matchobj: - whole_tag = matchobj.string[matchobj.start():matchobj.end()] - sec_start = matchobj.end() - tag_start = sec_start - len(whole_tag) - if pos < tag_start: - tokens.append(["text", matchobj.string[pos:tag_start]]) - - tokens.append(["tag", whole_tag]) - pos = sec_start - matchobj = self.r_TokenizeHTML.search(text, pos) - - if pos < len(text): - tokens.append(["text", text[pos:]]) - return tokens - - r_Outdent = re.compile(r"""^(\t|[ ]{1,%d})""" % tabwidth, re.M) - def _Outdent(self, text): - text = self.r_Outdent.sub("", text) - return text - - def _Detab(self, text): return text.expandtabs(self.tabwidth) - -def Markdown(*args, **kw): return _Markdown().parse(*args, **kw) -markdown = Markdown - -if __name__ == '__main__': - if len(sys.argv) > 1: - print Markdown(open(sys.argv[1]).read()) - else: - print Markdown(sys.stdin.read()) diff --git a/r2/r2/lib/filters.py b/r2/r2/lib/filters.py index ac0d3838d..c46b15250 100644 --- a/r2/r2/lib/filters.py +++ b/r2/r2/lib/filters.py @@ -209,7 +209,6 @@ def markdown_souptest(text, nofollow=False, target=None, lang=None): #@memoize('markdown') def safemarkdown(text, nofollow=False, target=None, lang=None, wrap=True): from r2.lib.c_markdown import c_markdown - from r2.lib.py_markdown import py_markdown if c.user.pref_no_profanity: text = profanity_filter(text) @@ -227,8 +226,6 @@ def safemarkdown(text, nofollow=False, target=None, lang=None, wrap=True): text = snudown.markdown(_force_utf8(text), nofollow, target) elif lang == "c": text = c_markdown(text, nofollow, target) - elif lang == "py": - text = py_markdown(text, nofollow, target) else: raise ValueError("weird lang [%s]" % lang) diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py index 5f79b28d4..205f2bb53 100644 --- a/r2/r2/lib/pages/pages.py +++ b/r2/r2/lib/pages/pages.py @@ -37,7 +37,6 @@ from pylons.controllers.util import abort from r2.lib import promote from r2.lib.traffic import load_traffic, load_summary from r2.lib.captcha import get_iden -from r2.lib.contrib.markdown import markdown from r2.lib.filters import spaceCompress, _force_unicode, _force_utf8 from r2.lib.filters import unsafe, websafe, SC_ON, SC_OFF, websafe_json from r2.lib.menus import NavButton, NamedButton, NavMenu, PageNameNav, JsButton @@ -53,6 +52,7 @@ from r2.lib.scraper import get_media_embed from r2.lib.log import log_text from r2.lib.memoize import memoize from r2.lib.utils import trunc_string as _truncate +from r2.lib.filters import safemarkdown import sys, random, datetime, locale, calendar, simplejson, re, time import graph, pycountry, time @@ -1453,9 +1453,7 @@ class Thanks(Templated): if g.lounge_reddit: lounge_url = "/r/" + g.lounge_reddit - lounge_html = (SC_OFF + - markdown(strings.lounge_msg % dict(link=lounge_url)) - + SC_ON) + lounge_html = safemarkdown(strings.lounge_msg % dict(link=lounge_url)) else: lounge_html = None Templated.__init__(self, status=status, secret=secret, @@ -1670,10 +1668,7 @@ class SearchBar(Templated): class SearchFail(Templated): """Search failure page.""" def __init__(self, **kw): - md = SC_OFF + markdown(strings.search_failed % dict( - link="javascript:tryagain\(\)")) + SC_ON - - self.errmsg = md + self.errmsg = strings.search_failed Templated.__init__(self) diff --git a/r2/r2/lib/py_markdown.py b/r2/r2/lib/py_markdown.py deleted file mode 100644 index 312964736..000000000 --- a/r2/r2/lib/py_markdown.py +++ /dev/null @@ -1,59 +0,0 @@ -from contrib.markdown import markdown -import re - -r_url = re.compile('(?', re.I | re.S) -img = re.compile('', re.I | re.S) -href_re = re.compile('([^<]+)') -a_re = re.compile('>([^<]+)') -fix_url = re.compile('<(http://[^\s\'\"\]\)]+)>') - -def code_handler(m): - l = m.group(1) - return '%s' % l.replace('&','&') - -#unescape double escaping in links -def inner_a_handler(m): - l = m.group(1) - return '>%s' % l.replace('&','&') - -def py_markdown(text, nofollow=False, target=None): - # increase escaping of &, < and > once - text = text.replace("&", "&").replace("<", "<").replace(">", ">") - - #wrap urls in "<>" so that markdown will handle them as urls - text = r_url.sub(r'<\1>', text) - - text = markdown(text) - - text = img.sub('', text) #remove images - # remove the "&" escaping in urls - text = code_re.sub(code_handler, text) - text = a_re.sub(inner_a_handler, text) - - #remove images - text = img.sub('', text) - - #wipe malicious javascript - text = jscript_url.sub('', text) - - # remove the "&" escaping in urls - def href_handler(m): - url = m.group(1).replace('&', '&') - link = ' - var searchfail_timeout = new Date(); - - function tryagain() { - elapsed = new Date() - searchfail_timeout; - - seconds = elapsed / 1000; - - if (seconds < 10) { - alert("Please don't pound our servers! " + - "Give them a few minutes to cool off."); - } else { - window.location.reload(); - } - } - -
    ${unsafe(thing.errmsg)}