From 2734d5800cfdc47784814bb117eff33f0b3c5aa1 Mon Sep 17 00:00:00 2001 From: Logan Hanks Date: Thu, 16 Jun 2011 15:18:11 -0700 Subject: [PATCH] Add xhtml entity definitions via minimal DTD before parsing markdown output. Fixes issue #38. --- r2/r2/lib/contrib/dtds/README | 2 + r2/r2/lib/contrib/dtds/xhtml-lat1.ent | 196 +++++++++++++++++++ r2/r2/lib/contrib/dtds/xhtml-special.ent | 80 ++++++++ r2/r2/lib/contrib/dtds/xhtml-symbol.ent | 237 +++++++++++++++++++++++ r2/r2/lib/contrib/dtds/xhtml.dtd | 9 + r2/r2/lib/filters.py | 16 +- 6 files changed, 538 insertions(+), 2 deletions(-) create mode 100644 r2/r2/lib/contrib/dtds/README create mode 100644 r2/r2/lib/contrib/dtds/xhtml-lat1.ent create mode 100644 r2/r2/lib/contrib/dtds/xhtml-special.ent create mode 100644 r2/r2/lib/contrib/dtds/xhtml-symbol.ent create mode 100644 r2/r2/lib/contrib/dtds/xhtml.dtd diff --git a/r2/r2/lib/contrib/dtds/README b/r2/r2/lib/contrib/dtds/README new file mode 100644 index 000000000..232f12854 --- /dev/null +++ b/r2/r2/lib/contrib/dtds/README @@ -0,0 +1,2 @@ +This directory provides a minimal DTD defining XHTML entities, for parsing +HTML generated by markdown. diff --git a/r2/r2/lib/contrib/dtds/xhtml-lat1.ent b/r2/r2/lib/contrib/dtds/xhtml-lat1.ent new file mode 100644 index 000000000..ffee223eb --- /dev/null +++ b/r2/r2/lib/contrib/dtds/xhtml-lat1.ent @@ -0,0 +1,196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/r2/r2/lib/contrib/dtds/xhtml-special.ent b/r2/r2/lib/contrib/dtds/xhtml-special.ent new file mode 100644 index 000000000..ca358b2fe --- /dev/null +++ b/r2/r2/lib/contrib/dtds/xhtml-special.ent @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/r2/r2/lib/contrib/dtds/xhtml-symbol.ent b/r2/r2/lib/contrib/dtds/xhtml-symbol.ent new file mode 100644 index 000000000..63c2abfa6 --- /dev/null +++ b/r2/r2/lib/contrib/dtds/xhtml-symbol.ent @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/r2/r2/lib/contrib/dtds/xhtml.dtd b/r2/r2/lib/contrib/dtds/xhtml.dtd new file mode 100644 index 000000000..b935f3214 --- /dev/null +++ b/r2/r2/lib/contrib/dtds/xhtml.dtd @@ -0,0 +1,9 @@ + +%HTMLlat1; + + +%HTMLspecial; + + +%HTMLsymbol; + diff --git a/r2/r2/lib/filters.py b/r2/r2/lib/filters.py index a54b92771..73920846b 100644 --- a/r2/r2/lib/filters.py +++ b/r2/r2/lib/filters.py @@ -20,6 +20,7 @@ # CondeNet, Inc. All Rights Reserved. ################################################################################ import cgi +import os import urllib import re from cStringIO import StringIO @@ -166,14 +167,25 @@ markdown_boring_tags = ('p', 'em', 'strong', 'br', 'ol', 'ul', 'hr', 'li', for bt in markdown_boring_tags: markdown_ok_tags[bt] = () +markdown_xhtml_dtd_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'contrib/dtds/xhtml.dtd') + +markdown_dtd = '' % markdown_xhtml_dtd_path + def markdown_souptest(text, nofollow=False, target=None, lang=None): if not text: return text smd = safemarkdown(text, nofollow, target, lang) - s = StringIO(smd) - tree = lxml.etree.parse(s) + # Prepend a DTD reference so we can load up definitions of all the standard + # XHTML entities ( , etc.). + smd_with_dtd = markdown_dtd + smd + + s = StringIO(smd_with_dtd) + parser = lxml.etree.XMLParser(load_dtd=True) + tree = lxml.etree.parse(s, parser) handler = SouptestSaxHandler(markdown_ok_tags) saxify(tree, handler)