Wiki: Replace TOC with a python generated one

- Replaces terrible anchor links with header content based ones - Fixes quirks with oddly ordered headers - Fixes html in headers appearing as text in TOC
2026-01-28 08:17:58 -05:00 · 2013-01-14 00:16:00 -05:00
parent 660f834263
commit a9a2463336
1 changed files with 65 additions and 3 deletions
--- a/r2/r2/lib/filters.py
+++ b/r2/r2/lib/filters.py
@@ -24,13 +24,16 @@ import cgi
 import os
 import urllib
 import re
+
+from collections import Counter
+
 import snudown
 from cStringIO import StringIO

 from xml.sax.handler import ContentHandler
 from lxml.sax import saxify
 import lxml.etree
-from BeautifulSoup import BeautifulSoup
+from BeautifulSoup import BeautifulSoup, Tag

 from pylons import g, c

@@ -252,7 +255,7 @@ def wikimarkdown(text):
    target = None
    
    text = snudown.markdown(_force_utf8(text), nofollow, target,
-                            renderer=snudown.RENDERER_WIKI, enable_toc=True)
+                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text)
@@ -260,10 +263,69 @@ def wikimarkdown(text):
    
    if images:
        [img_swap(image) for image in images]
-        text = str(soup)
+    
+    inject_table_of_contents(soup, prefix="wiki")
+    
+    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

+title_re = re.compile('\w|-')
+header_re = re.compile('^h[1-6]$')
+def inject_table_of_contents(soup, prefix):
+    header_ids = Counter()
+    headers = soup.findAll(header_re)
+    if not headers:
+        return
+    tocdiv = Tag(soup, "div", [("class", "toc")])
+    parent = Tag(soup, "ul")
+    tocdiv.append(parent)
+    level = 0
+    previous = 0
+    for header in headers:
+        contents = u''.join(header.findAll(text=True))
+        
+        # In the event of an empty header, skip
+        if not contents:
+            continue
+        
+        # Convert html entities to avoid ugly header ids
+        aid = unicode(BeautifulSoup(contents, convertEntities=BeautifulSoup.XML_ENTITIES))
+        # Prefix with PREFIX_ to avoid ID conflict with the rest of the page
+        aid = u'%s_%s' % (prefix, aid.replace(" ", "_").lower())
+        # Convert down to ascii by url encoding
+        aid = urllib.quote(aid.encode('utf-8'))
+        
+        # Check to see if a tag with the same ID exists
+        id_num = header_ids[aid] + 1
+        header_ids[aid] += 1
+        # Only start numbering ids with the second instance of an id
+        if id_num > 1:
+            aid = '%s%d' % (aid, id_num)
+        
+        header['id'] = aid
+        
+        li = Tag(soup, "li")
+        a = Tag(soup, "a", [("href", "#%s" % aid)])
+        a.string = contents
+        li.append(a)
+        
+        thislevel = int(header.name[-1])
+        
+        if previous and thislevel > previous:
+            newul = Tag(soup, "ul")
+            parent.append(newul)
+            parent = newul
+            level += 1
+        elif level and thislevel < previous:
+            parent = parent.findParent("ul")
+            level -= 1
+        
+        previous = thislevel
+        parent.append(li)
+    
+    soup.insert(0, tocdiv)
+
 def keep_space(text):
    text = websafe(text)
    for i in " \n\r\t":