From a9a246333637a96dc087f6b8b307799693349ecb Mon Sep 17 00:00:00 2001
From: Andre D <andre@andred.ca>
Date: Mon, 14 Jan 2013 00:16:00 -0500
Subject: [PATCH] Wiki: Replace TOC with a python generated one

- Replaces terrible anchor links with header content based ones
- Fixes quirks with oddly ordered headers
- Fixes html in headers appearing as text in TOC
---
 r2/r2/lib/filters.py | 68 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/r2/r2/lib/filters.py b/r2/r2/lib/filters.py
index f9111287a..c570bf530 100644
--- a/r2/r2/lib/filters.py
+++ b/r2/r2/lib/filters.py
@@ -24,13 +24,16 @@ import cgi
 import os
 import urllib
 import re
+
+from collections import Counter
+
 import snudown
 from cStringIO import StringIO
 
 from xml.sax.handler import ContentHandler
 from lxml.sax import saxify
 import lxml.etree
-from BeautifulSoup import BeautifulSoup
+from BeautifulSoup import BeautifulSoup, Tag
 
 from pylons import g, c
 
@@ -252,7 +255,7 @@ def wikimarkdown(text):
     target = None
     
     text = snudown.markdown(_force_utf8(text), nofollow, target,
-                            renderer=snudown.RENDERER_WIKI, enable_toc=True)
+                            renderer=snudown.RENDERER_WIKI)
     
     # TODO: We should test how much of a load this adds to the app
     soup = BeautifulSoup(text)
@@ -260,10 +263,69 @@ def wikimarkdown(text):
     
     if images:
         [img_swap(image) for image in images]
-        text = str(soup)
+    
+    inject_table_of_contents(soup, prefix="wiki")
+    
+    text = str(soup)
     
     return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
 
+title_re = re.compile('\w|-')
+header_re = re.compile('^h[1-6]$')
+def inject_table_of_contents(soup, prefix):
+    header_ids = Counter()
+    headers = soup.findAll(header_re)
+    if not headers:
+        return
+    tocdiv = Tag(soup, "div", [("class", "toc")])
+    parent = Tag(soup, "ul")
+    tocdiv.append(parent)
+    level = 0
+    previous = 0
+    for header in headers:
+        contents = u''.join(header.findAll(text=True))
+        
+        # In the event of an empty header, skip
+        if not contents:
+            continue
+        
+        # Convert html entities to avoid ugly header ids
+        aid = unicode(BeautifulSoup(contents, convertEntities=BeautifulSoup.XML_ENTITIES))
+        # Prefix with PREFIX_ to avoid ID conflict with the rest of the page
+        aid = u'%s_%s' % (prefix, aid.replace(" ", "_").lower())
+        # Convert down to ascii by url encoding
+        aid = urllib.quote(aid.encode('utf-8'))
+        
+        # Check to see if a tag with the same ID exists
+        id_num = header_ids[aid] + 1
+        header_ids[aid] += 1
+        # Only start numbering ids with the second instance of an id
+        if id_num > 1:
+            aid = '%s%d' % (aid, id_num)
+        
+        header['id'] = aid
+        
+        li = Tag(soup, "li")
+        a = Tag(soup, "a", [("href", "#%s" % aid)])
+        a.string = contents
+        li.append(a)
+        
+        thislevel = int(header.name[-1])
+        
+        if previous and thislevel > previous:
+            newul = Tag(soup, "ul")
+            parent.append(newul)
+            parent = newul
+            level += 1
+        elif level and thislevel < previous:
+            parent = parent.findParent("ul")
+            level -= 1
+        
+        previous = thislevel
+        parent.append(li)
+    
+    soup.insert(0, tocdiv)
+
 def keep_space(text):
     text = websafe(text)
     for i in " \n\r\t":