From 8cc3972b00f918213fff7eb6d524fedc0caab835 Mon Sep 17 00:00:00 2001 From: Neil Williams Date: Thu, 7 Jun 2012 16:21:59 -0700 Subject: [PATCH] Add utility to extract linked URLs in Markdown. --- r2/r2/lib/utils/utils.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/r2/r2/lib/utils/utils.py b/r2/r2/lib/utils/utils.py index 4f29e5e9f..b8d157577 100644 --- a/r2/r2/lib/utils/utils.py +++ b/r2/r2/lib/utils/utils.py @@ -28,15 +28,16 @@ from copy import deepcopy import cPickle as pickle import re, math, random -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, SoupStrainer from time import sleep from datetime import datetime, timedelta from functools import wraps, partial, WRAPPER_ASSIGNMENTS from pylons import g from pylons.i18n import ungettext, _ -from r2.lib.filters import _force_unicode +from r2.lib.filters import _force_unicode, _force_utf8 from mako.filters import url_escape +import snudown from r2.lib.utils._utils import * @@ -1391,3 +1392,15 @@ def wraps_api(f): if not hasattr(f, '_api_doc'): f._api_doc = {} return wraps(f, assigned=WRAPPER_ASSIGNMENTS+('_api_doc',)) + + +def extract_urls_from_markdown(md): + "Extract URLs that will be hot links from a piece of raw Markdown." + + html = snudown.markdown(_force_utf8(md)) + links = SoupStrainer("a") + + for link in BeautifulSoup(html, parseOnlyThese=links): + url = link.get('href') + if url: + yield url