Image previews: prefer direct image links

As requested by @dforsyth, when extracting links from self-posts for image
previews, we'll now use an imgur or image-looking link if available, overriding
the default behavior of choosing the first link in the post text.  This helps
us deal with AMAs in particular, where sometimes posters will link to their
website, YouTube videos, etc. and have a "proof image" at the bottom of the
post.
This commit is contained in:
xiongchiamiov
2015-05-21 11:56:24 -07:00
parent b40da70feb
commit 970ed3bdad
4 changed files with 92 additions and 5 deletions

View File

@@ -56,6 +56,7 @@ from r2.lib.utils import (
domain,
extract_urls_from_markdown,
get_requests_resp_json,
is_subdomain,
)
from r2.models.link import Link
from r2.models.media_cache import (
@@ -335,12 +336,19 @@ def _get_scrape_url(link):
return link.url
urls = extract_urls_from_markdown(link.selftext)
second_choice = None
for url in urls:
p = UrlParser(url)
if not p.is_reddit_url():
if p.is_reddit_url():
continue
# If we don't find anything we like better, use the first image.
if not second_choice:
second_choice = url
# This is an optimization for "proof images" in AMAs.
if is_subdomain(p.netloc, 'imgur.com') or p.has_image_extension():
return url
return None
return second_choice
def _set_media(link, force=False, **kwargs):

View File

@@ -557,6 +557,11 @@ class UrlParser(object):
return filename_parts[-1]
def has_image_extension(self):
"""Guess if the url leads to an image."""
extension = self.path_extension().lower()
return extension in {'gif', 'jpeg', 'jpg', 'png', 'tiff'}
def set_extension(self, extension):
"""
Changes the extension of the path to the provided value (the

View File

@@ -785,11 +785,10 @@ class Link(Thing, Printable):
return 'link'
p = UrlParser(self.url)
extension = p.path_extension().lower()
if extension in {'gif', 'jpeg', 'jpg', 'png', 'tiff'}:
if p.has_image_extension():
return 'image'
if extension in {'mp4', 'webm'}:
if p.path_extension().lower() in {'mp4', 'webm'}:
return 'video'
return 'link'

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
# Inc. All Rights Reserved.
###############################################################################
from r2.tests import stage_for_paste
stage_for_paste()
import unittest
from r2.lib.media import _get_scrape_url
from r2.models import Link
class TestGetScrapeUrl(unittest.TestCase):
def test_link_post(self):
post = Link(url='https://example.com')
url = _get_scrape_url(post)
self.assertEqual(url, 'https://example.com')
def test_simple_self_post(self):
post = Link(is_self=True, selftext='''
Some text here.
https://example.com
https://reddit.com''')
url = _get_scrape_url(post)
self.assertEqual(url, 'https://example.com')
def test_imgur_link(self):
post = Link(is_self=True, selftext='''
Some text here.
https://example.com
https://imgur.com''')
url = _get_scrape_url(post)
self.assertEqual(url, 'https://imgur.com')
def test_image_link(self):
post = Link(is_self=True, selftext='''
Some text here.
https://example.com
https://reddit.com/a.jpg''')
url = _get_scrape_url(post)
self.assertEqual(url, 'https://reddit.com/a.jpg')
post = Link(is_self=True, selftext='''
Some text here.
https://example.com
https://reddit.com/a.PNG''')
url = _get_scrape_url(post)
self.assertEqual(url, 'https://reddit.com/a.PNG')
post = Link(is_self=True, selftext='''
Some text here.
https://example.com
https://reddit.com/a.jpg/b''')
url = _get_scrape_url(post)
self.assertEqual(url, 'https://example.com')