Add new Q&A-oriented sort type

None of our current sort types work terribly well for Q&A threads.  So, we're
adding a new one that takes into account not only the votes on top-level
comments, but the votes on any OP responses, too, as well as the length of both
comments.
This commit is contained in:
xiongchiamiov
2015-02-10 14:43:52 -08:00
parent 3620e08d47
commit 62db2373f2
7 changed files with 300 additions and 14 deletions

View File

@@ -43,9 +43,14 @@ def sort_comments_key(link_id, sort):
assert sort.startswith('_')
return '%s%s' % (to36(link_id), sort)
def _get_sort_value(comment, sort):
def _get_sort_value(comment, sort, link=None, children=None):
if sort == "_date":
return epoch_seconds(comment._date)
if sort == '_qa':
# Responder is usually the OP, but there could be support for adding
# other answerers in the future.
responder_ids = link.responder_ids
return comment._qa(children, responder_ids)
return getattr(comment, sort)
def add_comments(comments):
@@ -91,19 +96,28 @@ def update_comment_votes(comments, write_consistency_level = None):
link_map = {}
for com in comments:
link_map.setdefault(com.link_id, []).append(com)
all_links = Link._byID(link_map.keys(), data=True)
comment_trees = {}
for link in all_links.values():
comment_trees[link._id] = get_comment_tree(link)
for link_id, coms in link_map.iteritems():
for sort in ("_controversy", "_hot", "_confidence", "_score", "_date"):
link = all_links[link_id]
for sort in ("_controversy", "_hot", "_confidence", "_score", "_date",
"_qa"):
cid_tree = comment_trees[link_id].tree
sorter = _comment_sorter_from_cids(coms, sort, link, cid_tree,
by_36=True)
# Cassandra always uses the id36 instead of the integer
# ID, so we'll map that first before sending it
c_key = sort_comments_key(link_id, sort)
c_r = dict((cm._id36, _get_sort_value(cm, sort))
for cm in coms)
CommentSortsCache._set_values(c_key, c_r,
CommentSortsCache._set_values(c_key, sorter,
write_consistency_level = write_consistency_level)
def _comment_sorter_from_cids(cids, sort):
def _comment_sorter_from_cids(comments, sort, link, cid_tree, by_36=False):
"""Retrieve sort values for comments.
Useful to fill in any gaps in CommentSortsCache.
@@ -119,8 +133,45 @@ def _comment_sorter_from_cids(cids, sort):
Returns a dictionary from cid to a numeric sort value.
"""
comments = Comment._byID(cids, data = False, return_dict = False)
return dict((x._id, _get_sort_value(x, sort)) for x in comments)
# The Q&A sort requires extra information about surrounding comments. It's
# more efficient to gather it up here instead of in the guts of the comment
# sort, but we don't want to do that for sort types that don't need it.
if sort == '_qa':
# An OP response will change the sort value for its parent, so we need
# to process the parent, too.
parent_cids = []
responder_ids = link.responder_ids
for c in comments:
if c.author_id in responder_ids and c.parent_id:
parent_cids.append(c.parent_id)
parent_comments = Comment._byID(parent_cids, data=True,
return_dict=False)
comments.extend(parent_comments)
# Fetch the comments in batch to avoid a bunch of separate calls down
# the line.
all_child_cids = []
for c in comments:
child_cids = cid_tree.get(c._id, None)
if child_cids:
all_child_cids.extend(child_cids)
all_child_comments = Comment._byID(all_child_cids, data=True)
comment_sorter = {}
for comment in comments:
if sort == '_qa':
child_cids = cid_tree.get(comment._id, ())
child_comments = (all_child_comments[cid] for cid in child_cids)
sort_value = _get_sort_value(comment, sort, link, child_comments)
else:
sort_value = _get_sort_value(comment, sort)
if by_36:
id = comment._id36
else:
id = comment._id
comment_sorter[id] = sort_value
return comment_sorter
def _get_comment_sorter(link_id, sort):
"""Retrieve cached sort values for all comments on a post.
@@ -213,7 +264,8 @@ def link_comments_and_sort(link, sort):
if not g.disallow_db_writes:
update_comment_votes(Comment._byID(sorter_needed, data=True, return_dict=False))
sorter.update(_comment_sorter_from_cids(sorter_needed, sort))
comments = Comment._byID(sorter_needed, data = False, return_dict = False)
sorter.update(_comment_sorter_from_cids(comments, sort, link, tree))
timer.intermediate('sort')
if parents is None:

View File

@@ -20,6 +20,7 @@
# Inc. All Rights Reserved.
###############################################################################
import math
from datetime import datetime, timedelta
from pylons import g
@@ -95,3 +96,36 @@ def confidence(int ups, int downs):
return _confidences[downs + ups * down_range]
else:
return _confidence(ups, downs)
cpdef double qa(int question_ups, int question_downs, int question_length,
op_children):
"""The Q&A-type sort.
Similar to the "best" (confidence) sort, but specially designed for
Q&A-type threads to highlight good question/answer pairs.
"""
question_score = confidence(question_ups, question_downs)
if not op_children:
return _qa(question_score, question_length)
# Only take into account the "best" answer from OP.
best_score = None
for answer in op_children:
score = confidence(answer._ups, answer._downs)
if best_score is None or score > best_score:
best_score = score
answer_length = len(answer.body)
return _qa(question_score, question_length, best_score, answer_length)
cpdef double _qa(double question_score, int question_length,
double answer_score=0, int answer_length=1):
score_modifier = question_score + answer_score
# Give more weight to longer posts, but count longer text less and less to
# avoid artificially high rankings for long-spam posts.
length_modifier = log10(question_length + answer_length)
# Add together the weighting from the scores and lengths, but emphasize
# score more.
return score_modifier + (length_modifier / 5)

View File

@@ -21,4 +21,4 @@
###############################################################################
from r2.lib.db._sorts import epoch_seconds, score, hot, _hot
from r2.lib.db._sorts import controversy, confidence
from r2.lib.db._sorts import controversy, confidence, qa

View File

@@ -23,11 +23,12 @@
from pylons import c, request
from pylons.i18n import _, N_
from r2.config import feature
from r2.lib.db import operators
from r2.lib.filters import _force_unicode
from r2.lib.search import sorts as search_sorts
from r2.lib.strings import StringHandler, plurals
from r2.lib.utils import query_string, timeago
from r2.lib.utils import class_property, query_string, timeago
from r2.lib.wrapped import Styled
@@ -56,6 +57,7 @@ menu = MenuHandler(hot = _('hot'),
gilded = _('gilded'),
confidence = _('best'),
random = _('random'),
qa = _('q & a'),
saved = _('saved {toolbar}'),
recommended = _('recommended'),
rising = _('rising'),
@@ -549,6 +551,7 @@ class SortMenu(NavMenu):
"controversial": operators.desc('_controversy'),
"confidence": operators.desc('_confidence'),
"random": operators.shuffled('_confidence'),
"qa": operators.desc('_qa'),
}
_reverse_mapping = {v: k for k, v in _mapping.iteritems()}
@@ -570,10 +573,16 @@ class CommentSortMenu(SortMenu):
"""Sort menu for comments pages"""
_default = 'confidence'
_options = ('confidence', 'top', 'new', 'hot', 'controversial', 'old',
'random')
hidden_options = ('random',)
'random', 'qa',)
button_cls = PostButton
@class_property
def hidden_options(cls):
if feature.is_enabled('qa_sort'):
return ('random',)
else:
return ('random', 'qa',)
class SearchSortMenu(SortMenu):
"""Sort menu for search pages."""

View File

@@ -112,6 +112,17 @@ class Enum(Storage):
return Storage.__contains__(self, item)
class class_property(object):
"""A decorator that combines @classmethod and @property.
http://stackoverflow.com/a/8198300/120999
"""
def __init__(self, function):
self.function = function
def __get__(self, instance, cls):
return self.function(cls)
class Results():
def __init__(self, sa_ResultProxy, build_fn, do_batch=False):
self.rp = sa_ResultProxy

View File

@@ -42,7 +42,7 @@ from r2.lib import hooks, utils
from r2.lib.log import log_text
from mako.filters import url_escape
from r2.lib.strings import strings, Score
from r2.lib.db import tdb_cassandra
from r2.lib.db import tdb_cassandra, sorts
from r2.lib.db.tdb_cassandra import view_of
from r2.lib.utils import sanitize_url
from r2.models.gold import (
@@ -732,6 +732,15 @@ class Link(Thing, Printable):
# If available, that should be used instead of calling this
return Account._byID(self.author_id, data=True, return_dict=False)
@property
def responder_ids(self):
"""Returns an iterable of the OP and other official responders in a
thread.
Designed for Q&A-type threads (eg /r/iama).
"""
return (self.author_id,)
def can_flair_slow(self, user):
"""Returns whether the specified user can flair this link"""
site = self.subreddit_slow
@@ -1040,6 +1049,31 @@ class Comment(Thing, Printable):
return pids
def _qa(self, children, responder_ids):
"""Sort a comment according to the Q&A-type sort.
Arguments:
* children -- a list of the children of this comment.
* responder_ids -- a set of ids of users categorized as "answerers" for
this thread.
"""
# This sort type only makes sense for comments, unlike the other sorts
# that can be applied to any Things, which is why it's defined here
# instead of in Thing.
op_children = [c for c in children if c.author_id in responder_ids]
score = sorts.qa(self._ups, self._downs, len(self.body), op_children)
# When replies to a question, we want to rank OP replies higher than
# non-OP replies (generally). This is a rough way to do so.
# Don't add extra scoring when we've already added it due to replies,
# though (because an OP responds to themselves).
if self.author_id in responder_ids and not op_children:
score *= 2
return score
@classmethod
def add_props(cls, user, wrapped):
from r2.lib.template_helpers import add_attr, get_domain

View File

@@ -0,0 +1,146 @@
#!/usr/bin/env python
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
# Inc. All Rights Reserved.
###############################################################################
import unittest
from collections import namedtuple
import mock
from r2.lib.comment_tree import _comment_sorter_from_cids
from r2.models.link import Comment
TINY_COMMENT = 'rekt'
SHORT_COMMENT = 'What is your favorite car from a rival brand?'
MEDIUM_COMMENT = '''I'm humbled by how many of you were interested in talking
to me and hearing about what we're doing with autonomous driving. When I signed
off last night, I never thought there would be so many more questions than what
I was able to answer after I left. I wish I had had more time last night, but
I'm answering more questions today.'''
LONG_COMMENT = '''That was a really good question then and still is and gets
back to something I said earlier about the difference between autonomous drive
vehicles, in which the driver remains in control, and self-driving cars, in
which a driver is not even required.
The focus of our R&D right now is on autonomous drive vehicles that enhance the
driving experience, by giving the driver the option of letting the car handle
some functions and by improving the car's ability to avoid accidents. That
technology exists already and will be rolled out in phases over the next
several years.
We are not dismissive of self-driving vehicles, but the reality is the timeline
for them is much further out into the future.'''
class CommentMock(Comment):
"""This class exists to allow us to call the _qa() method on Comments
without having to dick around with everything else they support."""
_nodb = True
def __init__(self, ups=1, downs=0, body=TINY_COMMENT, author_id=None):
self._ups = ups
self._downs = downs
self.body = body
self.author_id = author_id
def __setattr__(self, attr, val):
self.__dict__[attr] = val
def __getattr__(self, attr):
return self.attr
class TestCommentQaSort(unittest.TestCase):
def test_simple_upvotes(self):
"""All else equal, do upvoted comments score better?"""
no_upvotes = CommentMock(ups=1)
one_upvote = CommentMock(ups=2)
many_upvotes = CommentMock(ups=50)
no_upvotes_score = no_upvotes._qa((), ())
one_upvote_score = one_upvote._qa((), ())
many_upvotes_score = many_upvotes._qa((), ())
self.assertLess(no_upvotes_score, one_upvote_score)
self.assertLess(one_upvote_score, many_upvotes_score)
def test_simple_downvotes(self):
"""All else equal, do downvoted comments score worse?"""
no_downvotes = CommentMock(downs=0)
one_downvote = CommentMock(downs=1)
many_downvotes = CommentMock(downs=50)
no_downvotes_score = no_downvotes._qa((), ())
one_downvote_score = one_downvote._qa((), ())
many_downvotes_score = many_downvotes._qa((), ())
self.assertGreater(no_downvotes_score, one_downvote_score)
self.assertGreater(one_downvote_score, many_downvotes_score)
def test_simple_length(self):
"""All else equal, do longer comments score better?"""
tiny = CommentMock(body=TINY_COMMENT)
short = CommentMock(body=SHORT_COMMENT)
medium = CommentMock(body=MEDIUM_COMMENT)
long = CommentMock(body=LONG_COMMENT)
tiny_score = tiny._qa((), ())
short_score = short._qa((), ())
medium_score = medium._qa((), ())
long_score = long._qa((), ())
self.assertLess(tiny_score, short_score)
self.assertLess(short_score, medium_score)
self.assertLess(medium_score, long_score)
def test_simple_op_responses(self):
"""All else equal, do OP answers bump up the score of comments?"""
question = CommentMock()
answer = CommentMock(author_id=1)
no_answer_score = question._qa((), ())
no_op_answer_score = question._qa((answer,), (2,))
with_op_answer_score = question._qa((answer,), (1,))
self.assertEqual(no_answer_score, no_op_answer_score)
self.assertLess(no_op_answer_score, with_op_answer_score)
def test_multiple_op_responses(self):
"""What effect do multiple OP responses have on a comment's score?"""
question = CommentMock()
op_answer = CommentMock(author_id=1)
another_op_answer = CommentMock(author_id=1)
one_answer_score = question._qa((op_answer,), (1,))
two_answers_score = question._qa((op_answer, another_op_answer), (1,))
self.assertEqual(one_answer_score, two_answers_score)
bad_op_answer = CommentMock(ups=0, author_id=1)
good_op_answer = CommentMock(ups=30, author_id=1)
good_op_answer_score = question._qa((good_op_answer,), (1,))
op_answers = (bad_op_answer, good_op_answer, op_answer)
three_op_answers_score = question._qa(op_answers, (1,))
# Are we basing the score on the highest-scoring OP answer?
self.assertEqual(good_op_answer_score, three_op_answers_score)
def test_simple_op_comments(self):
"""All else equal, do comments from OP score better?"""
comment = CommentMock(author_id=1)
op_score = comment._qa((), (1,))
non_op_score = comment._qa((), (2,))
self.assertLess(non_op_score, op_score)