Add new Q&A-oriented sort type

None of our current sort types work terribly well for Q&A threads. So, we're adding a new one that takes into account not only the votes on top-level comments, but the votes on any OP responses, too, as well as the length of both comments.
2026-01-15 01:48:18 -05:00 · 2015-02-10 14:43:52 -08:00
parent 3620e08d47
commit 62db2373f2
7 changed files with 300 additions and 14 deletions
--- a/r2/r2/lib/comment_tree.py
+++ b/r2/r2/lib/comment_tree.py
@@ -43,9 +43,14 @@ def sort_comments_key(link_id, sort):
    assert sort.startswith('_')
    return '%s%s' % (to36(link_id), sort)

-def _get_sort_value(comment, sort):
+def _get_sort_value(comment, sort, link=None, children=None):
    if sort == "_date":
        return epoch_seconds(comment._date)
+    if sort == '_qa':
+        # Responder is usually the OP, but there could be support for adding
+        # other answerers in the future.
+        responder_ids = link.responder_ids
+        return comment._qa(children, responder_ids)
    return getattr(comment, sort)

 def add_comments(comments):
@@ -91,19 +96,28 @@ def update_comment_votes(comments, write_consistency_level = None):
    link_map = {}
    for com in comments:
        link_map.setdefault(com.link_id, []).append(com)
+    all_links = Link._byID(link_map.keys(), data=True)
+
+    comment_trees = {}
+    for link in all_links.values():
+        comment_trees[link._id] = get_comment_tree(link)

    for link_id, coms in link_map.iteritems():
-        for sort in ("_controversy", "_hot", "_confidence", "_score", "_date"):
+        link = all_links[link_id]
+        for sort in ("_controversy", "_hot", "_confidence", "_score", "_date",
+                     "_qa"):
+            cid_tree = comment_trees[link_id].tree
+            sorter = _comment_sorter_from_cids(coms, sort, link, cid_tree,
+                                               by_36=True)
+
            # Cassandra always uses the id36 instead of the integer
            # ID, so we'll map that first before sending it
            c_key = sort_comments_key(link_id, sort)
-            c_r = dict((cm._id36, _get_sort_value(cm, sort))
-                       for cm in coms)
-            CommentSortsCache._set_values(c_key, c_r,
+            CommentSortsCache._set_values(c_key, sorter,
                                          write_consistency_level = write_consistency_level)


-def _comment_sorter_from_cids(cids, sort):
+def _comment_sorter_from_cids(comments, sort, link, cid_tree, by_36=False):
    """Retrieve sort values for comments.

    Useful to fill in any gaps in CommentSortsCache.
@@ -119,8 +133,45 @@ def _comment_sorter_from_cids(cids, sort):

    Returns a dictionary from cid to a numeric sort value.
    """
-    comments = Comment._byID(cids, data = False, return_dict = False)
-    return dict((x._id, _get_sort_value(x, sort)) for x in comments)
+    # The Q&A sort requires extra information about surrounding comments.  It's
+    # more efficient to gather it up here instead of in the guts of the comment
+    # sort, but we don't want to do that for sort types that don't need it.
+    if sort == '_qa':
+        # An OP response will change the sort value for its parent, so we need
+        # to process the parent, too.
+        parent_cids = []
+        responder_ids = link.responder_ids
+        for c in comments:
+            if c.author_id in responder_ids and c.parent_id:
+                parent_cids.append(c.parent_id)
+        parent_comments = Comment._byID(parent_cids, data=True,
+                return_dict=False)
+        comments.extend(parent_comments)
+
+        # Fetch the comments in batch to avoid a bunch of separate calls down
+        # the line.
+        all_child_cids = []
+        for c in comments:
+            child_cids = cid_tree.get(c._id, None)
+            if child_cids:
+                all_child_cids.extend(child_cids)
+        all_child_comments = Comment._byID(all_child_cids, data=True)
+
+    comment_sorter = {}
+    for comment in comments:
+        if sort == '_qa':
+            child_cids = cid_tree.get(comment._id, ())
+            child_comments = (all_child_comments[cid] for cid in child_cids)
+            sort_value = _get_sort_value(comment, sort, link, child_comments)
+        else:
+            sort_value = _get_sort_value(comment, sort)
+        if by_36:
+            id = comment._id36
+        else:
+            id = comment._id
+        comment_sorter[id] = sort_value
+
+    return comment_sorter

 def _get_comment_sorter(link_id, sort):
    """Retrieve cached sort values for all comments on a post.
@@ -213,7 +264,8 @@ def link_comments_and_sort(link, sort):
        if not g.disallow_db_writes:
            update_comment_votes(Comment._byID(sorter_needed, data=True, return_dict=False))

-        sorter.update(_comment_sorter_from_cids(sorter_needed, sort))
+        comments = Comment._byID(sorter_needed, data = False, return_dict = False)
+        sorter.update(_comment_sorter_from_cids(comments, sort, link, tree))
        timer.intermediate('sort')

    if parents is None:
--- a/r2/r2/lib/db/_sorts.pyx
+++ b/r2/r2/lib/db/_sorts.pyx
@@ -20,6 +20,7 @@
 # Inc. All Rights Reserved.
 ###############################################################################

+import math
 from datetime import datetime, timedelta
 from pylons import g

@@ -95,3 +96,36 @@ def confidence(int ups, int downs):
        return _confidences[downs + ups * down_range]
    else:
        return _confidence(ups, downs)
+
+cpdef double qa(int question_ups, int question_downs, int question_length,
+                op_children):
+    """The Q&A-type sort.
+
+    Similar to the "best" (confidence) sort, but specially designed for
+    Q&A-type threads to highlight good question/answer pairs.
+    """
+    question_score = confidence(question_ups, question_downs)
+
+    if not op_children:
+        return _qa(question_score, question_length)
+
+    # Only take into account the "best" answer from OP.
+    best_score = None
+    for answer in op_children:
+        score = confidence(answer._ups, answer._downs)
+        if best_score is None or score > best_score:
+            best_score = score
+            answer_length = len(answer.body)
+    return _qa(question_score, question_length, best_score, answer_length)
+
+cpdef double _qa(double question_score, int question_length,
+                 double answer_score=0, int answer_length=1):
+    score_modifier = question_score + answer_score
+
+    # Give more weight to longer posts, but count longer text less and less to
+    # avoid artificially high rankings for long-spam posts.
+    length_modifier = log10(question_length + answer_length)
+
+    # Add together the weighting from the scores and lengths, but emphasize
+    # score more.
+    return score_modifier + (length_modifier / 5)
--- a/r2/r2/lib/db/sorts.py
+++ b/r2/r2/lib/db/sorts.py
@@ -21,4 +21,4 @@
 ###############################################################################

 from r2.lib.db._sorts import epoch_seconds, score, hot, _hot
-from r2.lib.db._sorts import controversy, confidence
+from r2.lib.db._sorts import controversy, confidence, qa
--- a/r2/r2/lib/menus.py
+++ b/r2/r2/lib/menus.py
@@ -23,11 +23,12 @@
 from pylons import c, request
 from pylons.i18n import _, N_

+from r2.config import feature
 from r2.lib.db import operators
 from r2.lib.filters import _force_unicode
 from r2.lib.search import sorts as search_sorts
 from r2.lib.strings import StringHandler, plurals
-from r2.lib.utils import  query_string, timeago
+from r2.lib.utils import  class_property, query_string, timeago
 from r2.lib.wrapped import Styled


@@ -56,6 +57,7 @@ menu =   MenuHandler(hot          = _('hot'),
                     gilded       = _('gilded'),
                     confidence   = _('best'),
                     random       = _('random'),
+                     qa           = _('q & a'),
                     saved        = _('saved {toolbar}'),
                     recommended  = _('recommended'),
                     rising       = _('rising'), 
@@ -549,6 +551,7 @@ class SortMenu(NavMenu):
        "controversial": operators.desc('_controversy'),
        "confidence": operators.desc('_confidence'),
        "random": operators.shuffled('_confidence'),
+        "qa": operators.desc('_qa'),
    }
    _reverse_mapping = {v: k for k, v in _mapping.iteritems()}

@@ -570,10 +573,16 @@ class CommentSortMenu(SortMenu):
    """Sort menu for comments pages"""
    _default = 'confidence'
    _options = ('confidence', 'top', 'new', 'hot', 'controversial', 'old',
-                 'random')
-    hidden_options = ('random',)
+                 'random', 'qa',)
    button_cls = PostButton

+    @class_property
+    def hidden_options(cls):
+        if feature.is_enabled('qa_sort'):
+            return ('random',)
+        else:
+            return ('random', 'qa',)
+

 class SearchSortMenu(SortMenu):
    """Sort menu for search pages."""
--- a/r2/r2/lib/utils/utils.py
+++ b/r2/r2/lib/utils/utils.py
@@ -112,6 +112,17 @@ class Enum(Storage):
            return Storage.__contains__(self, item)


+class class_property(object):
+    """A decorator that combines @classmethod and @property.
+
+    http://stackoverflow.com/a/8198300/120999
+    """
+    def __init__(self, function):
+        self.function = function
+    def __get__(self, instance, cls):
+        return self.function(cls)
+
+
 class Results():
    def __init__(self, sa_ResultProxy, build_fn, do_batch=False):
        self.rp = sa_ResultProxy
--- a/r2/r2/models/link.py
+++ b/r2/r2/models/link.py
@@ -42,7 +42,7 @@ from r2.lib import hooks, utils
 from r2.lib.log import log_text
 from mako.filters import url_escape
 from r2.lib.strings import strings, Score
-from r2.lib.db import tdb_cassandra
+from r2.lib.db import tdb_cassandra, sorts
 from r2.lib.db.tdb_cassandra import view_of
 from r2.lib.utils import sanitize_url
 from r2.models.gold import (
@@ -732,6 +732,15 @@ class Link(Thing, Printable):
        # If available, that should be used instead of calling this
        return Account._byID(self.author_id, data=True, return_dict=False)

+    @property
+    def responder_ids(self):
+        """Returns an iterable of the OP and other official responders in a
+        thread.
+
+        Designed for Q&A-type threads (eg /r/iama).
+        """
+        return (self.author_id,)
+
    def can_flair_slow(self, user):
        """Returns whether the specified user can flair this link"""
        site = self.subreddit_slow
@@ -1040,6 +1049,31 @@ class Comment(Thing, Printable):

        return pids

+    def _qa(self, children, responder_ids):
+        """Sort a comment according to the Q&A-type sort.
+
+        Arguments:
+
+        * children -- a list of the children of this comment.
+        * responder_ids -- a set of ids of users categorized as "answerers" for
+          this thread.
+        """
+        # This sort type only makes sense for comments, unlike the other sorts
+        # that can be applied to any Things, which is why it's defined here
+        # instead of in Thing.
+
+        op_children = [c for c in children if c.author_id in responder_ids]
+        score = sorts.qa(self._ups, self._downs, len(self.body), op_children)
+
+        # When replies to a question, we want to rank OP replies higher than
+        # non-OP replies (generally).  This is a rough way to do so.
+        # Don't add extra scoring when we've already added it due to replies,
+        # though (because an OP responds to themselves).
+        if self.author_id in responder_ids and not op_children:
+            score *= 2
+
+        return score
+
    @classmethod
    def add_props(cls, user, wrapped):
        from r2.lib.template_helpers import add_attr, get_domain
--- a/r2/r2/tests/unit/models/link_test.py
+++ b/r2/r2/tests/unit/models/link_test.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# The contents of this file are subject to the Common Public Attribution
+# License Version 1.0. (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+# License Version 1.1, but Sections 14 and 15 have been added to cover use of
+# software over a computer network and provide for limited attribution for the
+# Original Developer. In addition, Exhibit A has been modified to be consistent
+# with Exhibit B.
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+# the specific language governing rights and limitations under the License.
+#
+# The Original Code is reddit.
+#
+# The Original Developer is the Initial Developer.  The Initial Developer of
+# the Original Code is reddit Inc.
+#
+# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
+# Inc. All Rights Reserved.
+###############################################################################
+
+import unittest
+from collections import namedtuple
+
+import mock
+
+from r2.lib.comment_tree import _comment_sorter_from_cids
+from r2.models.link import Comment
+
+TINY_COMMENT = 'rekt'
+SHORT_COMMENT = 'What is your favorite car from a rival brand?'
+MEDIUM_COMMENT = '''I'm humbled by how many of you were interested in talking
+to me and hearing about what we're doing with autonomous driving. When I signed
+off last night, I never thought there would be so many more questions than what
+I was able to answer after I left. I wish I had had more time last night, but
+I'm answering more questions today.'''
+LONG_COMMENT = '''That was a really good question then and still is and gets
+back to something I said earlier about the difference between autonomous drive
+vehicles, in which the driver remains in control, and self-driving cars, in
+which a driver is not even required.
+The focus of our R&D right now is on autonomous drive vehicles that enhance the
+driving experience, by giving the driver the option of letting the car handle
+some functions and by improving the car's ability to avoid accidents. That
+technology exists already and will be rolled out in phases over the next
+several years.
+We are not dismissive of self-driving vehicles, but the reality is the timeline
+for them is much further out into the future.'''
+
+class CommentMock(Comment):
+    """This class exists to allow us to call the _qa() method on Comments
+    without having to dick around with everything else they support."""
+    _nodb = True
+
+    def __init__(self, ups=1, downs=0, body=TINY_COMMENT, author_id=None):
+        self._ups = ups
+        self._downs = downs
+        self.body = body
+        self.author_id = author_id
+
+    def __setattr__(self, attr, val):
+        self.__dict__[attr] = val
+
+    def __getattr__(self, attr):
+        return self.attr
+
+
+class TestCommentQaSort(unittest.TestCase):
+    def test_simple_upvotes(self):
+        """All else equal, do upvoted comments score better?"""
+        no_upvotes = CommentMock(ups=1)
+        one_upvote = CommentMock(ups=2)
+        many_upvotes = CommentMock(ups=50)
+        no_upvotes_score = no_upvotes._qa((), ())
+        one_upvote_score = one_upvote._qa((), ())
+        many_upvotes_score = many_upvotes._qa((), ())
+
+        self.assertLess(no_upvotes_score, one_upvote_score)
+        self.assertLess(one_upvote_score, many_upvotes_score)
+
+    def test_simple_downvotes(self):
+        """All else equal, do downvoted comments score worse?"""
+        no_downvotes = CommentMock(downs=0)
+        one_downvote = CommentMock(downs=1)
+        many_downvotes = CommentMock(downs=50)
+        no_downvotes_score = no_downvotes._qa((), ())
+        one_downvote_score = one_downvote._qa((), ())
+        many_downvotes_score = many_downvotes._qa((), ())
+
+        self.assertGreater(no_downvotes_score, one_downvote_score)
+        self.assertGreater(one_downvote_score, many_downvotes_score)
+
+    def test_simple_length(self):
+        """All else equal, do longer comments score better?"""
+        tiny = CommentMock(body=TINY_COMMENT)
+        short = CommentMock(body=SHORT_COMMENT)
+        medium = CommentMock(body=MEDIUM_COMMENT)
+        long = CommentMock(body=LONG_COMMENT)
+        tiny_score = tiny._qa((), ())
+        short_score = short._qa((), ())
+        medium_score = medium._qa((), ())
+        long_score = long._qa((), ())
+
+        self.assertLess(tiny_score, short_score)
+        self.assertLess(short_score, medium_score)
+        self.assertLess(medium_score, long_score)
+
+    def test_simple_op_responses(self):
+        """All else equal, do OP answers bump up the score of comments?"""
+        question = CommentMock()
+        answer = CommentMock(author_id=1)
+        no_answer_score = question._qa((), ())
+        no_op_answer_score = question._qa((answer,), (2,))
+        with_op_answer_score = question._qa((answer,), (1,))
+
+        self.assertEqual(no_answer_score, no_op_answer_score)
+        self.assertLess(no_op_answer_score, with_op_answer_score)
+
+    def test_multiple_op_responses(self):
+        """What effect do multiple OP responses have on a comment's score?"""
+        question = CommentMock()
+        op_answer = CommentMock(author_id=1)
+        another_op_answer = CommentMock(author_id=1)
+        one_answer_score = question._qa((op_answer,), (1,))
+        two_answers_score = question._qa((op_answer, another_op_answer), (1,))
+
+        self.assertEqual(one_answer_score, two_answers_score)
+
+        bad_op_answer = CommentMock(ups=0, author_id=1)
+        good_op_answer = CommentMock(ups=30, author_id=1)
+        good_op_answer_score = question._qa((good_op_answer,), (1,))
+        op_answers = (bad_op_answer, good_op_answer, op_answer)
+        three_op_answers_score = question._qa(op_answers, (1,))
+
+        # Are we basing the score on the highest-scoring OP answer?
+        self.assertEqual(good_op_answer_score, three_op_answers_score)
+
+    def test_simple_op_comments(self):
+        """All else equal, do comments from OP score better?"""
+        comment = CommentMock(author_id=1)
+        op_score = comment._qa((), (1,))
+        non_op_score = comment._qa((), (2,))
+
+        self.assertLess(non_op_score, op_score)
+