query_cache: Add lots of documentation.

2026-04-27 03:00:12 -04:00 · 2012-10-09 20:12:34 -07:00
parent 2ef0a7f452
commit 8c62ace8bb
1 changed files with 145 additions and 7 deletions
--- a/r2/r2/models/query_cache.py
+++ b/r2/r2/models/query_cache.py
@@ -19,6 +19,18 @@
 # All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
 # Inc. All Rights Reserved.
 ###############################################################################
+"""
+This module provides a Cassandra-backed lockless query cache.  Rather than
+doing complicated queries on the fly to populate listings, a list of items that
+would be in that listing are maintained in Cassandra for fast lookup.  The
+result can then be fed to IDBuilder to generate a final result.
+
+Whenever an operation occurs that would modify the contents of the listing, the
+listing should be updated somehow.  In some cases, this can be done by directly
+mutating the listing and in others it must be done offline in batch processing
+jobs.
+
+"""

 import random
 import datetime
@@ -43,6 +55,14 @@ LOG = g.log


 class ThingTupleComparator(object):
+    """A callable usable for comparing sort-data in a cached query.
+
+    The query cache stores minimal sort data on each thing to be able to order
+    the items in a cached query.  This class provides the ordering for those
+    thing tuples.
+
+    """
+
    def __init__(self, sorts):
        self.sorts = sorts

@@ -65,6 +85,12 @@ class _CachedQueryBase(object):
        self._fetched = False

    def fetch(self, force=False):
+        """Fill the cached query's sorted item list from Cassandra.
+
+        If the query has already been fetched, this method is a no-op unless
+        force=True.
+
+        """
        if not force and self._fetched:
            return

@@ -87,6 +113,22 @@ class _CachedQueryBase(object):


 class CachedQuery(_CachedQueryBase):
+    """A materialized view of a complex query.
+
+    Complicated queries can take way too long to sort in the databases.  This
+    class provides a fast-access view of a given listing's items.  The cache
+    stores each item's ID and a minimal subset of its data as required for
+    sorting.
+
+    Each time the listing is fetched, it is sorted. Because of this, we need to
+    ensure the listing does not grow too large.  On each insert, a "pruning"
+    can occur (with a configurable probability) which will remove excess items
+    from the end of the listing.
+
+    Use CachedQueryMutator to make changes to the cached query's item list.
+
+    """
+
    def __init__(self, model, key, sort, filter_fn):
        self.model = model
        self.key = key
@@ -95,10 +137,12 @@ class CachedQuery(_CachedQueryBase):
        super(CachedQuery, self).__init__(sort)

    def _make_item_tuple(self, item):
-        """Given a single 'item' from the result of a query build the tuple
-        that will be stored in the query cache. It is effectively the
-        fullname of the item after passing through the filter plus the
-        columns of the unfiltered item to sort by."""
+        """Return an item tuple from the result of a query.
+
+        The item tuple is used to sort the items in a query without having to
+        look them up.
+
+        """
        filtered_item = self.filter(item)
        lst = [filtered_item._fullname]
        for col in self.sort_cols:
@@ -185,6 +229,13 @@ class CachedQuery(_CachedQueryBase):


 class MergedCachedQuery(_CachedQueryBase):
+    """A cached query built by merging multiple sub-queries.
+
+    Merged queries can be read, but cannot be modified as it is not easy to
+    determine from a given item which sub-query should get modified.
+
+    """
+
    def __init__(self, queries):
        self.queries = queries

@@ -201,6 +252,13 @@ class MergedCachedQuery(_CachedQueryBase):


 class CachedQueryMutator(object):
+    """Utility to manipulate cached queries with batching.
+
+    This implements the context manager protocol so it can be used with the
+    with statement for clean batches.
+
+    """
+
    def __init__(self):
        self.mutator = Mutator(CONNECTION_POOL)
        self.to_prune = set()
@@ -212,6 +270,15 @@ class CachedQueryMutator(object):
        self.send()

    def insert(self, query, things):
+        """Insert items into the given cached query.
+
+        If the items are already in the query, they will have their sorts
+        updated.
+
+        This will sometimes trigger pruning with a configurable probability
+        (see g.querycache_prune_chance).
+
+        """
        if not things:
            return

@@ -223,6 +290,7 @@ class CachedQueryMutator(object):
            self.to_prune.add(query)

    def delete(self, query, things):
+        """Remove things from the query."""
        if not things:
            return

@@ -231,6 +299,12 @@ class CachedQueryMutator(object):
        query._delete(self.mutator, things)

    def send(self):
+        """Commit the mutations batched up so far and potentially do pruning.
+
+        This is automatically called by __exit__ when used as a context
+        manager.
+
+        """
        self.mutator.send()

        if self.to_prune:
@@ -239,16 +313,41 @@ class CachedQueryMutator(object):


 def filter_identity(x):
+    """Return the same thing given.
+
+    Use this as the filter_fn of simple Thing-based cached queries so that
+    the enumerated things will be returned for rendering.
+
+    """
    return x


 def filter_thing2(x):
-    """A filter to apply to the results of a relationship query returns
-    the object of the relationship."""
+    """Return the thing2 of a given relationship.
+
+    Use this as the filter_fn of a cached Relation query so that the related
+    things will be returned for rendering.
+
+    """
    return x._thing2


 def cached_query(model, filter_fn=filter_identity, sort=None):
+    """Decorate a function describing a cached query.
+
+    The decorated function is expected to follow the naming convention common
+    in queries.py -- "get_something".  The cached query's key will be generated
+    from the combination of the function name and its arguments separated by
+    periods.
+
+    There are currently two types of cached queries: SQL-backed and
+    pure-Cassandra.  In the prior case, sort should be None and the decorated
+    function should return a Things/Relations query that would be used in the
+    absence of the query cache.  In the pure-Cassandra case, the sort field is
+    used for ranking the returned items should be provided in sort and the
+    return value of the decorated function is ignored.
+
+    """
    def cached_query_decorator(fn):
        def cached_query_wrapper(*args):
            # build the row key from the function name and arguments
@@ -278,6 +377,12 @@ def cached_query(model, filter_fn=filter_identity, sort=None):


 def merged_cached_query(fn):
+    """Decorate a function describing a cached query made up of others.
+
+    The decorated function should return a sequence of cached queries whose
+    results will be merged together into a final listing.
+
+    """
    def merge_wrapper(*args):
        queries = fn(*args)
        return MergedCachedQuery(queries)
@@ -285,18 +390,35 @@ def merged_cached_query(fn):


 class _BaseQueryCache(object):
+    """The model through which cached queries to interact with Cassandra.
+
+    Each cached query is stored as a distinct row in Cassandra.  The row key is
+    given by higher level code (see the cached_query decorator above).  Each
+    item in the materialized result of the query is stored as a separate
+    column.  Each column name is the fullname of the item, while each value is
+    the stuff CachedQuery needs to be able to sort the items (see
+    CachedQuery._make_item_tuple).
+
+    """
+
    __metaclass__ = tdb_cassandra.ThingMeta
    _connection_pool = 'main'
    _extra_schema_creation_args = dict(key_validation_class=ASCII_TYPE,
                                       default_validation_class=UTF8_TYPE)
    _compare_with = ASCII_TYPE
    _use_db = False
-
    _type_prefix = None
    _cf_name = None

    @classmethod
    def get(cls, keys):
+        """Retrieve the items in a set of cached queries.
+
+        For each cached query, this returns the thing tuples and the column
+        timestamps for them.  The latter is useful for conditional removal
+        during pruning.
+
+        """
        rows = cls._cf.multiget(keys, include_timestamp=True,
                                column_count=tdb_cassandra.max_column_count)

@@ -317,6 +439,12 @@ class _BaseQueryCache(object):
    @classmethod
    @tdb_cassandra.will_write
    def insert(cls, mutator, key, columns):
+        """Insert things into the cached query.
+
+        This works as an upsert; if the thing already exists, it is updated. If
+        not, it is actually inserted.
+
+        """
        updates = dict((key, json.dumps(value))
                       for key, value in columns.iteritems())
        mutator.insert(cls._cf, key, updates)
@@ -324,19 +452,29 @@ class _BaseQueryCache(object):
    @classmethod
    @tdb_cassandra.will_write
    def remove(cls, mutator, key, columns):
+        """Unconditionally remove things from the cached query."""
        mutator.remove(cls._cf, key, columns=columns)

    @classmethod
    @tdb_cassandra.will_write
    def remove_if_unchanged(cls, mutator, key, columns, timestamps):
+        """Remove things from the cached query if unchanged.
+
+        If the things have been changed since the specified timestamps, they
+        will not be removed.  This is useful for avoiding race conditions while
+        pruning.
+
+        """
        for col in columns:
            mutator.remove(cls._cf, key, columns=[col],
                           timestamp=timestamps.get(col))


 class UserQueryCache(_BaseQueryCache):
+    """A query cache column family for user-keyed queries."""
    _use_db = True


 class SubredditQueryCache(_BaseQueryCache):
+    """A query cache column family for subreddit-keyed queries."""
    _use_db = True