query_cache: Add lots of documentation.

This commit is contained in:
Neil Williams
2012-10-09 20:12:34 -07:00
parent 2ef0a7f452
commit 8c62ace8bb

View File

@@ -19,6 +19,18 @@
# All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
# Inc. All Rights Reserved.
###############################################################################
"""
This module provides a Cassandra-backed lockless query cache. Rather than
doing complicated queries on the fly to populate listings, a list of items that
would be in that listing are maintained in Cassandra for fast lookup. The
result can then be fed to IDBuilder to generate a final result.
Whenever an operation occurs that would modify the contents of the listing, the
listing should be updated somehow. In some cases, this can be done by directly
mutating the listing and in others it must be done offline in batch processing
jobs.
"""
import random
import datetime
@@ -43,6 +55,14 @@ LOG = g.log
class ThingTupleComparator(object):
"""A callable usable for comparing sort-data in a cached query.
The query cache stores minimal sort data on each thing to be able to order
the items in a cached query. This class provides the ordering for those
thing tuples.
"""
def __init__(self, sorts):
self.sorts = sorts
@@ -65,6 +85,12 @@ class _CachedQueryBase(object):
self._fetched = False
def fetch(self, force=False):
"""Fill the cached query's sorted item list from Cassandra.
If the query has already been fetched, this method is a no-op unless
force=True.
"""
if not force and self._fetched:
return
@@ -87,6 +113,22 @@ class _CachedQueryBase(object):
class CachedQuery(_CachedQueryBase):
"""A materialized view of a complex query.
Complicated queries can take way too long to sort in the databases. This
class provides a fast-access view of a given listing's items. The cache
stores each item's ID and a minimal subset of its data as required for
sorting.
Each time the listing is fetched, it is sorted. Because of this, we need to
ensure the listing does not grow too large. On each insert, a "pruning"
can occur (with a configurable probability) which will remove excess items
from the end of the listing.
Use CachedQueryMutator to make changes to the cached query's item list.
"""
def __init__(self, model, key, sort, filter_fn):
self.model = model
self.key = key
@@ -95,10 +137,12 @@ class CachedQuery(_CachedQueryBase):
super(CachedQuery, self).__init__(sort)
def _make_item_tuple(self, item):
"""Given a single 'item' from the result of a query build the tuple
that will be stored in the query cache. It is effectively the
fullname of the item after passing through the filter plus the
columns of the unfiltered item to sort by."""
"""Return an item tuple from the result of a query.
The item tuple is used to sort the items in a query without having to
look them up.
"""
filtered_item = self.filter(item)
lst = [filtered_item._fullname]
for col in self.sort_cols:
@@ -185,6 +229,13 @@ class CachedQuery(_CachedQueryBase):
class MergedCachedQuery(_CachedQueryBase):
"""A cached query built by merging multiple sub-queries.
Merged queries can be read, but cannot be modified as it is not easy to
determine from a given item which sub-query should get modified.
"""
def __init__(self, queries):
self.queries = queries
@@ -201,6 +252,13 @@ class MergedCachedQuery(_CachedQueryBase):
class CachedQueryMutator(object):
"""Utility to manipulate cached queries with batching.
This implements the context manager protocol so it can be used with the
with statement for clean batches.
"""
def __init__(self):
self.mutator = Mutator(CONNECTION_POOL)
self.to_prune = set()
@@ -212,6 +270,15 @@ class CachedQueryMutator(object):
self.send()
def insert(self, query, things):
"""Insert items into the given cached query.
If the items are already in the query, they will have their sorts
updated.
This will sometimes trigger pruning with a configurable probability
(see g.querycache_prune_chance).
"""
if not things:
return
@@ -223,6 +290,7 @@ class CachedQueryMutator(object):
self.to_prune.add(query)
def delete(self, query, things):
"""Remove things from the query."""
if not things:
return
@@ -231,6 +299,12 @@ class CachedQueryMutator(object):
query._delete(self.mutator, things)
def send(self):
"""Commit the mutations batched up so far and potentially do pruning.
This is automatically called by __exit__ when used as a context
manager.
"""
self.mutator.send()
if self.to_prune:
@@ -239,16 +313,41 @@ class CachedQueryMutator(object):
def filter_identity(x):
"""Return the same thing given.
Use this as the filter_fn of simple Thing-based cached queries so that
the enumerated things will be returned for rendering.
"""
return x
def filter_thing2(x):
"""A filter to apply to the results of a relationship query returns
the object of the relationship."""
"""Return the thing2 of a given relationship.
Use this as the filter_fn of a cached Relation query so that the related
things will be returned for rendering.
"""
return x._thing2
def cached_query(model, filter_fn=filter_identity, sort=None):
"""Decorate a function describing a cached query.
The decorated function is expected to follow the naming convention common
in queries.py -- "get_something". The cached query's key will be generated
from the combination of the function name and its arguments separated by
periods.
There are currently two types of cached queries: SQL-backed and
pure-Cassandra. In the prior case, sort should be None and the decorated
function should return a Things/Relations query that would be used in the
absence of the query cache. In the pure-Cassandra case, the sort field is
used for ranking the returned items should be provided in sort and the
return value of the decorated function is ignored.
"""
def cached_query_decorator(fn):
def cached_query_wrapper(*args):
# build the row key from the function name and arguments
@@ -278,6 +377,12 @@ def cached_query(model, filter_fn=filter_identity, sort=None):
def merged_cached_query(fn):
"""Decorate a function describing a cached query made up of others.
The decorated function should return a sequence of cached queries whose
results will be merged together into a final listing.
"""
def merge_wrapper(*args):
queries = fn(*args)
return MergedCachedQuery(queries)
@@ -285,18 +390,35 @@ def merged_cached_query(fn):
class _BaseQueryCache(object):
"""The model through which cached queries to interact with Cassandra.
Each cached query is stored as a distinct row in Cassandra. The row key is
given by higher level code (see the cached_query decorator above). Each
item in the materialized result of the query is stored as a separate
column. Each column name is the fullname of the item, while each value is
the stuff CachedQuery needs to be able to sort the items (see
CachedQuery._make_item_tuple).
"""
__metaclass__ = tdb_cassandra.ThingMeta
_connection_pool = 'main'
_extra_schema_creation_args = dict(key_validation_class=ASCII_TYPE,
default_validation_class=UTF8_TYPE)
_compare_with = ASCII_TYPE
_use_db = False
_type_prefix = None
_cf_name = None
@classmethod
def get(cls, keys):
"""Retrieve the items in a set of cached queries.
For each cached query, this returns the thing tuples and the column
timestamps for them. The latter is useful for conditional removal
during pruning.
"""
rows = cls._cf.multiget(keys, include_timestamp=True,
column_count=tdb_cassandra.max_column_count)
@@ -317,6 +439,12 @@ class _BaseQueryCache(object):
@classmethod
@tdb_cassandra.will_write
def insert(cls, mutator, key, columns):
"""Insert things into the cached query.
This works as an upsert; if the thing already exists, it is updated. If
not, it is actually inserted.
"""
updates = dict((key, json.dumps(value))
for key, value in columns.iteritems())
mutator.insert(cls._cf, key, updates)
@@ -324,19 +452,29 @@ class _BaseQueryCache(object):
@classmethod
@tdb_cassandra.will_write
def remove(cls, mutator, key, columns):
"""Unconditionally remove things from the cached query."""
mutator.remove(cls._cf, key, columns=columns)
@classmethod
@tdb_cassandra.will_write
def remove_if_unchanged(cls, mutator, key, columns, timestamps):
"""Remove things from the cached query if unchanged.
If the things have been changed since the specified timestamps, they
will not be removed. This is useful for avoiding race conditions while
pruning.
"""
for col in columns:
mutator.remove(cls._cf, key, columns=[col],
timestamp=timestamps.get(col))
class UserQueryCache(_BaseQueryCache):
"""A query cache column family for user-keyed queries."""
_use_db = True
class SubredditQueryCache(_BaseQueryCache):
"""A query cache column family for subreddit-keyed queries."""
_use_db = True