mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-04-27 03:00:12 -04:00
query_cache: Add lots of documentation.
This commit is contained in:
@@ -19,6 +19,18 @@
|
||||
# All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
|
||||
# Inc. All Rights Reserved.
|
||||
###############################################################################
|
||||
"""
|
||||
This module provides a Cassandra-backed lockless query cache. Rather than
|
||||
doing complicated queries on the fly to populate listings, a list of items that
|
||||
would be in that listing are maintained in Cassandra for fast lookup. The
|
||||
result can then be fed to IDBuilder to generate a final result.
|
||||
|
||||
Whenever an operation occurs that would modify the contents of the listing, the
|
||||
listing should be updated somehow. In some cases, this can be done by directly
|
||||
mutating the listing and in others it must be done offline in batch processing
|
||||
jobs.
|
||||
|
||||
"""
|
||||
|
||||
import random
|
||||
import datetime
|
||||
@@ -43,6 +55,14 @@ LOG = g.log
|
||||
|
||||
|
||||
class ThingTupleComparator(object):
|
||||
"""A callable usable for comparing sort-data in a cached query.
|
||||
|
||||
The query cache stores minimal sort data on each thing to be able to order
|
||||
the items in a cached query. This class provides the ordering for those
|
||||
thing tuples.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, sorts):
|
||||
self.sorts = sorts
|
||||
|
||||
@@ -65,6 +85,12 @@ class _CachedQueryBase(object):
|
||||
self._fetched = False
|
||||
|
||||
def fetch(self, force=False):
|
||||
"""Fill the cached query's sorted item list from Cassandra.
|
||||
|
||||
If the query has already been fetched, this method is a no-op unless
|
||||
force=True.
|
||||
|
||||
"""
|
||||
if not force and self._fetched:
|
||||
return
|
||||
|
||||
@@ -87,6 +113,22 @@ class _CachedQueryBase(object):
|
||||
|
||||
|
||||
class CachedQuery(_CachedQueryBase):
|
||||
"""A materialized view of a complex query.
|
||||
|
||||
Complicated queries can take way too long to sort in the databases. This
|
||||
class provides a fast-access view of a given listing's items. The cache
|
||||
stores each item's ID and a minimal subset of its data as required for
|
||||
sorting.
|
||||
|
||||
Each time the listing is fetched, it is sorted. Because of this, we need to
|
||||
ensure the listing does not grow too large. On each insert, a "pruning"
|
||||
can occur (with a configurable probability) which will remove excess items
|
||||
from the end of the listing.
|
||||
|
||||
Use CachedQueryMutator to make changes to the cached query's item list.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, model, key, sort, filter_fn):
|
||||
self.model = model
|
||||
self.key = key
|
||||
@@ -95,10 +137,12 @@ class CachedQuery(_CachedQueryBase):
|
||||
super(CachedQuery, self).__init__(sort)
|
||||
|
||||
def _make_item_tuple(self, item):
|
||||
"""Given a single 'item' from the result of a query build the tuple
|
||||
that will be stored in the query cache. It is effectively the
|
||||
fullname of the item after passing through the filter plus the
|
||||
columns of the unfiltered item to sort by."""
|
||||
"""Return an item tuple from the result of a query.
|
||||
|
||||
The item tuple is used to sort the items in a query without having to
|
||||
look them up.
|
||||
|
||||
"""
|
||||
filtered_item = self.filter(item)
|
||||
lst = [filtered_item._fullname]
|
||||
for col in self.sort_cols:
|
||||
@@ -185,6 +229,13 @@ class CachedQuery(_CachedQueryBase):
|
||||
|
||||
|
||||
class MergedCachedQuery(_CachedQueryBase):
|
||||
"""A cached query built by merging multiple sub-queries.
|
||||
|
||||
Merged queries can be read, but cannot be modified as it is not easy to
|
||||
determine from a given item which sub-query should get modified.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, queries):
|
||||
self.queries = queries
|
||||
|
||||
@@ -201,6 +252,13 @@ class MergedCachedQuery(_CachedQueryBase):
|
||||
|
||||
|
||||
class CachedQueryMutator(object):
|
||||
"""Utility to manipulate cached queries with batching.
|
||||
|
||||
This implements the context manager protocol so it can be used with the
|
||||
with statement for clean batches.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.mutator = Mutator(CONNECTION_POOL)
|
||||
self.to_prune = set()
|
||||
@@ -212,6 +270,15 @@ class CachedQueryMutator(object):
|
||||
self.send()
|
||||
|
||||
def insert(self, query, things):
|
||||
"""Insert items into the given cached query.
|
||||
|
||||
If the items are already in the query, they will have their sorts
|
||||
updated.
|
||||
|
||||
This will sometimes trigger pruning with a configurable probability
|
||||
(see g.querycache_prune_chance).
|
||||
|
||||
"""
|
||||
if not things:
|
||||
return
|
||||
|
||||
@@ -223,6 +290,7 @@ class CachedQueryMutator(object):
|
||||
self.to_prune.add(query)
|
||||
|
||||
def delete(self, query, things):
|
||||
"""Remove things from the query."""
|
||||
if not things:
|
||||
return
|
||||
|
||||
@@ -231,6 +299,12 @@ class CachedQueryMutator(object):
|
||||
query._delete(self.mutator, things)
|
||||
|
||||
def send(self):
|
||||
"""Commit the mutations batched up so far and potentially do pruning.
|
||||
|
||||
This is automatically called by __exit__ when used as a context
|
||||
manager.
|
||||
|
||||
"""
|
||||
self.mutator.send()
|
||||
|
||||
if self.to_prune:
|
||||
@@ -239,16 +313,41 @@ class CachedQueryMutator(object):
|
||||
|
||||
|
||||
def filter_identity(x):
|
||||
"""Return the same thing given.
|
||||
|
||||
Use this as the filter_fn of simple Thing-based cached queries so that
|
||||
the enumerated things will be returned for rendering.
|
||||
|
||||
"""
|
||||
return x
|
||||
|
||||
|
||||
def filter_thing2(x):
|
||||
"""A filter to apply to the results of a relationship query returns
|
||||
the object of the relationship."""
|
||||
"""Return the thing2 of a given relationship.
|
||||
|
||||
Use this as the filter_fn of a cached Relation query so that the related
|
||||
things will be returned for rendering.
|
||||
|
||||
"""
|
||||
return x._thing2
|
||||
|
||||
|
||||
def cached_query(model, filter_fn=filter_identity, sort=None):
|
||||
"""Decorate a function describing a cached query.
|
||||
|
||||
The decorated function is expected to follow the naming convention common
|
||||
in queries.py -- "get_something". The cached query's key will be generated
|
||||
from the combination of the function name and its arguments separated by
|
||||
periods.
|
||||
|
||||
There are currently two types of cached queries: SQL-backed and
|
||||
pure-Cassandra. In the prior case, sort should be None and the decorated
|
||||
function should return a Things/Relations query that would be used in the
|
||||
absence of the query cache. In the pure-Cassandra case, the sort field is
|
||||
used for ranking the returned items should be provided in sort and the
|
||||
return value of the decorated function is ignored.
|
||||
|
||||
"""
|
||||
def cached_query_decorator(fn):
|
||||
def cached_query_wrapper(*args):
|
||||
# build the row key from the function name and arguments
|
||||
@@ -278,6 +377,12 @@ def cached_query(model, filter_fn=filter_identity, sort=None):
|
||||
|
||||
|
||||
def merged_cached_query(fn):
|
||||
"""Decorate a function describing a cached query made up of others.
|
||||
|
||||
The decorated function should return a sequence of cached queries whose
|
||||
results will be merged together into a final listing.
|
||||
|
||||
"""
|
||||
def merge_wrapper(*args):
|
||||
queries = fn(*args)
|
||||
return MergedCachedQuery(queries)
|
||||
@@ -285,18 +390,35 @@ def merged_cached_query(fn):
|
||||
|
||||
|
||||
class _BaseQueryCache(object):
|
||||
"""The model through which cached queries to interact with Cassandra.
|
||||
|
||||
Each cached query is stored as a distinct row in Cassandra. The row key is
|
||||
given by higher level code (see the cached_query decorator above). Each
|
||||
item in the materialized result of the query is stored as a separate
|
||||
column. Each column name is the fullname of the item, while each value is
|
||||
the stuff CachedQuery needs to be able to sort the items (see
|
||||
CachedQuery._make_item_tuple).
|
||||
|
||||
"""
|
||||
|
||||
__metaclass__ = tdb_cassandra.ThingMeta
|
||||
_connection_pool = 'main'
|
||||
_extra_schema_creation_args = dict(key_validation_class=ASCII_TYPE,
|
||||
default_validation_class=UTF8_TYPE)
|
||||
_compare_with = ASCII_TYPE
|
||||
_use_db = False
|
||||
|
||||
_type_prefix = None
|
||||
_cf_name = None
|
||||
|
||||
@classmethod
|
||||
def get(cls, keys):
|
||||
"""Retrieve the items in a set of cached queries.
|
||||
|
||||
For each cached query, this returns the thing tuples and the column
|
||||
timestamps for them. The latter is useful for conditional removal
|
||||
during pruning.
|
||||
|
||||
"""
|
||||
rows = cls._cf.multiget(keys, include_timestamp=True,
|
||||
column_count=tdb_cassandra.max_column_count)
|
||||
|
||||
@@ -317,6 +439,12 @@ class _BaseQueryCache(object):
|
||||
@classmethod
|
||||
@tdb_cassandra.will_write
|
||||
def insert(cls, mutator, key, columns):
|
||||
"""Insert things into the cached query.
|
||||
|
||||
This works as an upsert; if the thing already exists, it is updated. If
|
||||
not, it is actually inserted.
|
||||
|
||||
"""
|
||||
updates = dict((key, json.dumps(value))
|
||||
for key, value in columns.iteritems())
|
||||
mutator.insert(cls._cf, key, updates)
|
||||
@@ -324,19 +452,29 @@ class _BaseQueryCache(object):
|
||||
@classmethod
|
||||
@tdb_cassandra.will_write
|
||||
def remove(cls, mutator, key, columns):
|
||||
"""Unconditionally remove things from the cached query."""
|
||||
mutator.remove(cls._cf, key, columns=columns)
|
||||
|
||||
@classmethod
|
||||
@tdb_cassandra.will_write
|
||||
def remove_if_unchanged(cls, mutator, key, columns, timestamps):
|
||||
"""Remove things from the cached query if unchanged.
|
||||
|
||||
If the things have been changed since the specified timestamps, they
|
||||
will not be removed. This is useful for avoiding race conditions while
|
||||
pruning.
|
||||
|
||||
"""
|
||||
for col in columns:
|
||||
mutator.remove(cls._cf, key, columns=[col],
|
||||
timestamp=timestamps.get(col))
|
||||
|
||||
|
||||
class UserQueryCache(_BaseQueryCache):
|
||||
"""A query cache column family for user-keyed queries."""
|
||||
_use_db = True
|
||||
|
||||
|
||||
class SubredditQueryCache(_BaseQueryCache):
|
||||
"""A query cache column family for subreddit-keyed queries."""
|
||||
_use_db = True
|
||||
|
||||
Reference in New Issue
Block a user