tdb_cassandra: Optionally fetch very wide rows in small batches.

Thrift will materialize the entire result set into memory before returning it to the client, so requesting very large numbers of columns is quite detrimental to the server's health. Instead of increasing the max_column_count ad infinitum, we'll fetch the columns in batches when the model tells us that's desired.
2026-04-27 03:00:12 -04:00 · 2012-08-30 16:54:28 -07:00
parent f93c685637
commit 24c3793b17
1 changed files with 19 additions and 5 deletions
--- a/r2/r2/lib/db/tdb_cassandra.py
+++ b/r2/r2/lib/db/tdb_cassandra.py
@@ -280,6 +280,13 @@ class ThingBase(object):
    # these attributes are to be removed on _commit()
    _deletes = set()

+    # thrift will materialize the entire result set for a slice range
+    # in memory, meaning that we need to limit the maximum number of columns
+    # we receive in a single get to avoid hurting the server. if this
+    # value is true, we will make sure to do extra gets to retrieve all of
+    # the columns in a row when there are more than the per-call maximum.
+    _fetch_all_columns = False
+
    def __init__(self, _id = None, _committed = False, _partial = None, **kw):
        # things that have changed
        self._dirties = kw.copy()
@@ -350,13 +357,20 @@ class ThingBase(object):
                still_need.add(k)

        def lookup(l_ids):
-            # TODO: if we get back max_column_count columns for a
-            # given row, check a flag on the class as to whether to
-            # refetch for more of them. This could be important with
-            # large Views, for instance
-
            if properties is None:
                rows = cls._cf.multiget(l_ids, column_count=max_column_count)
+
+                # if we got max_column_count columns back for a row, it was
+                # probably clipped. in this case, we should fetch the remaining
+                # columns for that row and add them to the result.
+                if cls._fetch_all_columns:
+                    for key, row in rows.iteritems():
+                        if len(row) == max_column_count:
+                            last_column_seen = next(reversed(row))
+                            cols = cls._cf.xget(key,
+                                                column_start=last_column_seen,
+                                                buffer_size=max_column_count)
+                            row.update(cols)
            else:
                rows = cls._cf.multiget(l_ids, columns = willask_properties)