tdb_cassandra: Use Cassandra-native serialization for Date columns.

This has a couple of advantages:
 * interoperability with Cassandra tools (cassandra-cli / cqlsh)
 * smaller by 5-6 bytes on every column

From here on out, date columns will be written in the Cassandra
standard 8-byte integer (number of milliseconds since epoch) format.
Old-style stringified epoch seconds will be read properly.

This relies on Pycassa's new UTC-based behaviour in 1.7.
This commit is contained in:
Neil Williams
2012-05-24 19:44:39 -07:00
parent 4e04740d62
commit ba54ed713a
2 changed files with 28 additions and 8 deletions

View File

@@ -20,6 +20,7 @@
# CondeNet, Inc. All Rights Reserved.
################################################################################
import inspect
import pytz
from datetime import datetime
from socket import gethostbyaddr
@@ -28,8 +29,8 @@ from pylons import g
from pycassa import ColumnFamily
from pycassa.cassandra.ttypes import ConsistencyLevel, NotFoundException
from pycassa.system_manager import SystemManager, UTF8_TYPE, COUNTER_COLUMN_TYPE, TIME_UUID_TYPE
from pycassa.types import DateType
from r2.lib.utils import tup, Storage
from r2.lib.db.sorts import epoch_seconds
from r2.lib import cache
from uuid import uuid1, UUID
from itertools import chain
@@ -64,6 +65,10 @@ CL = Storage(ANY = ConsistencyLevel.ANY,
# karma_ rows, or we should not do that)
max_column_count = 50000
# the pycassa date serializer, for use when we can't set the right metadata
# to get pycassa to serialize dates for us
date_serializer = DateType()
class CassandraException(Exception):
"""Base class for Exceptions in tdb_cassandra"""
pass
@@ -433,6 +438,11 @@ class ThingBase(object):
return default
return cls._read_consistency_level
@classmethod
def _get_column_validator(cls, colname):
return cls._cf.column_validators.get(colname,
cls._cf.default_validation_class)
@classmethod
def _deserialize_column(cls, attr, val):
if attr in cls._int_props or (cls._value_type and cls._value_type == 'int'):
@@ -448,8 +458,7 @@ class ThingBase(object):
elif attr in cls._pickle_props or (cls._value_type and cls._value_type == 'pickle'):
return pickle.loads(val)
elif attr in cls._date_props or attr == cls._timestamp_prop or (cls._value_type and cls._value_type == 'date'):
as_float = float(val)
return datetime.utcfromtimestamp(as_float).replace(tzinfo = tz)
return cls._deserialize_date(val)
elif attr in cls._bytes_props or (cls._value_type and cls._value_type == 'bytes'):
return val
@@ -470,7 +479,11 @@ class ThingBase(object):
elif (attr in cls._date_props or attr == cls._timestamp_prop or
(cls._value_type and cls._value_type == 'date')):
# the _timestamp_prop is handled in _commit(), not here
return cls._serialize_date(val)
if cls._get_column_validator(attr) == 'DateType':
# pycassa will take it from here
return val
else:
return cls._serialize_date(val)
elif attr in cls._bytes_props or (cls._value_type and cls._value_type == 'bytes'):
return val
@@ -478,12 +491,19 @@ class ThingBase(object):
@classmethod
def _serialize_date(cls, date):
return str(epoch_seconds(date))
return date_serializer.pack(date)
@classmethod
def _deserialize_date(cls, val):
as_float = float(val)
return datetime.utcfromtimestamp(as_float).replace(tzinfo = tz)
if isinstance(val, datetime):
date = val
elif len(val) == 8: # cassandra uses 8-byte integer format for this
date = date_serializer.unpack(val)
else: # it's probably the old-style stringified seconds since epoch
as_float = float(val)
date = datetime.utcfromtimestamp(as_float)
return date.replace(tzinfo=pytz.utc)
@classmethod
def _from_serialized_columns(cls, t_id, columns):

View File

@@ -75,7 +75,7 @@ setup(
"chardet",
"psycopg2",
"pycountry",
"pycassa",
"pycassa>=1.7.0",
"PIL",
"pycaptcha",
"amqplib",