Feature flags: add support for experiments

We've long wanted the ability to run A/B tests; they make it much easier to test out potential behavior, and reduce the self-selection bias you get through opt-in betas. This commit adds A/B-type experiment support to the feature flagging system. Currently, only server-side checks and tests on logged-out users are directly supported. This is purely to scope down the feature enough to make it easier to get a v1 out.
2026-04-27 03:00:12 -04:00 · 2015-10-30 11:26:38 -07:00
parent 5b4ccc9ed6
commit fcbff995b1
7 changed files with 463 additions and 20 deletions
--- a/r2/r2/config/feature/README.md
+++ b/r2/r2/config/feature/README.md
@@ -1,10 +1,9 @@
 # Feature

 `r2.config.feature` is reddit's feature flagging API. It lets us quickly
-switch on and off features for specific segments of users and requests. It may
-also be used in the future for ramping up big changes or A/B testing.
+switch on and off features for specific segments of users and requests.

-It's heavily simplified version of Etsy's feature framework, at
+It's inspired by Etsy's feature framework, at
 https://github.com/etsy/feature - if you're looking to add to this, you may
 want to check there first to see if there's learning to be had. There almost
 certainly is.
@@ -92,14 +91,86 @@ feature_some_flag = {"percent_loggedout": 25}

 # For both admin and a group of users
 feature_some_flag = {"admin": true, "users": ["user1", "user2"]}
-
-# Not yet available: rampups, variants for A/B, etc.
 ```

 Since we're currently overloading live_config, each feature flag should be
 prepended with `feature_` in the config. We may choose to make a live-updating
 features block in the future. 

+You can also use feature flags to define A/B-type experiments.  Logically,
+experiments are separated into two parts.  First, there is an *eligibility
+check* to determine if the user is allowed to be a part of the experiment;
+currently, this is statically defined as the set of all logged-in users.
+Secondly, eligible users are either *bucketed* into a variant or *excluded*
+(because the summed percentage of all variants is less than 100).  `is_enabled`
+will return False for users who are non-eligible, fall into a control group, or
+are excluded; for anyone for whom this is true, you should call `variant` to
+find the specific variant they fall into.
+
+In code, this looks something like this:
+
+```python
+from r2.config import feature
+
+if feature.is_enabled('some_flag'):
+    variant = feature.variant('some_flag')
+    if variant == 'test_something':
+        do_new_thing()
+    elif variant == 'test_something_else':
+        do_other_new_thing()
+    else:
+        raise NotImplementedError('unknown variant %s for some_flag' % variant)
+else:
+    do_old_thing()
+```
+
+with a live_config option defining the experiment parameters:
+
+```ini
+feature_some_flag = {"experiment": {"experiment_id": 12345, "variants": {"test_something": 5.5, "test_something_else": 10}}}
+
+# Or with custom control group sizes:
+feature_some_flag = {"experiment": {"experiment_id": 12345, "variants": {"test_something": 5.5, "test_something_else": 10, "control_1": 20, "control_2": 20}}}
+```
+
+If only one non-control variant is defined (an A/A/B test), the code can be
+simplified a little bit:
+
+```python
+from r2.config import feature
+
+if feature.is_enabled('some_flag'):
+    do_new_thing()
+else:
+    do_old_thing()
+```
+
+The experiment dict has a few fields:
+
+* **experiment_id** -- an integer.  While the feature name needs to be unique
+  across all currently-defined feature flags, the experiment id should be
+  unique across all time.  This allows the data team to uniquely identify
+  experiments while looking at historical data.
+* **variants** -- a dictionary mapping variant names to percentages.  The
+  percent indicates roughly how many eligible users will be chosen to be a part
+  of that variant.  Percentages should not exceed 100/n, where n is the number
+  of variants.  The number of variants should not change over the course of the
+  experiment, but the percentages allocated each can.  Percentages can be
+  specified to the tenths of percentages.  If not defined, two control
+  groups ("control_1" and "control_2") at 10% each will be automatically added
+  to the variants.
+* **enabled** -- a boolean, defaulting to true.  Set to false to temporarily
+  disable an experiment while still keeping its definition around.
+
+Since it's useful to be able to force bucketing for testing purposes, you can
+specify a variant with a secondary syntax for a few flag conditions:
+
+```ini
+# ?feature=some_flag_something will force the "test_something" variant and
+# ?feature=some_flag_something_else will force "test_something_else"
+feature_some_flag = {"url": {"some_flag_something": "test_something", "some_flag_something_else": "test_something_else"}}
+```
+

 ## When should I use this?

--- a/r2/r2/config/feature/init.py
+++ b/r2/r2/config/feature/init.py
@@ -20,4 +20,4 @@
 # Inc. All Rights Reserved.
 ###############################################################################

-from r2.config.feature.feature import is_enabled
+from r2.config.feature.feature import is_enabled, variant
--- a/r2/r2/config/feature/feature.py
+++ b/r2/r2/config/feature/feature.py
@@ -57,6 +57,22 @@ def is_enabled(name, user=None, subreddit=None):
        oauth_client=oauth_client,
    )

+def variant(name, user=None):
+    """Return which variant of an experiment a user is part of.
+
+    If the experiment is not found, has no variants, or the user is not part of
+    any of them (control), return None.
+
+    :param name string - an experiment (feature) name
+    :param user - (optional) an Account.  Defaults to the currently signed in
+                  user.
+    :return string, or None if not part of an experiment
+    """
+    if not user:
+        user = _world.current_user()
+
+    return _get_featurestate(name).variant(user)
+

@feature_hooks.on('worker.live_config.update')
 def clear_featurestate_cache():
--- a/r2/r2/config/feature/state.py
+++ b/r2/r2/config/feature/state.py
@@ -23,6 +23,7 @@
 import json
 import hashlib

+from pylons import tmpl_context as c
 from pylons import app_globals as g


@@ -41,6 +42,14 @@ class FeatureState(object):
    DISABLED_CFG = {"enabled": GLOBALLY_OFF}
    ENABLED_CFG = {"enabled": GLOBALLY_ON}

+    # The number of buckets to use for any bucketing operations.  Should always
+    # be evenly divisible by 100.  Each factor of 10 over 100 gives us an
+    # additional digit of precision.
+    NUM_BUCKETS = 1000
+
+    # The variant definition for control groups that are added by default.
+    DEFAULT_CONTROL_GROUPS = {'control_1': 10, 'control_2': 10}
+
    def __init__(self, name, world):
        self.name = name
        self.world = world
@@ -76,6 +85,103 @@ class FeatureState(object):

        return config

+    def _calculate_bucket(self, seed):
+        """Sort something into one of self.NUM_BUCKETS buckets.
+
+        :param seed -- a string used for shifting the deterministic bucketing
+                       algorithm.  In most cases, this will be an Account's
+                       _fullname.
+        :return int -- a bucket, 0 <= bucket < self.NUM_BUCKETS
+        """
+        # Mix the feature name in with the seed so the same users don't get
+        # selected for ramp-ups for every feature.
+        hashed = hashlib.sha1(self.name + seed)
+        bucket = long(hashed.hexdigest(), 16) % self.NUM_BUCKETS
+        return bucket
+
+    @classmethod
+    def _choose_variant(cls, bucket, variants):
+        """Deterministically choose a percentage-based variant.
+
+        The algorithm satisfies two conditions:
+
+        1. It's deterministic (that is, every call with the same bucket and
+           variants will result in the same answer).
+        2. An increase in any of the variant percentages will keep the same
+           buckets in the same variants as at the smaller percentage (that is,
+           all buckets previously put in variant A will still be in variant A,
+           all buckets previously put in variant B will still be in variant B,
+           etc. and the increased percentages will be made of up buckets
+           previously not assigned to a bucket).
+
+        These attributes make it suitable for use in A/B experiments that may
+        see an increase in their variant percentages post-enabling.
+
+        :param bucket -- an integer bucket representation
+        :param variants -- a dictionary of
+                           <string:variant name>:<float:percentage> pairs.  If
+                           any percentage exceeds 1/n percent, where n is the
+                           number of variants, the percentage will be capped to
+                           1/n.  These variants will be added to
+                           DEFAULT_CONTROL_GROUPS to create the effective
+                           variant set.
+        :return string -- the variant name, or None if bucket doesn't fall into
+                          any of the variants
+        """
+        # We want to always include two control groups, but allow overriding of
+        # their percentages.
+        all_variants = dict(cls.DEFAULT_CONTROL_GROUPS)
+        all_variants.update(variants)
+
+        # Say we have an experiment with two new things we're trying out for 2%
+        # of users (A and B), a control group with 5% (C), and a pool of
+        # excluded users (x).  The buckets will be assigned like so:
+        #
+        #     A B C A B C x x C x x C x x C x x x x x x x x x...
+        #
+        # This scheme allows us to later increase the size of A and B to 7%
+        # while keeping the experience consistent for users in any group other
+        # than excluded users:
+        #
+        #     A B C A B C A B C A B C A B C A B x A B x x x x...
+        #
+        # Rather than building this entire structure out in memory, we can use
+        # a little bit of math to figure out just the one bucket's value.
+        num_variants = len(all_variants)
+        variant_names = sorted(all_variants.keys())
+        # If the variants took up the entire set of buckets, which bucket would
+        # we be in?
+        candidate_variant = variant_names[bucket % num_variants]
+        # Log a warning if this variant is capped, to help us prevent user (us)
+        # error.  It's not the most correct to only check the one, but it's
+        # easy and quick, and anything with that high a percentage should be
+        # selected quite often.
+        if (all_variants[candidate_variant] / 100.0) > 1.0/num_variants:
+            g.log.warning('Variant %s exceeds allowable percentage; truncating.',
+                          candidate_variant)
+        # Variant percentages are expressed as numeric percentages rather than
+        # a fraction of 1 (that is, 1.5 means 1.5%, not 150%); thus, at 100
+        # buckets, buckets and percents map 1:1 with each other.  Since we may
+        # have more than 100 buckets (causing each bucket to represent less
+        # than 1% each), we need to scale up how far "right" we move for each
+        # variant percent.
+        bucket_multiplier = cls.NUM_BUCKETS / 100
+        # Now check to see if we're far enough left to be included in the
+        # variant percentage.
+        if bucket < (all_variants[candidate_variant] * num_variants *
+                     bucket_multiplier):
+            return candidate_variant
+        else:
+            return None
+
+    @classmethod
+    def _is_variant_enabled(cls, variant):
+        """Determine if a variant is "enabled", as returned by is_enabled."""
+        # For users in control groups, the feature is considered "not
+        # enabled" because they should get the same behavior as ineligible
+        # users.
+        return variant not in cls.DEFAULT_CONTROL_GROUPS
+
    def is_enabled(self, user=None, subreddit=None, subdomain=None,
                   oauth_client=None):
        cfg = self.config
@@ -88,8 +194,13 @@ class FeatureState(object):
            return False

        url_flag = cfg.get('url')
-        if url_flag and url_flag in world.url_features():
-            return True
+        if url_flag:
+            if isinstance(url_flag, dict):
+                for feature in world.url_features():
+                    if feature in url_flag:
+                        return self._is_variant_enabled(url_flag[feature])
+            elif url_flag in world.url_features():
+                return True

        if cfg.get('admin') and world.is_admin(user):
            return True
@@ -103,7 +214,7 @@ class FeatureState(object):
        if cfg.get('gold') and world.has_gold(user):
            return True

-        loggedin = world.is_user_loggedin()
+        loggedin = world.is_user_loggedin(user)
        if cfg.get('loggedin') and loggedin:
            return True

@@ -127,13 +238,10 @@ class FeatureState(object):
            return True

        percent_loggedin = cfg.get('percent_loggedin', 0)
-        if percent_loggedin and user:
-            # Mix the feature name in with the user id so the same users
-            # don't get selected for ramp-ups for every feature
-            hashed = hashlib.sha1(self.name + user._fullname)
-            int_digest = long(hashed.hexdigest(), 16)
-
-            if int_digest % 100 < percent_loggedin:
+        if percent_loggedin and loggedin:
+            bucket = self._calculate_bucket(user._fullname)
+            scaled_percent = bucket / (self.NUM_BUCKETS / 100)
+            if scaled_percent < percent_loggedin:
                return True

        percent_loggedout = cfg.get('percent_loggedout', 0)
@@ -150,5 +258,50 @@ class FeatureState(object):
                except ValueError:
                    pass

+        experiment = cfg.get('experiment')
+        # Currently, all logged-in users are eligible for all experiments,
+        # as long as they're enabled.
+        if experiment and experiment.get('enabled', True) and loggedin:
+            bucket = self._calculate_bucket(user._fullname)
+            variant = self._choose_variant(bucket,
+                                           experiment.get('variants', {}))
+
+            # We only want to send this event once per request, because that's
+            # an easy way to get rid of extraneous events.
+            if not c.have_sent_bucketing_event:
+                c.have_sent_bucketing_event = {}
+            if (g.running_as_script or
+                    not c.have_sent_bucketing_event.get((self.name, user._id))):
+                g.events.bucketing_event(
+                        experiment_id=experiment.get('experiment_id'),
+                        experiment_name=self.name,
+                        variant=variant, user=user)
+                c.have_sent_bucketing_event[(self.name, user._id)] = True
+
+            return self._is_variant_enabled(variant)
+
        # Unknown value, default to off.
        return False
+
+    def variant(self, user):
+        url_flag = self.config.get('url')
+        # We only care about the dict-type 'url_flag's, since those are the
+        # only ones that can specify a variant.
+        if url_flag and isinstance(url_flag, dict):
+            for feature in self.world.url_features():
+                try:
+                    return url_flag[feature]
+                except KeyError:
+                    pass
+
+        if not user:
+            return None
+
+        experiment = self.config.get('experiment')
+        if not experiment:
+            return None
+
+        bucket = self._calculate_bucket(user._fullname)
+        variant = self._choose_variant(bucket, experiment.get('variants', {}))
+
+        return variant
--- a/r2/r2/config/feature/world.py
+++ b/r2/r2/config/feature/world.py
@@ -99,10 +99,10 @@ class World(object):

        return user.gold

-    def is_user_loggedin(self):
-        if self.current_user():
-            return True
-        return False
+    def is_user_loggedin(self, user):
+        if not (user or self.current_user()):
+            return False
+        return True

    def url_features(self):
        return set(request.GET.getall('feature'))
--- a/r2/r2/lib/eventcollector.py
+++ b/r2/r2/lib/eventcollector.py
@@ -630,6 +630,25 @@ class EventQueue(object):

        self.save_event(event)

+    def bucketing_event(self, experiment_id, experiment_name, variant, user):
+        """Send an event recording an experiment bucketing.
+
+        experiment_id: an integer representing the experiment
+        experiment_name: a human-readable name representing the experiment
+        variant: a string representing the variant name
+        user: the Account that has been put into the variant
+        """
+        event = Event(
+            topic='bucketing_events',
+            event_type='bucket',
+        )
+        event.add('experiment_id', experiment_id)
+        event.add('experiment_name', experiment_name)
+        event.add('variant', variant)
+        event.add('user_id', user._id)
+        event.add('user_name', user.name)
+        self.save_event(event)
+

 class Event(object):
    def __init__(self, topic, event_type,
--- a/r2/r2/tests/unit/config/feature_test.py
+++ b/r2/r2/tests/unit/config/feature_test.py
@@ -52,6 +52,11 @@ class MockWorld(World):
                return config
        return MockState('test_state', self)

+class TestFeature(unittest.TestCase):
+    _world = None
+    # Append user-supplied error messages to the default output, rather than
+    # overwriting it.
+    longMessage = True

 class TestFeatureBase(RedditTestCase):
    # Append user-supplied error messages to the default output, rather than
@@ -78,6 +83,127 @@ class TestFeature(TestFeatureBase):
        diff = abs((float(stats[True]) / total) - (percent / 100.0))
        self.assertTrue(diff < 0.1)

+    def test_calculate_bucket(self):
+        """Test FeatureState's _calculate_bucket function."""
+        feature_state = self._make_state(config={})
+
+        # Give ourselves enough users that we can get some reasonable amount of
+        # precision when checking amounts per bucket.
+        NUM_USERS = FeatureState.NUM_BUCKETS * 2000
+        fullnames = []
+        for i in xrange(NUM_USERS):
+            fullnames.append("t2_%s" % str(i))
+
+        counter = collections.Counter()
+        for fullname in fullnames:
+            bucket = feature_state._calculate_bucket(fullname)
+            counter[bucket] += 1
+            # Ensure bucketing is deterministic.
+            self.assertEqual(bucket, feature_state._calculate_bucket(fullname))
+
+        for bucket in xrange(FeatureState.NUM_BUCKETS):
+            # We want an even distribution across buckets.
+            expected = NUM_USERS / FeatureState.NUM_BUCKETS
+            actual = counter[bucket]
+            # Calculating the percentage difference instead of looking at the
+            # raw difference scales better as we change NUM_USERS.
+            percent_equal = float(actual)/expected
+            self.assertAlmostEqual(percent_equal, 1.0, delta=.10,
+                                   msg='bucket: %s' % bucket)
+
+    def test_choose_variant(self):
+        """Test FeatureState's _choose_variant function."""
+        no_variants = {}
+        three_variants = {
+            'remove_vote_counters': 5,
+            'control_1': 10,
+            'control_2': 5,
+        }
+        three_variants_more = {
+            'remove_vote_counters': 15.6,
+            'control_1': 10,
+            'control_2': 20,
+        }
+
+        counters = collections.defaultdict(collections.Counter)
+        for bucket in xrange(FeatureState.NUM_BUCKETS):
+            variant = FeatureState._choose_variant(bucket, no_variants)
+            if variant:
+                counters['no_variants'][variant] += 1
+            # Ensure variant-choosing is deterministic.
+            self.assertEqual(
+                    variant,
+                    FeatureState._choose_variant(bucket, no_variants))
+
+            variant = FeatureState._choose_variant(bucket, three_variants)
+            if variant:
+                counters['three_variants'][variant] += 1
+            # Ensure variant-choosing is deterministic.
+            self.assertEqual(
+                    variant,
+                    FeatureState._choose_variant(bucket, three_variants))
+
+            previous_variant = variant
+            variant = FeatureState._choose_variant(bucket, three_variants_more)
+            if variant:
+                counters['three_variants_more'][variant] += 1
+            # Ensure variant-choosing is deterministic.
+            self.assertEqual(
+                    variant,
+                    FeatureState._choose_variant(bucket, three_variants_more))
+            # If previously we had a variant, we should still have the same one
+            # now.
+            if previous_variant:
+                self.assertEqual(variant, previous_variant)
+
+        # Only controls chosen in the no-variant case.
+        for variant, percentage in FeatureState.DEFAULT_CONTROL_GROUPS.items():
+            count = counters['no_variants'][variant]
+            # The variant percentage is expressed as a part of 100, so we need
+            # to calculate the fraction-of-1 percentage and scale it
+            # accordingly.
+            scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+            self.assertEqual(scaled_percentage, percentage)
+        for variant, percentage in three_variants.items():
+            count = counters['three_variants'][variant]
+            scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+            self.assertEqual(scaled_percentage, percentage)
+        for variant, percentage in three_variants_more.items():
+            count = counters['three_variants_more'][variant]
+            scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+            self.assertEqual(scaled_percentage, percentage)
+
+        # Test boundary conditions around the maximum percentage allowed for
+        # variants.
+        fifty_fifty = {
+            'control_1': 50,
+            'control_2': 50,
+        }
+        almost_fifty_fifty = {
+            'control_1': 49,
+            'control_2': 51,
+        }
+        for bucket in xrange(FeatureState.NUM_BUCKETS):
+            variant = FeatureState._choose_variant(bucket, fifty_fifty)
+            counters['fifty_fifty'][variant] += 1
+            variant = FeatureState._choose_variant(bucket, almost_fifty_fifty)
+            counters['almost_fifty_fifty'][variant] += 1
+        count = counters['fifty_fifty']['control_1']
+        scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+        self.assertEqual(scaled_percentage, 50)
+
+        count = counters['fifty_fifty']['control_2']
+        scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+        self.assertEqual(scaled_percentage, 50)
+
+        count = counters['almost_fifty_fifty']['control_1']
+        scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+        self.assertEqual(scaled_percentage, 49)
+
+        count = counters['almost_fifty_fifty']['control_2']
+        scaled_percentage = float(count) / (FeatureState.NUM_BUCKETS / 100)
+        self.assertEqual(scaled_percentage, 50)
+
    def test_enabled(self):
        cfg = {'enabled': 'on'}
        feature_state = self.world._make_state(cfg)
@@ -200,6 +326,48 @@ class TestFeature(TestFeatureBase):
        self._assert_fuzzy_percent_true(simulate_percent_loggedout(50), 50)
        self._assert_fuzzy_percent_true(simulate_percent_loggedout(99), 99)

+    @mock.patch('r2.config.feature.state.g')
+    def test_experiment(self, g):
+        num_users = 2000
+        users = []
+        for i in xrange(num_users):
+            users.append(MockAccount(name=str(i), _fullname="t2_%s" % str(i)))
+
+        def test_simulation(experiment):
+            cfg = {'experiment': experiment}
+
+            mock_world = self.world()
+            mock_world.is_user_loggedin = mock.Mock(return_value=False)
+            feature_state = self._make_state(cfg, mock_world)
+            self.assertFalse(feature_state.is_enabled(None))
+
+            mock_world = self.world()
+            mock_world.is_user_loggedin = mock.Mock(return_value=True)
+            feature_state = self._make_state(cfg, mock_world)
+            counter = collections.Counter()
+            for user in users:
+                if feature_state.is_enabled(user):
+                    counter[feature_state.variant(user)] += 1
+
+            for variant, percent in experiment['variants'].items():
+                # Our actual percentage should be within our expected percent
+                # (expressed as a part of 100 rather than a fraction of 1)
+                # +- 1%.
+                measured_percent = (float(counter[variant]) / num_users) * 100
+                self.assertAlmostEqual(measured_percent, percent, delta=1)
+
+        experiment = {'variants': {'larger': 5, 'smaller': 10}}
+        test_simulation(experiment)
+        experiment['enabled'] = True
+        test_simulation(experiment)
+
+        experiment['enabled'] = False
+        cfg = {'experiment': experiment}
+        mock_world = self.world()
+        mock_world.is_user_loggedin = mock.Mock(return_value=True)
+        feature_state = self._make_state(cfg, mock_world)
+        for user in users:
+            self.assertFalse(feature_state.is_enabled(user))

    def test_url_enabled(self):

@@ -215,6 +383,12 @@ class TestFeature(TestFeatureBase):
        self.assertTrue(feature_state.is_enabled())
        self.assertTrue(feature_state.is_enabled(user=gary))

+        cfg = {'url': {'test_state_a': 'a', 'test_state_b': 'b'}}
+        mock_world.url_features = mock.Mock(return_value={'x', 'test_state_b'})
+        feature_state = self._make_state(cfg, mock_world)
+        self.assertTrue(feature_state.is_enabled())
+        self.assertEqual(feature_state.variant(user=gary), 'b')
+
    def test_url_disabled(self):

        cfg = {'url': 'test_state'}
@@ -229,6 +403,16 @@ class TestFeature(TestFeatureBase):
        self.assertFalse(feature_state.is_enabled())
        self.assertFalse(feature_state.is_enabled(user=gary))

+        cfg = {'url': {'test_state_a': 'a', 'test_state_b': 'b'}}
+        mock_world.url_features = mock.Mock(return_value={'x'})
+        feature_state = self._make_state(cfg, mock_world)
+        self.assertFalse(feature_state.is_enabled())
+
+        cfg = {'url': {'test_state_c1': 'control_1', 'test_state_c2': 'control_2'}}
+        mock_world.url_features = mock.Mock(return_value={'x', 'test_state_c2'})
+        feature_state = self._make_state(cfg, mock_world)
+        self.assertFalse(feature_state.is_enabled())
+
    def test_user_in(self):
        cfg = {'users': ['Gary']}
        feature_state = self.world._make_state(cfg)