mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-04-27 03:00:12 -04:00
If you go to a userpage and sort by top (in either the overview or comments tabs), and restrict the time range to anything other than "all time", no comments will be shown. The data in these listings is built from functions in `lib/db/queries.py` (specifically from `get_comments()` down). This ends up trying to pull the query results from permacache (in `CachedResults.fetch_multi()`), defaulting to an empty list if no cache entry is found. Now, the cache entry is supposed to be populated periodically by a cronjob that calls `scripts/compute_time_listings`. This script (and its Python helpers in `lib/mr_top.py` and `lib/mr_tools/`) generates a dump of data from Postgresql, then reads through that and builds up entries to insert into the cache. As with many scripts of this sort, it expects to get in some bad data, and so performs some basic sanity checks. The problem is that the sanity checks have been throwing out all comments. With no new comments, there's nothing new to put into the cache! The root of this was a refactoring in reddit/reddit@3511b08 that combined several different scripts that were doing similar things. Unfortunately, we ended up requiring the `url` field on comments, which doesn't exist because, well, comments aren't links. Now we have two sets of fields that we expect to get, one for comments and one for links, and all is good. We also now have a one-line summary of processed/skipped entries printed out, which will help to make a problem like this more obvious in the future.
113 lines
3.6 KiB
Bash
Executable File
113 lines
3.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# The contents of this file are subject to the Common Public Attribution
|
|
# License Version 1.0. (the "License"); you may not use this file except in
|
|
# compliance with the License. You may obtain a copy of the License at
|
|
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
|
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
|
# software over a computer network and provide for limited attribution for the
|
|
# Original Developer. In addition, Exhibit A has been modified to be consistent
|
|
# with Exhibit B.
|
|
#
|
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
|
# the specific language governing rights and limitations under the License.
|
|
#
|
|
# The Original Code is reddit.
|
|
#
|
|
# The Original Developer is the Initial Developer. The Initial Developer of
|
|
# the Original Code is reddit Inc.
|
|
#
|
|
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
|
|
# Inc. All Rights Reserved.
|
|
###############################################################################
|
|
|
|
set -e
|
|
|
|
# expects two environment variables
|
|
# REDDIT_ROOT = path to the root of the reddit public code; the directory with the Makefile
|
|
# REDDIT_INI = path to the ini file to use
|
|
# which should be supplied via:
|
|
source /etc/default/reddit
|
|
# additionally, some configuration can be overridden in the environment
|
|
export TMPDIR=${TMPDIR:-/tmp}
|
|
export PGUSER=${PGUSER:-reddit}
|
|
export PGHOST=${PGHOST:-localhost}
|
|
|
|
## command line args
|
|
# one of "link" or "comment"
|
|
export THING_CLS="$1"
|
|
# period of data to extract from postgres: e.g. "hour", "week", "year", "all"
|
|
export INTERVAL="$2"
|
|
# which period listings to update.
|
|
# formatted as python tuple of strings: e.g. '("hour",)' or ("week", "all",) etc
|
|
export TIMES="$3"
|
|
|
|
echo "Starting $THING_CLS processing"
|
|
|
|
THING_DUMP=$TMPDIR/$THING_CLS-$INTERVAL-thing.dump
|
|
DATA_DUMP=$TMPDIR/$THING_CLS-$INTERVAL-data.dump
|
|
function clean_up {
|
|
rm -f $THING_DUMP $DATA_DUMP
|
|
}
|
|
trap clean_up EXIT
|
|
|
|
if [ -e $THING_DUMP ]; then
|
|
echo cannot start because $THING_DUMP exists
|
|
ls -l $THING_DUMP
|
|
exit 1
|
|
fi
|
|
touch $THING_DUMP
|
|
|
|
|
|
function run_query {
|
|
psql -F"\t" -A -t -c "$1"
|
|
}
|
|
|
|
function mrsort {
|
|
sort -S200m
|
|
}
|
|
|
|
function reddit {
|
|
paster --plugin=r2 run $REDDIT_INI $REDDIT_ROOT/r2/lib/mr_top.py -c "$1"
|
|
}
|
|
|
|
# Hack to let pg fetch all things with intervals
|
|
if [ $INTERVAL = "all" ]; then
|
|
export INTERVAL="century"
|
|
fi
|
|
|
|
MINID=$(run_query "SELECT thing_id
|
|
FROM reddit_thing_$THING_CLS
|
|
WHERE
|
|
date > now() - interval '1 $INTERVAL' AND
|
|
date < now()
|
|
ORDER BY date
|
|
LIMIT 1")
|
|
if [ -z $MINID ]; then
|
|
echo \$MINID is empty. Replication is likely behind.
|
|
exit 1
|
|
fi
|
|
|
|
run_query "\\copy (SELECT thing_id, 'thing', '$THING_CLS', ups, downs, deleted, spam, extract(epoch from date)
|
|
FROM reddit_thing_$THING_CLS
|
|
WHERE
|
|
not deleted AND
|
|
thing_id >= $MINID
|
|
) to $THING_DUMP"
|
|
|
|
run_query "\\copy (SELECT thing_id, 'data', '$THING_CLS', key, value
|
|
FROM reddit_data_$THING_CLS
|
|
WHERE
|
|
key IN ('url', 'sr_id', 'author_id') AND
|
|
thing_id >= $MINID
|
|
) to $DATA_DUMP"
|
|
|
|
cat $THING_DUMP $DATA_DUMP |
|
|
mrsort |
|
|
reddit "join_things('$THING_CLS')" |
|
|
reddit "time_listings($TIMES, '$THING_CLS')" |
|
|
mrsort |
|
|
reddit "write_permacache()"
|
|
|
|
echo 'Done.'
|