From efa3bb40091ee596d350de72d38a308c1de2005a Mon Sep 17 00:00:00 2001 From: Neil Williams Date: Tue, 4 Dec 2012 11:33:01 -0800 Subject: [PATCH] Add migration script for regenerating CommentParticipationByAccount CF. --- scripts/migrate/comment-participation.pig | 63 +++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 scripts/migrate/comment-participation.pig diff --git a/scripts/migrate/comment-participation.pig b/scripts/migrate/comment-participation.pig new file mode 100644 index 000000000..b7dce1161 --- /dev/null +++ b/scripts/migrate/comment-participation.pig @@ -0,0 +1,63 @@ +-- Backfill data for CommentParticipationByAccount CF. + +%default SCRIPT_ROOT 'udfs/dist/lib' +%default INPUT 'input' +%default OUTPUT 'output' + + +REGISTER '$SCRIPT_ROOT/reddit-pig-udfs.jar'; + + +items = +LOAD '$INPUT/comment.dump' + USING PigStorage() + AS (id:long, + ups:int, + downs:int, + deleted:chararray, + spam:chararray, + timestamp:double); + + +data = +LOAD '$INPUT/comment-data.dump' + USING PigStorage() + AS (id:long, + key:chararray, + value); + + +grouped_with_data = +COGROUP items BY id, data BY id; + + +items_with_data = +FOREACH grouped_with_data + GENERATE FLATTEN(items), + com.reddit.pig.MAKE_MAP(data.(key, value)) AS data; + + +comments_unfiltered = +FOREACH items_with_data + GENERATE (long)data#'link_id' as link_id, + (long)data#'author_id' as author_id; + + +link_x_author_full = +FILTER comments_unfiltered + BY link_id IS NOT NULL AND + author_id IS NOT NULL; + + +link_x_author = +DISTINCT link_x_author_full; + + +columns = +FOREACH link_x_author + GENERATE com.reddit.pig.TO_36(author_id) AS rowkey, + com.reddit.pig.TO_36(link_id) AS name, + ''; + + +STORE columns INTO '$OUTPUT/CommentParticipationByAccount/';