Files
reddit/scripts/traffic/mr_coalesce.pig
2013-01-22 13:58:19 -05:00

107 lines
3.4 KiB
Pig
Executable File

/* EMR Version
*
* Coalesce output from multiple processed logs within interval
* hours --> day
* days --> month
*
* Needs to be passed: INPUT, OUTPUT
*/
/****************************************************
* DEFINITIONS
****************************************************/
-- Cleanup
rmf $OUTPUT
/****************************************************
* COALESCE
****************************************************/
-- sitewide
sitewide = LOAD '$INPUT/sitewide' AS (unique_id, count:long); -- load all input files (multiple hours)
sitewide_grouped = GROUP sitewide BY unique_id; -- (unique_id, {(unique_id, count), ...}, ...)
sitewide_coalesced = FOREACH sitewide_grouped
GENERATE group, SUM(sitewide.count); -- ((unique_id, SUM(sitewide.count), ...)
STORE sitewide_coalesced INTO '$OUTPUT/sitewide';
-- subreddit
subreddit_counters = LOAD '$INPUT/subreddit' AS (subreddit, unique_id, count:long);
subreddits_grouped = GROUP subreddit_counters BY (subreddit, unique_id);
subreddits_coalesced = FOREACH subreddits_grouped
GENERATE group.subreddit, group.unique_id,
SUM(subreddit_counters.count) AS count;
STORE subreddits_coalesced INTO '$OUTPUT/subreddit';
-- subreddit path
srpath = LOAD '$INPUT/srpath' AS (srpath, unique_id, count:long);
srpath_grouped = GROUP srpath BY (srpath, unique_id);
srpath_coalesced = FOREACH srpath_grouped
GENERATE group.srpath, group.unique_id,
SUM(srpath.count) AS count;
STORE srpath_coalesced INTO '$OUTPUT/srpath';
-- language
lang = LOAD '$INPUT/lang' AS (lang, unique_id, count:long);
lang_grouped = GROUP lang BY (lang, unique_id);
lang_coalesced = FOREACH lang_grouped
GENERATE group.lang, group.unique_id,
SUM(lang.count) AS count;
STORE lang_coalesced INTO '$OUTPUT/lang';
-- click
click = LOAD '$INPUT/clicks' AS (fullname, unique_id, count:long);
click_grouped = GROUP click BY (fullname, unique_id);
click_coalesced = FOREACH click_grouped
GENERATE group.fullname, group.unique_id,
SUM(click.count) AS count;
STORE click_coalesced INTO '$OUTPUT/clicks';
-- clicktarget
clicktarget = LOAD '$INPUT/clicks_targeted' AS (fullname, sr, unique_id, count:long);
clicktarget_grouped = GROUP clicktarget BY (fullname, sr, unique_id);
clicktarget_coalesced = FOREACH clicktarget_grouped
GENERATE group.fullname, group.sr, group.unique_id,
SUM(clicktarget.count) AS count;
STORE clicktarget_coalesced INTO '$OUTPUT/clicks_targeted';
-- thing
thing = LOAD '$INPUT/thing' AS (fullname, unique_id, count:long);
thing_grouped = GROUP thing BY (fullname, unique_id);
thing_coalesced = FOREACH thing_grouped
GENERATE group.fullname, group.unique_id,
SUM(thing.count) AS count;
STORE thing_coalesced INTO '$OUTPUT/thing';
-- thingtarget
thingtarget = LOAD '$INPUT/thingtarget' AS (fullname, sr, unique_id, count:long);
thingtarget_grouped = GROUP thingtarget BY (fullname, sr, unique_id);
thingtarget_coalesced = FOREACH thingtarget_grouped
GENERATE group.fullname, group.sr, group.unique_id,
SUM(thingtarget.count) AS count;
STORE thingtarget_coalesced INTO '$OUTPUT/thingtarget';