这是一个版本,每个键只能找到一个结果:
data = LOAD 'mode_data.dat' USING PigStorage(',') AS (key, value);
byKeyValue = GROUP data BY (key, value);
cntKeyValue = FOREACH byKeyValue GENERATE FLATTEN(group) AS (key, value), COUNT(data) as cnt;
byKey = GROUP cntKeyValue BY key;
mode = FOREACH byKey {
freq = ORDER cntKeyValue BY cnt DESC;
topFreq = LIMIT freq 1; -- one of the most frequent values for key of the group
GENERATE FLATTEN(topFreq.(key, value));
};
这个版本会为同一个键找到所有同样频繁的值:
data = LOAD 'mode_data.dat' USING PigStorage(',') AS (key, value);
byKeyValue = GROUP data BY (key, value);
cntKeyValue = FOREACH byKeyValue GENERATE FLATTEN(group) AS (key, value), COUNT(data) as cnt;
byKey = GROUP cntKeyValue BY key;
mostFreqCnt = FOREACH byKey { -- calculate the biggest count for each key
freq = ORDER cntKeyValue BY cnt DESC;
topFreq = LIMIT freq 1;
GENERATE FLATTEN(topFreq.(key, cnt)) as (key, cnt);
};
modeAll = COGROUP cntKeyValue BY (key, cnt), mostFreqCnt BY (key, cnt); -- get all values with the same count and same key, used cogroup as next command was throwing some errors during execution
mode = FOREACH (FILTER modeAll BY not IsEmpty(mostFreqCnt)) GENERATE FLATTEN(cntKeyValue.(key, value)) as (key, value);