可能最简单的解决方案是使用@David 提到的Pig 。为了快速测试,你想出了这样的东西:
TABLE_A = LOAD 'hdfs://my_path/input/table_a.txt' using PigStorage(' ') AS (
id:chararray,
name:chararray,
place:chararray
);
TABLE_B = LOAD 'hdfs://my_path/input/table_b.txt' using PigStorage(' ') AS (
id:chararray,
cid:chararray,
name:chararray
);
TABLE_C = LOAD 'hdfs://my_path/input/table_c.txt' using PigStorage(' ') AS (
cid:chararray,
cname:chararray
);
TMP = FOREACH (join TABLE_A by id, TABLE_B by id) GENERATE
TABLE_A::id as id,
TABLE_A::name as name,
TABLE_A::place as place,
TABLE_B::cid as cid;
JOIN_ABC = FOREACH (join TMP by cid, TABLE_C by cid) GENERATE
TMP::id,
TMP::name,
TMP::place,
TABLE_C::cname;
store JOIN_ABC into 'hdfs://my_path/output' using PigStorage(' ');