我实现了一个简单的 map/reduce 程序来创建一个倒排索引(我的小型搜索引擎的一部分)。每件事都工作得很好。但是在我将输入文件大小扩大到 20MB 以上(仍然是小文件进行测试)后,输出文件将为空。这意味着我可以找到文件“part-00000”,但它是 0 字节。
奇怪的是根本没有例外。这是我的日志:
(注意“映射输入记录=19405,组合输出记录=4111513,减少输入记录=0”)
SEInvertedIndex Program starts!...
12/10/28 00:58:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
12/10/28 00:58:05 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
12/10/28 00:58:05 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
12/10/28 00:58:05 WARN snappy.LoadSnappy: Snappy native library not loaded
12/10/28 00:58:05 INFO mapred.FileInputFormat: Total input paths to process : 1
12/10/28 00:58:05 INFO mapred.JobClient: Running job: job_local_0001
12/10/28 00:58:05 INFO util.ProcessTree: setsid exited with exit code 0
12/10/28 00:58:05 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@8c4a77
12/10/28 00:58:05 INFO mapred.MapTask: numReduceTasks: 1
12/10/28 00:58:05 INFO mapred.MapTask: io.sort.mb = 100
12/10/28 00:58:05 INFO mapred.MapTask: data buffer = 79691776/99614720
12/10/28 00:58:05 INFO mapred.MapTask: record buffer = 262144/327680
12/10/28 00:58:06 INFO mapred.JobClient: map 0% reduce 0%
12/10/28 00:58:08 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:08 INFO mapred.MapTask: bufstart = 0; bufend = 25657705; bufvoid = 99614720
12/10/28 00:58:08 INFO mapred.MapTask: kvstart = 0; kvend = 262144; length = 327680
12/10/28 00:58:10 INFO mapred.MapTask: Finished spill 0
12/10/28 00:58:11 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:11 INFO mapred.MapTask: bufstart = 25657705; bufend = 55385370; bufvoid = 99614720
12/10/28 00:58:11 INFO mapred.MapTask: kvstart = 262144; kvend = 196607; length = 327680
12/10/28 00:58:11 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:12 INFO mapred.JobClient: map 5% reduce 0%
12/10/28 00:58:12 INFO mapred.MapTask: Finished spill 1
12/10/28 00:58:13 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:13 INFO mapred.MapTask: bufstart = 55385370; bufend = 87701074; bufvoid = 99614720
12/10/28 00:58:13 INFO mapred.MapTask: kvstart = 196607; kvend = 131070; length = 327680
12/10/28 00:58:14 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:14 INFO mapred.MapTask: Finished spill 2
12/10/28 00:58:15 INFO mapred.JobClient: map 8% reduce 0%
12/10/28 00:58:15 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:15 INFO mapred.MapTask: bufstart = 87701074; bufend = 17642207; bufvoid = 99614622
12/10/28 00:58:15 INFO mapred.MapTask: kvstart = 131070; kvend = 65533; length = 327680
12/10/28 00:58:17 INFO mapred.MapTask: Finished spill 3
12/10/28 00:58:17 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:17 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:17 INFO mapred.MapTask: bufstart = 17642207; bufend = 47171716; bufvoid = 99614720
12/10/28 00:58:17 INFO mapred.MapTask: kvstart = 65533; kvend = 327677; length = 327680
12/10/28 00:58:18 INFO mapred.JobClient: map 12% reduce 0%
12/10/28 00:58:19 INFO mapred.MapTask: Finished spill 4
12/10/28 00:58:20 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:20 INFO mapred.MapTask: bufstart = 47171716; bufend = 75047441; bufvoid = 99614720
12/10/28 00:58:20 INFO mapred.MapTask: kvstart = 327677; kvend = 262140; length = 327680
12/10/28 00:58:20 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:21 INFO mapred.JobClient: map 16% reduce 0%
12/10/28 00:58:21 INFO mapred.MapTask: Finished spill 5
12/10/28 00:58:22 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:22 INFO mapred.MapTask: bufstart = 75047441; bufend = 7640048; bufvoid = 99614657
12/10/28 00:58:22 INFO mapred.MapTask: kvstart = 262140; kvend = 196603; length = 327680
12/10/28 00:58:23 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:24 INFO mapred.MapTask: Finished spill 6
12/10/28 00:58:24 INFO mapred.JobClient: map 19% reduce 0%
12/10/28 00:58:24 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:24 INFO mapred.MapTask: bufstart = 7640048; bufend = 39737149; bufvoid = 99614720
12/10/28 00:58:24 INFO mapred.MapTask: kvstart = 196603; kvend = 131066; length = 327680
12/10/28 00:58:26 INFO mapred.MapTask: Finished spill 7
12/10/28 00:58:26 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:27 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:27 INFO mapred.MapTask: bufstart = 39737149; bufend = 69183305; bufvoid = 99614720
12/10/28 00:58:27 INFO mapred.MapTask: kvstart = 131066; kvend = 65529; length = 327680
12/10/28 00:58:27 INFO mapred.JobClient: map 21% reduce 0%
12/10/28 00:58:28 INFO mapred.MapTask: Finished spill 8
12/10/28 00:58:29 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:29 INFO mapred.MapTask: bufstart = 69183305; bufend = 97875327; bufvoid = 99614720
12/10/28 00:58:29 INFO mapred.MapTask: kvstart = 65529; kvend = 327673; length = 327680
12/10/28 00:58:29 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:30 INFO mapred.JobClient: map 26% reduce 0%
12/10/28 00:58:31 INFO mapred.MapTask: Finished spill 9
12/10/28 00:58:32 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:32 INFO mapred.MapTask: bufstart = 97875327; bufend = 31087133; bufvoid = 99614698
12/10/28 00:58:32 INFO mapred.MapTask: kvstart = 327673; kvend = 262136; length = 327680
12/10/28 00:58:32 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:33 INFO mapred.JobClient: map 29% reduce 0%
12/10/28 00:58:33 INFO mapred.MapTask: Finished spill 10
12/10/28 00:58:34 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:34 INFO mapred.MapTask: bufstart = 31087133; bufend = 62067679; bufvoid = 99614720
12/10/28 00:58:34 INFO mapred.MapTask: kvstart = 262136; kvend = 196599; length = 327680
12/10/28 00:58:35 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:36 INFO mapred.MapTask: Finished spill 11
12/10/28 00:58:36 INFO mapred.JobClient: map 31% reduce 0%
12/10/28 00:58:36 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:36 INFO mapred.MapTask: bufstart = 62067679; bufend = 96839215; bufvoid = 99614720
12/10/28 00:58:36 INFO mapred.MapTask: kvstart = 196599; kvend = 131062; length = 327680
12/10/28 00:58:38 INFO mapred.MapTask: Finished spill 12
12/10/28 00:58:38 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:39 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:39 INFO mapred.MapTask: bufstart = 96839215; bufend = 29204841; bufvoid = 99614697
12/10/28 00:58:39 INFO mapred.MapTask: kvstart = 131062; kvend = 65525; length = 327680
12/10/28 00:58:39 INFO mapred.JobClient: map 36% reduce 0%
12/10/28 00:58:40 INFO mapred.MapTask: Finished spill 13
12/10/28 00:58:41 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:41 INFO mapred.MapTask: bufstart = 29204841; bufend = 59391566; bufvoid = 99614720
12/10/28 00:58:41 INFO mapred.MapTask: kvstart = 65525; kvend = 327669; length = 327680
12/10/28 00:58:41 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:42 INFO mapred.JobClient: map 39% reduce 0%
12/10/28 00:58:42 INFO mapred.MapTask: Finished spill 14
12/10/28 00:58:43 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:43 INFO mapred.MapTask: bufstart = 59391566; bufend = 88629854; bufvoid = 99614720
12/10/28 00:58:43 INFO mapred.MapTask: kvstart = 327669; kvend = 262132; length = 327680
12/10/28 00:58:44 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:45 INFO mapred.MapTask: Finished spill 15
12/10/28 00:58:45 INFO mapred.JobClient: map 42% reduce 0%
12/10/28 00:58:46 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:46 INFO mapred.MapTask: bufstart = 88629854; bufend = 19479345; bufvoid = 99614663
12/10/28 00:58:46 INFO mapred.MapTask: kvstart = 262132; kvend = 196595; length = 327680
12/10/28 00:58:47 INFO mapred.MapTask: Finished spill 16
12/10/28 00:58:47 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:48 INFO mapred.JobClient: map 45% reduce 0%
12/10/28 00:58:48 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:48 INFO mapred.MapTask: bufstart = 19479345; bufend = 50425350; bufvoid = 99614720
12/10/28 00:58:48 INFO mapred.MapTask: kvstart = 196595; kvend = 131058; length = 327680
12/10/28 00:58:50 INFO mapred.MapTask: Finished spill 17
12/10/28 00:58:50 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:50 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:50 INFO mapred.MapTask: bufstart = 50425350; bufend = 81295514; bufvoid = 99614720
12/10/28 00:58:50 INFO mapred.MapTask: kvstart = 131058; kvend = 65521; length = 327680
12/10/28 00:58:51 INFO mapred.JobClient: map 49% reduce 0%
12/10/28 00:58:52 INFO mapred.MapTask: Finished spill 18
12/10/28 00:58:53 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:53 INFO mapred.MapTask: bufstart = 81295514; bufend = 10297619; bufvoid = 99614693
12/10/28 00:58:53 INFO mapred.MapTask: kvstart = 65521; kvend = 327665; length = 327680
12/10/28 00:58:53 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:54 INFO mapred.JobClient: map 53% reduce 0%
12/10/28 00:58:54 INFO mapred.MapTask: Finished spill 19
12/10/28 00:58:55 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:55 INFO mapred.MapTask: bufstart = 10297619; bufend = 40501081; bufvoid = 99614720
12/10/28 00:58:55 INFO mapred.MapTask: kvstart = 327665; kvend = 262128; length = 327680
12/10/28 00:58:56 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:58:57 INFO mapred.MapTask: Finished spill 20
12/10/28 00:58:57 INFO mapred.JobClient: map 55% reduce 0%
12/10/28 00:58:57 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:58:57 INFO mapred.MapTask: bufstart = 40501081; bufend = 70713611; bufvoid = 99614720
12/10/28 00:58:57 INFO mapred.MapTask: kvstart = 262128; kvend = 196591; length = 327680
12/10/28 00:58:59 INFO mapred.MapTask: Finished spill 21
12/10/28 00:58:59 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:00 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:00 INFO mapred.MapTask: bufstart = 70713611; bufend = 3303900; bufvoid = 99614652
12/10/28 00:59:00 INFO mapred.MapTask: kvstart = 196591; kvend = 131054; length = 327680
12/10/28 00:59:00 INFO mapred.JobClient: map 59% reduce 0%
12/10/28 00:59:02 INFO mapred.MapTask: Finished spill 22
12/10/28 00:59:02 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:02 INFO mapred.MapTask: bufstart = 3303900; bufend = 34088069; bufvoid = 99614720
12/10/28 00:59:02 INFO mapred.MapTask: kvstart = 131054; kvend = 65517; length = 327680
12/10/28 00:59:02 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:03 INFO mapred.JobClient: map 63% reduce 0%
12/10/28 00:59:04 INFO mapred.MapTask: Finished spill 23
12/10/28 00:59:05 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:05 INFO mapred.MapTask: bufstart = 34088069; bufend = 63394277; bufvoid = 99614720
12/10/28 00:59:05 INFO mapred.MapTask: kvstart = 65517; kvend = 327661; length = 327680
12/10/28 00:59:05 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:06 INFO mapred.JobClient: map 66% reduce 0%
12/10/28 00:59:06 INFO mapred.MapTask: Finished spill 24
12/10/28 00:59:07 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:07 INFO mapred.MapTask: bufstart = 63394277; bufend = 94566907; bufvoid = 99614720
12/10/28 00:59:07 INFO mapred.MapTask: kvstart = 327661; kvend = 262124; length = 327680
12/10/28 00:59:08 INFO mapred.MapTask: Finished spill 25
12/10/28 00:59:08 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:09 INFO mapred.JobClient: map 70% reduce 0%
12/10/28 00:59:09 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:09 INFO mapred.MapTask: bufstart = 94566907; bufend = 29503862; bufvoid = 99614698
12/10/28 00:59:09 INFO mapred.MapTask: kvstart = 262124; kvend = 196587; length = 327680
12/10/28 00:59:11 INFO mapred.MapTask: Finished spill 26
12/10/28 00:59:12 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:12 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:12 INFO mapred.MapTask: bufstart = 29503862; bufend = 63726597; bufvoid = 99614720
12/10/28 00:59:12 INFO mapred.MapTask: kvstart = 196587; kvend = 131050; length = 327680
12/10/28 00:59:12 INFO mapred.JobClient: map 73% reduce 0%
12/10/28 00:59:14 INFO mapred.MapTask: Finished spill 27
12/10/28 00:59:14 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:14 INFO mapred.MapTask: bufstart = 63726597; bufend = 99242470; bufvoid = 99614720
12/10/28 00:59:14 INFO mapred.MapTask: kvstart = 131050; kvend = 65513; length = 327680
12/10/28 00:59:15 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:15 INFO mapred.JobClient: map 77% reduce 0%
12/10/28 00:59:16 INFO mapred.MapTask: Finished spill 28
12/10/28 00:59:17 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:17 INFO mapred.MapTask: bufstart = 99242470; bufend = 35558411; bufvoid = 99614660
12/10/28 00:59:17 INFO mapred.MapTask: kvstart = 65513; kvend = 327657; length = 327680
12/10/28 00:59:18 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:18 INFO mapred.JobClient: map 80% reduce 0%
12/10/28 00:59:18 INFO mapred.MapTask: Finished spill 29
12/10/28 00:59:19 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:19 INFO mapred.MapTask: bufstart = 35558411; bufend = 70034163; bufvoid = 99614720
12/10/28 00:59:19 INFO mapred.MapTask: kvstart = 327657; kvend = 262120; length = 327680
12/10/28 00:59:21 INFO mapred.MapTask: Finished spill 30
12/10/28 00:59:21 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:21 INFO mapred.JobClient: map 83% reduce 0%
12/10/28 00:59:21 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:21 INFO mapred.MapTask: bufstart = 70034163; bufend = 4759655; bufvoid = 99614713
12/10/28 00:59:21 INFO mapred.MapTask: kvstart = 262120; kvend = 196583; length = 327680
12/10/28 00:59:24 INFO mapred.MapTask: Finished spill 31
12/10/28 00:59:24 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:24 INFO mapred.JobClient: map 85% reduce 0%
12/10/28 00:59:24 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:24 INFO mapred.MapTask: bufstart = 4759655; bufend = 39709563; bufvoid = 99614720
12/10/28 00:59:24 INFO mapred.MapTask: kvstart = 196583; kvend = 131046; length = 327680
12/10/28 00:59:26 INFO mapred.MapTask: Finished spill 32
12/10/28 00:59:27 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:27 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:27 INFO mapred.MapTask: bufstart = 39709563; bufend = 74269540; bufvoid = 99614720
12/10/28 00:59:27 INFO mapred.MapTask: kvstart = 131046; kvend = 65509; length = 327680
12/10/28 00:59:27 INFO mapred.JobClient: map 89% reduce 0%
12/10/28 00:59:29 INFO mapred.MapTask: Finished spill 33
12/10/28 00:59:29 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:29 INFO mapred.MapTask: bufstart = 74269540; bufend = 9768351; bufvoid = 99614650
12/10/28 00:59:29 INFO mapred.MapTask: kvstart = 65509; kvend = 327653; length = 327680
12/10/28 00:59:30 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:30 INFO mapred.JobClient: map 93% reduce 0%
12/10/28 00:59:31 INFO mapred.MapTask: Finished spill 34
12/10/28 00:59:32 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:32 INFO mapred.MapTask: bufstart = 9768351; bufend = 43640794; bufvoid = 99614720
12/10/28 00:59:32 INFO mapred.MapTask: kvstart = 327653; kvend = 262116; length = 327680
12/10/28 00:59:33 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:33 INFO mapred.JobClient: map 96% reduce 0%
12/10/28 00:59:33 INFO mapred.MapTask: Finished spill 35
12/10/28 00:59:34 INFO mapred.MapTask: Spilling map output: record full = true
12/10/28 00:59:34 INFO mapred.MapTask: bufstart = 43640794; bufend = 77787072; bufvoid = 99614720
12/10/28 00:59:34 INFO mapred.MapTask: kvstart = 262116; kvend = 196579; length = 327680
12/10/28 00:59:36 INFO mapred.MapTask: Finished spill 36
12/10/28 00:59:36 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:36 INFO mapred.MapTask: Starting flush of map output
12/10/28 00:59:36 INFO mapred.JobClient: map 99% reduce 0%
12/10/28 00:59:37 INFO mapred.MapTask: Finished spill 37
12/10/28 00:59:37 INFO mapred.Merger: Merging 38 sorted segments
12/10/28 00:59:37 INFO mapred.Merger: Merging 2 intermediate segments out of a total of 38
12/10/28 00:59:38 INFO mapred.Merger: Merging 10 intermediate segments out of a total of 37
12/10/28 00:59:39 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:39 INFO mapred.JobClient: map 100% reduce 0%
12/10/28 00:59:42 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 00:59:45 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:21 INFO mapred.Merger: Merging 10 intermediate segments out of a total of 28
12/10/28 01:00:24 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:27 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:28 INFO mapred.Merger: Merging 10 intermediate segments out of a total of 19
12/10/28 01:00:30 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:33 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:35 INFO mapred.Merger: Down to the last merge-pass, with 10 segments left of total size: 512806627 bytes
12/10/28 01:00:36 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:39 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
12/10/28 01:00:39 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:39 INFO mapred.LocalJobRunner: hdfs://localhost:9000/user/hadoop/webData/data12.out:0+66123962
12/10/28 01:00:39 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
12/10/28 01:00:39 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@61f533
12/10/28 01:00:39 INFO mapred.LocalJobRunner:
12/10/28 01:00:39 INFO mapred.Merger: Merging 1 sorted segments
12/10/28 01:00:39 INFO mapred.Merger: Down to the last merge-pass, with 0 segments left of total size: 0 bytes
12/10/28 01:00:39 INFO mapred.LocalJobRunner:
12/10/28 01:00:39 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
12/10/28 01:00:39 INFO mapred.LocalJobRunner:
12/10/28 01:00:39 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now
12/10/28 01:00:39 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to hdfs://localhost:9000/user/hadoop/webDataOut
12/10/28 01:00:42 INFO mapred.LocalJobRunner: reduce > reduce
12/10/28 01:00:42 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
12/10/28 01:00:42 INFO mapred.JobClient: map 100% reduce 100%
12/10/28 01:00:42 INFO mapred.JobClient: Job complete: job_local_0001
12/10/28 01:00:42 INFO mapred.JobClient: Counters: 22
12/10/28 01:00:42 INFO mapred.JobClient: File Input Format Counters
12/10/28 01:00:42 INFO mapred.JobClient: Bytes Read=66123962
12/10/28 01:00:42 INFO mapred.JobClient: File Output Format Counters
12/10/28 01:00:42 INFO mapred.JobClient: Bytes Written=0
12/10/28 01:00:42 INFO mapred.JobClient: FileSystemCounters
12/10/28 01:00:42 INFO mapred.JobClient: FILE_BYTES_READ=1866062016
12/10/28 01:00:42 INFO mapred.JobClient: HDFS_BYTES_READ=132247924
12/10/28 01:00:42 INFO mapred.JobClient: FILE_BYTES_WRITTEN=1866142216
12/10/28 01:00:42 INFO mapred.JobClient: Map-Reduce Framework
12/10/28 01:00:42 INFO mapred.JobClient: Map output materialized bytes=6
12/10/28 01:00:42 INFO mapred.JobClient: Map input records=19405
12/10/28 01:00:42 INFO mapred.JobClient: Reduce shuffle bytes=0
12/10/28 01:00:42 INFO mapred.JobClient: Spilled Records=7484422
12/10/28 01:00:42 INFO mapred.JobClient: Map output bytes=1192717378
12/10/28 01:00:42 INFO mapred.JobClient: Total committed heap usage (bytes)=845938688
12/10/28 01:00:42 INFO mapred.JobClient: CPU time spent (ms)=0
12/10/28 01:00:42 INFO mapred.JobClient: Map input bytes=66123962
12/10/28 01:00:42 INFO mapred.JobClient: SPLIT_RAW_BYTES=105
12/10/28 01:00:42 INFO mapred.JobClient: Combine input records=13954115
12/10/28 01:00:42 INFO mapred.JobClient: Reduce input records=0
12/10/28 01:00:42 INFO mapred.JobClient: Reduce input groups=0
12/10/28 01:00:42 INFO mapred.JobClient: Combine output records=4111513
12/10/28 01:00:42 INFO mapred.JobClient: Physical memory (bytes) snapshot=0
12/10/28 01:00:42 INFO mapred.JobClient: Reduce output records=0
12/10/28 01:00:42 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0
12/10/28 01:00:42 INFO mapred.JobClient: Map output records=9842602
SEInvertedIndex Program ends!...
我的实现非常简单,当输入文件很小时它工作得很好。多谢!
这是代码:
package com.zhoutall.SEInvertedIndex;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class SEInvertedIndex {
public static class InvertedIndexMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, Text> {
private final static Text One = new Text("1");
public void map(LongWritable key, Text val,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String strContent_all=new String(val.getBytes(), 0, val.getLength(), "utf-8");
if(strContent_all==null || strContent_all=="\n")
return;
// System.out.println(strContent_all);
String strUrl=null;
String strContent = null;
String strPart[] = strContent_all.split(" ");
// if(strPart.length!=2)
// return;
strUrl = strPart[0].replace("http", "https");
strContent = strPart[1];
//System.out.println(strUrl+"~~"+strContent);
String webHead = new String(strContent.substring(0, 20) + "...").replace(" ", " ");
StringReader strReader = new StringReader(strContent);
IKSegmenter ik = new IKSegmenter(strReader, true);
Lexeme lexeme = null;
while ((lexeme = ik.next()) != null) {
// System.out.println(lexeme.getLexemeText() + "##" + strUrl + "##" + webHead);
output.collect(new Text(lexeme.getLexemeText() + "##" + strUrl + "##" + webHead), One);
}
}
}
public static class InvertedIndexCombiner extends MapReduceBase implements
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
int sum = 0;
while (values.hasNext()) {
values.next();
sum += 1;
}
// System.out.println(key.toString());
String fields[] = key.toString().split("##");
if(fields.length!=3)
return;
// System.out.println(fields[1]+"##"+fields[2] + "##" + sum);
output.collect(new Text(fields[0]), new Text(fields[1] + "##" + fields[2] + "##" + sum));
}
}
public static class InvertedIndexReducer extends MapReduceBase implements
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
boolean first = true;
StringBuilder count_all = new StringBuilder();
while (values.hasNext()) {
if (!first)
count_all.append(";;");
first = false;
count_all.append(values.next().toString());
}
// System.out.println(key+"::"+count_all.toString());
output.collect(key, new Text(count_all.toString()));
}
}
public static void main(String[] args) {
System.out.println("SEInvertedIndex Program starts!...");
JobClient client = new JobClient();
JobConf conf = new JobConf(SEInvertedIndex.class);
conf.setJobName("SEInvertedIndex");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
//conf.setNumReduceTasks(1);
FileInputFormat.addInputPath(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
// ////////////////
Path dstFilePath = new Path(args[1]);
FileSystem fs = null;
try {
fs = dstFilePath.getFileSystem(conf);
if (fs.exists(dstFilePath))
fs.delete(dstFilePath, true);
} catch (IOException e1) {
e1.printStackTrace();
}
// /////////////////
conf.setMapperClass(InvertedIndexMapper.class);
conf.setCombinerClass(InvertedIndexCombiner.class);
conf.setReducerClass(InvertedIndexReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("SEInvertedIndex Program ends!...");
}
}