0

我试图找出为什么当我尝试在 hadoop 上运行下面的 Java 时它不起作用。

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

public class PageStat implements Tool {
private Configuration conf;

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Page visit statistics MR";
    job.setJobName(jobName);

    job.setJarByClass(PageStat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(PageStat.PageStatMapper.class);
    job.setReducerClass(PageStat.PageStatReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status =  job.waitForCompletion(true) ? 0 : 1;
    return status;
}

public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new PageStat(), args);
    System.exit(exitCode);
}

public void setConf(Configuration conf) {
   this.conf = conf;
}

public Configuration getConf() {
    return conf;
}

public static class PageStatMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private Text keyHolder = new Text();
    private IntWritable valueHolder = new IntWritable();

    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
        String[] items  =  value.toString().split(",");
        if (items.length == 3) {
            String url = items[1];
            keyHolder.set(url);
            Integer duration = Integer.parseInt(items[2]);
            valueHolder.set(duration);
            context.write(keyHolder, valueHolder);
        } else {
            context.getCounter("Error", "invalidData").increment(1);
        }
    }        
}   

public static class PageStatReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private Text keyHolder = new Text();
    private IntWritable valueHolder = new IntWritable();
    private String statType;
    private int count;
    private int totalTime;
    private int avTime;

    protected void setup(Context context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        statType = conf.get("page.stat");
    }

    protected void reduce(Text key, Iterable<IntWritable> values, Context context)
    throws IOException, InterruptedException {
        count = 0;
        totalTime = 0;
        for (IntWritable value : values){
            ++count;
            totalTime += value.get();
        } 
        avTime = totalTime / count;

        keyHolder.set(key);
        if (statType.equals("average")){
            valueHolder.set(avTime);
        } else {
            valueHolder.set(totalTime);
        }
        context.write(keyHolder, valueHolder);
    }
}    

}

错误是:

c:\hadoop-training\tutorial02-jobtracker>hadoop jar PageStat.jar PageStat jobtracker/input/visit_5000000.txt jobtracker/output 13/07/29 11:24:50 INFO input.FileInputFormat:要处理的总输入路径:1 log4j:ERROR 无法将 [c:\Hadoop\hadoop-1.1.0-SNAPSHOT\logs/hadoop.log] 重命名为 [c:\Hadoop\hadoop-1.1.0-SNAPSHOT\logs/hadoop.log.2013-07 -26]。29 年 7 月 13 日 11:24:51 信息 util.NativeCodeLoader:加载了本机 hadoop 库 13/07/29 11:24:51 警告 snappy.LoadSnappy:未加载 Snappy 本机库 29 年 7 月 11:24: 54 信息 mapred.JobClient:正在运行的作业:job_201307261340_0001 13/07/29 11:24:55 信息 mapred.JobClient:地图 0% 减少 0% 13/07/29 11:25:24 信息 mapred.JobClient:地图 1% 减少0% 13/07/29 11:25:27 信息 mapred.JobClient:映射 6% 减少 0% 13/07/29 11:25:30 信息 mapred.JobClient:

13/07/29 11:26:56 INFO mapred.JobClient:地图 100% 减少 0% 13/07/29 11:27:05 INFO mapred.JobClient:地图 100% 减少 8% 13/07/29 11:27 :08 INFO mapred.JobClient:映射 100% 减少 33% 13/07/29 11:27:10 INFO mapred.JobClient:任务 ID:尝试_201307261340_0001_r_0 00000_1,状态:在 PageStat$PageStatReducer.reduce(PageStat) 处失败 java.lang.NullPointerException .java:120) 在 PageStat$PageStatReducer.reduce(PageStat.java:96) 在 org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177) 在 org.apache.hadoop.mapred.ReduceTask.runNewReducer( ReduceTask.java:651 ) 在 org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418) 在 org.apache.hadoop.mapred.Child$4.run(Child.java:271) 在 java.security。 javax.security 中的 AccessController.doPrivileged(Native Method)。auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1135) at org.apache.hadoop.mapred.Child.main(Child.java:265 )

13/07/29 11:27:11 INFO mapred.JobClient:地图 100% 减少 0% 13/07/29 11:27:20 INFO mapred.JobClient:地图 100% 减少 8% 13/07/29 11:27 :23 INFO mapred.JobClient:映射 100% 减少 25% 13/07/29 11:27:25 INFO mapred.JobClient:任务 ID:尝试_201307261340_0001_r_0 00000_2,状态:在 PageStat$PageStatReducer.reduce(PageStat) 处失败 java.lang.NullPointerException .java:120) 在 PageStat$PageStatReducer.reduce(PageStat.java:96) 在 org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177) 在 org.apache.hadoop.mapred.ReduceTask.runNewReducer( ReduceTask.java:651 ) 在 org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418) 在 org.apache.hadoop.mapred.Child$4.run(Child.java:271) 在 java.security。 javax.security 中的 AccessController.doPrivileged(Native Method)。auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1135) at org.apache.hadoop.mapred.Child.main(Child.java:265 )

13/07/29 11:27:26 INFO mapred.JobClient: 地图 100% 减少 0% 13/07/29 11:27:38 INFO mapred.JobClient: 地图 100% 减少 25% 13/07/29 11:27 :41 INFO mapred.JobClient:映射 100% 减少 0% 13/07/29 11:27:43 INFO mapred.JobClient:作业完成:job_201307261340_0001 13/07/29 11:27:43 INFO mapred.JobClient:计数器:24 29 年 7 月 13 日 11:27:43 信息 mapred.JobClient:作业计数器 13/07/29 11:27:43 信息 mapred.JobClient:启动减少任务 = 4 13/07/29 11:27:43 信息映射。 JobClient:SLOTS_MILLIS_MAPS=179086 13/07/29 11:27:43 INFO mapred.JobClient:在保留槽 (ms)=0 13/07/29 11:27:43 INFO mapred.JobClient 后所有减少等待所花费的总时间:
保留槽后等待的所有地图花费的总时间 (ms)=0 13/07/29 11:27:43 INFO mapred.JobClient: Launched map tasks=4 13/07/29 11:27:43 INFO mapred.JobClient :数据本地映射任务=4 13/07/29 11:27:43 信息 mapred.JobClient:失败的减少任务=1 13/07/29 11:27:43 信息 mapred.JobClient:
SLOTS_MILLIS_REDUCES=106513 13/07/ 29 11:27:43 信息 mapred.JobClient:
FileSystemCounters 13/07/29 11:27:43 信息 mapred.JobClient:
FILE_BYTES_READ=179504086 13/07/29 11:27:43 信息 mapred.JobClient:
HDFS_BYTES_READ=254931072 13/ 07/29 11:27:43 INFO mapred.JobClient:
FILE_BYTES_WRITTEN=359099432 13/07/29 11:27:43 INFO mapred.JobClient:文件输入格式计数器 13/07/29 11:27:43 INFO mapred.JobClient:
字节读取 = 254930544 13/07/29 11:27:43 信息 mapred.JobClient:
Map-Reduce 框架 13/07/29 11:27:43 INFO mapred.JobClient:映射输出物化字节=17949 9502 13/07/29 11:27:43 INFO mapred.JobClient:组合输出记录=0 13/07/ 29 11:27:43 INFO mapred.JobClient:映射输入记录=5000000 13/07/29 11:27:43 INFO mapred.JobClient:物理内存(字节)快照=85 1607552 13/07/29 11:27:43 INFO mapred.JobClient:溢出记录 = 10000000 13/07/29 11:27:43 INFO mapred.JobClient:映射输出字节 = 169499478 13/07/29 11:27:43 INFO mapred.JobClient:CPU 时间花费(毫秒) =81308 13/07/29 11:27:43 INFO mapred.JobClient:总提交堆使用量(字节)= 746323968 13/07/29 11:27:43 INFO mapred.JobClient:虚拟内存(字节)快照=988 401664 13/07/29 11:27:43 INFO mapred.JobClient:组合输入记录 = 0 13/07/29 11:27:43 INFO mapred。JobClient: 映射输出记录=5000000 13/07/29 11:27:43 INFO mapred.JobClient:
SPLIT_RAW_BYTES=528

谢谢!!!

4

2 回答 2

1

我也有类似的问题,需要使用 -D 标志来执行:

-Dpage.stat=total

您可能会看到一个错误:

log4j:WARN No appenders could be found for logger (org.apache.hadoop.hdfs.DFSClient).
log4j:WARN Please initialize the log4j system properly.

这不是完整的答案,我自己仍在深究。

于 2013-11-01T16:26:40.797 回答
0

堆栈跟踪中的行号似乎与发布的源代码不一致。自运行以来代码是否发生了变化?

NullPointerException 可能发生在 if (statType...) 行上。我在配置中看不到任何设置“page.stat”的内容,无论是在运行方法中硬编码还是在作业提交中作为参数传递。这将导致 statType 成员被初始化为 null。

于 2013-07-30T19:57:17.480 回答