hadoop - hadoop 的 putmerge 不起作用

Question

我试图从eclipse编译hadoop的putmerge程序，但它没有工作，而不是在HDFS上创建一个文件，而是在我的localmachine（我安装了eclipse的地方）上创建文件。

看起来我的 conf 没有从 XML 文件中选择正确的配置。

编辑：

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class PlayWithHadoop {

public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
//conf.set("fs.default.name", "hdfs://localhost:54310/user/hduser");
//conf.set("dfs.data.dir", "/user/hduser");
conf.addResource(new Path("/home/hduser/hadoop/conf/core-site.xml"));
conf.addResource(new Path("/home/hduser/hadoop/conf/hdfs-site.xml"));

FileSystem hdfs = FileSystem.get(conf);
FileSystem local = FileSystem.getLocal(conf);
Path inputDir = new Path(args[0]);
Path hdfsFile = new Path(args[1]);

try {
//hdfs.setWorkingDirectory(new Path("/user/hduser/hadoop"));
FileStatus[] inputFiles = local.listStatus(inputDir);
FSDataOutputStream out = hdfs.create(hdfsFile);

for (int i=0; i<inputFiles.length; i++) {
System.out.println(inputFiles[i].getPath().getName());
FSDataInputStream in =local.open(inputFiles[i].getPath());
System.out.println();
System.out.println(hdfs.getWorkingDirectory().toString());


byte buffer[] = new byte[256];
int bytesRead = 0;
while( (bytesRead = in.read(buffer)) > 0) {
out.write(buffer, 0, bytesRead);
}
in.close();
}
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

score 0 · Accepted Answer

在您的代码中添加这些行，看看它是否有效：

Configuration conf = new Configuration();
conf.addResource(new Path("/HADOOP_HOME/conf/core-site.xml"));
conf.addResource(new Path("/HADOOP_HOME/conf/hdfs-site.xml"));

score 0 · Accepted Answer

// 试试这段代码，看看你是否为 jar 提供了正确的输入、outpath 路径。

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Progressable;

/**
 *
 * @author hadoop1
 */
public class PutMerge {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException, ClassNotFoundException {

        //Configuration would be taking the resource that contains set of name/value pairs as XML
        //Since Configuration automatically loads the properties from core-site.xml in its constructor, and core-site.xml should contain fs.defaultFS,
        Configuration conf = new Configuration();

        //FIleSystem Abstract class is used to get the conf key to connect to HDFS
        //FileSystem needs only one configuration key to successfully connect to HDFS. Previously it was fs.default.name. 
        //From yarn onward it's changed to fs.defaultFS. So the following snippet is sufficient for the connection. 
        FileSystem hdfs = FileSystem.get(conf);
        //For local connection l
        FileSystem local = FileSystem.getLocal(conf);

        Path inputDirectory = new Path(args[0]);
        Path hdfsFiles = new Path(args[1]);

        try {

            FileStatus[] inputFiles = local.listStatus(inputDirectory);
            FSDataOutputStream out = hdfs.create(hdfsFiles);

            for (int i = 0; i < inputFiles.length; i++) {
                //System.out.println(inputFiles[i].getPath().getName());

                FSDataInputStream in = local.open(inputFiles[i].getPath());
                byte buffer[] = new byte[256];
                int byteRead = 0;

                while ((byteRead = in.read(buffer)) > 0) {
                    out.write(buffer, 0, byteRead);
                }
                in.close();
            }
            out.close();
        } catch (Exception e) {
            System.out.print("We have an error");
        }

        Job job = Job.getInstance(conf, "PutMerge");
        job.setJarByClass(PutMerge.class);
        job.setMapperClass(PutMergeMapper.class);
        job.setCombinerClass(PutMergeReducer.class);
        job.setReducerClass(PutMergeReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(FloatWritable.class);


        // HDFS path will contain the merged files
        FileInputFormat.addInputPath(job, hdfsFiles);
        FileOutputFormat.setOutputPath(job, new Path(args[2]));

        try {

            // For the job to wait unless one part of job is over
            System.exit(job.waitForCompletion(true)? 0 : 1);
            //To take the input path of multiple csvs
        } catch (InterruptedException ex) {
            System.out.print("Error 1 in main file %^#%$^&*(&&$%^&*(*^&%");
        }
    }
}

hadoop - hadoop 的 putmerge 不起作用

2 回答 2

Related

Reference