hadoop - Mapreduce 与 HCATALOG 与 MAPR 中的 oozie 集成

Question

我编写了一个 mapreduce 程序，它使用 HCATLOG 从 hive 表中读取数据并写入 HBase。这是一个只有地图的工作，没有减速器。我已经从命令行运行了该程序，它按预期工作（创建了一个胖 jar 以避免 Jar 问题）。我想将它集成到 oozie（在 HUE 的帮助下）。我有两个选择来运行它

使用 Mapreduce 操作
使用 Java 动作

由于我的 Mapreduce 程序有一个包含以下代码的驱动程序方法

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat;

public class HBaseValdiateInsertDriver {

public static void main(String[] args) throws Exception {

    String dbName = "Test";
    String tableName = "emp";
    Configuration conf = new Configuration();

    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = new Job(conf, "HBase Get Put Demo");
    job.setInputFormatClass(HCatInputFormat.class);
    HCatInputFormat.setInput(job, dbName, tableName, null);

    job.setJarByClass(HBaseValdiateInsertDriver.class);

    job.setMapperClass(HBaseValdiateInsert.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, new Path("maprfs:///user/input"));
    FileOutputFormat.setOutputPath(job, new Path("maprfs:///user/output"));

    job.waitForCompletion(true);

    }
}

如何在 oozie 中指定驱动程序方法，我所看到的只是指定映射器和减速器类。有人可以指导我如何设置属性吗？

使用 java 操作，我可以将我的驱动程序类指定为主类并执行它，但我面临诸如找不到表、找不到 HCATLOG jar 等错误。我在工作流程中包含 hive-site.xml（使用 Hue）但我感觉系统无法拾取属性。有人可以告诉我我需要注意什么吗，还有其他我需要包含的配置属性吗？

我在cloudera网站中提到的示例程序也使用

HCatInputFormat.setInput(job, InputJobInfo.create(dbName,
                inputTableName, null));

当我使用以下内容时（我没有看到接受上述输入的方法

HCatInputFormat.setInput(job, dbName, tableName, null);

下面是我的映射器代码

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hive.hcatalog.data.HCatRecord;



public class HBaseValdiateInsert extends Mapper<WritableComparable, HCatRecord, Text, Text> {

    static HTableInterface table;
    static HTableInterface inserted;
    private String hbaseDate = null;
    String existigValue=null;
    List<Put> putList = new ArrayList<Put>();


    @Override
    public void setup(Context context) throws IOException {

                Configuration conf = context.getConfiguration();
                String tablename = "dev_arch186";
        Utils.getHBConnection();
        table = Utils.getTable(tablename);
        table.setAutoFlushTo(false);
    }
    @Override
    public void cleanup(Context context) {
        try {
            table.put(putList);
            table.flushCommits();
            table.close();
        } catch (IOException e) {

            e.printStackTrace();
        }
        Utils.closeConnection();
    }



    @Override
    public void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {

                String name_hive = (String) value.get(0);
                String id_hive = (String) value.get(1);

                String rec[] = test.toString().split(",");
        Get g = new Get(Bytes.toBytes(name_hive));

        existigValue=getOneRecord(Bytes.toBytes("Info"),Bytes.toBytes("name"),name_hive);
        if (existigValue.equalsIgnoreCase("NA") || !existigValue.equalsIgnoreCase(id_hive)) {
            Put put = new Put(Bytes.toBytes(rec[0]));
            put.add(Bytes.toBytes("Info"),
                    Bytes.toBytes("name"),
                    Bytes.toBytes(rec[1]));
            put.setDurability(Durability.SKIP_WAL);
            putList.add(put);
            if(putList.size()>25000){
                table.put(putList);
                table.flushCommits();
            }
        }

    }


    public String getOneRecord(byte[] columnFamily, byte[] columnQualifier, String rowKey)
            throws IOException {
        Get get = new Get(rowKey.getBytes());
        get.setMaxVersions(1);
        Result rs = table.get(get);
        rs.getColumn(columnFamily, columnQualifier);
        System.out.println(rs.containsColumn(columnFamily, columnQualifier));
        KeyValue result = rs.getColumnLatest(columnFamily,columnQualifier);

        if (rs.containsColumn(columnFamily, columnQualifier))
            return (Bytes.toString(result.getValue()));
        else
            return "NA";
    }

    public boolean columnQualifierExists(String tableName, String ColumnFamily,
            String ColumnQualifier, String rowKey) throws IOException  {
        Get get = new Get(rowKey.getBytes());
        Result rs = table.get(get);
        return(rs.containsColumn(ColumnFamily.getBytes(),ColumnQualifier.getBytes()));
    }

}

注意：我使用带有 HUE 的 MapR (M3) Cluster 作为 oozie 的接口。Hive 版本：1-0 HCAT 版本：1-0

score 0 · Accepted Answer

我找不到从 Oozie mapreduce 操作初始化 HCatInputFormat 的任何方法。但我有一个解决方法如下。

通过扩展 HCatInputFormat 创建了 LazyHCatInputFormat。覆盖 getJobInfo 方法，以处理初始化。这将作为 getSplits(..) 调用的一部分调用。

    private static void lazyInit(Configuration conf){
    try{

        if(conf==null){
            conf = new Configuration(false);
        }
        conf.addResource(new Path(System.getProperty("oozie.action.conf.xml")));
        conf.addResource(new org.apache.hadoop.fs.Path("hive-config.xml"));

        String databaseName = conf.get("LazyHCatInputFormat.databaseName");
        String tableName = conf.get("LazyHCatInputFormat.tableName");
        String partitionFilter = conf.get("LazyHCatInputFormat.partitionFilter");

        setInput(conf, databaseName, tableName);
        //setFilter(partitionFilter);

        //System.out.println("After lazyinit : "+conf.get("mapreduce.lib.hcat.job.info"));
    }catch(Exception e){
        System.out.println("*** LAZY INIT FAILED ***");
        //e.printStackTrace();
    }
}

public static InputJobInfo getJobInfo(Configuration conf)
        throws IOException {
    String jobString = conf.get("mapreduce.lib.hcat.job.info");
    if (jobString == null) {
        lazyInit(conf);
        jobString = conf.get("mapreduce.lib.hcat.job.info");
        if(jobString == null){
            throw new IOException("job information not found in JobContext. HCatInputFormat.setInput() not called?");   
        }
    }
    return (InputJobInfo) HCatUtil.deserialize(jobString);
}

在 oozie map-redcue 动作中，配置如下。

              <property>
                <name>mapreduce.job.inputformat.class</name>
                <value>com.xyz.LazyHCatInputFormat</value>
          </property>
          <property>
                <name>LazyHCatInputFormat.databaseName</name>
                <value>HCAT DatabaseNameHere</value>
          </property>
           <property>
                <name>LazyHCatInputFormat.tableName</name>
                <value>HCAT TableNameHere</value>
          </property>

这可能不是最好的实现，但可以快速破解使其工作。

hadoop - Mapreduce 与 HCATALOG 与 MAPR 中的 oozie 集成

1 回答 1

Related

Reference