43

有没有办法将重复项保存在 Hive 的收集集中,或者使用其他方法模拟 Hive 提供的那种聚合集合?我想将列中具有相同键的所有项目聚合到一个数组中,并带有重复项。

IE:

hash_id | num_of_cats
=====================
ad3jkfk            4
ad3jkfk            4
ad3jkfk            2
fkjh43f            1
fkjh43f            8
fkjh43f            8
rjkhd93            7
rjkhd93            4
rjkhd93            7

应该返回:

hash_agg | cats_aggregate
===========================
ad3jkfk   Array<int>(4,4,2)
fkjh43f   Array<int>(1,8,8)
rjkhd93   Array<int>(7,4,7)
4

9 回答 9

37

尝试在 Hive 0.13.0 之后使用 COLLECT_LIST(col)

SELECT
    hash_id, COLLECT_LIST(num_of_cats) AS aggr_set
FROM
    tablename
WHERE
    blablabla
GROUP BY
    hash_id
;
于 2014-11-11T02:28:04.517 回答
22

没有内置任何东西,但是创建用户定义的函数,包括聚合,并没有那么糟糕。唯一粗糙的部分是试图使它们类型通用,但这里是一个收集示例。

package com.example;

import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

public class CollectAll extends AbstractGenericUDAFResolver
{
    @Override
    public GenericUDAFEvaluator getEvaluator(TypeInfo[] tis)
            throws SemanticException
    {
        if (tis.length != 1)
        {
            throw new UDFArgumentTypeException(tis.length - 1, "Exactly one argument is expected.");
        }
        if (tis[0].getCategory() != ObjectInspector.Category.PRIMITIVE)
        {
            throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + tis[0].getTypeName() + " was passed as parameter 1.");
        }
        return new CollectAllEvaluator();
    }

    public static class CollectAllEvaluator extends GenericUDAFEvaluator
    {
        private PrimitiveObjectInspector inputOI;
        private StandardListObjectInspector loi;
        private StandardListObjectInspector internalMergeOI;

        @Override
        public ObjectInspector init(Mode m, ObjectInspector[] parameters)
                throws HiveException
        {
            super.init(m, parameters);
            if (m == Mode.PARTIAL1)
            {
                inputOI = (PrimitiveObjectInspector) parameters[0];
                return ObjectInspectorFactory
                        .getStandardListObjectInspector((PrimitiveObjectInspector) ObjectInspectorUtils
                        .getStandardObjectInspector(inputOI));
            }
            else
            {
                if (!(parameters[0] instanceof StandardListObjectInspector))
                {
                    inputOI = (PrimitiveObjectInspector)  ObjectInspectorUtils
                            .getStandardObjectInspector(parameters[0]);
                    return (StandardListObjectInspector) ObjectInspectorFactory
                            .getStandardListObjectInspector(inputOI);
                }
                else
                {
                    internalMergeOI = (StandardListObjectInspector) parameters[0];
                    inputOI = (PrimitiveObjectInspector) internalMergeOI.getListElementObjectInspector();
                    loi = (StandardListObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(internalMergeOI);
                    return loi;
                }
            }
        }

        static class ArrayAggregationBuffer implements AggregationBuffer
        {
            ArrayList<Object> container;
        }

        @Override
        public void reset(AggregationBuffer ab)
                throws HiveException
        {
            ((ArrayAggregationBuffer) ab).container = new ArrayList<Object>();
        }

        @Override
        public AggregationBuffer getNewAggregationBuffer()
                throws HiveException
        {
            ArrayAggregationBuffer ret = new ArrayAggregationBuffer();
            reset(ret);
            return ret;
        }

        @Override
        public void iterate(AggregationBuffer ab, Object[] parameters)
                throws HiveException
        {
            assert (parameters.length == 1);
            Object p = parameters[0];
            if (p != null)
            {
                ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
                agg.container.add(ObjectInspectorUtils.copyToStandardObject(p, this.inputOI));
            }
        }

        @Override
        public Object terminatePartial(AggregationBuffer ab)
                throws HiveException
        {
            ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
            ArrayList<Object> ret = new ArrayList<Object>(agg.container.size());
            ret.addAll(agg.container);
            return ret;
        }

        @Override
        public void merge(AggregationBuffer ab, Object o)
                throws HiveException
        {
            ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
            ArrayList<Object> partial = (ArrayList<Object>)internalMergeOI.getList(o);
            for(Object i : partial)
            {
                agg.container.add(ObjectInspectorUtils.copyToStandardObject(i, this.inputOI));
            }
        }

        @Override
        public Object terminate(AggregationBuffer ab)
                throws HiveException
        {
            ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
            ArrayList<Object> ret = new ArrayList<Object>(agg.container.size());
            ret.addAll(agg.container);
            return ret;
        }
    }
}

然后在蜂巢中,只需发出add jar Whatever.jar;CREATE TEMPORARY FUNCTION collect_all AS 'com.example.CollectAll'; 您应该能够按预期使用它。

hive> SELECT hash_id, collect_all(num_of_cats) FROM test GROUP BY hash_id;
OK
ad3jkfk [4,4,2]
fkjh43f [1,8,8]
rjkhd93 [7,4,7]

值得注意的是,元素的顺序应该被认为是未定义的,因此如果您打算使用它来将信息输入 n_grams,您可能需要稍微扩展它以根据需要对数据进行排序。

于 2011-07-02T09:38:56.610 回答
12

修改了 Jeff Mc 的代码以删除输入必须是原始类型的限制(可能继承自 collect_set)。这个版本可以收集结构、映射和数组以及原语。

package com.example;

import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

public class CollectAll extends AbstractGenericUDAFResolver
{
    @Override
    public GenericUDAFEvaluator getEvaluator(TypeInfo[] tis)
            throws SemanticException
    {
        if (tis.length != 1)
        {
            throw new UDFArgumentTypeException(tis.length - 1, "Exactly one argument is expected.");
        }
        return new CollectAllEvaluator();
    }

    public static class CollectAllEvaluator extends GenericUDAFEvaluator
    {
        private ObjectInspector inputOI;
        private StandardListObjectInspector loi;
        private StandardListObjectInspector internalMergeOI;

        @Override
        public ObjectInspector init(Mode m, ObjectInspector[] parameters)
                throws HiveException
        {
            super.init(m, parameters);
            if (m == Mode.PARTIAL1)
            {
                inputOI = parameters[0];
                return ObjectInspectorFactory
                        .getStandardListObjectInspector(ObjectInspectorUtils
                        .getStandardObjectInspector(inputOI));
            }
            else
            {
                if (!(parameters[0] instanceof StandardListObjectInspector))
                {
                    inputOI = ObjectInspectorUtils
                            .getStandardObjectInspector(parameters[0]);
                    return (StandardListObjectInspector) ObjectInspectorFactory
                            .getStandardListObjectInspector(inputOI);
                }
                else
                {
                    internalMergeOI = (StandardListObjectInspector) parameters[0];
                    inputOI = internalMergeOI.getListElementObjectInspector();
                    loi = (StandardListObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(internalMergeOI);
                    return loi;
                }
            }
        }

        static class ArrayAggregationBuffer implements AggregationBuffer
        {
            ArrayList<Object> container;
        }

        @Override
        public void reset(AggregationBuffer ab)
                throws HiveException
        {
            ((ArrayAggregationBuffer) ab).container = new ArrayList<Object>();
        }

        @Override
        public AggregationBuffer getNewAggregationBuffer()
                throws HiveException
        {
            ArrayAggregationBuffer ret = new ArrayAggregationBuffer();
            reset(ret);
            return ret;
        }

        @Override
        public void iterate(AggregationBuffer ab, Object[] parameters)
                throws HiveException
        {
            assert (parameters.length == 1);
            Object p = parameters[0];
            if (p != null)
            {
                ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
                agg.container.add(ObjectInspectorUtils.copyToStandardObject(p, this.inputOI));
            }
        }

        @Override
        public Object terminatePartial(AggregationBuffer ab)
                throws HiveException
        {
            ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
            ArrayList<Object> ret = new ArrayList<Object>(agg.container.size());
            ret.addAll(agg.container);
            return ret;
        }

        @Override
        public void merge(AggregationBuffer ab, Object o)
                throws HiveException
        {
            ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
            ArrayList<Object> partial = (ArrayList<Object>)internalMergeOI.getList(o);
            for(Object i : partial)
            {
                agg.container.add(ObjectInspectorUtils.copyToStandardObject(i, this.inputOI));
            }
        }

        @Override
        public Object terminate(AggregationBuffer ab)
                throws HiveException
        {
            ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab;
            ArrayList<Object> ret = new ArrayList<Object>(agg.container.size());
            ret.addAll(agg.container);
            return ret;
        }
    }
}
于 2012-12-14T18:00:26.637 回答
12

从 hive 0.13 开始,有一个名为的内置 UDAFcollect_list()可以实现此目的。见这里

于 2013-12-27T20:24:08.213 回答
2

查看 Brickhouse 收集 UDAF ( http://github.com/klout/brickhouse/blob/master/src/main/java/brickhouse/udf/collect/CollectUDAF.java )

它还支持收集成地图。Brickhouse 还包含许多有用的 UDF,它们不在标准 Hive 发行版中。

于 2014-07-10T17:41:19.427 回答
1

这是执行此工作的确切 hive 查询(仅在 hive > 0.13 中有效):

SELECT hash_id, collect_set(num_of_cats) FROM GROUP BY hash_id;

于 2014-06-10T07:25:23.127 回答
1

对于它的价值(尽管我知道这是一篇较旧的帖子),Hive 0.13.0具有一个新的collect_list()函数,它不会重复数据删除。

于 2014-07-08T22:40:16.350 回答
0

收集结构的解决方法

假设你有一张桌子

tableWithStruct(
id string,
obj struct <a:string,b:string>)

现在创建另一个表

CREATE EXTERNAL TABLE tablename (
id string,
temp array<string>
)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '|'

插入查询

insert into table tablename select id,collect(concat_ws('|',cast(obj.a as string),cast(obj.b as string)) from tableWithStruct group by id;

现在在与表名相同的位置创建另一个

CREATE EXTERNAL TABLE tablename_final (
id string,
array_list array<struct<a:string,b:string>>
)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '|'

当您从tablename_final中选择时,您将获得所需的输出

于 2015-06-05T08:49:28.647 回答
-1

只是想知道 - 如果 n 声明 -

SELECT
    hash_id, COLLECT_LIST(num_of_cats) AS aggr_set
FROM
    tablename
WHERE
    blablabla
GROUP BY
    hash_id
;

我们想要对 num_of_cats 的元素进行排序和限制 - 怎么做?COZ 在大数据中,我们处理 PB 的数据。在这种情况下,我们可能不需要所有这些,但前 10 名或限制它。

于 2015-12-17T11:32:52.693 回答