我在 S3 中有包含多行 JSON 的文件(由换行符分隔)。我想将这些文件转换为列格式供AWS Athena使用
我正在按照转换为列格式的指南来执行此操作,但是当转换为 ORC 时,S3 中的分区约定会丢失。
在此示例中,您如何保留dt
已转换为 parquet s3 文件夹结构中的分区?当我运行示例时,它只是输出s3://myBucket/pq/000000_0
而不是s3://myBucket/pq/dt=2009-04-14-04-05/000000_0
这是设置接口以将 JSON 带入 Hive 表的 HQL:
CREATE EXTERNAL TABLE impressions (
requestBeginTime string,
adId string,
impressionId string,
referrer string,
userAgent string,
userCookie string,
ip string,
number string,
processId string,
browserCookie string,
requestEndTime string,
timers struct<modelLookup:string, requestTime:string>,
threadId string,
hostname string,
sessionId string)
PARTITIONED BY (dt string)
ROW FORMAT serde 'org.apache.hive.hcatalog.data.JsonSerDe'
with serdeproperties ( 'paths'='requestBeginTime, adId, impressionId, referrer, userAgent, userCookie, ip' )
LOCATION 's3://us-east-1.elasticmapreduce/samples/hive-ads/tables/impressions' ;
msck repair table impressions;
这是转换为 Parquet 的 HQL
CREATE EXTERNAL TABLE parquet_hive (
requestBeginTime string,
adId string,
impressionId string,
referrer string,
userAgent string,
userCookie string,
ip string)
STORED AS PARQUET
LOCATION 's3://mybucket/pq/';
INSERT OVERWRITE TABLE parquet_hive SELECT requestbegintime,adid,impressionid,referrer,useragent,usercookie,ip FROM impressions where dt='2009-04-14-04-05';