0

我想使用 AWS Glue 和 Apache Hudi 删除 S3 中的一些行。当我执行以下作业时,遇到“调用 0128.save 时发生错误。非法字符:{database_name}_record”错误。我还尝试了插入和更新作业,并正确更新了 S3 中的数据。

有谁知道这个错误的原因,以及如何避免这种情况?

import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

spark = SparkSession.builder.config('spark.serializer','org.apache.spark.serializer.KryoSerializer').getOrCreate()
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

tableName = 'hudi_sample_cow'
bucketName = 'cm-moriwaki-hudi-sample--datalake'
basePath = f's3://{bucketName}/{tableName}'

schema = ["time", "transaction_id", "option"]
data = [
    ("2020/12/20", "00007", "B"),
    ("2020/12/21", "00008", "B"),
]
df = spark.createDataFrame(data, schema)

hudi_options = {
  'hoodie.table.name': tableName,
  'hoodie.datasource.write.recordkey.field': 'transaction_id', 
  'hoodie.datasource.write.partitionpath.field': 'time', 
  'hoodie.datasource.write.table.name': tableName, 
  'hoodie.datasource.write.operation': 'delete', 
  'hoodie.datasource.write.precombine.field': 'option', 
  'hoodie.upsert.shuffle.parallelism': 2,  
  'hoodie.insert.shuffle.parallelism': 2, 
}

df.write.format("hudi"). \
  options(**hudi_options). \
  mode("append"). \
  save(basePath)

job.commit()
4

0 回答 0