我想使用 AWS Glue 和 Apache Hudi 删除 S3 中的一些行。当我执行以下作业时,遇到“调用 0128.save 时发生错误。非法字符:{database_name}_record”错误。我还尝试了插入和更新作业,并正确更新了 S3 中的数据。
有谁知道这个错误的原因,以及如何避免这种情况?
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
spark = SparkSession.builder.config('spark.serializer','org.apache.spark.serializer.KryoSerializer').getOrCreate()
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
tableName = 'hudi_sample_cow'
bucketName = 'cm-moriwaki-hudi-sample--datalake'
basePath = f's3://{bucketName}/{tableName}'
schema = ["time", "transaction_id", "option"]
data = [
("2020/12/20", "00007", "B"),
("2020/12/21", "00008", "B"),
]
df = spark.createDataFrame(data, schema)
hudi_options = {
'hoodie.table.name': tableName,
'hoodie.datasource.write.recordkey.field': 'transaction_id',
'hoodie.datasource.write.partitionpath.field': 'time',
'hoodie.datasource.write.table.name': tableName,
'hoodie.datasource.write.operation': 'delete',
'hoodie.datasource.write.precombine.field': 'option',
'hoodie.upsert.shuffle.parallelism': 2,
'hoodie.insert.shuffle.parallelism': 2,
}
df.write.format("hudi"). \
options(**hudi_options). \
mode("append"). \
save(basePath)
job.commit()