目前,日志存储在 DynamoDB 中。我们希望从该表中过滤掉不必要的行并将输出存储在不同的表中(fe 排除“value”字段包含“bot”、“python”、“requests”等的行)。
此时我想出了这样的东西(aws模板):
{
"objects": [
{
"name": "EmrClusterForBackup",
"coreInstanceType": "m1.medium",
"coreInstanceCount": "1",
"masterInstanceType": "m1.medium",
"amiVersion": "3.3.2",
"id": "EmrClusterForBackup",
"type": "EmrCluster",
"terminateAfter": "2 Hours"
},
{
"occurrences": "1",
"period": "1 Day",
"name": "RunOnce",
"id": "DefaultSchedule",
"type": "Schedule",
"startAt": "FIRST_ACTIVATION_DATE_TIME"
},
{
"name": "DDBExportFormat",
"id": "DDBExportFormat",
"type": "DynamoDBExportDataFormat"
},
{
"directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
"dataFormat": {
"ref": "DDBExportFormat"
},
"name": "S3BackupLocation",
"id": "S3BackupLocation",
"type": "S3DataNode"
},
{
"failureAndRerunMode": "CASCADE",
"schedule": {
"ref": "DefaultSchedule"
},
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://ti-labs-ml-data/logs/",
"scheduleType": "cron",
"name": "Default",
"id": "Default"
},
{
"output": {
"ref": "S3BackupLocation"
},
"input": {
"ref": "DDBSourceTable"
},
"filterSql": "",
"name": "TableBackupActivity",
"id": "TableBackupActivity",
"runsOn": {
"ref": "EmrClusterForBackup"
},
"type": "HiveCopyActivity",
"resizeClusterBeforeRunning": "true"
},
{
"readThroughputPercent": "#{myDDBReadThroughputRatio}",
"dataFormat": {
"ref": "DDBExportFormat"
},
"name": "DDBSourceTable",
"id": "DDBSourceTable",
"type": "DynamoDBDataNode",
"tableName": "#{myDDBTableName}"
}
],
"parameters": [
{
"description": "Output S3 folder",
"id": "myOutputS3Loc",
"type": "AWS::S3::ObjectKey"
},
{
"default": "0.2",
"watermark": "Enter value between 0.1-1.0",
"description": "DynamoDB read throughput ratio",
"id": "myDDBReadThroughputRatio",
"type": "Double"
},
{
"description": "DynamoDB table name",
"id": "myDDBTableName",
"type": "String"
}
],
"values": {
"myDDBTableName": "TI-LABS-DDB-A",
"myDDBReadThroughputRatio": "0.2",
"myOutputS3Loc": "s3://ti-labs-ml-data/"
}
}
但是我不明白过滤器查询应该是什么样子(我试过一个 - 它说该行只有“项目”,而我的表有 2 个字段 - id 和 value)。