1

我在我的 Google Cloud 环境中设置 CDAP,但在执行以下管道时遇到问题:在 BigQuery 上运行查询并将结果保存在 Google Storage 上的 CSV 文件中。

我的过程是:

  1. 使用Google Marketplace 上的CDAP OSS 映像安装 CDAP 。

  2. 构建以下管道:

{
    "artifact": {
        "name": "cdap-data-pipeline",
        "version": "6.0.0",
        "scope": "SYSTEM"
    },
    "description": "Data Pipeline Application",
    "name": "cdap_dsc_test",
    "config": {
        "resources": {
            "memoryMB": 2048,
            "virtualCores": 1
        },
        "driverResources": {
            "memoryMB": 2048,
            "virtualCores": 1
        },
        "connections": [
            {
                "from": "BigQuery",
                "to": "Google Cloud Storage"
            }
        ],
        "comments": [],
        "postActions": [],
        "properties": {},
        "processTimingEnabled": true,
        "stageLoggingEnabled": true,
        "stages": [
            {
                "name": "BigQuery",
                "plugin": {
                    "name": "BigQueryTable",
                    "type": "batchsource",
                    "label": "BigQuery",
                    "artifact": {
                        "name": "google-cloud",
                        "version": "0.12.2",
                        "scope": "SYSTEM"
                    },
                    "properties": {
                        "project": "bi-data-science",
                        "serviceFilePath": "/home/ubuntu/bi-data-science-cdap-4cbf526de374.json",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}",
                        "referenceName": "test_tables",
                        "dataset": "google_trends",
                        "table": "devices"
                    }
                },
                "outputSchema": [
                    {
                        "name": "etlSchemaBody",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
                    }
                ]
            },
            {
                "name": "Google Cloud Storage",
                "plugin": {
                    "name": "GCS",
                    "type": "batchsink",
                    "label": "Google Cloud Storage",
                    "artifact": {
                        "name": "google-cloud",
                        "version": "0.12.2",
                        "scope": "SYSTEM"
                    },
                    "properties": {
                        "project": "bi-data-science",
                        "suffix": "yyyy-MM-dd",
                        "format": "json",
                        "serviceFilePath": "/home/ubuntu/bi-data-science-cdap-4cbf526de374.json",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}",
                        "delimiter": ",",
                        "referenceName": "gcs_cdap",
                        "path": "gs://hurb_sandbox/cdap_experiments/"
                    }
                },
                "outputSchema": [
                    {
                        "name": "etlSchemaBody",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
                    }
                ],
                "inputSchema": [
                    {
                        "name": "BigQuery",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
                    }
                ]
            }
        ],
        "schedule": "0 * * * *",
        "engine": "mapreduce",
        "numOfRecordsPreview": 100,
        "description": "Data Pipeline Application",
        "maxConcurrentRuns": 1
    }
}

凭据密钥具有所有者权限,我可以使用“预览”选项访问查询结果。

管道结果:

文件:

  • _SUCCESS(空)
  • part-r-00000(查询结果)

没有生成 csv 文件,我也找不到可以在 CDAP 中为输出文件设置名称的地方。我错过了任何配置步骤吗?

更新:

我们最终放弃了 CDAP,而使用了 Google DataFlow。

4

1 回答 1

1

在管道中配置 GCS 接收器时,有一个“格式”字段,您已将其设置为 JSON。您可以将其设置为 CSV 以实现您想要的格式。

于 2019-11-24T17:27:05.060 回答