1

我有一个json文件如下:

{
  "123": [
    {
      "id": "123",
      "info": {
        "op": {
          "m": 1,
          "q": 2
        },
        "li": [
          "a",
          "b"
        ],
        "ad": [
          {
            "m": 1,
            "q": 2,
            "t": "text"
          },
          {
            "m": 1,
            "q": 2,
            "t": "abc"
          }
        ]
      },
      "dt": 1532494800000,
      "et": 1532494800000
    },
    {
      "id": "123",
      "info": {
        "op": {
          "m": 2,
          "q": 1
        },
        "li": [
          "a",
          "b"
        ],
        "ad": [
          {
            "m": 2,
            "q": 1,
            "t": "atext"
          },
          {
            "m": 10,
            "q": 2,
            "t": "abc"
          }
        ]
      },
      "dt": 1532494800000,
      "et": 1532494800000
    }
  ]
}

由于 json 对象以变量开头,我该如何为此编写模式?对于文件中的每个 json,spark 都会创建新的模式对象。不是性能瓶颈吗?

json 以非结构化形式存在于文件中,例如

{"123":[{"id":"123","info":{"op":{"m":1,"q":2},"li":["a","b"],"ad":[{"m":1,"q":2,"t":"text"},{"m":1,"q":2,"t":"abc"}]},"dt":1532494800000,"et":1532494800000},{"id":"123","info":{"op":{"m":2,"q":1},"li":["a","b"],"ad":[{"m":2,"q":1,"t":"atext"},{"m":10,"q":2,"t":"abc"}]},"dt":1532494800000,"et":1532494800000}]}

每个新行都包含一个 json 对象。这是我到目前为止所拥有的:

public JavaRDD<MyObject> parseRecordFile(String path) {
    JavaRDD<Row> jsonRdd = getJsonRdd(path);
    JavaRDD<MyObject> map = jsonRdd.map(JsonReader::parseJsonStructure);
    return map;
  }

  public void jsonSchemaSpark() {
    //Don't know what to put here.
  }

  private JavaRDD<Row> getJsonRdd(String path) {
    Dataset<Row> jsonDS = sparkSession.read().format("json").load(path);
    return jsonDS.toJavaRDD();
  }

  private static MyObject parseJsonStructure(Row row) {
    log.info("Row starting");
    log.info("One row {}", row);
    log.info("Row end");
    return new MyObject();
  }

一行是否像文件中的每行一样类似于一个 json 对象。

4

0 回答 0