2

我有一个data.table超过 2 列的类型list。我想扩展这些列,以便列表的每个元素成为一个新列。我想要一种比“手动”扩展每一列然后将表格连接在一起更优雅的方式。

设置

编辑:(提供json我从中获得的data.table

所以我有一个json这样的文件:

[
    {
        "origins": [
            {
                "orig_lon": "14.36784",
                "orig_lat": "49.985982",
                "local_id": "AD.22045279",
                "full_address": "Věštínská 36/9, Radotín, 15300 Praha 5"
            },
            {
                "orig_lon": "14.352792",
                "orig_lat": "49.983317",
                "local_id": "AD.22055428",
                "full_address": "Otínská 1102/37, Radotín, 15300 Praha 5"
            }
        ],
        "destinations": [
            {
                "dest_lon": "14.352245",
                "dest_lat": "49.981314",
                "local_id": "AD.22045848",
                "full_address": "Zderazská 98/3, Radotín, 15300 Praha 5"
            },
            {
                "dest_lon": "14.226975",
                "dest_lat": "50.051702",
                "local_id": "AD.27261433",
                "full_address": "Západní 458, 25303 Chýně"
            }
        ],
        "destination_addresses": [
            "Zderazská 98/3, 153 00 Praha-Radotín, Czechia",
            "Západní 458, 253 01 Chýně, Czechia"
        ],
        "origin_addresses": [
            "U Jankovky 455/18, 153 00 Praha-Radotín, Czechia",
            "Otínská 1102/37, 153 00 Praha-Radotín, Czechia"
        ],
        "rows": [
            {
                "elements": [
                    {
                        "distance": {
                            "text": "1.6 km",
                            "value": 1620
                        },
                        "duration": {
                            "text": "5 mins",
                            "value": 272
                        },
                        "duration_in_traffic": {
                            "text": "5 mins",
                            "value": 277
                        },
                        "status": "OK"
                    },
                    {
                        "distance": {
                            "text": "19.3 km",
                            "value": 19313
                        },
                        "duration": {
                            "text": "22 mins",
                            "value": 1343
                        },
                        "duration_in_traffic": {
                            "text": "24 mins",
                            "value": 1424
                        },
                        "status": "OK"
                    }
                ]
            },
            {
                "elements": [
                    {
                        "distance": {
                            "text": "0.7 km",
                            "value": 691
                        },
                        "duration": {
                            "text": "2 mins",
                            "value": 101
                        },
                        "duration_in_traffic": {
                            "text": "2 mins",
                            "value": 99
                        },
                        "status": "OK"
                    },
                    {
                        "distance": {
                            "text": "18.7 km",
                            "value": 18655
                        },
                        "duration": {
                            "text": "21 mins",
                            "value": 1246
                        },
                        "duration_in_traffic": {
                            "text": "22 mins",
                            "value": 1336
                        },
                        "status": "OK"
                    }
                ]
            }               
        ],
        "status": "OK"
    },
    {
        "origins": [
            {
                "orig_lon": "14.36784",
                "orig_lat": "49.985982",
                "local_id": "AD.22045279",
                "full_address": "Věštínská 36/9, Radotín, 15300 Praha 5"
            },
            {
                "orig_lon": "14.352792",
                "orig_lat": "49.983317",
                "local_id": "AD.22055428",
                "full_address": "Otínská 1102/37, Radotín, 15300 Praha 5"
            }
        ],
        "destinations": [
            {
                "dest_lon": "14.36053",
                "dest_lat": "49.981687",
                "local_id": "AD.22047131",
                "full_address": "Zítkova 235/7, Radotín, 15300 Praha 5"
            },
            {
                "dest_lon": "14.361052",
                "dest_lat": "49.988529",
                "local_id": "AD.22054952",
                "full_address": "Strážovská 1053/33, Radotín, 15300 Praha 5"
            }
        ],
        "destination_addresses": [
            "Zítkova 235/7, 153 00 Praha-Radotín, Czechia",
            "Strážovská 1053/33, 153 00 Praha-Radotín, Czechia"
        ],
        "origin_addresses": [
            "U Jankovky 455/18, 153 00 Praha-Radotín, Czechia",
            "Otínská 1102/37, 153 00 Praha-Radotín, Czechia"
        ],
        "rows": [
            {
                "elements": [
                    {
                        "distance": {
                            "text": "1.4 km",
                            "value": 1445
                        },
                        "duration": {
                            "text": "4 mins",
                            "value": 248
                        },
                        "duration_in_traffic": {
                            "text": "4 mins",
                            "value": 247
                        },
                        "status": "OK"
                    },
                    {
                        "distance": {
                            "text": "1.9 km",
                            "value": 1933
                        },
                        "duration": {
                            "text": "4 mins",
                            "value": 264
                        },
                        "duration_in_traffic": {
                            "text": "4 mins",
                            "value": 267
                        },
                        "status": "OK"
                    }
                ]
            },
            {
                "elements": [
                    {
                        "distance": {
                            "text": "1.4 km",
                            "value": 1374
                        },
                        "duration": {
                            "text": "4 mins",
                            "value": 232
                        },
                        "duration_in_traffic": {
                            "text": "4 mins",
                            "value": 241
                        },
                        "status": "OK"
                    },
                    {
                        "distance": {
                            "text": "1.3 km",
                            "value": 1274
                        },
                        "duration": {
                            "text": "3 mins",
                            "value": 167
                        },
                        "duration_in_traffic": {
                            "text": "3 mins",
                            "value": 174
                        },
                        "status": "OK"
                    }
                ]
            }
        ],
        "status": "OK"
    }
]

我读到的是:

library(jsonlite)
library(data.table)
data <- read_json('./path_to_that_json/that_json.json')

这导致list长度为 2。

我可以将其转换为data.table

dt <- rbindlist(lapply(data, as.data.table))

然后导致data.table类似:

   origins destinations                             destination_addresses                                 origin_addresses
1:  <list>       <list>     Zderazská 98/3, 153 00 Praha-Radotín, Czechia U Jankovky 455/18, 153 00 Praha-Radotín, Czechia
2:  <list>       <list>                Západní 458, 253 01 Chýne, Czechia   Otínská 1102/37, 153 00 Praha-Radotín, Czechia
3:  <list>       <list>      Zítkova 235/7, 153 00 Praha-Radotín, Czechia U Jankovky 455/18, 153 00 Praha-Radotín, Czechia
4:  <list>       <list> Strážovská 1053/33, 153 00 Praha-Radotín, Czechia   Otínská 1102/37, 153 00 Praha-Radotín, Czechia
     rows status
1: <list>     OK
2: <list>     OK
3: <list>     OK
4: <list>     OK

这意味着我有几个包含列表的列,我想扩展它们。

什么样的作品

我知道要扩展一列,我可以这样做:

dt[, r = as.character(.I)]
res1 <- dt[, rbindlist(setNames(origins, r), id = "r")]

(我在这里发现:Expand list column of data.tables

现在,我可以通过重复此调用并使用 column 加入结果来扩展多个列r。这可能看起来像:

res1 <- dt[dt[, rbindlist(origins, id = "r")][
  , `:=`(r=as.character(r))], on = "r"][, `:=`(origins = NULL, destinations = NULL)][dt[
    , rbindlist(destinations, id = "r")][
      , `:=`(r=as.character(r))], on = "r"]

这会给我所需的输出:

                               destination_addresses                                 origin_addresses   rows status r
1:     Zderazská 98/3, 153 00 Praha-Radotín, Czechia U Jankovky 455/18, 153 00 Praha-Radotín, Czechia <list>     OK 1
2:                Západní 458, 253 01 Chýne, Czechia   Otínská 1102/37, 153 00 Praha-Radotín, Czechia <list>     OK 2
3:      Zítkova 235/7, 153 00 Praha-Radotín, Czechia U Jankovky 455/18, 153 00 Praha-Radotín, Czechia <list>     OK 3
4: Strážovská 1053/33, 153 00 Praha-Radotín, Czechia   Otínská 1102/37, 153 00 Praha-Radotín, Czechia <list>     OK 4
    orig_lon  orig_lat    local_id                            full_address  dest_lon  dest_lat  i.local_id
1:  14.36784 49.985982 AD.22045279  Veštínská 36/9, Radotín, 15300 Praha 5 14.352245 49.981314 AD.22045848
2: 14.352792 49.983317 AD.22055428 Otínská 1102/37, Radotín, 15300 Praha 5 14.226975 50.051702 AD.27261433
3:  14.36784 49.985982 AD.22045279  Veštínská 36/9, Radotín, 15300 Praha 5  14.36053 49.981687 AD.22047131
4: 14.352792 49.983317 AD.22055428 Otínská 1102/37, Radotín, 15300 Praha 5 14.361052 49.988529 AD.22054952
                               i.full_address
1:     Zderazská 98/3, Radotín, 15300 Praha 5
2:                   Západní 458, 25303 Chýne
3:      Zítkova 235/7, Radotín, 15300 Praha 5
4: Strážovská 1053/33, Radotín, 15300 Praha 5

我的问题是:

有没有更优雅、更有效的方式来扩展几列?理论上,我想要一个要扩展的列列表,然后进行一次调用,这将扩展所有列并返回上述结果。

此外,对于 column rows,扩展有点复杂:到目前为止,我正在创建一个 type 的新列list,其中不包括status记录。就像是:

dt[, rows2 := lapply(rows, function(x) list("distance" = (x[[1]][[1]]["distance"]),
                                         "duration" = (x[[1]][[1]]["duration"]),
                                         "duration_in_traffic" = (x[[1]][[1]]["duration_in_traffic"])))]

然后可以使用上面的过程扩展rows2为 type 的三列,list随后可以使用相同的过程对其进行扩展。现在,这种方法很糟糕,原因很明显,因为对于在我之后阅读代码的任何人来说都不是很简单。此外,它需要大量的打字。我认为必须有更优雅的方式来解决这个问题。

4

2 回答 2

1

因此,考虑该问题的一种方法是使用 lapply 处理列表列,以分别扩展每个列并存储到 data.tables 列表中,然后一次合并列表中的所有列。

要创建扩展变量列表,您只需执行以下操作:

    expandcols<-c("origins","destinations")

    lapply(expandcols,function(i) rbindlist(dt[[i]],idcol = "r")))

另请注意,您的原始 r 列是一个字符向量,而 rbindlist 创建的 idcol 是一个整数,因此您需要在这里保持一致性。在我的代码中,我刚刚将您的原件转换为数字。

要合并 data.tables 列表,我喜欢像这样使用 Reduce 函数:

     Reduce(function(...) merge(...,by="keys"), list())

输出将是一个 data.table,其中您的键列是“r”,列表将是上面 lapply 调用的结果。然后,您可以使用 data.table 方式将结果与原始数据框合并。总而言之,调用看起来像这样:

    dtfinal<-Reduce(function(...) merge(...,by="r"),lapply(expandcols,function(i) rbindlist(dt[[i]],idcol = "r")))[dt[,-expandcols,with=F],on="r"]

这是我制作的函数的代码:

    list_expander_fn<-function(X){
      '%notin%'<-Negate('%in%')##Helpful for selecting column names later
      expandcols_fun<-function(Y){##Main function to be called recursively as needed and takes in a data.table object as its only argument.
        listcols<-colnames(Y)[which(sapply(Y,is.list))] #Identify list columns
        listdt<-lapply(listcols,function(i) tryCatch(rbindlist(Y[[i]],idcol = "r"),error=function(e) NULL)) #Expand lists using rbindlist and returns null on error.

        invalidlists<-which(sapply(listdt,is.null)) #Rbindlist does not work unless list elements contain data.tables

##Simply unlists if character vector is created like in destination and origin addresses columns
        if(length(invalidlists)!=0){
            Y[,listcols[invalidlists]:=lapply(.SD,unlist),.SDcols = listcols[invalidlists]]

            listcols<-listcols[-invalidlists] ##Update list columns to be merged
            listdt<-listdt[-invalidlists]##Removes NULL elements from the listdt.
        }

        origcols<-colnames(Y)[colnames(Y)%notin%listcols]##Identifies  nonlist columns for final merge
        currentdt<-Reduce(function(...) merge(...,by="r"),listdt) ##merges list of data.tables
        return(currentdt[Y[,origcols,with=F],on="r"])
        }

      repeat{
        currentexpand<-expandcols_fun(X) #Executes the expandcols_fun
        listcheck<-sapply(currentexpand,is.list) #Checks again if lists still exist
        if(sum(listcheck)!=0){
          X<-currentexpand #Updates the X for recursive calls

        } else{
          break
        }
      }

      return(currentexpand)
}

它可以工作,但是由于最终字段名称(文本和值),变量名称存在问题。如果你喜欢它的发展方向,我可能会稍微修改一下。它适用于“rows2”,但不适用于“rows”。调用它的代码当然很简单:

    finaldt<-list_expander_fn(dt)

这有助于回答你的问题吗?如果您希望我在解释中添加任何内容,请告诉我。祝你好运!

于 2018-12-29T19:23:04.390 回答
1

与其在 a 中争吵,不如data.table考虑从 json 数据对象构建一个数据表,该数据对象通常作为数据框或其他列表的高度嵌套列表导入。因此,您需要根据不同级别项的路径进行迁移:

library(jsonlite)
library(data.table)

json_data <- read_json('/path/to/posted.json')

df_list <- lapply(json_data, function(item)
  data.frame(origin_address = unlist(item$origin_addresses),           # TOP LEVEL
             destination_address = unlist(item$destination_addresses), # TOP LEVEL
             do.call(rbind, lapply(item$origins, data.frame)),         # NESTED LEVEL
             do.call(rbind, lapply(item$destinations, data.frame)))    # NESTED LEVEL
)

final_df <- do.call(rbind, df_list)  # SINGLE DATA FRAME
final_dt <- rbindlist(df_list)       # SINGLE DATA TABLE

输出 (确保将 full_address 和 local_id 字段重命名为 origin_ 或 destination_)

final_dt

#                                      origin_address                               destination_address  orig_lon  orig_lat    local_id
# 1: U Jankovky 455/18, 153 00 Praha-Radotín, Czechia     Zderazská 98/3, 153 00 Praha-Radotín, Czechia  14.36784 49.985982 AD.22045279
# 2:   Otínská 1102/37, 153 00 Praha-Radotín, Czechia                Západní 458, 253 01 Chýně, Czechia 14.352792 49.983317 AD.22055428
# 3: U Jankovky 455/18, 153 00 Praha-Radotín, Czechia      Zítkova 235/7, 153 00 Praha-Radotín, Czechia  14.36784 49.985982 AD.22045279
# 4:   Otínská 1102/37, 153 00 Praha-Radotín, Czechia Strážovská 1053/33, 153 00 Praha-Radotín, Czechia 14.352792 49.983317 AD.22055428
#                               full_address  dest_lon  dest_lat  local_id.1                             full_address.1
# 1:  Věštínská 36/9, Radotín, 15300 Praha 5 14.352245 49.981314 AD.22045848     Zderazská 98/3, Radotín, 15300 Praha 5
# 2: Otínská 1102/37, Radotín, 15300 Praha 5 14.226975 50.051702 AD.27261433                   Západní 458, 25303 Chýně
# 3:  Věštínská 36/9, Radotín, 15300 Praha 5  14.36053 49.981687 AD.22047131      Zítkova 235/7, Radotín, 15300 Praha 5
# 4: Otínská 1102/37, Radotín, 15300 Praha 5 14.361052 49.988529 AD.22054952 Strážovská 1053/33, Radotín, 15300 Praha 5
于 2018-12-30T02:44:27.767 回答