0

我正在使用h2oR 中的包并尝试进行一些数据操作,但sub/gsub函数存在一些问题。

这是我的代码:

library(h2o)

# Start cluster
localH2O = h2o.init(nthreads = 2) 

# Create data set
dat1.mini <- structure(list(id = c("7927751403363142656", "18236986451472797696", 
"5654946373641778176", "14195690822403907584", "1693303484298446848", 
"1.1362181921561e+19", "11694645532962195456", "1221431312630614784", 
"1987127670789791488", "379819848497418688"), click = c("0", 
"0", "0", "0", "0", "0", "0", "1", "0", "0"), hour = c("14102118", 
"14102217", "14102812", "14102912", "14102820", "14102401", "14102117", 
"14102312", "14102301", "14102414"), C1 = c("1005", "1005", "1005", 
"1002", "1005", "1005", "1005", "1005", "1005", "1005"), banner_pos = c("1", 
"1", "0", "0", "0", "0", "1", "1", "0", "0"), site_id = c("b7e9786d", 
"e151e245", "85f751fd", "ee4c822c", "85f751fd", "85f751fd", "e5c60a05", 
"e151e245", "1fbe01fe", "1fbe01fe"), site_domain = c("b12b9f85", 
"7e091613", "c4e18dd6", "c4e18dd6", "c4e18dd6", "c4e18dd6", "7256c623", 
"7e091613", "f3845767", "f3845767"), site_category = c("f028772b", 
"f028772b", "50e219e0", "50e219e0", "50e219e0", "50e219e0", "f028772b", 
"f028772b", "28905ebd", "28905ebd"), app_id = c("ecad2386", "ecad2386", 
"685d1c4c", "ecad2386", "92f5800b", "f02cb7ab", "ecad2386", "ecad2386", 
"ecad2386", "ecad2386"), app_domain = c("7801e8d9", "7801e8d9", 
"2347f47a", "7801e8d9", "ae637522", "2347f47a", "7801e8d9", "7801e8d9", 
"7801e8d9", "7801e8d9"), app_category = c("07d7df22", "07d7df22", 
"8ded1f7a", "07d7df22", "0f2161f8", "f95efa07", "07d7df22", "07d7df22", 
"07d7df22", "07d7df22"), device_id = c("a99f214a", "a99f214a", 
"a99f214a", "8374cacf", "a99f214a", "8a5908a5", "a99f214a", "a99f214a", 
"a99f214a", "a99f214a"), device_ip = c("3214d61e", "d5623936", 
"419e166e", "698846d6", "c2d9c2f2", "40817190", "edd10fc1", "e4c6e857", 
"05d3adbe", "6929d972"), device_model = c("a0f5f879", "69f9dd0e", 
"46a414f4", "12edfe21", "4ffd3a7e", "04f5b394", "779d90c2", "1f0bc64f", 
"293291c1", "d787e91b"), device_type = c("1", "1", "1", "0", 
"1", "1", "1", "1", "1", "1"), device_conn_type = c("0", "0", 
"3", "0", "3", "0", "0", "0", "0", "0"), C14 = c("16208", "20277", 
"23224", "17566", "21189", "20633", "19771", "17264", "15703", 
"20108"), C15 = c("320", "320", "320", "320", "320", "320", "320", 
"320", "320", "320"), C16 = c("50", "50", "50", "50", "50", "50", 
"50", "50", "50", "50"), C17 = c("1800", "2281", "2676", "479", 
"2424", "2374", "2227", "1872", "1722", "2299"), C18 = c("3", 
"3", "0", "3", "1", "3", "0", "3", "0", "2"), C19 = c("167", 
"47", "35", "39", "161", "39", "679", "39", "35", "1327"), C20 = c("100077", 
"100181", "100176", "100074", "100189", "-1", "100074", "-1", 
"-1", "-1"), C21 = c("23", "42", "221", "23", "71", "23", "48", 
"23", "79", "52")), .Names = c("id", "click", "hour", "C1", "banner_pos", 
"site_id", "site_domain", "site_category", "app_id", "app_domain", 
"app_category", "device_id", "device_ip", "device_model", "device_type", 
"device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", 
"C20", "C21"), row.names = c(NA, 10L), class = "data.frame")

# Load data to cluster
dat.mini.hex <- as.h2o(localH2O, dat1.mini)

# Attempt to grab substring of first 6 characters from hour column
dat.mini.hex$hr <- h2o.sub('^(.{6}).*$','\\1', dat.mini.hex$hour)
dat.mini.hex$hr <- h2o.gsub('(.+)..','\\1', dat.mini.hex$hour)

所有这些尝试都会导致以下错误:

Error in .h2o.__remoteSend(client, .h2o.__PAGE_EXEC2, str = expr) : 
  http://127.0.0.1:54321/2/Exec2.json  returned the following error:
   class java.lang.NullPointerException
4

1 回答 1

2

发生错误是因为hour是数字列。该函数h2o.subh2o.gsub不适用于数值数据。

该命令str(dat.mini.hex$hour)将向您显示这hour是一个数字列。

str(dat.mini.hex$hour)

您可以转换hour为因子并将结果保存在新列hour2中。

dat.mini.hex$hour2 <- as.factor(dat.mini.hex$hour)

现在,您可以使用h2o.sub. 但是,我想你不会喜欢这个结果......

h2o.sub('^(.{6}).*$','\\1', dat.mini.hex$hour2)
#   hour2
# 1   \\1
# 2   \\1
# 3   \\1
# 4   \\1
# 5   \\1
# 6   \\1

如您所见,按字面意思h2o.sub使用,\\1但不用于第一个匹配组。这种行为与base R's 形成对比sub

您可以更改您的正则表达式并将前六个字符之后的字符替换为空字符串。

h2o.sub('(?<=^.{6}).*$','', dat.mini.hex$hour2)
#    hour2
# 1 141021
# 2 141022
# 3 141028
# 4 141029
# 5 141028
# 6 141024

在这里,(?<=^.{6})是一个积极的回顾。它匹配字符串开头和前 6 位数字之前的位置。

于 2015-01-29T21:37:34.857 回答