我正在尝试使用以下 R 脚本在位于“hdfs://:/somnath/merged_train/part-m-00000”的 HDFS 数据文件上使用 RHadoop(rmr2、rhdfs 包)构建逻辑回归模型,然后使用测试模型位于“hdfs://:/somnath/merged_test/part-m-00000”的测试 HDFS 数据文件。
我们使用 CDH4 发行版,Yarn/MR2 与 Hadoop-0.20 支持的 MR1 并行运行。并使用 hadoop-0.20 mapreduce 和 hdfs 版本将以下 RHadoop 脚本作为 Sys.setenv 命令运行,如下所示。
但是,每当我运行脚本时,我都会遇到以下错误,几乎没有运气绕过它。如果有人指出这个错误的可能原因,我将不胜感激,这似乎是由于 R 中 lapply 调用的错误方式而不处理 NA 参数。
[root@kkws029 logreg_template]# Rscript logreg_test.R
Loading required package: methods
Loading required package: rJava
HADOOP_CMD=/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/bin/hadoop
Be sure to run hdfs.init()
14/08/11 11:59:30 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
NULL
NULL
[1] "Starting to build logistic regression model..."
Error in FUN(X[[2L]], ...) :
Sorry, parameter type `NA' is ambiguous or not supported.
Calls: logistic.regression ... .jrcall -> ._java_valid_objects_list -> lapply -> FUN
Execution halted
下面是我的 R 脚本:
#!/usr/bin/env Rscript
Sys.setenv(HADOOP_HOME="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce")
Sys.setenv(HADOOP_CMD="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/bin/hadoop")
Sys.setenv(HADOOP_BIN="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/bin");
Sys.setenv(HADOOP_CONF_DIR="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/conf");
Sys.setenv(HADOOP_STREAMING="/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.3.0.jar")
Sys.setenv(LD_LIBRARY_PATH="/usr/lib64/R/library/rJava/jri")
library(rmr2)
library(rhdfs)
.jinit()
.jaddClassPath("/opt/cloudera/parcels/CDH/lib/hadoop/hadoop-auth-2.0.0-cdh4.3.0.jar")
.jaddClassPath("/opt/cloudera/parcels/CDH/lib/hadoop-hdfs/hadoop-hdfs-2.0.0-cdh4.3.0.jar")
.jaddClassPath("/opt/cloudera/parcels/CDH/lib/hadoop/hadoop-common-2.0.0-cdh4.3.0.jar")
hdfs.init()
rmr.options( backend = "hadoop", hdfs.tempdir = "/tmp" )
logistic.regression =
function(hdfsFilePath, iterations, dims, alpha) {
r.file <- hdfs.file(hdfsFilePath,"r")
#hdfsFilePath <- to.dfs(hdfsFilePath)
lr.map =
function(.,M) {
Y = M[,1]
X = M[,-1]
keyval(
1,
Y * X *
g(-Y * as.numeric(X %*% t(plane))))}
lr.reduce =
function(k, Z)
keyval(k, t(as.matrix(apply(Z,2,sum))))
plane = t(rep(0, dims))
g = function(z) 1/(1 + exp(-z))
for (i in 1:iterations) {
gradient =
values(
from.dfs(
mapreduce(
input = as.matrix(hdfs.read.text.file(r.file)),
#input = from.dfs(hdfsFilePath),
map = function(.,M) {
Y = M[,1]
X = M[,-1]
keyval(
1,
Y * X *
g(-Y * as.numeric(X %*% t(plane))))},
reduce = lr.reduce,
combine = T)))
plane = plane + alpha * gradient
#trace(print(plane),quote(browser()))
}
return(plane) }
#validate logistic regression
logistic.regression.test =
function(hdfsFilePath, weight) {
r.file <- hdfs.file(hdfsFilePath,"r")
lr.test.map =
function(.,M) {
keyval(
1,
lapply(as.numeric(M[,-1] %*% t(weight)),function(z) 1/(1 + exp(-z))))}
probabilities =
values(
from.dfs(
mapreduce(
input = as.matrix(hdfs.read.text.file(r.file)),
map = function(.,M) {
keyval(
1,
lapply(as.numeric(M[,-1] %*% t(weight)), function(z) 1/(1 + exp(-z))))}
)))
return(probabilities) }
out = list()
prob = list()
rmr.options( backend = "hadoop", hdfs.tempdir = "/tmp" )
print("Starting to build logistic regression model...")
out[['hadoop']] =
## @knitr logistic.regression-run
logistic.regression(
"hdfs://XX.XX.XX.XX:NNNN/somnath/merged_train/part-m-00000", 5, 5, 0.05)
write.csv(as.vector(out[['hadoop']]), "/root/somnath/logreg_data/weights.csv")
print("Building logistic regression model completed.")
prob[['hadoop']] =
logistic.regression.test(
"hdfs://XX.XX.XX.XX:NNNN/somnath/merged_test/part-m-00000", out[['hadoop']])
write.csv(as.vector(prob[['hadoop']]), "/root/somnath/logreg_data/probabilities.csv")
stopifnot(
isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7)))
注意:我在 root ~/.bash_profile 中为 HADOOP 设置了以下环境变量,如下所示
# Hadoop-specific environment and commands
export HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce
export HADOOP2_HOME=/opt/cloudera/parcels/CDH/lib/hadoop
#export HADOOP_CMD=${HADOOP_HOME}/bin/hadoop
#export HADOOP_STREAMING=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar
#export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export LD_LIBRARY_PATH=${R_HOME}/library/rJava/jri #:${HADOOP_HOME}/../hadoop-0.20-mapreduce/lib/native/Linux-amd64-64
# Add hadoop-common jar to classpath for PlatformName and FsShell classes; Add hadoop-auth and hadoop-hdfs jars
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:${HADOOP2_HOME}/client-0.20/* #:${HADOOP_HOME}/*.jar:${HADOOP_HOME}/lib/*.jar:${HADOOP2_HOME}/hadoop-common-2.0.0-cdh4.3.0.jar:${HADOOP_HOME}/../hadoop-hdfs/hadoop-hdfs-2.0.0-cdh4.3.0.jar:${HADOOP_HOME}/hadoop-auth-2.0.0-cdh4.3.0.jar:$HADOOP_STREAMING
PATH=$PATH:$R_HOME/bin:$JAVA_HOME/bin:$LD_LIBRARY_PATH:/opt/cloudera/parcels/CDH/lib/mahout:/opt/cloudera/parcels/CDH/lib/hadoop:/opt/cloudera/parcels/CDH/lib/hadoop-hdfs:/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce:/opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce:/var/lib/storm-0.9.0-rc2/lib #:$HADOOP_CMD:$HADOOP_STREAMING:$HADOOP_CONF_DIR
export PATH
样本训练数据集
0,-4.418,-2.0658,1.2193,-0.68097,0.90894
0,-2.7466,-2.9374,-0.87562,-0.65177,0.53182
0,-0.98846,0.66962,-0.20736,-0.2895,0.002313
0,-2.277,2.492,0.47936,0.4673,-1.5075
0,-5.4391,1.8447,-1.6843,1.465,-0.71099
0,-0.12843,0.066968,0.02678,-0.040851,0.0075902
0,-2.0796,2.4739,0.23472,0.86423,0.45094
0,-3.1796,-0.15429,1.4814,-0.94316,-0.52754
0,-1.9429,1.3111,0.31921,-1.202,0.8552
0,-2.3768,1.9301,0.096005,-0.51971,-0.17544
0,-2.0336,1.991,0.82029,0.018232,-0.33222
0,-3.6388,-3.2903,-2.1076,0.73341,0.75986
0,-2.9146,0.53163,0.49182,-0.38562,-0.76436
0,-3.3816,1.0954,0.25552,-0.11564,-0.01912
0,-1.7374,-0.63031,-0.6122,0.022664,0.23399
0,-1.312,-0.54935,-0.68508,-0.072985,0.036481
0,-3.991,0.55278,0.38666,-0.56128,-0.6748
....
样本测试数据集
0,-0.66666,0.21439,0.041861,-0.12996,-0.36305
0,-1.3412,-1.1629,-0.029398,-0.13513,0.49758
0,-2.6776,-0.40194,-0.97336,-1.3355,0.73202
0,-6.0203,-0.61477,1.5248,1.9967,2.697
0,-4.5663,-1.6632,-1.2893,-1.7972,1.4367
0,-7.2339,2.4589,0.61349,0.39094,2.19
0,-4.5683,-1.3066,1.1006,-2.8084,0.3172
0,-4.1223,-1.5059,1.3063,-0.18935,1.177
0,-3.7135,-0.26283,1.6961,-1.3499,-0.18553
0,-2.7993,1.2308,-0.42244,-0.50713,-0.3522
0,-3.0541,1.8173,0.96789,-0.25138,-0.36246
0,-1.1798,1.0478,-0.29168,-0.26261,-0.21527
0,-2.6459,2.9387,0.14833,0.24159,-2.4811
0,-3.1672,2.479,-1.2103,-0.48726,0.30974
1,-0.90706,1.0157,0.32953,-0.11648,-0.47386
...