我有一个要在大约 300 万个数据点上运行的函数。我正在尝试在具有 8 个内核mcmapply
的Ubuntu机器上并行化该功能。该函数接受list
长度为 300 万的 a 以及另外 3 个长度为 300 万的向量和 1 个常量值cutoffyearmon
。
该代码在单核上 2 分钟内运行 100000 行数据,运行良好,并且没有抛出错误。但是,当我尝试在我的机器的 6 个内核上并行运行代码时,mcmapply
它会持续运行超过 5 个小时。
更新:这是我的函数调用的淡化版本。我为 1 个月、2 个月和 3 个月的持续时间创建了另外 9 个变量。我只考虑了 6 个月和 1 年的时间变量。
我正在使用以下函数调用:
abc_xx_last_xxx_days=mcmapply(function(abcstrnew,sd,naflag,empflag,daysdiff,cutoffyearmon){
abcstrnew=if((!naflag) & (!empflag)){
substring(text = abcstrnew,first = seq(from = 1,to = (nchar(abcstrnew)-2),by = 3),last = seq(from = 3,to = (nchar(abcstrnew)),by = 3))
}else{
if(!is.na(empflag) & empflag){
""
}else{
NA_character_
}
}
abcstrnew=if((!naflag) & (!empflag)){
as.numeric(abcstrnew)
}else{
if(!is.na(empflag) & empflag){
as.numeric(0)
}else{
NA_real_
}
}
if(is.na(daysdiff)){
return(list(worst_abc_ever=NA_real_,
times_abc=NA_real_,
times_abc_last_180_days=NA_real_,
times_abc_last_365_days=NA_real_,
times_abc30_last_365_days=NA_real_,
times_abc30_last_180_days=NA_real_,
times_abc60_last_365_days=NA_real_,
times_abc60_last_180_days=NA_real_,
abc_last_180_days=NA_real_,
abc_last_365_days=NA_real_
))
}else{
if((!naflag)&(!empflag)){
abcstrlen=length(abcstrnew)
worst_abc_ever=max(abcstrnew)
times_abc=as.numeric(length(which(abcstrnew>0)))
if(daysdiff>365){
abc_last_365_days=as.numeric(0)
times_abc30_last_365_days=as.numeric(0)
times_abc60_last_365_days=as.numeric(0)
times_abc_last_365_days=as.numeric(0)
}else{
abcmonthstwelve=12-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)
if(abcstrlen>=abcmonthstwelve){
abc_last_365_days=(max(abcstrnew[1:abcmonthstwelve]))
}else{
abc_last_365_days=(max(abcstrnew[1:abcstrlen]))
}
if(abcstrlen>=abcmonthstwelve){
times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=30)))
}else{
times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
}
if(abcstrlen>=abcmonthstwelve){
times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=60)))
}else{
times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
}
if(abcstrlen>=abcmonthstwelve){
times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>0)))
}else{
times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
}
}
if(daysdiff>180){
abc_last_180_days=as.numeric(0)
times_abc30_last_180_days=as.numeric(0)
times_abc60_last_180_days=as.numeric(0)
times_abc_last_180_days=as.numeric(0)
}else{
abcmonthssix=6-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)
if(abcstrlen>=abcmonthssix){
abc_last_180_days=(max(abcstrnew[1:abcmonthssix]))
}else{
abc_last_180_days=(max(abcstrnew[1:abcstrlen]))
}
if(abcstrlen>=abcmonthssix){
times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=30)))
}else{
times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
}
if(abcstrlen>=abcmonthssix){
times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=60)))
}else{
times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
}
if(abcstrlen>=abcmonthssix){
times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>0)))
}else{
times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
}
}
return(list(worst_abc_ever=worst_abc_ever,
times_abc=times_abc,
times_abc_last_180_days=times_abc_last_180_days,
times_abc_last_365_days=times_abc_last_365_days,
times_abc30_last_365_days=times_abc30_last_365_days,
times_abc30_last_180_days=times_abc30_last_180_days,
times_abc60_last_365_days=times_abc60_last_365_days,
times_abc60_last_180_days=times_abc60_last_180_days,
abc_last_180_days=abc_last_180_days,
abc_last_365_days=abc_last_365_days
))
}else{
return(list(worst_abc_ever=NA_real_,
times_abc=NA_real_,
times_abc_last_180_days=NA_real_,
times_abc_last_365_days=NA_real_,
times_abc30_last_365_days=NA_real_,
times_abc30_last_180_days=NA_real_,
times_abc60_last_365_days=NA_real_,
times_abc60_last_180_days=NA_real_,
abc_last_180_days=NA_real_,
abc_last_365_days=NA_real_
))
}
}
},lst,sd,naflag,empflag,daysdiff,cutoffyearmon,mc.cores=6, mc.preschedule=TRUE, mc.cleanup=TRUE)
您可以使用以下一组输入来运行该函数并检查其输出。
lst=list("000050000032","000000340000000000000")
sd=c(as.Date.character("2017-05-22"),as.Date.character("2017-04-23"))
empflag=c(FALSE,FALSE)
naflag=c(FALSE,FALSE)
daysdiff=difftime(time1 = as.Date.character("2017-06-30"),time2 = sd)
cutoffyearmon=as.yearmon("2017-06-30")
我假设代码将通过分配mc.preschedule=TRUE
. 但是我看不到处理速度的任何显着表现。在机器的 6 个内核上运行时,我预计处理将在大约 1.5 小时内完成。
如果我错过了什么,有什么建议。
当使用 pbmcmapply 时,mc.cores=6
我得到的 ETA 为 06:01:32:57