1

I have a Data Frame that contains, between other things, the following fields: userX, Time1, Time2, Time3. The number of observations is 2000.

I have a function that has as inputs userX, Time1, Time2, Time3 and return a data frame with 1 observation and 19 variables.

I want to apply that function to all the observations of the first data frame to make a new data frame with 2000 observations and 19 variables.

I thought about using lapply, but if I understand correctly, it only takes one variable.

Can somebody point me in the right direction?

This is the code I have right now:

            # Make Data Frame for video actions between given times for user X 
    DataVideoActionT <- function (userX, Time1, Time2, Time3){
      #Get data for user X
      videoActionsX<-subset(videoLectureActions, username==userX)
      #Time1 = before first attempt
      videoActionsX_T1<-subset(videoActionsX, eventTimestamp<Time1)
      #Time2 = before best attemp
      videoActionsX_T2<-subset(videoActionsX, eventTimestamp<Time2 & eventTimestamp>Time1)
      #Time3= before last attemp
      videoActionsX_T3<-subset(videoActionsX, eventTimestamp<Time3 & eventTimestamp>Time1)

      error1 = sum(videoActionsX_T1$type==" error ")
      pause1 = sum(videoActionsX_T1$type==" pause ")
      play1 = sum(videoActionsX_T1$type==" play ")
      ratechange1 = sum(videoActionsX_T1$type==" ratechange ")
      seeked1 = sum(videoActionsX_T1$type==" seeked ")
      stalled1 = sum(videoActionsX_T1$type==" stalled ")

      error2 = sum(videoActionsX_T2$type==" error ")
      pause2 = sum(videoActionsX_T2$type==" pause ")
      play2 = sum(videoActionsX_T2$type==" play ")
      ratechange2 = sum(videoActionsX_T2$type==" ratechange ")
      seeked2 = sum(videoActionsX_T2$type==" seeked ")
      stalled2 = sum(videoActionsX_T2$type==" stalled ")

      error3 = sum(videoActionsX_T3$type==" error ")
      pause3 = sum(videoActionsX_T3$type==" pause ")
      play3 = sum(videoActionsX_T3$type==" play ")
      ratechange3 = sum(videoActionsX_T3$type==" ratechange ")
      seeked3 = sum(videoActionsX_T3$type==" seeked ")
      stalled3 = sum(videoActionsX_T3$type==" stalled ")

      data<-data.frame(anon_ID=userX,
                       error1 = error1,
                       pause1 = pause1,
                       play1 = play1,
                       ratechange1 = ratechange1,
                       seeked1=seeked1,
                       stalled1=stalled1,
                       error2 = error2,
                       pause2 = pause2,
                       play2 = play2,
                       ratechange2 = ratechange2,
                       seeked2 =seeked2,
                       stalled2 = stalled2,
                       error3 = error3,
                       pause3 = pause3,
                       play3 = play3,
                       ratechange3 = ratechange3,
                       seeked3 = seeked3,
                       stalled3 = stalled3)
      return(data)
    }

    videoLectureActions<-structure(list(username = c("exampleID1", "exampleID1", "exampleID1", 
                                                     "exampleID2", "exampleID2", "exampleID2", "exampleID3", "exampleID3", 
                                                     "exampleID3", "exampleID3"), currentTime = c("103.701247", "103.701247", 
                                                                                                  "107.543877", "107.543877", "116.456507", "116.456507", "119.987188", 
                                                                                                  "177.816693", "183.417124", "183.417124"), playbackRate = c("null", 
                                                                                                                                                              "null", "null", "null", "null", "null", "null", "null", "null", 
                                                                                                                                                              "null"), pause = c("true", "false", "true", "false", "true", 
                                                                                                                                                                                 "false", "true", "false", "true", "false"), error = c("null", 
                                                                                                                                                                                                                                       "null", "null", "null", "null", "null", "null", "null", "null", 
                                                                                                                                                                                                                                       "null"), networkState = c("1", "1", "1", "1", "1", "1", "1", 
                                                                                                                                                                                                                                                                 "1", "1", "1"), readyState = c("4", "4", "4", "4", "4", "4", 
                                                                                                                                                                                                                                                                                                "4", "4", "4", "4"), lectureID = c("exampleLectureID1", "exampleLectureID1", 
                                                                                                                                                                                                                                                                                                                                   "exampleLectureID1", "exampleLectureID1", "exampleLectureID1", 
                                                                                                                                                                                                                                                                                                                                   "exampleLectureID1", "exampleLectureID1", "exampleLectureID1", 
                                                                                                                                                                                                                                                                                                                                   "exampleLectureID1", "exampleLectureID1"), eventTimestamp = c("2013-03-04 18:51:49", 
                                                                                                                                                                                                                                                                                                                                                                                                 "2013-03-04 18:51:50", "2013-03-04 18:51:54", "2013-03-04 18:51:56", 
                                                                                                                                                                                                                                                                                                                                                                                                 "2013-03-04 18:52:05", "2013-03-04 18:52:07", "2013-03-04 18:52:11", 
                                                                                                                                                                                                                                                                                                                                                                                                 "2013-03-04 18:59:17", "2013-03-04 18:59:23", "2013-03-04 18:59:31"
                                                                                                                                                                                                                                                                                                                                   ), initTimestamp = c("2013-03-04 18:44:15", "2013-03-04 18:44:15", 
                                                                                                                                                                                                                                                                                                                                                        "2013-03-04 18:44:15", "2013-03-04 18:44:15", "2013-03-04 18:44:15", 
                                                                                                                                                                                                                                                                                                                                                        "2013-03-04 18:44:15", "2013-03-04 18:44:15", "2013-03-04 18:44:15", 
                                                                                                                                                                                                                                                                                                                                                        "2013-03-04 18:44:15", "2013-03-04 18:44:15"), type = c(" pause ", 
                                                                                                                                                                                                                                                                                                                                                                                                                " play ", " pause ", " play ", " pause ", " play ", " pause ", 
                                                                                                                                                                                                                                                                                                                                                                                                                " play ", " pause ", " play "), prevTime = c("103.701247 ", "103.701247 ", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                             "107.543877 ", "107.543877 ", "116.456507 ", "116.456507 ", "119.987188 ", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                             "177.816693 ", "183.417124 ", "183.417124 ")), .Names = c("username", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       "currentTime", "playbackRate", "pause", "error", "networkState", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       "readyState", "lectureID", "eventTimestamp", "initTimestamp", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       "type", "prevTime"), row.names = c(1L, 2L, 5L, 6L, 17L, 21L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          28L, 936L, 957L, 988L), class = "data.frame")
    data<-structure(list(anon_ID = c("exampleID1", "exampleID2", "exampleID3" ), maxGrade = c(10, 5, 10), firstGrade = c(10, 5, 8), lastGrade = c(10, 5, 10), total_submissions = c(1L, 1L, 3L), Time1 = structure(c(1361993741, 1362356090, 1362357401), class = c("POSIXct", "POSIXt"), tzone = ""), TimeM = structure(c(1361993741, 1362356090, 1362492744), class = c("POSIXct", "POSIXt"), tzone = ""), TimeL = structure(c(1361993741, 1362356090, 1362492744), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("anon_ID", "maxGrade", "firstGrade", "lastGrade", "total_submissions", "Time1", "TimeM", "TimeL"), row.names = c(NA, 3L), class = "data.frame")

    library(foreach)
    library(doMC)
    registerDoMC(2)  #change the 2 to your number of CPU cores  

    n <- nrow(data)
    res <- list("vector", n)
    foreach(i=1:n, .verbose=FALSE, .combine=rbind) %do% {  
      res[[i]] <- with(data, DataVideoActionT(anon_ID[i], Time1[i], TimeM[i], TimeL[i]))
    }
    test<-do.call(rbind, res)

I have 3 questions.

  1. How can I make foreach not print to the console? This is how it looks when I run it

    foreach(i=1:n, .verbose=FALSE, .combine=rbind) %do% {  
    +   res[[i]] <- with(data, DataVideoActionT(anon_ID[i], Time1[i], TimeM[i], TimeL[i]))
    + }
         anon_ID error1 pause1 play1 ratechange1 seeked1 stalled1
    1 exampleID1      0      0     0           0       0        0
    2 exampleID2      0      0     0           0       0        0
    3 exampleID3      0      0     0           0       0        0
      error2 pause2 play2 ratechange2 seeked2 stalled2 error3 pause3
    1      0      0     0           0       0        0      0      0
    2      0      0     0           0       0        0      0      0
    3      0      2     2           0       0        0      0      2
      play3 ratechange3 seeked3 stalled3
    1     0           0       0        0
    2     0           0       0        0
    3     2           0       0        0
    

I don't want that in the console with thousands of observations.

  1. I want to run this in parallel, I i change the %do% for %dopar% the code stop working. Instead of getting test with 3 observations and 19 variables I get a 2x1 character matrix

  2. Is there a better way of doing this? If so, could you explain why is better?

Thanks!

4

3 回答 3

3

mapply专为您的需求而设计,因为它允许您组合每个案例的值,进行计算并返回更大的矩阵。

请注意,我只是将参数“user”、“time1”和“time2”作为一个小例子。

# This is a matrix of 3 columns
data <- replicate(3, 1:5)

# Your function takes some args, and returns extra info 

your_function <- function(user, time1, time2) {
  c(user, time1, time2, time1*time2, time1+time2, time1/time2)
}

# Here it comes together:
t(mapply(your_function, data[,1], data[,2], data[,3]))

# Output:
#      [,1] [,2] [,3] [,4] [,5] [,6]
#   [1,]    1    1    1    1    2    1
#   [2,]    2    2    2    4    4    1
#   [3,]    3    3    3    9    6    1
#   [4,]    4    4    4   16    8    1
#   [5,]    5    5    5   25   10    1

确认它有效:)

作为奖励,我为您的输入和输出做了基准测试,基于 100 次评估所需的中位时间为24 毫秒。当然,这取决于your_function实际操作。

使用的代码:

data <- cbind(1:2000, replicate(18, rnorm(2000)))
f <- function(user, time1, time2, time3) {
  c(user, time1, time2, time3,
    time1+time2, time2+time3, time1+time3, time1+time2+time3,
    time1+time2, time2+time3, time1+time3, time1+time2+time3,
    time1+time2, time2+time3, time1+time3, time1+time2+time3,
    time1+time2, time2+time3, time1+time3)
}
result <- t(mapply(f, data[,1], data[,2], data[,3], data[,4]))  
result       # dim(result)   2000 by 19

编辑以解决您的具体情况(澄清评论):

改变

data<-data.frame(anon_ID=userX,
                 ...
                 ...
                 ...)

从您的功能:

data<-c(error1, pause1, play1, ratechange1, seeked1, stalled1, error2, 
        pause2, play2, ratechange2, seeked2, stalled2, error3, pause3, 
        play3, ratechange3, seeked3, stalled3)

然后执行以下操作:

test<-t(mapply(DataVideoActionT, userX=data$anon_ID, 
               Time1=data$Time1, Time2=data$TimeM, Time3=data$TimeL))

colnames(test) <- c("error1", "pause1", "play1", "ratechange1", "seeked1", 
                    "stalled1", "error2", "pause2", "play2", "ratechange2", 
                    "seeked2", "stalled2", "error3", "pause3", "play3", 
                    "ratechange3", "seeked3", "stalled3")

test
于 2013-09-01T14:40:29.733 回答
0

您可以使用do.callVectorize

t(do.call(Vectorize(f), DF[,c("userX", "Time1", "Time2", "Time3")]))

f您的功能和数据框在哪里DF。您必须转置以保留列数。

于 2013-09-01T14:13:01.230 回答
0

您可以使用 APPLY 在下面找到代码示例,这应该对您有所帮助!

dane_evaluations<-data.frame(dane_evaluations,time_spent=apply(dane_evaluations[,c('documentevaluation_start','documentevaluation_end')],1,function(x) time_spent(x[1], x[2])))

正在应用的函数的名称:time_spent 并且该函数有两个参数:documentevaluation_startdocumentevaluation_end作为应用数据框的结果的数据框的列,由具有由函数为每一行计算的适当值dane_evaluations 的列扩展。time_spent


示例数据:

子集data_frame:

head(dane_evaluations[,c('documentevaluation_start','documentevaluation_end')])
    documentevaluation_start     documentevaluation_end
1 2013-02-07 13:53:57.073760 2013-02-07 14:10:29.445997
2 2013-02-07 14:28:29.463068 2013-02-07 14:34:56.867517

应用功能:

time_spent <- function(from,to) {
op <- options(digits.secs = 3)  
 as.numeric((strptime(to, "%Y-%m-%d %H:%M:%OS")-strptime(from, "%Y-%m-%d %H:%M:%OS")),units="secs")
}

应用函数后的样子:

head(dane_evaluations[,c('documentevaluation_start','documentevaluation_end','time_spent')])
    documentevaluation_start     documentevaluation_end time_spent
1 2013-02-07 13:53:57.073760 2013-02-07 14:10:29.445997   992.3722
2 2013-02-07 14:28:29.463068 2013-02-07 14:34:56.867517   387.4044
于 2013-09-01T14:02:43.443 回答