很确定这对你有用:
> library(plyr)
> library(doMC)
> library(reshape2)
>
> set.seed(2)
> #make junk data
> dat <- data.frame(user=1:5,
+ time=1:20,
+ url=paste0("https://domain.com/forum/thread?thread_id=",sample(5,20,T)))
> head(dat)
user time url
1 1 1 https://domain.com/forum/thread?thread_id=1
2 2 2 https://domain.com/forum/thread?thread_id=4
3 3 3 https://domain.com/forum/thread?thread_id=3
4 4 4 https://domain.com/forum/thread?thread_id=1
5 5 5 https://domain.com/forum/thread?thread_id=5
6 1 6 https://domain.com/forum/thread?thread_id=5
> #subet within time range
> dat <- dat[dat$time >=1 & dat$time <= 20,]
>
> #make threadID variable
> dat$threadid <- gsub("^.*thread_id=",'',dat$url)
>
>
> #register parallel cores
> registerDoMC(4)
> #count number of thread occurrences for each user (in parallel)
> dat.new <- ddply(dat,.(user,threadid),summarize,threadcount=length(threadid),.parallel=TRUE)
> #reshape data to be in the format you want
> dat.new <- dcast(dat.new,user~threadid,value.var="threadcount",fill=0)
> #add total views
> dat.new$totalview <- rowSums(dat.new[,-1])
> dat.new
user 1 2 3 4 5 totalview
1 1 1 0 1 0 2 4
2 2 1 1 0 1 1 4
3 3 0 1 1 1 1 4
4 4 2 0 2 0 0 4
5 5 1 0 2 0 1 4