1

我想弄清楚用户访问时发生了什么 profile.views 。用户由对 uid,state 唯一标识。该数据存储在两个数据帧中。

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))

> visits
    id  uid state ts
1 2001 1001    CA 51
2 2002 1002    CA 52
3 2003 1001    CA 53
4 2004 1001    MA 54

> profile.views
id  uid state ts
1 3001 1001    CA 51
2 3002 1003    CA 57
3 3003 1002    CA 59
4 3004 1001    CA 59

对于每个 profile.view,我想弄清楚它来自哪个访问。这是通过使用匹配的 uid 和状态的 ts 小于或等于 profile.views 行上的 ts 的最近访问来完成的。

这是我想要的结果(以某种形式):

profile.views[1,] 来自于访问[1,]

profile.views[2,] 不是来自任何访问(这可能是由数据记录错误引起的)

profile.views[3,] 来自于访问[2,]

profile.views[4,] 来自于访问[3,]

有谁知道这样做的好方法?

4

4 回答 4

2

使用 SQL 风格的语法sqldf

library(sqldf)
sqldf("
SELECT a.id, a.uid, a.state, a.ts, MAX(b.ts) AS visit_ts
FROM \"profile.views\" AS a
LEFT OUTER JOIN visits AS b
ON a.uid = b.uid
AND a.state = b.state
AND a.ts >= b.ts
GROUP BY a.id, a.uid, a.state, a.ts
ORDER BY a.id
")
于 2012-12-22T02:20:37.410 回答
2

一种更快的data.table方法,它将个人资料视图与访问 ID 匹配:

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))
visits <- data.table(visits)
profile.views <- data.table(profile.views)
setkey(visits,uid,state,ts)
#orders columns so that joins are on first three columns
setcolorder(profile.views,c("uid","state","ts","id"))
##set names to avoid name collision
setnames(profile.views,c("uid","state","view.ts","view.id"))
##rolling join
visits[profile.views,roll=TRUE]
    # uid state ts   id view.id
# 1: 1001    CA 51 2001    3001
# 2: 1003    CA 57   NA    3002
# 3: 1002    CA 59 2002    3003
# 4: 1001    CA 59 2003    3004
于 2012-12-22T16:07:39.247 回答
1

这是一个data.table解决方案。有一些事情可能可以做得更好,但这是第一次通过。

library(data.table)
visits <- data.table(visits)
profile.views <- data.table(profile.views)
##renames some columns to avoid name collision
##there's probably a better solution to this
setnames(profile.views,c("id","ts"),c("view.id","view.ts"))
setkey(visits,uid,state)
setkey(profile.views,uid,state)
##outer joins visits to profile.views by uid and state
##leaving NA if a row in profile.views has no matches
#visits[profile.views] 
##filters out rows where views happen before visits
#visits[profile.views][view.ts >= ts | is.na(ts)] 
##picks the latest visit timestamp by view
visits[profile.views][view.ts >= ts | is.na(ts), 
  list(visit.ts=max(ts)), 
  by=list(view.id,uid,state,view.ts)][order(view.id)]
#    view.id  uid state view.ts visit.ts
# 1:    3001 1001    CA      51       51
# 2:    3002 1003    CA      57       NA
# 3:    3003 1002    CA      59       52
# 4:    3004 1001    CA      59       53
于 2012-12-21T23:09:04.603 回答
0

使用基数 Rmergeaggregate

visits = data.frame(id=2001:2004, uid=c(1001,1002,1001,1001), state=c('CA','CA','CA','MA'), ts=c(51,52,53,54))
profile.views = data.frame(id=3001:3004, uid=c(1001,1003,1002,1001), state=c('CA','CA','CA','CA'), ts=c(51,57,59,59))
##merges data frames based on uid and state
newdf.merged <- merge(visits,profile.views, by=c("uid","state"),all.y=TRUE)
##puts unmatched rows into another dataset
newdf.na <- with(newdf,newdf[is.na(ts.x),])
##filters views that happened after visits (like WHERE)
newdf.filter <- with(newdf,newdf[ts.y >= ts.x,])
##aggregates using the max function, selecting max id and ts
newdf.agg <- aggregate(cbind(id.y,ts.y) ~ uid + state + id.x + ts.x, data = newdf.filter, FUN = max)
##merges aggregated result and na rows
newdf.final <- rbind(newdf.agg,newdf.na)
##optional ordering step
newdf.final <- newdf.final[with(newdf.final,order(uid,state,id.x)),]
于 2012-12-27T04:59:46.870 回答