0

我希望为员工获取一个包含横截面工资数据的数据集,并创建一个大的不间断时间序列,并在此过程中估算值。假设我有:

name <- c("carl","carl","bob","rick","rick","rick","rick")
sex <- c(rep("M",7))
salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
                 "2008-01-01","2009-12-09", "2010-01-01"))

salaries <- data.frame(name,sex,salary,date)
salaries
  name sex salary       date
  carl   M  18000 2007-04-30
  carl   M  14000 2007-07-30
   bob   M  34000 2009-12-09
  rick   M  11000 2006-01-01
  rick   M  23000 2008-01-01
  rick   M  23000 2009-12-09
  rick   M  25000 2010-01-01

正如我们所看到的,可怜的卡尔在 7 月份减薪 4k。在此之前,他的收入为 18k。在他被削减之前的3个月都是这种情况,但我的数据并没有反映这一点。我想制作一张漂亮的图片来展示这种趋势,但首先我需要将数据更改为如下所示(其中 * 表示估算值):

head(salaries)
  name sex salary       date change
  carl   M  18000 2007-04-30    0
  carl   M  18000 2007-05-30*   0 
  carl   M  18000 2007-06-30*   0 
  carl   M  14000 2007-07-30    1
   bob   M  34000 2009-12-09    0 
  rick   M  11000 2006-01-01    0
  rick   M  11000 2006-02-01*   0
  ...   .. ....... ...... ....
  rick   M  11000 2007-12-01*   0
  rick   M  23000 2008-01-01    1
  rick   M  23000 2008-02-01*   1
  ....   ...... ...... ........
  rick   M  23000 2009-12-09    1     
  rick   M  25000 2010-01-01    2 

所以我想估算中间值,并在发生变化时进行标记。像鲍勃这样从未有过薪水变动的人,只会停留在 0 上。但是,经历过多次薪水变动的 rick 每次都会被标记,因此我们知道变动发生的时间和数字。我只对将月份作为分析单位感兴趣,但知道如何每天进行估算也会很有用。

4

2 回答 2

2

如果您有一个时间序列,您可以使用na.locf最后一个可用值替换缺失值,或者approx如果您只想在值之间进行插值。要创建这些单独的时间序列,您可以使用 和 在“高”(标准化)格式和“宽”格式之间转换dcast数据melt。要计算更改次数,您可以使用ddplycumsum

library(reshape2)
library(plyr)
library(zoo)

# Convert to wide format
d <- dcast( salaries, date ~ name, value.var = "salary" )

# Add all the dates you want
dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
d <- merge( d, data.frame(date=dates), all=TRUE )

# Fill in the missing values
# If you want the last non-missing value:
#d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE))
# If you only want to interpolate between values:
d <- as.data.frame(lapply(d, 
  function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
))

# Convert back to the tall format
d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)

# Add the number of changes
d <- ddply(
  d, "name", transform, 
  change = cumsum(c(0, diff(salary) != 0))
)
于 2013-04-04T12:14:31.283 回答
0

详细说明@Vincent 的建议:

        name <- c("carl","carl","bob","rick","rick","rick","rick")
        sex <- c(rep("M",7))
        salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
        office <- c('melbourne','sydney','adelaide','perth','perth','melbourne','melbourne')
        date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
                          "2008-01-01","2009-12-09", "2010-01-01"))

        salaries <- data.frame(name,sex,salary,date, office)
        salaries


        library(reshape2)
        library(plyr)
        library(zoo)

使用 approx 处理数字向量

        # Convert to wide format
        d <- dcast( salaries, date ~ name, value.var = "salary" )

        # Add all the dates you want
        dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
        d <- merge( d, data.frame(date=dates), all=TRUE )

        # Fill in the missing values
        # If you want the last non-missing value:
        #d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE, fromLast = T))
        #If you only want to interpolate between values:
        d <- as.data.frame(lapply(d, 
                                  function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
        ))

        # Convert back to the tall format
        d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)

        # Add the number of changes
        d <- ddply(
          d, "name", transform, 
          change = cumsum(c(0, diff(salary) != 0))
        )

使用 na.locf 转换字符向量

        # Convert to wide format
        a <- dcast( salaries, date ~ name, value.var = "office" )

        # Add all the dates you want
        dates <- seq.Date( from = min(a$date), max(a$date), by="month" )
        a <- merge( a, data.frame(date=dates), all=TRUE )

        # Fill in the missing values using na.locf
        a <- as.data.frame(lapply(a, na.locf, na.rm=FALSE, fromLast = T))

        # Convert back to the tall format
        a <- melt(a, id.vars="date", value.name="office", variable.name="name", na.rm=TRUE)

合并结果

        d$date <- as.Date(d$date)
        out = merge(a,d, by = c('name','date'))
于 2013-04-05T12:20:31.177 回答