0

我需要对循环中的数据子集执行 QCC 测试。绘图并不重要,但 LCL、UCL 的计算以及超出限制和违反 Shewhart 规则的数据点的标记很重要。

输入数据组织在一个 DF 中,如下所示:

    TS 类别关键字频道数量
    2013_Q1 ABC WIDGET1 零售 55
    2013_Q2 ABC WIDGET1 零售 57
    2013_Q3 ABC WIDGET1 零售 18
    2013_Q4 ABC WIDGET1 零售 20
    2014_Q1 ABC WIDGET1 零售 7
    2014_Q2 ABC WIDGET1 零售 15
    2014_Q3 ABC WIDGET1 零售 24
    2014_Q4 ABC WIDGET1 零售 21
    2015_Q1 ABC WIDGET1 零售 43
    2015_Q2 ABC WIDGET1 零售 70
    2015_Q3 ABC WIDGET1 零售 51
    2015_Q4 ABC WIDGET1 零售 83
    2013_Q1 ABC WIDGET1 在线 31
    2013_Q2 ABC WIDGET1 在线 37
    2013_Q3 ABC WIDGET1 在线 31
    2013_Q4 ABC WIDGET1 在线 56
    2014_Q1 ABC WIDGET1 在线 56
    2014_Q2 ABC WIDGET1 在线 62
    2014_Q3 ABC WIDGET1 在线 55
    2014_Q4 ABC WIDGET1 在线 86
    2015_Q1 ABC WIDGET1 在线 79
    2015_Q2 ABC WIDGET1 在线 79
    2015_Q3 ABC WIDGET1 在线 62
    2015_Q4 ABC WIDGET1 在线 83
    2013_Q1 ABC WIDGET1 拍卖 2
    2013_Q2 ABC WIDGET1 拍卖 0
    2013_Q3 ABC WIDGET1 拍卖 2
    2013_Q4 ABC WIDGET1 拍卖 1
    2014_Q1 ABC WIDGET1 拍卖 3
    2014_Q2 ABC WIDGET1 拍卖 4
    2014_Q3 ABC WIDGET1 拍卖 3
    2014_Q4 ABC WIDGET1 拍卖 2
    2015_Q1 ABC WIDGET1 拍卖 6
    2015_Q2 ABC WIDGET1 拍卖 2
    2015_Q3 ABC WIDGET1 拍卖 1
    2015_Q4 ABC WIDGET1 拍卖 2

我已经能够使用循环使代码工作,如下所示:

  • 根据类别、关键字和频道确定数据集中的唯一组(键)
  • 通过增加 TS 对数据进行排序(用于控制图)
  • 循环遍历键
  • 选择一个子集
  • 执行 qcc 计算
  • 使用结果更新 DF - 即 oos(超出规范)、vlt(违规点)、lcl 和 ucl

小型数据集的性能很好,但随着数据集变大(> 100,000 行),性能就很差了。

任何改变逻辑的想法将不胜感激。

下面是R代码:

library(qcc)

# read data into DF
DF <- read.csv("SPCQty1.csv",header=TRUE,na.strings = "null")

# create ID row to use for later updates
DF$ID <- 1:nrow(DF)

# Create additional columns for later use
# these will be populated after calling qcc function for each group
DF$oos <- NA
DF$vlt <- NA
DF$ucl <- NA
DF$lcl <- NA

# determine unique groups in data set
keys <- unique(DF[,c('PL','KEYWORD','CHANNEL')])
len <- nrow(keys)

# perform stats on each set
for (i in 1:len)
{
  g1 <- as.data.frame.array(keys[i,]["PL"])[,"PL"]
  g2 <- as.data.frame.array(keys[i,]["KEYWORD"])[,"KEYWORD"]
  g3 <- as.data.frame.array(keys[i,]["CHANNEL"])[,"CHANNEL"]

  # select the subset  
  tmp <- subset(DF, PL == g1 & KEYWORD == g2 & CHANNEL == g3)
  # sort by TS for control chart
  spcdata <- tmp[order(tmp$TS),]

  # generate control chart stats

  spc <- qcc(spcdata$QTY, type="xbar.one", plot = FALSE)

  # get statistics object generated by qcc
  stats <- spc$statistics
  indices <- 1:length(stats)

  # get UCL and LCL   
  limits <- spc$limits
  lcl <- limits[,1]
  ucl <- limits[,2]

  # violating runs  
  violations <- spc$violations

  # create a data frame of the qcc stats  
  qc.data <- data.frame(df.indices <- indices, df.statistics <-   as.vector(stats), ID = spcdata$ID)

  # detect violating runs
  index.r <- rep(NA, length(violations$violating.runs))
  if(length(violations$violating.runs > 0)) { 
   index.r <- violations$violating.runs
   # Create a data frame for violating run points.
   df.runs <- data.frame(x.r = qc.data$ID[index.r], vlt = "Y")
   idx <- df.runs$x.r
   DF$vlt[DF$ID %in% idx]<- "Y"
   }

   # detect beyond limits points
   index.b <- rep(NA, length(violations$beyond.limits))
   if(length(violations$beyond.limits > 0)) { 
     index.b <- violations$beyond.limits
     # Create a data frame to tag beyond limit points.
     df.beyond <- data.frame(x.b = qc.data$ID[index.b], oos = "Y")
     idx <- df.beyond$x.b
     DF$oos[DF$ID %in% idx]<- "Y"
   }

   idx <- qc.data$ID
   DF$ucl[DF$ID %in% idx] <- ucl
   DF$lcl[DF$ID %in% idx] <- lcl
} 

DF[is.na(DF)] <- ""
# DF will now have 5 additional columns - ID, oos, vlt, ucl and lcl
4

1 回答 1

0

我注意到您的代码创建了大量临时变量(eq index.r、index.b 等)。如果数组长度相同,则无需跟踪索引。

library(qcc)
# read data into DF
DF <- read.csv("sample.csv",header=TRUE,na.strings = "null")

# Create additional columns for later use
# these will be populated after calling qcc function for each group
DF$oos <- NA
DF$vlt <- NA
DF$ucl <- NA
DF$lcl <- NA

# determine unique groups in data set
keys <- unique(DF[,c('PL','KEYWORD','CHANNEL')])
len <- nrow(keys)
dfnew<-data.frame()

# perform stats on each set
for (i in 1:len)
{
   # select the subset  
   tmp <- subset(DF, PL == keys$PL[i] & KEYWORD == keys$KEYWORD[i] & CHANNEL == keys$CHANNEL[i])
   # generate control chart stats
   spc <- qcc(tmp$QTY, type="xbar.one", plot = FALSE)

    # get UCL and LCL   
    tmp$lcl <- spc$limits[,1]
    tmp$ucl <- spc$limits[,2]
    #get violations
    tmp$vlt[spc$violations$violating.runs]<- "Y"
    tmp$oos[spc$violations$beyond.limits]<- "Y"
    #add onto data frame
    dfnew<-rbind(dfnew,tmp)
} 
dfnew[is.na(dfnew)] <- ""
#Sort as needed
print(dfnew)

一个新的数据框“dfnew”保存了最终结果。这个简化版本更容易阅读并且应该有一些性能改进,不能用有限的数据来量化。此版本还假设数据在循环之前进行了预排序。下一个改进将一起消除循环并替换为 _apply 命令。还要查看 Data.Table,这可以提高子集的性能。

于 2016-02-17T21:44:48.083 回答