r - 将计数转换为百分比和指数分数的效率

Question

我目前有以下代码可以产生我想要的结果（Data_Index和Data_Percentages）

Input_Data <- read.csv("http://dl.dropbox.com/u/881843/RPubsData/gd/2010_pop_estimates.csv", row.names=1, stringsAsFactors = FALSE)
Input_Data <- data.frame(head(Input_Data))

Rows <-nrow(Input_Data)
Vars <-ncol(Input_Data) - 1

#Total population column
TotalCount <- Input_Data[1]

#Total population sum
TotalCountSum  <- sum(TotalCount)
Input_Data[1]  <- NULL
VarNames       <- colnames(Input_Data)
Data_Per_Row   <- c()
Data_Index_Row <- c()

for (i in 1:Rows) {

    #Proportion of all areas population found in this row
    OAPer <- TotalCount[i, ] / TotalCountSum * 100

    Data_Per_Col   <- c()
    Data_Index_Col <- c()

    for(u in 1:Vars) {
        # For every column value in the selected row 
        # the percentage of that value compared to the 
        # total population (TotalCount) for that row is calculated
        VarPer <- Input_Data[i, u] / TotalCount[i, ] * 100

        # Once the percentage is calculated the index 
        # score is calculated by diving this percentage 
        # by the proportion of the total population in that 
        # area compared to all areas
        VarIndex <- VarPer / OAPer * 100

        # Binds results for all columns in the row
        Data_Per_Col   <- cbind(Data_Per_Col, VarPer)
        Data_Index_Col <- cbind(Data_Index_Col, VarIndex)
    }

    # Binds results for completed row with previously completed rows
    Data_Per_Row   <- rbind(Data_Per_Row, Data_Per_Col) 
    Data_Index_Row <- rbind(Data_Index_Row, Data_Index_Col) 
}
colnames(Data_Per_Row)   <- VarNames
colnames(Data_Index_Row) <- VarNames

# Changes the index scores to range from -1 to 1
OldRange   <- (max(Data_Index_Row) - min(Data_Index_Row))  
NewRange   <- (1 - -1)  
Data_Index <- (((Data_Index_Row - min(Data_Index_Row)) * NewRange) / OldRange) + -1
Data_Percentages <- Data_Per_Row

# Final outputs
Data_Index
Data_Percentages

我遇到的问题是代码非常慢。我希望能够在具有 200,000 行和 200 列的数据集上使用它（目前使用代码大约需要 4 天）。我确信必须有一种方法可以加快这个过程，但我不确定具体如何。

代码所做的是（在此示例中）将人口计数表划分为年龄段和不同区域，并将其转换为百分比和指数分数。目前有 2 个循环，以便单独选择所有行和列中的每个值，并对它们执行计算。我认为正是这些循环使它运行缓慢，是否有任何替代方案可以产生相同的结果，但速度更快？谢谢你的尽心帮助。

score 0 · Accepted Answer

摆脱“i”循环使用 apply 来计算 OAPer

 OAPer<-apply(TotalCount,1,
                   function(x,tcs)x/tcs*100,
                   tcs = TotalCountSum)

同样，您也可以对“u”循环内的工作进行矢量化，希望您的代码中有一些注释

score 0 · Accepted Answer

这是您的整个代码。for 循环不是必需的。也是如此apply。除法可以通过完全潜水矩阵来实现。

df <- Input_Data

total_count <- df[, 1]
total_sum   <- sum(total_count)

df <- df[, -1]

# equivalent of your for-loop
oa_per <- total_count/total_sum * 100
Data_Per_Row <- df/matrix(rep(total_count, each=5), ncol=5, byrow=T)*100
Data_Index_Row <- Data_Per_Row/oa_per * 100
names(Data_Per_Row) <- names(Data_Index_Row) <- names(df)

# rest of your code: identical
OldRange = max(Data_Index_Row) - min(Data_Index_Row)
NewRange = (1 - -1)
Data_Index = (((Data_Index_Row - min(Data_Index_Row)) * NewRange) / OldRange) + -1
Data_Percentages <- Data_Per_Row

r - 将计数转换为百分比和指数分数的效率

2 回答 2

Related

Reference