0

我遇到了一个非常奇怪的问题,即双精度矩阵变成字符串矩阵。我将代码缩减为以下问题:

num.samples <- nrow(expr.matrix)
num.genes <- ncol(expr.matrix)
gene.names <- colnames(expr.matrix)

# Define a function which returns a vector in order to... 
execute.per.gene <- function(target.gene, ...) {
    # Uninteresting code
    x <- expr.matrix[,setdiff(1:num.genes, target.gene)]
    y <- expr.matrix[,target.gene]
    rf <- randomForest(x, y, mtry=10, ntree=100, importance=TRUE)

    # Calculate importance measure
    im <- importance(rf)[,"IncNodePurity"]

    # Divide by number of samples
    im / num.samples
}

# ... execute mclapply!
all.output <- mclapply(1:num.genes, execute.per.gene, mc.cores=mc.cores)

# Initialise matrix
weight.matrix <- matrix(0.0, nrow=num.genes, ncol=num.genes)
rownames(weight.matrix) <- gene.names
colnames(weight.matrix) <- gene.names

# And now I merge the results from 'all.output' into the weight.matrix
for (target.gene in 1:num.genes) {
    # Get result
    im <- all.output[[target.gene]]

    # Find which rows to change for this column
    cand.tf.idx <- match(names(im), gene.names)

    # Merge results into output matrix
    weight.matrix[cand.tf.idx, target.gene] <- im
}

# And suddenly, the matrix consists of a bunch of strings!
if (!is.numeric(weight.matrix[[1,1]])) { # dafuq
    cat("\nEncountered strings! :/\n")
    print(weight.matrix)
    # recently added this for debugging purposes:
    print(sapply(all.output, class))
    browser()
}

输出是:

Encountered strings! :/
      G1  G7  G9  G23 G26 G28 G29 G33 G44 G48 G50 G52 G55 G59 G63 G64 G69 G70
G1    "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
G7    "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
G9    "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
G23   "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
G26   "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
...

而且我知道为什么矩阵会变成一堆包含零的字符串。毕竟,在该execute.per.gene方法中,我做的最后一件事是将向量除以一个整数,此时问题不会引发异常,因此此时im必须仍然是双精度向量。

有人看到问题出在哪里吗?我究竟做错了什么?


更新,im应该总是看起来像下面这样。如果im已经由字符串组成,我im / num.samples认为会失败。

    > dput(im)
    structure(c(3.86421872658217, 0.0600404651226161, 0.0729843866848986, 
    0.0556398483535666, 0.0488815568218319, 0.0526059937835038, 0.170688282373908, 
    0.129655447072086, 0.174050696716209, 0.244770969072866, 0.170282014024477, 
    0.100440545265572, 0.0634773436494396, 0.0696835665372604, 0.118303002740336, 
    0.0493612110879677, 0.103414668075989, 0.0149516634700066, 0.0377397612656266, 
    0.0462366818296757, 0.0534595079995701, 0.0418429987271517, 0.0521335103883387, 
    0.0454590053400778, 0.0620792864477719, 0.0528642019860386, 0.0440233200010488, 
    ....
    2.4293680818691, 0.0455845647048088, 0.0480473721971548, 0.0493418345253576, 
    0.0468391879447859, 1.53509517636789, 0.0639471582428624, 0.155340800410008, 
    0.0668494853135931, 0.0436381864919185, 1.09024170028797, 0.0649503734307499, 
    0.0490042073829033, 0.0304435411561372, 0.034892331733943, 0.0759421587532521, 
    0.0666974014679768, 0.913196971375135, 0.0550660353121449, 1.36191204205922, 
    3.63194611493454, 0.177078251458191, 0.17856008667256, 0.0499985787306069, 
    0.0465138307009715, 0.071656156183379, 0.0441178391009568, 0.239933902772204, 
    0.0719828575374175, 0.0654148345872996, 0.920668929212975, 0.0454979263784418, 
    2.92899170564573, 0.0208273505572265, 0.0397416566013167, 0.197310579354446, 
    0.0313568556466712), .Names = c("sample_2", "sample_3", "sample_4", 
    "sample_5", "sample_6", "sample_7", "sample_8", "sample_9", "sample_10", 
    "sample_11", "sample_12", "sample_13", "sample_14", "sample_15", 
    "sample_16", "sample_17", "sample_18", "sample_19", "sample_20", 
    "sample_21", "sample_22", "sample_23", "sample_24", "sample_25", 
    "sample_26", "sample_27", "sample_28", "sample_29", "sample_30", 
    "sample_31", "sample_32", "sample_33", "sample_34", "sample_35", 
    "sample_36", "sample_37", "sample_38", "sample_39", "sample_40", 
    "sample_41", "sample_42", "sample_43", "sample_44", "sample_45", 
    "sample_46", "sample_47", "sample_48", "sample_49", "sample_50", 
    "sample_51", "sample_52", "sample_53", "sample_54", "sample_55", 
    "sample_56", "sample_57", "sample_58", "sample_59", "sample_60", 
    "sample_61", "sample_62", "sample_63", "sample_64", "sample_65", 
    "sample_66", "sample_67", "sample_68", "sample_69", "sample_70", 
    "sample_71", "sample_72", "sample_73", "sample_74", "sample_75", 
    ....
    "sample_801", "sample_802", "sample_803", "sample_804", "sample_805"
    ))

以及有关gene.names和的更多信息:num.samplesnum.genes

> dput(num.samples)
1643L
> dput(num.genes)
805L
> dput(gene.names)
c("sample_1", "sample_2", "sample_3", "sample_4", "sample_5", 
"sample_6", "sample_7", "sample_8", "sample_9", "sample_10", 
"sample_11", "sample_12", "sample_13", "sample_14", "sample_15", 
"sample_16", "sample_17", "sample_18", "sample_19", "sample_20", 
"sample_21", "sample_22", "sample_23", "sample_24", "sample_25", 
"sample_26", "sample_27", "sample_28", "sample_29", "sample_30", 
"sample_31", "sample_32", "sample_33", "sample_34", "sample_35", 
"sample_36", "sample_37", "sample_38", "sample_39", "sample_40", 
...
"sample_741", "sample_742", "sample_743", "sample_744", "sample_745", 
"sample_746", "sample_747", "sample_748", "sample_749", "sample_750", 
"sample_751", "sample_752", "sample_753", "sample_754", "sample_755", 
"sample_756", "sample_757", "sample_758", "sample_759", "sample_760", 
"sample_761", "sample_762", "sample_763", "sample_764", "sample_765", 
"sample_766", "sample_767", "sample_768", "sample_769", "sample_770", 
"sample_771", "sample_772", "sample_773", "sample_774", "sample_775", 
"sample_776", "sample_777", "sample_778", "sample_779", "sample_780", 
"sample_781", "sample_782", "sample_783", "sample_784", "sample_785", 
"sample_786", "sample_787", "sample_788", "sample_789", "sample_790", 
"sample_791", "sample_792", "sample_793", "sample_794", "sample_795", 
"sample_796", "sample_797", "sample_798", "sample_799", "sample_800", 
"sample_801", "sample_802", "sample_803", "sample_804", "sample_805"
)
4

1 回答 1

0

感谢大家的帮助,我找到了问题:)

在一些非常不可能的条件下expr.matrix包含一些 NaN 值,randomForests 算法会翻转。

提示browser()非常有用!

于 2013-05-22T22:16:37.207 回答