我正在使用 R 中的 lsa 包生成一个矩阵。创建矩阵后,我想计算矩阵中特定文档对(列)之间的余弦相似度。
目前,我正在使用嵌套的 for 循环来执行此操作,而且速度非常慢。在下面的代码中,有 150个sourceID和 6413个targetID,总共进行了 961.950 次比较。在我的数字运算机上运行了一个半小时后,它只通过了大约 300k 的数字。
有关更多信息,sourceIDs和targetIDs是列名称的向量,从包含这些名称的两个文件中加载。我想在所有源->目标对之间应用余弦函数。这些列由文档名称索引,该名称是一个字符串。
我确信使用apply 可以更快地做到这一点,但我无法理解它。
library(lsa)
# tf function
real_tf <- function(m)
{
return (sweep(m, MARGIN=2, apply(m, 2, max), "/"))
}
#idf function
real_idf <- function(m)
{
df = rowSums(lw_bintf(m), na.rm=TRUE)
return (log(ncol(m)/df))
}
#load corpus
lsa.documents <- textmatrix(args[1], minWordLength=1, minDocFreq=0)
# compute tf-idf
lsa.weighted_documents <- real_tf(lsa.documents) * real_idf(lsa.documents)
# compute svd
lsa.nspace <- lsa(lsa.weighted_documents, dims = as.integer(args[5]))
lsa.matrix <- diag(lsa.nspace$sk) %*% t(lsa.nspace$dk)
# compute similarities
lsa.sourceIDs <- scan(args[2], what = character())
lsa.targetIDs <- scan(args[3], what = character())
lsa.similarities <- data.frame(SourceID=character(), TargetID=character(), Score=numeric(), stringsAsFactors=FALSE)
k <- 1
for (i in lsa.sourceIDs)
{
for (j in lsa.targetIDs)
{
lsa.similarities[k,] <- c(i, j, cosine(lsa.matrix[,i], lsa.matrix[,j]))
k <- k + 1
}
}
lsa.ranklist <- lsa.similarities[order(lsa.similarities$Score, decreasing=TRUE),]
# save ranklist
write.table(lsa.ranklist, args[4], sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE)
编辑:可重现的例子
# cosine function from lsa package
cosine <- function( x, y )
{
return ( crossprod(x,y) / sqrt( crossprod(x)*crossprod(y) ) )
}
theMatrix <- structure(c(-0.0264639232505822, -0.0141165039351167, -0.0280459775632757,
-0.041211247161448, -0.00331565717239375, -0.0291161345945683,
-0.0451167802746869, -0.0116214407383401, -0.0381080747718958,
-1.36693644389599, 0.274747343110076, 0.128100677705483, -0.401760905661056,
-1.24876927957167, 0.368479552862631, -0.459711112157286, -0.544344448332346,
-0.765378939625159, -1.28612431910459, 0.293455499695499, 0.025167452173962
), .Dim = c(3L, 7L), .Dimnames = list(NULL, c("doc1", "doc2", "doc3",
"doc4", "doc5", "doc6", "doc7")))
sources <- c("doc1", "doc2", "doc3")
targets <- c("doc4", "doc5", "doc6", "doc7")
similarities <- data.frame(SourceID=character(), TargetID=character(), Score=numeric(), stringsAsFactors=FALSE)
k <- 1
for (i in sources)
{
for (j in targets)
{
similarities[k,] <- c(i, j, cosine(theMatrix[,i], theMatrix[,j]))
k <- k + 1
}
}
ranklist <- similarities[order(similarities$Score, decreasing=TRUE),]
write.table(ranklist, "C:\\Temp\\outputfile.txt", sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE)
产生(outputfile.txt):
doc1 doc6 0.962195242094352
doc3 doc6 0.893461576046585
doc2 doc6 0.813856201398669
doc2 doc7 0.768837903803964
doc2 doc4 0.730093288388069
doc3 doc7 0.675640649189972
doc3 doc4 0.635982900340315
doc1 doc7 0.53871688669971
doc1 doc4 0.499235059782688
doc1 doc5 0.320383772495164
doc3 doc5 0.226751624753921
doc2 doc5 0.144680489733846