1

假设我有一个 Wikipedia 文章标题列表,我想测量文章之间的词汇继承。

title <- c("virus", 
  "coronavirus",
  "Coronaviridae",
  "pandemic", 
  "2019–20_coronavirus_pandemic", 
  "Coronavirus_disease_2019",
  "Severe_acute_respiratory_syndrome_coronavirus_2",
  "Severe_acute_respiratory_syndrome_coronavirus",
  "Severe_acute_respiratory_syndrome-related_coronavirus",
  "syndrome",
  "disease",
  "infection"
  )

这是一个解决方案,但我认为这不是针对长标题列表执行此任务的最快方法。首先,使用 Wikipedia API 获取文章的函数。

GetArticle <- function(title){
    library(xml2)
    library(httr)
    query <- paste0("https://en.wikipedia.org/w/api.php?", 
                    "action=query", "&format=xml", "&redirects", "&prop=extracts",
                    "&explaintext","&titles=", title)
    answer <- httr::GET(query)
    page.xml <- xml2::read_xml(answer)
    page <- xml2::xml_find_all(page.xml, "//extract")
    text <- as.character(base::trimws(xml_text(page)))
  }

计算词法继承的函数。

  lexicalInheritance <- function(text1, text2){
    tokens1 <- tolower(unlist(strsplit(text1, " ")))
    tokens2 <- tolower(unlist(strsplit(text2, " ")))
    intersection.v <- sort(intersect(tokens1, tokens2))
    cardinalityOfIntersection <- length(intersection.v)
    score <- cardinalityOfIntersection / length(tokens2)
    return(score)
  }

适用于 Wikipedia 文章的功能:

MeasureLexicalInheritanceOfWikipediaArticlesFromtitle <- function(title){
  start_time <- Sys.time()
  article <- unlist(lapply(title, GetArticle))
  cleanArticle <- gsub("[[:punct:]]", "", article)
  cleanArticle <- gsub("[0-9]", "", cleanArticle)
  titles.df <- as.data.frame(expand.grid(title, title))
  names(titles.df) <- c("title1", "title2")
  couples <- as.data.frame(expand.grid(cleanArticle,cleanArticle))
  score <- c()
  for (i in 1:length(article)) {
    index <- lexicalInheritance (as.character(couples$Var1[i]), 
                               as.character(couples$Var2[i]))
    score <- c(score, index)
  }
  results.df <- cbind(titles.df, score)
  end_time <- Sys.time()
  time <- end_time - start_time
  print(time)
  return(results.df)
}

随时间变化的结果:

results.df <- MeasureLexicalInheritanceOfWikipediaArticlesFromtitle(title)
View(results.df)

Time difference of 1.89983 secs

system.time(MeasureLexicalInheritanceOfWikipediaArticlesFromtitle(title))
utilisateur     système      écoulé 
       0.98        0.00        3.67

非常感谢您的帮助 !

4

0 回答 0