1

这是我可以在 Excel 中轻松完成的事情。但我被 R 弄糊涂了。

我想将国家名称分配给一长串字符串(“隶属关系”)。

c("Department of Psychiatry and Behavioural Sciences, University College London Medical School, UK.", 
"", "Ty Dewi Sant School of Nursing, University Hospital of Wales, College of Medicine, Cardiff.", 
"University of Massachusetts Medical Center.", "Older Women's League.", 
"Kimberly Quality Care, Boston, MA.", "Michaux Manor Living Center, Fayetteville, PA.", 
"Florida Diagnostic and Learning Resources System, University of South Florida, Tampa 33613.", 
"", "Bigel Institute for Health Policy, Brandeis University, Waltham, MA.", 
"", "York Health Authority.", "Southern Illinois University, Edwardsville.", 
"St. Joseph's Hospital, Memphis, TN.", "Long Term Home Care of the Frail Elderly Foundation, New York City.", 
"Catholic University of America, Washington, DC.", "Mercy Health Center, Oklahoma City, OK.", 
"", "Visiting Nurse Service of New York.", "RespiteCare Center, Evanston, IL.", 
"Camden and Islington HA.", "National Advisory Council on Aging.", 
"Visiting Nurse Service of New York.", "American Health Care Association, Washington, DC.", 
"HealthCare Partners Medical Group, Los Angeles, CA 90015, USA.", 
"Tad Publishing Company, Peoria, IL, USA.", "Child Health Investment Partnership, Roanoke, VA, USA.", 
"School of Public Health, State University of New York, Albany 12237, USA.", 
"Bundoora Extended Care Centre.", "", "", "Family Respite Center, Falls Church, VA, USA.", 
"", "University of Victoria.", "", "Homemaker Health Aide Service of the National Capital Area.", 
"West Lambeth Health Authority, London SE1 7EH.", "Bon Secours Hospital/Villa Maria Nursing Center, North Miami, FL 33161.", 
"Alzheimer's Disease and Related Disorders Association, Syracuse, NY.", 
"Alzheimer's Association, Washington DC.", "South Carolina Commission on Aging, Columbia.", 
"University of New Mexico College of Nursing.", "Department of Human Development and Family Studies, University of Alabama, Tuscaloosa.", 
"Ballard Health Care Residence, Des Plaines, IL.", "Bowman Gray School of Medicine of Wake Forest University, Winston-Salem, NC.", 
"Case Western Reserve University.", "School of Public and Environmental Administration, Indiana University, Indianapolis 46202.", 
"Manor HealthCare Corp, Silver Spring, MD.", "Relationship Builders, Napa, CA.", 
"", "", "Medical University of South Carolina, USA.", "Tokyo Metropolitan Institute of Gerontology, Itabashi, Japan. tatsuro@tmig.or.jp", 
"Medical University of South Carolina, USA.", "Royal Hospital for Sick Children, Bristol.", 
"Barefield, Ennis, Co. Clare., Ireland.", "North Georgia College, Dahlonega 30597, USA.", 
"Institute for Psychology (I), University of Wurzburg, Germany.", 
"Camborne Redruth Community Hospital, Cornwall, United Kingdom.", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Institute of Child Health and Great Ormond Street Hospital for Children NHS Trust, London, UK.", 
"Department of Psychiatry, University of Toronto, Toronto, Ontario, Canada. carol.cohen@sunnybrook.on.ca", 
"Boston University School of Social Work, MA 02215, USA.", "", 
"Neurosciences Unit, General Infirmary at Leeds.", "", "", "School of Kang-Ning Junior College of Nursing, Nei-Hu, Taiwan, ROC.", 
"College of Nursing, South Dakota State University, USA.", "Department of Geriatric Medicine, University of Manchester, UK.", 
"Southern Illinois University, Department of Social Work, Edwardsville 62026-1450, USA.", 
"Redlands Community College, El Reno, Oklahoma, USA.", "", "", 
"Department of Geriatric Medicine, Alexandra Hospital, Singapore.", 
"School of Nursing and Midwifery, Department of Gerontological and Continuing Care Nursing, University of Sheffield, Sheffield, England. Liz.hanson@act.shef.ac.uk", 
"", "State University of New York, Health Science Center at Syracuse, 13210, USA. HAMR@mailbox.hscsyr.edu", 
"Div. of Active Palliative Care, Todachuo General Hospital.", 
"Children and Young People's Kidney Unit, Nottingham City Hospital, NHS Trust, UK.", 
"School of Nursing & Midwifery, Department of Gerontological & Continuing Care Nursing, University of Sheffield. liz.hanson@act.shef.ac.uk", 
"Harrington Memorial Hospital, Southbridge, MA, USA.", "", "Department of Curriculum and Instruction, Iowa State University, Ames, 50011. USA.", 
"Children & Young People's Kidney Unit, Nottingham City Hospital, U.K.", 
"School of Social Work, Boston University, MA 02215, USA. freedman@bu.edu", 
"Royal Free Hospital, London, UK.", "Humboldt State University, Department of Nursing, Arcata, CA, USA.", 
"Department of Psychiatry, The University of Queensland, Mental Health Centre, Royal Brisbane Hospital, Herston, Australia. davidk@psychiatry.uq.edu.au", 
"Centre for Evidence Based Nursing, University of York, Heslington, York, Nth Yorkshire, UK, YO1 5DG. cat4@york.ac.uk", 
"School of Nursing, University of British Columbia, Vancouver. magenta@bc.sympatico.ca", 
"Medisinsk avdeling, Lovisenberg Diakonale Sykehus, Oslo.", "School of Nursing, Yale University, USA.", 
"Centre de la Mémoire, Hôpital Roger Salengro, Centre Hospitalier Universitaire, Lille.", 
"University of Ulster and Eastern Health and Social Services Board, Ulster, Northern Ireland. r.mcconkey@ulst.ac.uk", 
"Thames Valley Family Practice Research Unit, Department of Family Medicine's Centre for Studies in Family Medicine, University of Western Ontario (UWO), London. jbbrown@julian.uwo.ca", 
"", "", "Department of Special Education, University of Nijmegen, The Netherlands. A.Hendriks@ped.kun.nl", 
"European Institute of Health and Medical Sciences, University of Surrey, Guildford, England.", 
"California State University School of Nursing, Chico, USA.")

在每个字符串中可能是也可能不是引用位置的子字符串,该位置本身可能引用国家/地区。预期的输出是一个数据帧,如下所示:

Affiliation[1], matchedCountry
Affiliation[2], matchedCountry
...
Affiliation[n], matchedCountry

“matchedCountry”旨在根据多个列表(大学、英国城市、美国各州等)进行评估,并且允许使用 NA。有些列表只返回 ISO 代码。

根据迄今为止的反馈(感谢@rbm),我已经管理了一个很好的解决方案(见答案部分)。也就是说,我确信性能仍然可以提高。谢谢。

参考:

  1. 同时合并列表中的多个data.frames
  2. R grepl:快速将多个字符串与多个子字符串进行匹配,返回所有匹配项
  3. R grep:将一个字符串与多个模式匹配
  4. 对 R 数据框进行快速测试以查看一列中的行值是否在数据框中的另一列内
  5. 使用列表中包含的一些但不是所有字符串中的多个模式提取和组合多个子字符串并返回到 R 中的列表
  6. 如何从R中的字符串中的多个列表中检测子字符串
4

1 回答 1

0

这是一个解决方案,它针对主列表中的每个项目检查各种子字符串列表,然后根据列表返回:a)原始子字符串,b)相邻子字符串,或 c)固定/预定义值. 结果是附加了“国家”列的原始表。

这些条件在提供的示例代码中表示。

编辑:似乎“域”查找没有按预期工作。我不太确定如何对其进行故障排除/修复,但我猜这超出了这个问题的范围......

######### GENERATE COUNTRY ID  #############

  library("stringr")
  library(RCurl)

  ## Download country lists and perpetrate

  countryList <- getURL("https://raw.githubusercontent.com/umpirsky/country-list/master/country/icu/en_US/country.csv")
  usstatesList <- getURL("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")
  ukcitiesList <- getURL("https://raw.githubusercontent.com/encyclopediaio/list-of-cities-in-the-uk/master/src/uk_cities.csv")
  ukcountryList <- getURL("https://raw.githubusercontent.com/Gibbs/UK-Postcodes/master/postcodes.csv")
  universitiesList <- getURL("https://raw.githubusercontent.com/endSly/world-universities-csv/master/world-universities.csv")

  countryList <- read.csv(text = countryList, stringsAsFactors=FALSE)
  usstatesList <- read.csv(text = usstatesList, stringsAsFactors=FALSE)
  ukcitiesList <- read.csv(text = ukcitiesList, stringsAsFactors=FALSE)
  ukcountryList <- read.csv(text = ukcountryList, stringsAsFactors=FALSE)
  universitiesList <- read.csv(text = universitiesList, header = FALSE, stringsAsFactors=FALSE)

  ## Generate affiliation list from ronbun data
  affiliationList <- pub.data$Affiliation1

  ## Generate email domains column and add to countryList
  domains <- function(x)
    { 
    x <- tolower(x)
    x <- paste0(".", x)
    return(x)
    }

  countryList <- data.frame(countryList[c("name", "iso")], domain = domains(countryList$iso), stringsAsFactors = FALSE)


  ## Add country names to universitiesList as V4

  universitiesList <- data.frame(universitiesList, V4="", stringsAsFactors = FALSE)

  i = 0
  for (v in universitiesList$V1)
  {
    tryCatch({  
    i = i + 1
      if (sum(str_detect(v, countryList$iso)) > 0) {
        universitiesList$V4[i] <- countryList$name[which(str_detect(v, countryList$iso))]
      }
    }, error=function(e){})
  }

  ### on to the main show

  df <- data.frame(affiliationList, CountryISO="", CountryNAME="", stringsAsFactors = FALSE)


  i = 0
  for (v in affiliationList)
  {
    tryCatch({
    i = i + 1
      if (sum(str_detect(v, countryList$name)) > 0) {
        df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$name))]
        df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$name))]   
      } 
      if (sum(str_detect(v, ukcitiesList$name)) > 0) {
        df$CountryISO[i] <- "GB"
        df$CountryNAME[i] <- "United Kingdom"   
      }
      if (sum(str_detect(v, ukcountryList$country_string)) > 0) {
        df$CountryISO[i] <- "GB"
        df$CountryNAME[i] <- "United Kingdom"   
      } 
      if (sum(str_detect(v, usstatesList$State)) > 0 || sum(str_detect(v, usstatesList$Abbreviation)) > 0) {
        df$CountryISO[i] <- "US"
        df$CountryNAME[i] <- "United States"   
      } 
      if (sum(str_detect(v, countryList$domain)) > 0) {
        df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$domain))]
        df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$domain))]   
      } 
      if (sum(str_detect(v, universitiesList$V2)) > 0) {
        df$CountryISO[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V2))]
        df$CountryNAME[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V4))]
      } 
    }, error=function(e){})
  }

return(df)

感谢您提供的所有帮助!

于 2015-09-24T08:14:50.717 回答