它有点手动,我不知道你的数据是如何格式化的,但是通过一些修补应该可以完成工作:
编辑:它没有使用qdap
,但我认为这不是问题的关键部分。
第二次编辑:我忘记了替换,更正了下面的代码。
library(data.table)
library(tm) # Functions with tm:: below
library(magrittr)
dt <- data.table(
ID = 1L:4L,
Keywords = c(
paste('112 mills', 'open heart', 'surgery', 'great', 'great job', sep = ' '),
paste('Ausie', 'open', 'heart out', sep = ' '),
paste('opened', 'heartily', '56mg)_job', 'orders12', sep = ' '),
paste('order', 'macD', sep = ' ')))
# dt_2 <- data.table(Tokens = tm::scan_tokenizer(dt[, Keywords]))
dt_2 <- dt[, .(Tokens = unlist(strsplit(Keywords, split = ' '))), by = ID]
dt_2[, Words := tm::scan_tokenizer(Tokens) %>%
tm::removePunctuation() %>%
tm::removeNumbers()
]
dt_2[, Stems := tm::stemDocument(Words)]
dt_2
# ID Tokens Words Stems
# 1: 1 112
# 2: 1 mills mills mill
# 3: 1 open open open
# 4: 1 heart heart heart
# 5: 1 surgery surgery surgeri
# 6: 1 great great great
# 7: 1 great great great
# 8: 1 job job job
# 9: 2 Ausie Ausie Ausi
# 10: 2 open open open
# 11: 2 heart heart heart
# 12: 2 out out out
# 13: 3 opened opened open
# 14: 3 heartily heartily heartili
# 15: 3 56mg)_job mgjob mgjob
# 16: 3 orders12 orders order
# 17: 4 order order order
# 18: 4 macD macD macD
# Frequencies
dt_2[, .N, by = Words]
# Words N
# 1: 1
# 2: mills 1
# 3: open 2
# 4: heart 2
# 5: surgery 1
# 6: great 2
# 7: job 1
# 8: Ausie 1
# 9: out 1
# 10: opened 1
# 11: heartily 1
# 12: mgjob 1
# 13: orders 1
# 14: order 1
# 15: macD 1
第二次编辑:
res <- dt_2[, .(Keywords = paste(Words, collapse = ' ')), by = ID]
res
# ID Keywords
# 1: 1 mills open heart surgery great great job
# 2: 2 Ausie open heart out
# 3: 3 opened heartily mgjob orders
# 4: 4 order macD
第三次编辑,以防您的关键字以列表形式出现,并且您希望保持这种状态。
library(data.table)
library(tm) # Functions with tm:: below
library(magrittr)
dt <- data.table(
ID = 1L:4L,
Keywords = list(
c('112 mills', 'open heart', 'surgery', 'great', 'great job'),
c('Ausie', 'open', 'heart out'),
c('opened', 'heartily', '56mg)_job', 'orders12'),
c('order', 'macD')))
dt_2 <- dt[, .(Keywords = unlist(Keywords)), by = ID]
dt_2[, ID_temp := .I]
dt_3 <- dt_2[, .(ID, Tokens = unlist(strsplit(unlist(Keywords), split = ' '))), by = ID_temp]
dt_3[, Words := tm::scan_tokenizer(Tokens) %>%
tm::removePunctuation() %>%
tm::removeNumbers() %>%
stringr::str_to_lower()
]
dt_3[, Stems := tm::stemDocument(Words)]
dt_3
res <- dt_3[, .(
ID = first(ID),
Keywords = paste(Words, collapse = ' ') %>% stringr::str_trim()),
by = ID_temp]
res <- res[, .(Keywords = list(Keywords)), by = ID]
# Confirm format (a list of keywords in every element)
dt[1, Keywords] %T>% {print(class(.))} %T>% {print(length(.[[1]]))}
res[1, Keywords] %T>% {print(class(.))} %T>% {print(length(.[[1]]))}