我正在制作一个有尺寸的集合
dim(data)
[1] 419612 2
第二列看起来或多或少像这样:
> unique(data[1:50,"topics"])
[1] {"dom":2.0,"moda":3.0,"rodzina":1.55,"praca":1.42,"finanse":1.96,"edukacja":1.67,"sport":1.96,"muzyka":1.52,"kuchnia":1.8,"plotka":1.8,"zdrowie":1.12,"kibic":1.8,"uroda":2.32,"gra":2.94,"motoryzacja":1.33,"kultura":1.42,"film":3.14,"podróż":1.9,"technologia":1.31}
[2] {"rodzina":2.99,"kultura":4.46,"muzyka":4.5}
[3] {"dom":1.93,"rodzina":5.37,"zwierzęta":3.0,"praca":4.3,"finanse":2.11,"sport":2.1,"muzyka":2.99,"nieruchomość":2.8,"kuchnia":6.4,"plotka":2.1,"zdrowie":3.79,"gra":4.25,"motoryzacja":2.57,"kultura":3.13,"film":4.4,"podróż":3.21}
[4] {"plotka":9.5,"uroda":10.06,"kultura":15.67,"muzyka":29.97}
[5] {"dom":2.99,"rodzina":2.5,"edukacja":3.85,"sport":1.17,"muzyka":1.23,"nieruchomość":2.95,"kuchnia":1.42,"wnętrze":1.33,"kibic":1.17,"ogród":1.33,"motoryzacja":1.17,"film":1.17,"podróż":1.57}
[6] {"kuchnia":4.38,"plotka":1.33,"rodzina":1.61,"film":1.33}
37530 Levels: {"biznes":1.0} ... {"zwierzęta":9.96,"podróż":9.97}
对于每一行,我想从topics
列中选择:
符号后等级最高的单词。我尝试使用dplyr
包中的 mutate 函数,它看起来好像不起作用。stringi
使用更快版本的包制作的字符上的操作stringr
。我的代码和此操作的结果如下。任何人都知道为什么我在此操作后的每一行中都得到相同的值,以及如何在不使用for
循环的情况下达到预期的结果?
> data2 <- data %>%
+ mutate( xx = topics %>%
+ stri_extract_all_regex(pattern = "[a-zA-Z0-9óśćłźżęą\\.\\s]+") %>%
+ unlist %>%
+ data.frame( topic = .[seq(1,length(.), by=2)],
+ waga = .[seq(2,length(.), by=2)] ) %>%
+ select( topic, waga) %>% arrange( desc( waga)) %>%
+ unique() %>%
+ .[1,1]
+ )
> table(data2$xx)[ which(table(data2$xx) > 1) ]
kuchnia
419612
我添加nr
了作为行号的额外列,然后我愚蠢地group_by
编辑了该列和summarise
d 而不是mutate
并实现了我想要的......但我并不为我的代码感到自豪。还有其他想法吗?
daneBC1 <- data %>%
group_by( nr) %>%
summarise( bc1 = topics %>%
stri_extract_all_regex(pattern = "[a-zA-Z0-9óśćłźżęą\\.\\s]+") %>%
unlist %>%
data.frame( topic = .[seq(1,length(.), by=2)],
waga = .[seq(2,length(.), by=2)] ) %>%
select( topic, waga) %>% arrange( desc( waga)) %>%
unique() %>%
.[1,1] )
daneBC1$bc1 %>% table
dom edukacja film finanse gra kibic kuchnia kultura
119802 79487 55569 38134 30425 21757 16371 12356
moda motoryzacja muzyka plotka podróż praca rodzina sport
11103 7264 6357 4855 3520 3005 2317 2183
technologia uroda zdrowie
1441 1055 740
样本数据
library(archivist)
data <- loadFromGithubRepo( "97f74c5a10f510cce39eafb0d9a1a9e8",
user="MarcinKosinski", repo="Museum", value = TRUE )