你也可以这样做:(使用a
来自@rengis)
如果你有http
并且https
作为URL
text1 <- gsub("(?<=[0-9]|http|https):(*SKIP)(*F)|:", ";", a, perl=TRUE)
text2 <- text1[!grepl("\\*|^$", text1)]
res <- do.call(data.frame,c(split(gsub(".*; ?", "", text2),
gsub(";.*", "", text2)), stringsAsFactors=FALSE))
res
# Favorite Hashtags ID MentionedEntities Origin RetCount Text
#1 false 123 abc 0 abc
#2 false rty 456 cde 0 rty
# Time Type URL
#1 Fri Jul 22 15:07:37 CDT 2011 status
#2 Thu Jul 21 14:09:47 CDT 2011 status http://ocs
或使用cSplit
library(data.table)
library(devtools)
source_gist(11380733)
DT <- cSplit(as.data.frame(text2), "text2",";", "wide")[,
n:= seq_len(.N), by=text2_1]
dcast.data.table(DT, n~text2_1, value.var="text2_2")
# n Favorite Hashtags ID MentionedEntities Origin RetCount Text
# 1: 1 false 123 abc 0 abc
# 2: 2 false rty 456 cde 0 rty
Time Type URL
#1: Fri Jul 22 15:07:37 CDT 2011 status
#2: Thu Jul 21 14:09:47 CDT 2011 status http://ocs
更新
根据新信息,即colons
弹出:
a <- readLines(textConnection("
***
Type:status
Origin: abc
Text: abc
URL:
ID: 123
Time: Fri Jul 22 15:07:37 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags:
***
***
Type:status
Origin: cde: andgg
Text: rty: asndf
URL: http://ocs
ID: 456
Time: Thu Jul 21 14:09:47 CDT 2011
RetCount: 0
Favorite: false
MentionedEntities:
Hashtags: rty
***
***"))
text1 <- gsub("(?<=[0-9]|http|https):(*SKIP)(*F)|^([^:]+):(.*)",
"\\1;\\2", a, perl=TRUE)
text2 <- text1[!grepl("\\*|^$", text1)]
splitGroup <- sub(";.*", "", text2)
res <- do.call(data.frame,c(split(gsub(".*; ?", "", text2),
factor(splitGroup, levels=unique(splitGroup))), stringsAsFactors=FALSE))
res
# Type Origin Text URL ID Time
#1 status abc abc 123 Fri Jul 22 15:07:37 CDT 2011
#2 status cde: andgg rty: asndf http://ocs 456 Thu Jul 21 14:09:47 CDT 2011
# RetCount Favorite MentionedEntities Hashtags
#1 0 false
#2 0 false rty