3个解决方案:
一个包装器dput
(处理标准data.frames
,tibbles
和lists
)
解决read.table
方案(对于data.frames
)
解决tibble::tribble
方案(对于data.frames
,返回 a tibble
)
所有包含n
和random
参数都允许仅输入数据的头部或动态对其进行采样。
dput_small1(Df)
# Df <- data.frame(
# A = c(2, 2, 2, 6, 7, 8),
# B = structure(c(1L, 2L, 4L, NA, 3L, 3L), .Label = c("A", "G", "L",
# "N"), class = "factor"),
# C = c(1L, 3L, 5L, NA, NA, NA) ,
# stringsAsFactors=FALSE)
dput_small2(Df,stringsAsFactors=TRUE)
# Df <- read.table(sep="\t", text="
# A B C
# 2 A 1
# 2 G 3
# 2 N 5
# 6 NA NA
# 7 L NA
# 8 L NA", header=TRUE, stringsAsFactors=TRUE)
dput_small3(Df)
# Df <- tibble::tribble(
# ~A, ~B, ~C,
# 2, "A", 1L,
# 2, "G", 3L,
# 2, "N", 5L,
# 6, NA_character_, NA_integer_,
# 7, "L", NA_integer_,
# 8, "L", NA_integer_
# )
# Df$B <- factor(Df$B)
包装dput
此选项提供的输出非常接近问题中提出的输出。它很笼统,因为它实际上是包裹在 周围的dput
,但在列上单独应用。
multiline
表示“保持 dput 的默认输出布局成多行”。
dput_small1<- function(x,
name=as.character(substitute(x)),
multiline = TRUE,
n=if ('list' %in% class(x)) length(x) else nrow(x),
random=FALSE,
seed = 1){
name
if('tbl_df' %in% class(x)) create_fun <- "tibble::tibble" else
if('list' %in% class(x)) create_fun <- "list" else
if('data.table' %in% class(x)) create_fun <- "data.table::data.table" else
create_fun <- "data.frame"
if(random) {
set.seed(seed)
if(create_fun == "list") x <- x[sample(1:length(x),n)] else
x <- x[sample(1:nrow(x),n),]
} else {
x <- head(x,n)
}
line_sep <- if (multiline) "\n " else ""
cat(sep='',name," <- ",create_fun,"(\n ",
paste0(unlist(
Map(function(item,nm) paste0(nm,if(nm=="") "" else " = ",paste(capture.output(dput(item)),collapse=line_sep)),
x,if(is.null(names(x))) rep("",length(x)) else names(x))),
collapse=",\n "),
if(create_fun == "data.frame") ",\n stringsAsFactors = FALSE)" else "\n)")
}
dput_small1(list(1,2,c=3,d=4),"my_list",random=TRUE,n=3)
# my_list <- list(
# 2,
# d = 4,
# c = 3
# )
read.table
解决方案
因为data.frames
我觉得以更明确/表格格式输入很舒服。
这可以使用 来实现,然后自动重新格式化不正确read.table
的列类型。read.table
不像第一个解决方案那样通用,但对于在SO
.
dput_small2 <- function(df,
name=as.character(substitute(df)),
sep='\t',
header=TRUE,
stringsAsFactors = FALSE,
n= nrow(df),
random=FALSE,
seed = 1){
name
if(random) {
set.seed(seed)
df <- df[sample(1:nrow(df),n),]
} else {
df <- head(df,n)
}
cat(sep='',name,' <- read.table(sep="',sub('\t','\\\\t',sep),'", text="\n ',
paste(colnames(df),collapse=sep))
df <- head(df,n)
apply(df,1,function(x) cat(sep='','\n ',paste(x,collapse=sep)))
cat(sep='','", header=',header,', stringsAsFactors=',stringsAsFactors,')')
sapply(names(df), function(x){
if(is.character(df[[x]]) & suppressWarnings(identical(as.character(as.numeric(df[[x]])),df[[x]]))){ # if it's a character column containing numbers
cat(sep='','\n',name,'$',x,' <- as.character(', name,'$',x,')')
} else if(is.factor(df[[x]]) & !stringsAsFactors) { # if it's a factor and conversion is not automated
cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')')
} else if(inherits(df[[x]], "POSIXct")){
cat(sep='','\n',name,'$',x,' <- as.POSIXct(', name,'$',x,')')
} else if(inherits(df[[x]], "Date")){
cat(sep='','\n',name,'$',x,' <- as.Date(', name,'$',x,')')
}})
invisible(NULL)
}
最简单的情况
dput_small2(iris,n=6)
将打印:
iris <- read.table(sep="\t", text="
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa", header=TRUE, stringsAsFactors=FALSE)
依次执行时将返回:
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3.0 1.4 0.2 setosa
# 3 4.7 3.2 1.3 0.2 setosa
# 4 4.6 3.1 1.5 0.2 setosa
# 5 5.0 3.6 1.4 0.2 setosa
# 6 5.4 3.9 1.7 0.4 setosa
str(iris)
# 'data.frame': 6 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4
# $ Species : chr " setosa" " setosa" " setosa" " setosa" ...
更复杂
虚拟数据:
test <- data.frame(a=1:5,
b=as.character(6:10),
c=letters[1:5],
d=factor(letters[6:10]),
e=Sys.time()+(1:5),
stringsAsFactors = FALSE)
这个:
dput_small2(test,'df2')
将打印:
df2 <- read.table(sep="\t", text="
a b c d e
1 6 a f 2018-02-15 11:53:17
2 7 b g 2018-02-15 11:53:18
3 8 c h 2018-02-15 11:53:19
4 9 d i 2018-02-15 11:53:20
5 10 e j 2018-02-15 11:53:21", header=TRUE, stringsAsFactors=FALSE)
df2$b <- as.character(df2$b)
df2$d <- factor(df2$d)
df2$e <- as.POSIXct(df2$e)
依次执行时将返回:
# a b c d e
# 1 1 6 a f 2018-02-15 11:53:17
# 2 2 7 b g 2018-02-15 11:53:18
# 3 3 8 c h 2018-02-15 11:53:19
# 4 4 9 d i 2018-02-15 11:53:20
# 5 5 10 e j 2018-02-15 11:53:21
str(df2)
# 'data.frame': 5 obs. of 5 variables:
# $ a: int 1 2 3 4 5
# $ b: chr "6" "7" "8" "9" ...
# $ c: chr "a" "b" "c" "d" ...
# $ d: Factor w/ 5 levels "f","g","h","i",..: 1 2 3 4 5
# $ e: POSIXct, format: "2018-02-15 11:53:17" "2018-02-15 11:53:18" "2018-02-15 11:53:19" "2018-02-15 11:53:20" ...
all.equal(df2,test)
# [1] "Component “e”: Mean absolute difference: 0.4574251" # only some rounding error
tribble
解决方案
该read.table
选项非常易读,但不是很通用。tribble
几乎可以处理任何数据类型(尽管因素需要临时修复)。
此解决方案对于 OP 的示例不是很有用,但对于列表列非常有用(请参见下面的示例)。要使用输出,tibble
需要库。
就像我的第一个解决方案一样,它是一个包装器dput
,但不是“dputting”列,而是“dputting”元素。
dput_small3 <- function(df,
name=as.character(substitute(df)),
n= nrow(df),
random=FALSE,
seed = 1){
name
if(random) {
set.seed(seed)
df <- df[sample(1:nrow(df),n),]
} else {
df <- head(df,n)
}
df1 <- lapply(df,function(col) if(is.factor(col)) as.character(col) else col)
dputs <- sapply(df1,function(col){
col_dputs <- sapply(col,function(elt) paste(capture.output(dput(elt)),collapse=""))
max_char <- max(nchar(unlist(col_dputs)))
sapply(col_dputs,function(elt) paste(c(rep(" ",max_char-nchar(elt)),elt),collapse=""))
})
lines <- paste(apply(dputs,1,paste,collapse=", "),collapse=",\n ")
output <- paste0(name," <- tibble::tribble(\n ",
paste0("~",names(df),collapse=", "),
",\n ",lines,"\n)")
cat(output)
sapply(names(df), function(x) if(is.factor(df[[x]])) cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')'))
invisible(NULL)
}
dput_small3(dplyr::starwars[c(1:3,11)],"sw",n=6,random=TRUE)
# sw <- tibble::tribble(
# ~name, ~height, ~mass, ~films,
# "Lando Calrissian", 177L, 79, c("Return of the Jedi", "The Empire Strikes Back"),
# "Finis Valorum", 170L, NA_real_, "The Phantom Menace",
# "Ki-Adi-Mundi", 198L, 82, c("Attack of the Clones", "The Phantom Menace", "Revenge of the Sith"),
# "Grievous", 216L, 159, "Revenge of the Sith",
# "Wedge Antilles", 170L, 77, c("Return of the Jedi", "The Empire Strikes Back", "A New Hope"),
# "Wat Tambor", 193L, 48, "Attack of the Clones"
# )