好的,我按照@DirkEddelbuettel 的建议写了一些包装器。评论赞赏。
#' Write a table via RSQLite with factors stored in another table
#' Handles data.tables efficiently for large datasets
#' @param conn The connection object (created with e.g. dbConnect)
#' @param name The name of the table to write
#' @param value The data.frame to write to the database
#' @param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will create a table called mytable_factor_color which will store the levels information)
#' @param \dots Options to pass along to dbWriteTable (e.g. append=TRUE)
#' @return A boolean indicating whether the table write was successful
dbWriteFactorTable <- function( conn, name, value, factorName="_factor_", ... ) {
# Test inputs
stopifnot(class(conn)=="SQLiteConnection")
stopifnot(class(name)=="character")
stopifnot("data.frame" %in% class(value))
stopifnot(class(factorName)=="character")
if( grepl("[.]",factorName) ) stop("factorName must use valid characters for SQLite")
if( "data.table" %in% class(value) ) dt <- TRUE # Is value a data.table, if so use more efficient methods
# Convert factors to character
factorCols <- names( Filter( function(x) x=="factor", vapply( value, class, "" ) ) )
if(length(factorCols>0)) {
for( cl in which( colnames(value) %in% factorCols ) ) {
cn <- colnames(value)[cl]
factorTable <- data.frame( levels=levels(value[[ cn ]]) )
factorTable$levelKey <- seq(nrow(factorTable))
fctNm <- paste0(name,factorName,cn)
dbWriteTable( conn = conn, name = fctNm, value = factorTable, row.names=FALSE, overwrite=TRUE )
if( dt ) set( x=value, j=cl, value=as.character(value[[ cn ]]) )
}
if( !dt ) value <- japply( value, which( colnames(value) %in% factorCols ), as.character )
} else {
warning("No factor columns detected.")
}
dbWriteTable( conn = conn, name = name, value = value, ... )
}
#' Read a table via RSQLite with factors stored in another table
#' @param conn The connection object (created with e.g. dbConnect)
#' @param name The name of the table to read
#' @param query A character string containing sequel statements to be appended onto the query (e.g. "WHERE x==3")
#' @param dt Whether to return a data.table vs. a plain-old data.frame
#' @param factorName The base name of the tables to store the factor labels in in the SQLite database (e.g. if factorName is "_factor_" and the data.frame in value contains a factor column called "color" and the name is "mytable" then dbWriteFactorTable will expect there to be a table called mytable_factor_color which holds the levels information)
#' @param \dots Options to pass along to dbGetQuery
#' @return A data.table or data.frame
dbReadFactorTable <- function( conn, name, query="", dt=TRUE, factorName="_factor_", ... ) {
# Test inputs
stopifnot(class(conn)=="SQLiteConnection")
stopifnot(class(name)=="character")
stopifnot(class(factorName)=="character")
if( grepl("[.]",factorName) ) stop("factorName must use valid characters for SQLite")
# Read main table
if( dt ) {
value <- as.data.table( dbGetQuery( conn, paste("SELECT * FROM",name,query), ... ) )
} else {
value <- dbGetQuery( conn, paste("SELECT * FROM",name,query), ... )
}
# Convert factors to character
factorCols <- sub( paste0("^.*",name,factorName,"(.+)$"), "\\1",
Filter( Negate(is.na),
str_extract( dbListTables( conn ), paste0(".*",name,factorName,".*") )
)
)
if(length(factorCols>0)) {
for( cn in factorCols ) {
fctNm <- paste0(name,factorName,cn)
factorTable <- dbGetQuery( conn, paste0("SELECT * FROM ",fctNm) )
if( dt ) {
cl <- which( colnames(value) %in% cn )
set( x=value, j=cl, value=factor( value[[ cn ]], levels=factorTable$levels ) )
} else {
value[[ cn ]] <- factor( value[[ cn ]], levels=factorTable$levels )
}
}
} else {
warning("No factor columns detected.")
}
value
}
一个简单的例子:
db <- dbConnect( SQLite(), dbname="~/temp/test.sqlite" )
set.seed(1)
n <- 1000
testDat <- data.frame(key=seq(n), x=runif(n),y=runif(n),g1=sample(letters[1:10],n,replace=TRUE),g2=rep(letters[1:10],each=n/10),g3=factor( sample(letters[1:10],n,replace=TRUE) ))
if(dbExistsTable(db,"test")) dbRemoveTable(db,"test")
dbWriteFactorTable( conn = db, name = "test", value = as.data.table(testDat), row.names=FALSE )
dbReadFactorTable( conn = db, name = "test" )
dbReadFactorTable( conn = db, name = "test", query="WHERE g3=='a'" )