I have a large table with a bunch of data in it, but the relevant columns are serialNumber
and date
.
My goal is to create a new table that gives me the start date and end date for each continuous run of days for each serial number. Like This:
serialNumber, minDate, maxDate
1111, 2009-02-15, 2011-07-01
1111, 2014-09-01, 2015-04-12
1111, 2017-12-11, NA
2222, 2016-07-11, 2018-07-01
I have been able to get the data I need one serial number at a time by running snippets of the code below, but I've been stumped trying to get my script to output the data in the above format.
Here is my script:
library(RMySQL)
library(dplyr)
db <- dbConnect(MySQL(), user=username, password=password,
dbname='database', host='host')
results = data.frame(serialNumber = numeric(), minDate = as.Date(numeric(), origin="1970-01-01"), maxDate = as.Date(numeric(), origin="1970-01-01"))
queryUniqueSerialNumbers <- "SELECT DISTINCT(serialNumber) FROM myTable"
uniqueSerialNumberIds <- dbGetQuery(db, queryUniqueSerialNumbersIds)
geTimeDataForGivenSerialNumber <- function(serialNumber) {
queryTimeData <- paste0("SELECT * FROM myTable WHERE serialNumber = ", serialNumber)
timeData <- dbGetQuery(db, queryTimeData)
dateRanges <- as.vector(rle(timeData$date)$values)
unbrokenRuns <- split(as.Date(dateRanges), cumsum(c(TRUE, diff(as.Date(dateRanges)) != 1L)))
record <- createRecordOfTimeSpan(unbrokenRuns)
serialNumbers <- as.list(rep(serialNumberNumber, length(results)))
results <- cbind(serialNumbers, record)
return(results)
}
createRecordOfTimeSpan <- function(unbrokenRuns) {
mins <- lapply(unbrokenRuns, min)
maxs <- lapply(unbrokenRuns, max)
record <- data.frame(minDate = mins, maxDate = maxs)
return(record)
}
results <- as.data.frame(lapply(uniqueSerialNumbers, getTimeDataForGivenserialNumber))