0

I have a large table with a bunch of data in it, but the relevant columns are serialNumber and date.

My goal is to create a new table that gives me the start date and end date for each continuous run of days for each serial number. Like This:

serialNumber,    minDate,      maxDate
1111,            2009-02-15,   2011-07-01
1111,            2014-09-01,   2015-04-12
1111,            2017-12-11,   NA
2222,            2016-07-11,   2018-07-01

I have been able to get the data I need one serial number at a time by running snippets of the code below, but I've been stumped trying to get my script to output the data in the above format.

Here is my script:

library(RMySQL)
library(dplyr)

db <- dbConnect(MySQL(), user=username, password=password, 
            dbname='database', host='host')
results = data.frame(serialNumber = numeric(), minDate = as.Date(numeric(), origin="1970-01-01"), maxDate = as.Date(numeric(), origin="1970-01-01"))

queryUniqueSerialNumbers <- "SELECT DISTINCT(serialNumber) FROM myTable"
uniqueSerialNumberIds <- dbGetQuery(db, queryUniqueSerialNumbersIds)

geTimeDataForGivenSerialNumber <- function(serialNumber) {
  queryTimeData <- paste0("SELECT * FROM myTable WHERE serialNumber = ", serialNumber) 
  timeData <- dbGetQuery(db, queryTimeData)
  dateRanges <- as.vector(rle(timeData$date)$values)
  unbrokenRuns <- split(as.Date(dateRanges), cumsum(c(TRUE, diff(as.Date(dateRanges)) != 1L)))
  record <- createRecordOfTimeSpan(unbrokenRuns)
  serialNumbers <- as.list(rep(serialNumberNumber, length(results)))
  results <- cbind(serialNumbers, record)
  return(results)
}

createRecordOfTimeSpan <- function(unbrokenRuns) {
  mins <- lapply(unbrokenRuns, min)
  maxs <- lapply(unbrokenRuns, max)
  record <- data.frame(minDate = mins, maxDate = maxs)
  return(record)
}

results <- as.data.frame(lapply(uniqueSerialNumbers, getTimeDataForGivenserialNumber))
4

0 回答 0