1

我正在研究阅读每月时间序列数据的方法,这些数据没有以“日期”列和“数据”列的宽格式整齐排列。例如,SEMI的这个电子表格有按月份和地区组织的数据块,但年份是分开的并且在不连续的块中,以 YYYY 形式作为每个块前面的标题的年份。

非连续数据

我的目标是将此数据转换为连续块,其中第 1 列中的月度日期和第 2:6 列中的区域数据。将此电子表格导出为制表符分隔文件后(我发现两者gdata都有XLConnect问题,您可以在屏幕截图中看到的那种合并单元格存在问题)我读了它并获取了一个子集,这是dput下面的源代码。

我采取了首先使用以下方法去除空行的方法:

mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),]

然后在 Region 列中为具有年份的行添加一个标签 - 方便地,它总是出现在第二个('January')列中。

mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate'

下一步是用每月日期填充这些“年”行中从 1 月到 12 月的列。我想一旦我每个月都有一个唯一的日期,我就可以使用ddply或其他东西来处理它。

mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))

这并没有像我预期的那样工作,因为该apply函数没有按照我希望的方式生成日期 - 它们不是按顺序排列的。我将非常感谢(a)apply步骤的特定修复或(b)可能更简单或更容易的替代方法。

数据和代码如下:

mydf <- structure(list(Region = c("", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide"), January = c("1980", "413136",
"189577", "34033", "39868", "676614", "", "1981", "445504", "277290",
"33970", "44642", "801406", "", "1982", "445300", "226274", "34404",
"44989", "750967", "", "January", "1983", "457604", "232443",
"34326", "46247", "770621", "", "1984", "731009", "285740", "205644",
"85426", "1307820"), February = c("", "423748", "234818", "35104",
"42398", "736069", "", "", "440225", "274526", "33795", "44005",
"792550", "", "", "438332", "226806", "33359", "44020", "742517",
"", "February", "", "457899", "233560", "32604", "46184", "770247",
"", "", "790963", "307735", "381282", "102791", "1582770"), March = c("",
"436152", "281353", "34456", "46555", "798516", "", "", "434628",
"267259", "33709", "45206", "780802", "", "", "441313", "235612",
"32380", "43600", "752905", "", "March", "", "459498", "234986",
"31544", "48178", "774206", "", "", "856970", "339674", "574527",
"118091", "1889262"), April = c("", "455673", "288710", "34451",
"48585", "827419", "", "", "443285", "264405", "34823", "47192",
"789705", "", "", "465613", "246425", "33618", "46274", "791930",
"", "April", "", "484299", "243867", "32719", "52333", "813218",
"", "", "909873", "364465", "627400", "126954", "2028693"), May = c("",
"474441", "297343", "35092", "51102", "857977", "", "", "451221",
"255887", "35499", "48459", "791065", "", "", "487738", "249522",
"34339", "47727", "819325", "", "May", "", "507807", "246136",
"34708", "59300", "847950", "", "", "969553", "382706", "655862",
"133455", "2141576"), June = c("", "475552", "299427", "35743",
"51440", "862162", "", "", "453152", "242889", "35798", "48147",
"779986", "", "", "488564", "241273", "34360", "48871", "813068",
"", "June", "", "528620", "246710", "37345", "62910", "875586",
"", "", "991274", "388697", "672773", "135550", "2188294"), July = c("",
"473007", "302075", "37771", "51027", "863880", "", "", "454387",
"231097", "35402", "47468", "768353", "", "", "480702", "229555",
"33915", "49112", "793284", "", "July", "", "543063", "241211",
"40403", "66658", "891335", "", "", "1005742", "395852", "683854",
"138853", "2224302"), August = c("", "462125", "294497", "37628",
"49773", "844023", "", "", "450648", "213017", "34363", "46614",
"744642", "", "", "472486", "215763", "32866", "48620", "769734",
"", "August", "", "565034", "236353", "42524", "66853", "910763",
"", "", "1010739", "393337", "691731", "141101", "2236908"),
    September = c("", "461968", "295501", "37310", "50280", "845059",
    "", "", "459276", "215403", "33801", "47297", "755777", "",
    "", "475729", "219643", "33083", "47540", "775994", "", "September",
    "", "593019", "244979", "44108", "70242", "952348", "", "",
    "1035725", "408658", "698992", "141944", "2285320"), October = c("",
    "459862", "296522", "36399", "51220", "844003", "", "", "465096",
    "218792", "34168", "47369", "765424", "", "", "467151", "225828",
    "33667", "47890", "774536", "", "October", "", "618854",
    "259807", "47622", "71345", "997628", "", "", "1033560",
    "421043", "710563", "140154", "2305320"), November = c("",
    "456832", "296283", "35769", "50531", "839415", "", "", "467288",
    "232593", "35039", "47415", "782335", "", "", "461950", "237117",
    "35672", "47285", "782024", "", "November", "", "641864",
    "275099", "50371", "72095", "1039428", "", "", "1008836",
    "441652", "732948", "133861", "2317297"), December = c("",
    "460343", "291348", "35781", "48298", "835771", "", "", "460574",
    "231461", "35971", "47173", "775179", "", "", "462919", "235861",
    "36251", "47974", "783006", "", "December", "", "672533",
    "276525", "54603", "74717", "1078379", "", "", "982210",
    "442448", "731546", "132982", "2289187")), .Names = c("Region",
"January", "February", "March", "April", "May", "June", "July",
"August", "September", "October", "November", "December"), row.names = 29:63, class = "data.frame")

mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),] # remove rows with nothing in the January column
mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate' # add a row label for 'year' rows

mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))
4

3 回答 3

3

您可以使用xlsReadWritereshape2

 library(xlsReadWrite)
 tdata<-read.xls('GSR1976-June 2012.xls',stringsAsFactors=F)
 tdata[85,2]<-1987 # fix for missing year
 tdata[228,2]<-2007 # fix for missing year
 year.marker<-c(grep('^[[:digit:]]{4}$',tdata[,2]),270)

 temp.df<-NULL

 for(i in seq_along(year.marker)[-length(year.marker)]){
   dum.df<-cbind(tdata[year.marker[i],2],tdata[(year.marker[i]+1):(year.marker[i+1]-2),])
   temp.df<-rbind(temp.df,dum.df)
 }

 names(temp.df)<-c('year','region',month.name)

 df1<-temp.df[!temp.df[,'region']=='',]
 library(reshape2)
 df2<-melt(df1, id.vars=c("region", "year"))
于 2012-08-18T12:35:27.640 回答
2

我采取了以下方法:

首先,我将您的文件转换为 CSV,然后读取其中的行。我曾经grep()找到“Americas”,这是每组的第一行。我手动输入了开始和结束年份,但其中一些grep可能也可以使用。

temp = readLines("GSR1976-June 2012.csv")
START = grep("Americas", temp)
YEARS = 1976:2012

之后,我创建了一个data.frames 列表,每年一个。

temp1 = lapply(1:length(YEARS), 
               function(x) read.csv("GSR1976-June 2012.csv",
                                    header=FALSE, skip=START[x]-1,
                                    nrows=5))
names(temp1) = YEARS

然后,我将它们合二为一data.frame并进行了一些清理。

temp2 = do.call(rbind, temp1)
names(temp2) = c("region", "jan", "feb", "mar", "apr", "may", "jun",
                 "jul", "aug", "sep", "oct", "nov", "dec")
temp2$year = rep(YEARS, each=5)

您没有指定要进行哪种类型的重塑,但如果您想从宽变长,最简单的方法是使用reshape2包:

library(reshape2)
temp3 = melt(temp2, id.vars=c("region", "year"))
list(head(temp3), tail(temp3))
# [[1]]
#         region year variable  value
# 1     Americas 1976      jan     NA
# 2       Europe 1976      jan     NA
# 3        Japan 1976      jan     NA
# 4 Asia Pacific 1976      jan     NA
# 5    Worldwide 1976      jan     NA
# 6     Americas 1977      jan 195638
# 
# [[2]]
#            region year variable    value
# 2215    Worldwide 2011      dec 23832532
# 2216     Americas 2012      dec       NA
# 2217       Europe 2012      dec       NA
# 2218        Japan 2012      dec       NA
# 2219 Asia Pacific 2012      dec       NA
# 2220    Worldwide 2012      dec       NA

然后,对于听起来您正在寻找的输出,请使用dcast()

temp4 = dcast(temp3, year + variable ~ region)
head(temp4)
#   year variable Americas Asia Pacific Europe Japan Worldwide
# 1 1976      jan       NA           NA     NA    NA        NA
# 2 1976      feb       NA           NA     NA    NA        NA
# 3 1976      mar   178295        16761  55602 10805    261463
# 4 1976      apr   178961        16513  60959 11589    268022
# 5 1976      may   187076        17396  62329 12435    279235
# 6 1976      jun   193675        17712  61676 14411    287475
于 2012-08-18T12:28:32.217 回答
1

使用XLConnect可以轻松地直接从 Excel 文件处理上述数据集,如下所示:

require(XLConnect)
require(reshape2)

# Load Excel workbook
wb = loadWorkbook("~/Downloads/GSR1976-June 2012.xls")

# Read data from 1st worksheet, starting at row 7 with predefined column types
data = readWorksheet(wb, sheet = 1, startRow = 7, 
    colTypes = c("character", rep("numeric", 12)))
# Rename first column and keep month names
colnames(data)[1] = "Region"
months = names(data)[-1]

# The data of merged cells (years) is in the first cell of the merged region
years = ifelse(is.na(data$Region), data$January, NA)
idx = !is.na(years)

# Replicate year information to form a new column 'Year'
data$Year = rep(years[idx], times = diff(c(which(idx), length(years) + 1)))

# Remove any rows where 'Region' is missing (^= non-data rows)
data = data[!is.na(data$Region), ]

# Reshape (wide --> long)
data = melt(data, measure.vars = months, variable.name = "Month")
于 2012-08-19T19:18:04.173 回答