arrays - 使用数组读入（扫描，读取csv）R中的大量数据

Question

任何人都可以帮助我了解如何在 R 中读取一个巨大的数据框（33 行，38 列，并且总是每 39 列以 # 从每日天气 1951-2015 开始的标题）。我在 .txt 文件中有以下数据命名它测试2.txt：

# 1950-01-01 00:59:00
  1 5 5 5 9
  2 3 4 5 2
# 1950-01-02 00:59:00
  4 5 4 4 3
  9 4 3 3 3
# 1950-01-03 00:59:00
  4 2 3 3 3
  2 2 2 3 9

我正在尝试将其读入 R，然后创建一个数组或一个合理的矩阵来进行计算。我尝试使用 read.csv 和扫描，但我想我完全走错了路。有谁知道使用哪个命令。

 read.csv("test2.txt", header=FALSE, sep="")

此外，我想在之后为列和行添加名称，但这也可以在第二步中发生。名称应该用于行：A、B 和列 C、D、E、F、G，所以最后，数组看起来像我假设的名称（例如＃1950-01-03 00:59 :00) 可能会丢失。

score 2 · Accepted Answer

已编辑

我提供了两种单独的单线解决方案。

将文件视为固定宽度格式

read.fwf("test2.txt", 
         widths = list(21, c(1, rep(2, 4)), rep(2, 5)), 
         comment.char = "")

我举例说明：

file <- "# 1950-01-01 00:59:00
1 5 5 5 9
2 3 4 5 2
# 1950-01-02 00:59:00
4 5 4 4 3
9 4 3 3 3
# 1950-01-03 00:59:00
4 2 3 3 3
2 2 2 3 9"

read.fwf(textConnection(file), 
         widths = list(21, c(1, rep(2, 4)), rep(2, 5)), 
         comment.char = "")

                     V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
1 # 1950-01-01 00:59:00  1  5  5  5  9  2  3  4   5   2
2 # 1950-01-02 00:59:00  4  5  4  4  3  9  4  3   3   3
3 # 1950-01-03 00:59:00  4  2  3  3  3  2  2  2   3   9

将文件视为空白单独的表

你只需要一行 R 代码就可以做到这一点：

read.table("test2.txt", comment.char = "#", header = FALSE)

这样做的原因是comment.char允许您指定要忽略的文本。在您的情况下，由于您的行以函数开头，#因此read.table()忽略了整行。

我举例说明：

file <- "# 1950-01-01 00:59:00
  1 5 5 5 9
2 3 4 5 2
# 1950-01-02 00:59:00
4 5 4 4 3
9 4 3 3 3
# 1950-01-03 00:59:00
4 2 3 3 3
2 2 2 3 9"

read.table(text = file, comment.char = "#", header = FALSE)

  V1 V2 V3 V4 V5
1  1  5  5  5  9
2  2  3  4  5  2
3  4  5  4  4  3
4  9  4  3  3  3
5  4  2  3  3  3
6  2  2  2  3  9

score 1 · Accepted Answer

对于示例文本，我使用了以下代码：

library(stringi)
nrrep <- 3 # or 39 in your case
ncols <- 5
list.files()
dump <- readLines("test2.txt")
namelines <- str_trim(dump[(1+nrrep*(0:((length(dump))/nrrep -1 )))])
goodlines <- str_trim(dump[-(1+nrrep*(0:((length(dump))/nrrep -1 )))])
mymat <- matrix(unlist(str_split(goodlines, " ")), ncol=ncols)
rownames(mymat) <- rep(namelines, each=nrrep-1)
colnames(mymat) <- paste0("Col",LETTERS[1:ncols])
mymat

                        ColA ColB ColC ColD ColE
# 1950-01-01 00:59:00 "1"  "3"  "4"  "3"  "3" 
# 1950-01-01 00:59:00 "5"  "4"  "4"  "3"  "2" 
# 1950-01-02 00:59:00 "5"  "5"  "3"  "4"  "2" 
# 1950-01-02 00:59:00 "5"  "2"  "9"  "2"  "2" 
# 1950-01-03 00:59:00 "9"  "4"  "4"  "3"  "3" 
# 1950-01-03 00:59:00 "2"  "5"  "3"  "3"  "9"

score 0 · Accepted Answer

我不确定您认为要在 R 中最终使用的格式是否会帮助您分析数据。在不了解有关读数的更多信息的情况下，这是一种使用基数 R 的方法，然后如何使用以下命令将生成的数据帧从宽格式重新格式化为长格式tidyr：

readings_raw <- readLines(textConnection("# 1950-01-01 00:59:00
  1 5 5 5 9
  2 3 4 5 2
# 1950-01-02 00:59:00
  4 5 4 4 3
  9 4 3 3 3
# 1950-01-03 00:59:00
  4 2 3 3 3
  2 2 2 3 9"))


readings_wide <- do.call(rbind, lapply(seq(1, length(readings_raw), 3), function(i) {
  tmp <- read.table(text=paste(readings_raw[(i+1):(i+2)], collapse=""),
                    col.names=LETTERS[1:10])
  tmp$date <- as.POSIXct(gsub("^# |\ *$", "", readings_raw[i]))
  tmp
}))

readings_wide
##   A B C D E F G H I J                date
## 1 1 5 5 5 9 2 3 4 5 2 1950-01-01 00:59:00
## 2 4 5 4 4 3 9 4 3 3 3 1950-01-02 00:59:00
## 3 4 2 3 3 3 2 2 2 3 9 1950-01-03 00:59:00
tidyr::gather(readings_wide, reading, value, -date)
##                   date reading value
## 1  1950-01-01 00:59:00       A     1
## 2  1950-01-02 00:59:00       A     4
## 3  1950-01-03 00:59:00       A     4
## 4  1950-01-01 00:59:00       B     5
## 5  1950-01-02 00:59:00       B     5
## 6  1950-01-03 00:59:00       B     2
## 7  1950-01-01 00:59:00       C     5
## 8  1950-01-02 00:59:00       C     4
## 9  1950-01-03 00:59:00       C     3
## 10 1950-01-01 00:59:00       D     5
## 11 1950-01-02 00:59:00       D     4
## 12 1950-01-03 00:59:00       D     3
## 13 1950-01-01 00:59:00       E     9
## 14 1950-01-02 00:59:00       E     3
## 15 1950-01-03 00:59:00       E     3
## 16 1950-01-01 00:59:00       F     2
## 17 1950-01-02 00:59:00       F     9
## 18 1950-01-03 00:59:00       F     2
## 19 1950-01-01 00:59:00       G     3
## 20 1950-01-02 00:59:00       G     4
## 21 1950-01-03 00:59:00       G     2
## 22 1950-01-01 00:59:00       H     4
## 23 1950-01-02 00:59:00       H     3
## 24 1950-01-03 00:59:00       H     2
## 25 1950-01-01 00:59:00       I     5
## 26 1950-01-02 00:59:00       I     3
## 27 1950-01-03 00:59:00       I     3
## 28 1950-01-01 00:59:00       J     2
## 29 1950-01-02 00:59:00       J     3
## 30 1950-01-03 00:59:00       J     9

score 0 · Accepted Answer

file <- "# 1950-01-01 00:59:00
  1 5 5 5 9
  2 3 4 5 2
# 1950-01-02 00:59:00
  4 5 4 4 3
  9 4 3 3 3
# 1950-01-03 00:59:00
  4 2 3 3 3
  2 2 2 3 9"

library(dplyr)
library(stringr)
Imported <- data.frame(raw= readLines(textConnection(file))) %>%
  mutate(index = cumsum(grepl("#", raw))) 

Dates <- filter(Imported, grepl("#", raw))

ColumnsData <- filter(Imported, !grepl("#", raw)) %>%
  group_by(index) %>%
  mutate(sub_index = 1:n())

Columns <- 
  do.call("rbind",
          lapply(1:nrow(ColumnsData),
                 function(i){
                   cols <- unlist(str_split(ColumnsData$raw[i], ""))
                   cols <- cols[cols != " "]
                   as.numeric(cols)
                 }
          ))
Columns <- cbind(ColumnsData, as.data.frame(Columns))
Columns <- merge(Dates, Columns,
                 by = "index")

> Columns
  index                 raw.x       raw.y sub_index V1 V2 V3 V4 V5
1     1 # 1950-01-01 00:59:00   1 5 5 5 9         1  1  5  5  5  9
2     1 # 1950-01-01 00:59:00   2 3 4 5 2         2  2  3  4  5  2
3     2 # 1950-01-02 00:59:00   4 5 4 4 3         1  4  5  4  4  3
4     2 # 1950-01-02 00:59:00   9 4 3 3 3         2  9  4  3  3  3
5     3 # 1950-01-03 00:59:00   4 2 3 3 3         1  4  2  3  3  3
6     3 # 1950-01-03 00:59:00   2 2 2 3 9         2  2  2  2  3  9

这不是一个特别优雅的解决方案，但它还具有索引每个日期内的行号的优点。

arrays - 使用数组读入（扫描，读取csv）R中的大量数据

4 回答 4

Related

Reference