0

有人可以解释为什么下面标记的行返回数字而不是设置该行中的字符串的列名吗?如果在阅读表格时包含注释掉的colClasses行,我将如何获得正确的列名?

url<-'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
library(XML)
#colClasses = c("character","character","character","character","integer","integer","integer","character","character","integer","character","character"),
data<-readHTMLTable(url,header=F,as.data.frame=T)
View(data)
csv<-as.data.frame(data)
colnames(csv)<-csv[4,] #why does this line return numbers?
colnames(csv)<-gsub(" ","",colnames(csv))
View(head(csv))
csv<-csv[-c(1:4),]
#####
View(csv)
4

1 回答 1

1

你被咬了stringsAsFactors。另外,您的调用不是正确的做法(使用vsas.data.frame查看数据结构)。strView

library(XML)

URL <- 'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'

csv <- readHTMLTable(URL, header=FALSE, as.data.frame=TRUE, stringsAsFactors=FALSE)[[2]]

colnames(csv) <- csv[4,]
colnames(csv) <- gsub(" ", "", colnames(csv))

csv <- csv[-c(1:4),]

dplyr::glimpse(csv)

## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023", "173C2 G009", "173C2 G007", "17...
## $ PropertyType       (chr) "R", "R", "R", "R", "R", "R", "R", "R", "R"...
## $ SaleDate           (chr) "12-2015", "08-2015", "08-2015", "07-2015",...
## $ SalePrice          (chr) "200,000", "265,000", "210,000", "188,000",...
## $ HeatedSqFt         (chr) "1,538", "1,756", "1,415", "1,125", "1,559"...
## $ Acres              (chr) "0.30", "0.37", "0.37", "0.38", "0.32", "0....
## $ SaleQual           (chr) "Q", "Q", "Q", "Q", "Q", "Q", "U", "Q", "Q"...
## $ Reason             (chr) "FM", "FM", "FM", "FM", "FM", "FM", "B", "F...
## $ YearBuilt          (chr) "1952", "1954", "1963", "1963", "1998", "19...
## $ LocationAddress    (chr) "155 HARDIN DR", "140 HARDIN DR", "150 HARD...
## $ Neighborhood       (chr) "4025R-RIVERDALE", "4025R-RIVERDALE", "4025...


# or use the more modern xml2 ---------------------------------------------

library(xml2)
library(rvest)

pg <- read_html(URL)

csv2 <- html_table(html_nodes(pg, "table")[[2]], fill=TRUE)

colnames(csv2) <- csv[4,]
colnames(csv2) <- gsub(" ", "", colnames(csv))

csv2 <- csv2[-c(1:4), -c(12:13)]

dplyr::glimpse(csv2)

## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023 ", "173C2 G009 ", "173C2 G007 ", ...
## $ PropertyType       (chr) "R ", "R ", "R ", "R ", "R ", "R ", "R ", "...
## $ SaleDate           (chr) "12-2015 ", "08-2015 ", "08-2015 ", "07-201...
## $ SalePrice          (chr) "200,000 ", "265,000 ", "210,000 ", "188,00...
## $ HeatedSqFt         (chr) "1,538 ", "1,756 ", "1,415 ", "1,125 ", "1,...
## $ Acres              (chr) "0.30 ", "0.37 ", "0.37 ", "0.38 ", "0.32 "...
## $ SaleQual           (chr) "Q ", "Q ", "Q ", "Q ", "Q ", "Q ", "U ", "...
## $ Reason             (chr) "FM ", "FM ", "FM ", "FM ", "FM ", "FM ", "...
## $ YearBuilt          (chr) "1952 ", "1954 ", "1963 ", "1963 ", "1998 "...
## $ LocationAddress    (chr) "155 HARDIN DR ", "140 HARDIN DR ", "150 HA...
## $ Neighborhood       (chr) "4025R-RIVERDALE ", "4025R-RIVERDALE ", "40...
于 2016-03-13T23:24:29.853 回答