1

我一直在从事一个项目,其目标是获取街道地址和邮政编码的两列 CSV,将其读入 R,然后对每一个执行 Zillow 查询(特别是 GetSearchResults),解析输出,然后将解析后的输出存储在要写入 CSV 的数据框中(并放置在现有数据旁边)。

警告:我一次只能通过 zillow API 调用一个地址/zip 组合,所以任何违反它的东西都会立即被排除在外。

到目前为止,我已经完成了大约 85% 的工作。我有 i) 一些代码,可以从数据帧中逐个查询这些地址/zip 组合,以及 ii) 将输入放回数据帧的暂定方式,

library(ZillowR)
library(rvest)
library(dplyr)
library(DT)

# this commented section is what I would use instead of creating the dataframe manually below, just for clarity
# data1 = read.csv('Addresses.csv', header = F, colClasses = 'character')$V1
# data2 = read.csv('Addresses.csv', header = F, colClasses = 'character')$V2
# data = data.frame(street = data1, city.state = as.character(data2))
# per comments, should add a "stringsAsFactors = FALSE" in the dataframe part

data = data.frame(
    street = c('77 Huntington Ave',
             '85 Prospect St',
             '219 Lincoln St'),
    city.state = c(rep('01752', 3)))

get.zillowdata = function(df, address, city.state){
    require(ZillowR)
    set_zillow_web_service_id('API KEY')
    results = do.call(rbind, lapply(1:nrow(df), function(i){
        z = tryCatch({
        zdata = GetDeepSearchResults(address = df$street[i],
                  citystatezip = df$city.state[i],
                  zws_id = getOption("ZillowR-zws_id"),
                  url = "http://www.zillow.com/webservice/GetDeepSearchResults.htm")
        return(zdata)
    },

    error = function(cond) {
      message(paste("No Data Available:", df$street[i], df$city.state[i]))
      return(NA) # Choose a return value in case of error
    },

    warning = function(cond) {
      message(paste("Zdata caused a warning:", df$street[i], df$city.state[i]))
      return(NA) # Choose a return value in case of warning
    },
    # print processing message to screen
    finally = {
      message(paste("Processed Address:", df$street[i], df$city.state[i]))
      message(paste(i, "of", nrow(df), 'processed'))
      }
    )
    }))

if(nrow(results)==nrow(df)){
    results = cbind(df, results)

    print(paste('Original data had', nrow(df), 'rows. Returning a dataframe with', nrow(results),
    'rows. Returned dataframe has', sum(is.na(results$amount)), 'missing zdata values.'))

  return(results)
}
    else(print("Error: nrows(df) do not match nrows(zdata)"))
}

get.zillowdata(data)
` 

以及 iii)当您通过 Zillow API 执行查询时获得的 XMLnode 响应的解析器,该 API 会挑选出特定的子值(zestimate、平方英尺、地块大小等;无论您指定什么)

library(ZillowR)
library(XML)
library(RCurl)

set_zillow_web_service_id('API KEY')
output123 = GetDeepSearchResults(address = 'STREET ADDRESS', citystatezip = '0ZIP CODE', zws_id = getOption("ZillowR-zws_id"), url = "http://www.zillow.com/webservice/GetSearchResults.htm")

results <- xmlToList(output123$response[["results"]])

getValRange <- function(x, hilo) {
  ifelse(hilo %in% unlist(dimnames(x)), x["text",hilo][[1]], NA)
}

out <- apply(results, MAR=2, function(property) {
  zpid <- property$zpid
  links <- unlist(property$links)
  address <- unlist(property$address)
  z <- property$zestimate
  zestdf <- list(
    amount=ifelse("text" %in% names(z$amount), z$amount$text, NA),
    lastupdated=z$"last-updated",
    valueChange=ifelse(length(z$valueChange)==0, NA, z$valueChange),
    valueLow=getValRange(z$valuationRange, "low"),
    valueHigh=getValRange(z$valuationRange, "high"),
    percentile=z$percentile)
  list(id=zpid, links, address, zestdf)
})

data <- as.data.frame(do.call(rbind, lapply(out, unlist)),
                      row.names=seq_len(length(out)))

但我在这一点上有点卡住了。我应该如何将这些放在一起,以便我可以在 api 调用部分的末尾包含解析,并确保它们都在地址/zip 的完整列表中进行迭代?我现在的代码没有任何特定的顺序,所以如果你决定解决这个问题,请随意移动,如果有人需要更多信息,我很乐意澄清!

首先十分感谢。

4

1 回答 1

0

这是对您的问题的半简化答案(我的意思是我删除了 api 提供的一些信息,例如高低估计,但您可以按照下面代码中提供的其他信息的逻辑将它们添加回来)。首先加载所需的包:

ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg)) 
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
#install and load the following packages 
packages <- c("ZillowR", "rjson", "httr", "XML")
ipak(packages)

接下来设置您的 Zillow API 密钥:

set_zillow_web_service_id('API KEY')

接下来定义一个只有地址的数据集,然后定义并添加街道地址和邮政编码的列,以便我们稍后可以在 Zillow api 中使用它们:

df <-data.frame("Address" = c("5435 Andrea Boulevard, Sacramento, CA 95842","8434 Walerga Road Apt 421, Antelope, CA 95843"
       ,"7152 Flanders Way, Sacramento, CA 95842","1076 Edmonton Drive, Sacramento, CA 95833"    
       ,"9906 Burline Street, Sacramento, CA 95827","3634 Sapphire Drive Apt 1, Auburn, CA 95602"),stringsAsFactors = FALSE)
df$StreetCity<-gsub("(.*),.*", "\\1", df$Address)
df$Street<-gsub("(.*),.*", "\\1", df$StreetCity)
df$zip<-substr(df$Address,nchar(df$Address)-4,nchar(df$Address))
df
#                                         Address                          StreetCity                    Street   zip
# 1   5435 Andrea Boulevard, Sacramento, CA 95842   5435 Andrea Boulevard, Sacramento     5435 Andrea Boulevard 95842
# 2 8434 Walerga Road Apt 421, Antelope, CA 95843 8434 Walerga Road Apt 421, Antelope 8434 Walerga Road Apt 421 95843
# 3       7152 Flanders Way, Sacramento, CA 95842       7152 Flanders Way, Sacramento         7152 Flanders Way 95842
# 4     1076 Edmonton Drive, Sacramento, CA 95833     1076 Edmonton Drive, Sacramento       1076 Edmonton Drive 95833
# 5     9906 Burline Street, Sacramento, CA 95827     9906 Burline Street, Sacramento       9906 Burline Street 95827
# 6   3634 Sapphire Drive Apt 1, Auburn, CA 95602   3634 Sapphire Drive Apt 1, Auburn 3634 Sapphire Drive Apt 1 95602

现在让我们点击 API:

getValRange <- function(x, hilo) {
  ifelse(hilo %in% unlist(dimnames(x)), x["text",hilo][[1]], NA)
}

Out_df_All <-NULL
Out_df <-NULL

for (i in 1:nrow(df)){
  print(i)
  xml =GetSearchResults(address = as.character(df[i,]$Street), citystatezip = as.character(df[i,]$zip),rentzestimate = TRUE)
  if (!is.null(xml$response[["results"]])){
    results <- xmlToList(xml$response[["results"]])
    if (is.matrix(results)){
      out <- apply(results, MAR=2, function(property) {
        zpid <- property$zpid
        links <- unlist(property$links)["mapthishome"]
        address <- unlist(property$address)
        z <-property$zestimate
        zestdf <- list(
          zestimate=ifelse("text" %in% names(z$amount), z$amount$text, NA)
        ) 

        rz <- property$rentzestimate
        rentalzestdf <- list(
          rentzestimate=ifelse("text" %in% names(rz$amount), rz$amount$text, NA)
        ) 
        list(zpid=zpid, links, address, zestdf, rentalzestdf)
      }


      )
      data <- as.data.frame(do.call(rbind, lapply(out, unlist)[1]),row.names=seq_len(length(out)))
      data[is.na(data)] <- 0
      Out_df<-data.frame("Address"=df[i,"Address"],data)
      Out_df_All<-rbind(Out_df,Out_df_All)
    }

    else {
      data<-as.data.frame(do.call(rbind, lapply(xml$response[["results"]][1], unlist)[1]),row.names=seq_len(length(xml$response[["results"]][1])))
      data<-data[which(grepl("text.value",colnames(data)))]
      colnames(data)<-gsub("children.","",colnames(data))
      colnames(data)<-gsub(".text.value","",colnames(data))
      colnames(data)<-gsub("address.","",colnames(data))
      colnames(data)<-gsub("links.","",colnames(data))
      colnames(data)<-gsub(".amount","",colnames(data))
      data <- data[,c("zpid","mapthishome","street","zipcode","city","state","latitude","longitude")]
      data$zestimate <- ifelse("zestimate" %in% colnames(data), data$zestimate, NA)
      data$rentzestimate <- ifelse("rentzestimate" %in% colnames(data), data$rentzestimate, NA)
      data[is.na(data)] <- 0
      Out_df<-data.frame("Address"=df[i,"Address"],data)
      Out_df_All<-rbind(Out_df,Out_df_All)

    }
  }
}
View(Out_df_All)

在此处输入图像描述

于 2019-09-19T19:01:57.593 回答