0

我正在尝试使用 R 的选择器小工具工具,使用 rvest 和 dplyr 包来抓取 Zillow 房屋的纬度和经度。

我试图找到每个列表的纬度和经度,并将其存储到我使用以下代码创建的数据框中。这就是我现在所拥有的。任何人都可以帮忙吗?

link = "https://www.zillow.com/arlington-va/2_p/?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%22arlington%2C%20virginia%22%2C%22mapBounds%22%3A%7B%22west%22%3A-77.46492611914063%2C%22east%22%3A-76.73708188085938%2C%22south%22%3A38.64364888623124%2C%22north%22%3A39.117234332841704%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A30258%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D"


page = read_html(link)

bed =  page %>% html_nodes(".list-card-details li:nth-child(1)") %>% html_text()
bed =  page %>% html_nodes(".list-card-details li:nth-child(1)") %>% html_text()
bath = page %>% html_nodes(".list-card-details li:nth-child(2)") %>% html_text()
sqfoot = page %>% html_nodes(".list-card-details li:nth-child(3)") %>% html_text()
price = page %>% html_nodes(".list-card-price") %>% html_text()
marketime= page %>% html_nodes(".list-card-variable-text") %>% html_text()

houses = data.frame(address, bed, bath, sqfoot, price, marketime) %>%
mutate(bed = as.numeric(substring(bed, 1, 1)), bath = substring(bath, 1, 1), sqfoot = 
gsub(",","",sqfoot), price = gsub(",", "", price))

houses <- mutate(houses, sqfoot = as.numeric(gsub(" sqft", "", houses$sqfoot)), price = 
as.numeric(substring(price, 2, nchar(houses$price))))
4

1 回答 1

0

您可以从页面上的脚本标签中提取所有列表信息(尽管我认为 zillow 提供了一个更好的 API)

library(rvest)
library(purrr)

page <- read_html('https://www.zillow.com/arlington-va/2_p/?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%22arlington%2C%20virginia%22%2C%22mapBounds%22%3A%7B%22west%22%3A-77.64070736914063%2C%22east%22%3A-76.56130063085938%2C%22south%22%3A38.56616517053261%2C%22north%22%3A39.19411978197601%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A30258%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D')

data <- page %>% html_nodes('.photo-cards script') %>%  html_text() 
info <- map(data, ~jsonlite::parse_json(., simplifyVector = T))
mask <- map(info, ~ 'geo' %in% names(.) ) %>% unlist() 
info <- info[mask] # filter for only those with lat/lon in geo

df <- map_df(info, ~ {
  data.frame(
    Name = .$name,
    Latitude <- .$geo$latitude,
    Longitude = .$geo$longitude,
    stringsAsFactors = FALSE
  )
})
于 2021-02-10T20:30:20.903 回答