我正在尝试从 Zillow 抓取出租物业数据,以确定最适合居住的社区。到目前为止编写的代码能够输出:URL、详细信息、地址、图像、价格和坐标。
问题是“坐标”始终返回“N/A”。我相信这可能是由于我无法确定的 Zillow 的变化。看页面源码的时候,经纬度都有,但是无法调用。
import requests
from bs4 import BeautifulSoup
import csv
import json
class ZillowScraper:
results = []
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': 'zguid=23|%2403435e76-0699-4a32-b86d-77d033c907ef; _ga=GA1.2.1271511001.1575011821; zjs_user_id=null; zjs_anonymous_id=%2203435e76-0699-4a32-b86d-77d033c907ef%22; _gcl_au=1.1.1333357279.1575011822; _pxvid=3cfcc163-1278-11ea-bff8-0242ac12000b; ki_r=; __gads=ID=84d8013cfac6df96:T=1575012041:S=ALNI_MaSvVNZsir2JXJ17pv54bjsPuyfcw; ki_s=199442%3A0.0.0.0.0%3B199444%3A0.0.0.0.2; zgsession=1|c0999376-b167-4a47-a1cd-0e456d882d4e; _gid=GA1.2.55965867.1578668946; JSESSIONID=87D0662A6BC141A73F0D12620788519C; KruxPixel=true; DoubleClickSession=true; KruxAddition=true; ki_t=1575011869563%3B1578669044158%3B1578669044158%3B2%3B10; _pxff_tm=1; _px3=2e6809e35ce7e076934ff998c2bdb8140e8b793b53e08a27c5da11f1b4760755:DFItCmrETuS2OQcztcFmt0FYPUn00ihAAue2ynQgbfSq6H+p2yP3Rl3aeyls3Unr1VRJSgcNue8Rr1SUq4P1jA==:1000:9ueZvAJ6v5y4ny7psGF25dK+d3GlytY2Bh+Xj9UUhC4DaioIZ+FMXPU0mOX+Qnghqut0jIT61gLecN4fyu6qXaPDlBX6YsZVbIry1YyBN/37l0Ri3JP+E0h+m+QEBB+bqb6MbE2HtgGBJRJAry8dgOKGM5JtBGdX+X/nuQX1xaw=; AWSALB=E6JYC43gXQRlE2jPT9e2vAQOYPvdHnccBlqi0mcXevYExTaHro0M+uo/Qxahi6JyLz9LpotY9eLtEbYrAOeQXcCm6UhjWnTopQHernmjlR/ibE6JmE8F6tReiBn4; search=6|1581261153229%7Crect%3D40.96202658306895%252C-73.55498286718745%252C40.4487909557045%252C-74.40093013281245%26rid%3D6181%26disp%3Dmap%26mdm%3Dauto%26p%3D3%26z%3D0%26lt%3Dfsbo%26pt%3Dpmf%252Cpf%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%09%01%096181%09%09%09%090%09US_%09',
'pragma': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/75.0.3770.142 Chrome/75.0.3770.142 Safari/537.36'
}
def fetch(self, url, params):
print('HTTP GET request to URL: %s' % url, end='')
res = requests.get(url, params=params, headers=self.headers)
print(' | Status code: %s' % res.status_code)
return res
def save_response(self, res):
with open('res.html', 'w') as html_file:
html_file.write(res)
def load_response(self):
html = ''
with open('res.html', 'r') as html_file:
for line in html_file:
html += line
return html
def parse(self, html):
# parse response
content = BeautifulSoup(html, 'lxml')
# extract property cards
cards = content.findAll('article', {'class': 'list-card'})
# extract coordinates script {key: value(not a script tag...div...unique identifier for script looking for)}
script = content.find('script', {'data-zrr-shared-data-key': 'mobileSearchPageStore'}).text
# loop over property cards
for card in cards:
# try to extract image
try:
image = card.find('div', {'class': 'list-card-top'}).find('img')['src']
except:
image = 'N/A'
# extract items
items = {
'url': card.find('a', {'class': 'list-card-link'})['href'],
'details': [
price.text for price in
card.find('ul', {'class': 'list-card-details'}).find_all('li')
],
'address': card.find('address', {'class': 'list-card-addr'}).text,
'image': image
}
# try to extract price if not extracted yet
try:
items['price'] = card.find('div', {'class': 'list-card-price'}).text
except:
pass
# try to extract coordinates from script
try:
splitter = '"detailUrl":"' + items['url'] + '","latLong":'
coords = json.loads(script.split(splitter)[-1].split('},')[0] + '}')
items['coordinates'] = coords
except:
coords = script.split(splitter)[-1].split('},')[0] + '}'
splitter = '<!--{"queryState":{"mapBounds":'
try:
map_bounds = json.loads(coords.split(splitter)[-1])
items['coordinates'] = map_bounds
except:
items['coordinates'] = 'N/A'
# append scraped items to results list
self.results.append(items)
print(json.dumps(items, indent=2))
def to_json(self):
with open('zillow_rent.json', 'w') as f:
f.write(json.dumps(self.results, indent=2))
def run(self):
for page in range(1, 5):
params = {
'searchQueryState': '{"pagination":{"currentPage":%s},"usersSearchTerm":"Chicago, IL","mapBounds":{"west":-88.09607811914063,"east":-87.36823388085938,"south":41.606969127843165,"north":42.060251786677156},"regionSelection":[{"regionId":17426,"regionType":6}],"isMapVisible":false,"filterState":{"pmf":{"value":false},"fore":{"value":false},"auc":{"value":false},"nc":{"value":false},"fr":{"value":true},"fsbo":{"value":false},"cmsn":{"value":false},"pf":{"value":false},"fsba":{"value":false}},"isListVisible":true}' % str(
page)
}
res = self.fetch('https://www.zillow.com/homes/Chicago,-IL_rb/?', params)
self.parse(res.text)
self.to_json()
# html = self.load_response()
# self.parse(html)
if __name__ == '__main__':
scraper = ZillowScraper()
scraper.run()
结果:
HTTP GET request to URL: https://www.zillow.com/homes/Chicago,-IL_rb/? | Status code: 200
{
"url": "https://www.zillow.com/homedetails/1926-W-Montrose-Ave-2S-Chicago-IL-60640/2078561693_zpid/",
"details": [
"2 bds",
"1 ba",
"1,500 sqft"
],
"address": "1926 W Montrose Ave #2S, Chicago, IL 60640",
"image": "https://photos.zillowstatic.com/p_e/ISfse6dgk5f2jr1000000000.jpg",
"price": "$1,849/mo",
"coordinates": "N/A"
}
{
"url": "https://www.zillow.com/b/polo-towers-4180-n.-marine-dr.-chicago-il-Bxxx/",
"details": [
"$1,350 Studio",
"$1,175+ 1 bd"
],
"address": "Polo Towers - 4180 N. Marine Dr. | 4180 N Marine Dr, Chicago, IL",
"image": "https://photos.zillowstatic.com/p_e/ISnuh8p2jxbbue0000000000.jpg",
"coordinates": "N/A"
}
{
"url": "https://www.zillow.com/b/535-n-michigan-avenue-chicago-il-5XfC4d/",
"details": [
"$1,295+ Studio",
"$1,550+ 1 bd",
"$2,450+ 2 bds"
],
"address": "535 N Michigan Avenue | 535 N Michigan Ave, Chicago, IL",
"image": "https://photos.zillowstatic.com/p_e/ISvct3il8gk68q1000000000.jpg",
"coordinates": "N/A"
}
{
"url": "https://www.zillow.com/homedetails/1445-W-Huron-St-APT-1-Chicago-IL-60642/70465653_zpid/",
"details": [
"3 bds",
"2.5 ba",
"2,500 sqft"
],
"address": "1445 W Huron St APT 1, Chicago, IL 60642",
"image": "https://photos.zillowstatic.com/p_e/ISj3ekp8qw0p771000000000.jpg",
"price": "$3,150/mo",
"coordinates": "N/A"
}
{