python - Craigslist 多城市搜索 python 脚本。添加 GUI 更多城市

Question

所以我对python编程很陌生。只是想找出一个好的项目让我开始。想尝试在多个城市搜索 craigslist。我在网上找到了一个过时的示例并将其用作起点。下面的脚本目前只有俄亥俄州的城市，但我计划添加我们所有的城市。“家乡”当前设置为代顿。它要求搜索半径、搜索词、最低价格和最高价格。根据城市的纬度，它只搜索半径内的城市。如果结果超过 1 页，我也会搜索所有页面。最后，它会创建一个结果的 html 文件并在浏览器中打开它。它似乎工作正常，但希望得到关于我是否有效地做所有事情的反馈。我还想添加一个 GUI 来捕获用户输入，但甚至不确定从哪里开始。那里有什么建议吗？谢谢！

#Craigslist Search
"""
Created on Thu Mar 27 11:56:54 2014
used http://cal.freeshell.org/2010/05/python-craigslist-search-script-version-2/ as 
starting point.
"""

import re
import os
import os.path
import time
import urllib2
import webbrowser
from math import *

results = re.compile('<p.+</p>', re.DOTALL) #Find pattern for search results.
prices = re.compile('<span class="price".*?</span>', re.DOTALL) #Find pattern for 
pages = re.compile('button pagenum">.*?</span>')
delay = 10


def search_all():
    for city in list(set(searchcities)):#add another for loop for all pages

        #Setup headers to spoof Mozilla
        dat = None
        ua = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.9.1.4) Gecko/20091007 Firefox/3.5.4"  
        head = {'User-agent': ua}
        errorcount=0

        #Do a quick search to see how many pages of results
        url = "http://" + city + ".craigslist.org/search/" + "sss?s=" + "0" + "&catAbb=sss&query=" + query.replace(' ', '+') + "&minAsk=" + pricemin + "&maxAsk=" + pricemax
        req = urllib2.Request(url, dat, head)
        try:
            response = urllib2.urlopen(req)
        except urllib2.HTTPError:
            if errorcount < 1:
                errorcount = 1
                print "Request failed, retrying in " + str(delay) + " seconds"
                time.sleep(int(delay))
                response = urllib2.urlopen(req)

        msg = response.read()
        errorcount = 0
        pglist = pages.findall(msg)
        pg = pglist.pop(0)
        if pg.find('of') == -1:
            pg=100
        else:
            pg =pg[int((pg.find('of'))+3) : int((pg.find('</span>'))) ]

        if int(pg)/100 == 0:
            pg = 100
        numpages = range(int(pg)/100)


        for page in numpages:
            print "searching...."
            page = page*100
            url = "http://" + city + ".craigslist.org/search/" + "sss?s=" + str(page) + "&catAbb=sss&query=" + query.replace(' ', '+')  + "&minAsk=" + pricemin + "&maxAsk=" + pricemax
            cityurl = "http://" + city + ".craigslist.org"

            errorcount = 0

            #Get page
            req = urllib2.Request(url, dat, head)
            try:
                response = urllib2.urlopen(req)
            except urllib2.HTTPError:
                if errorcount < 1:
                    errorcount = 1
                    print "Request failed, retrying in " + str(delay) + " seconds"
                    time.sleep(int(delay))
                    response = urllib2.urlopen(req)

            msg = response.read()
            errorcount = 0
            res = results.findall(msg)
            res = str(res)
            res = res.replace('[', '')
            res = res.replace(']', '')
            res = res.replace('<a href="' , '<a href="' + cityurl )
            #res = re.sub(prices,'',res)
            res = "<BLOCKQUOTE>"*6 + res + "</BLOCKQUOTE>"*6

            outp = open("craigresults.html", "a")
            outp.write(city)
            outp.write(str(res))
            outp.close()

def calcDist(lat_A, long_A, lat_B, long_B):#This was found at zip code database project
  distance = (sin(radians(lat_A)) *
              sin(radians(lat_B)) +
              cos(radians(lat_A)) *
              cos(radians(lat_B)) *
              cos(radians(long_A - long_B)))

  distance = (degrees(acos(distance))) * 69.09

  return distance

cities = """akroncanton:41.043955,-81.51919
            ashtabula:41.871212,-80.79178
            athensohio:39.322847,-82.09728
            cincinnati:39.104410,-84.50774
            cleveland:41.473451,-81.73580
            columbus:39.990764,-83.00117
            dayton:39.757758,-84.18848
            limaohio:40.759451,-84.08458
            mansfield:40.759156,-82.51118
            sandusky:41.426460,-82.71083
            toledo:41.646649,-83.54935
            tuscarawas:40.397916,-81.40527
            youngstown:41.086279,-80.64563
            zanesville:39.9461,-82.0122  
         """

if os.path.exists("craigresults.html")==True:   
    os.remove("craigresults.html")

homecity = "dayton"
radius = raw_input("Search Distance from Home in Miles: ")
query = raw_input("Search Term: ")
pricemin = raw_input("Min Price: ")
pricemax = raw_input("Max Price: ")

citylist = cities.split()

#create dictionary
citdict = {}
for city in citylist:
    items=city.split(":")
    citdict[items[0]] = items[1]

homecord = str(citdict.get(homecity)).split(",")
homelat = float(homecord[0])
homelong = float(homecord[1])

searchcities = []
for key,value in citdict.items():
    distcity=key
    distcord=str(value).split(",")
    distlat = float(distcord[0])
    distlong = float(distcord[1])
    dist = calcDist(homelat,homelong,distlat,distlong)
    if dist < int(radius):
        searchcities.append(key)
print searchcities        

search_all()
webbrowser.open_new('craigresults.html')

python - Craigslist 多城市搜索 python 脚本。添加 GUI 更多城市

0 回答 0

Related

Reference