我试图通过使用线程库实现多线程来加速我的 webscraper,但运行时并不比使用单线程时快。我尝试设置 craper,以便您可以输入要抓取的页数和/或远离原始种子的跳数。
import sys
import queue
import requests
import time
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
import matplotlib.pyplot as plt
import threading
import statistics
import concurrent.futures
runtimes = []
def crawler(frontier, req_pages, req_hops, output_directory):
f = open("report.txt", "w")
f.write("")
f.close()
#multithreaded#
with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
executor.submit(crawler_thread, frontier, req_pages, req_hops, output_directory)
#single threaded#
# crawler_thread(frontier, req_pages, req_hops, output_directory)
def crawler_thread(frontier, req_pages, req_hops, output_directory):
while not frontier.empty() and req_pages > 0 and req_hops > 0:
start_time0 = time.perf_counter()
try:
url = frontier.get()
#print("Trying to Crawl: ", url)
if url == '*': # denotes end of current hop has been hit
req_hops = req_hops - 1
frontier.put('*') # mark end of next hop
elif crawl_permitted(url):
parsed_url = urlparse(url)
filename = parsed_url.netloc[4:] if parsed_url.netloc.find('www') == 0 else parsed_url.netloc
filename = filename + parsed_url.path.replace('/', '-')
if not already_saved(filename, output_directory):
response = requests.get(url, verify = False)
time.sleep(3) #sleep for a few seconds for politeness. To do : implement politeness windows based on the host's crawl_delay specified robots.txt
open(output_directory + '/' + filename, 'w', encoding='utf-8').write(response.text)
req_pages = req_pages -1
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
cleaned_link = clean_link(hyperlink = link.get('href'), website = url)
if cleaned_link:
frontier.put(cleaned_link)
except:
f = open("report.txt", "a")
f.write("crawler failed on url: " + url) + "\n"
f.close()
end_time0 = time.perf_counter()
runtimes.append(end_time0-start_time0)
# refresh pages, robots.txt, politeness window - to do
#create url queue (frontier) parse argv to pass into crawler
frontier = seeds_to_frontier(seeds_filename = sys.argv[1])
req_pages = int(sys.argv[2])
req_hops = int(sys.argv[3])
output_directory = sys.argv[4]
start_time1 = time.perf_counter()
#crawl
crawler(frontier, req_pages, req_hops, output_directory)
end_time1 = time.perf_counter()
f = open("report.txt", "a")
f.write("Total Scrape Time: " + str(end_time1-start_time1) + "\n")
f.write("Average Scrape Time for Each Url: " + str(statistics.mean(runtimes)) + "\n")
f.close()
plt.plot(runtimes)
plt.ylabel("execution time (s)")
plt.xlabel("url queue iteration")
plt.title("scrapetime for each url")
plt.savefig("execution_time_of_scraped_urls")
如果您想查看它们,这是我在上面的代码中使用的额外功能(但我不认为它们是我的错误的原因):
def seeds_to_frontier(seeds_filename):
seeds_file = open(seeds_filename, 'r')
frontier = queue.Queue()
for page in seeds_file:
frontier.put(page.strip('\n'))
frontier.put('*') # marks end of hop 0
seeds_file.close()
return frontier
def crawl_permitted(url):
rp = RobotFileParser()
parsed_url = urlparse(url)
filename = parsed_url.netloc.rsplit('.', 2)[-2] + '.' + parsed_url.netloc.rsplit('.', 2)[-1]
robots_url = parsed_url.scheme + '://' + filename + '/robots.txt'
if not already_saved(filename, 'robots'):
response = requests.get(robots_url)
open('robots/' + filename, 'w', encoding='utf-8').write(response.text)
rp.set_url(robots_url)
rp.read()
return rp.can_fetch('*', url)
def url_delay(url):
rp = RobotFileParser()
#set up url for robotfileparser
parsed_url = urlparse(url)
filename = parsed_url.netloc.rsplit('.', 2)[-2] + '.' + parsed_url.netloc.rsplit('.', 2)[-1]
robots_url = parsed_url.scheme + '://' + filename + '/robots.txt'
#parse the robots.txt and extract crawl_delay
rp.set_url(robots_url)
rp.read()
crawl_delay = rp.crawl_delay("*") if rp.crawl_delay("*") else 0
return crawl_delay
def already_saved(file, directory):
for root, dirs, files in os.walk(directory):
if file in files:
return 1
return 0
def clean_link(hyperlink, website):
parsed_website = urlparse(website)
parsed_hyperlink = urlparse(hyperlink)
cleaned_link = hyperlink
if not parsed_hyperlink.scheme and not parsed_hyperlink.netloc and not parsed_hyperlink.path and not parsed_hyperlink.params \
and not parsed_hyperlink.query and not parsed_hyperlink.fragment:
return ''
if parsed_hyperlink.scheme and parsed_hyperlink.scheme != 'http':
return ''
if parsed_hyperlink.path == '/': # duplicate - self references
return ''
if parsed_hyperlink.fragment: # duplicate - bookmarks
return ''
# avoid pdfs, images, # to do: The below solution is not the best to avoid all media
if len(parsed_hyperlink.path) > 4 and (parsed_hyperlink.path[-4:] in [".pdf", ".jpg", ".png", ".svg", ".jpeg"]):
return ''
# take care of invalid characters - to do
if not (".edu" in parsed_hyperlink.netloc):# only crawl edu pages
return ''
if not parsed_hyperlink.netloc:
if parsed_hyperlink.path[0] == '/':
cleaned_link = parsed_website.scheme + '://' + parsed_website.netloc + hyperlink
else:
# bug - hyperlink 'smi.ucr.edu' parsed as path and not netloc - ???
if parsed_hyperlink.path[0:3] == 'tel:': # remove incorrectly formatted telephone links
print('incorrect tel link: ' + hyperlink)
return ''
# double check
cleaned_link = website + hyperlink
return cleaned_link
单线程代码运行良好,平均抓取页面大约需要 0.7 秒。多线程也以同样的速度刮???我最初设置的方式(如上所示,第一个单元格)似乎一直在单个线程中运行一个大循环。我尝试编辑(下面单元格中的代码)来解决这个问题。
def crawler(frontier, req_pages, req_hops, output_directory):
f = open("report.txt", "w")
f.write("")
f.close()
#multithreaded#
# with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
# executor.submit(crawler_thread, frontier, req_pages, req_hops, output_directory)
#single threaded#
# crawler_thread(frontier, req_pages, req_hops, output_directory)
#testing stuff
while not frontier.empty() and req_pages > 0 and req_hops > 0:
with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor:
executor.submit(crawler_thread, frontier, req_pages, req_hops, output_directory)
def crawler_thread(frontier, req_pages, req_hops, output_directory):
lock = threading.Lock()
lock.acquire()
if req_pages > 0 and req_hops > 0:
lock.release()
start_time0 = time.perf_counter()
try:
url = frontier.get()
print("Trying to Crawl: ", url)
if url == '*': # denotes end of current hop has been hit
lock.acquire()
req_hops = req_hops - 1
lock.release()
frontier.put('*') # mark end of next hop
elif crawl_permitted(url):
parsed_url = urlparse(url)
filename = parsed_url.netloc[4:] if parsed_url.netloc.find('www') == 0 else parsed_url.netloc
filename = filename + parsed_url.path.replace('/', '-')
if not already_saved(filename, output_directory):
response = requests.get(url, verify = False)
#time.sleep(10) #sleep for a few seconds for politeness. To do : implement politeness windows based on the host's crawl_delay specified robots.txt
open(output_directory + '/' + filename, 'w', encoding='utf-8').write(response.text)
lock.acquire()
req_pages = req_pages -1
lock.release()
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
cleaned_link = clean_link(hyperlink = link.get('href'), website = url)
if cleaned_link:
frontier.put(cleaned_link)
except:
f = open("report.txt", "a")
f.write("crawler failed on url: " + url + "\n")
f.close()
end_time0 = time.perf_counter()
runtimes.append(end_time0-start_time0)
# refresh pages, robots.txt, politeness window - to do
#create url queue (frontier) parse argv to pass into crawler
frontier = seeds_to_frontier(seeds_filename = sys.argv[1])
req_pages = int(sys.argv[2])
req_hops = int(sys.argv[3])
output_directory = sys.argv[4]
start_time1 = time.perf_counter()
#crawl
crawler(frontier, req_pages, req_hops, output_directory)
end_time1 = time.perf_counter()
f = open("report.txt", "a")
f.write("Total Scrape Time: " + str(end_time1-start_time1) + "\n")
f.write("Average Scrape Time for Each Url: " + str(statistics.mean(runtimes)) + "\n")
f.close()
print("Total Scrape Time: " + str(end_time1-start_time1) + "\n")
plt.plot(runtimes)
plt.ylabel("execution time (s)")
plt.xlabel("url queue iteration")
plt.title("scrapetime for each url")
plt.savefig("execution_time_of_scraped_urls")
1)我不确定这是否加快了速度。2)当我尝试指定在刮完 10 页后停止时,刮板会继续运行,直到我强行退出它。我拥有的第一个版本(单元格 1)没有这样做。
我的第一次实施和第二次实施有什么问题?由于抓取的速度,第一个似乎根本不是多线程的。第二个(我还不确定抓取速度)不会在 req_page 参数处停止。我在尝试多线程时哪里出错了?我的记忆被破坏了吗?