我正在尝试通过crawl
命令运行名为“ReviewsScraper”的scrapy项目。当我运行时,蜘蛛名称是“酒店”:
scrapy crawl hotels -a city="تونس" -s filename="tunis_hotels.csv" --loglevel=ERROR
我收到此错误:ModuleNotFoundError: No module named 'ReviewsScraper'
我确保我在项目目录中......
我遇到了一个解决方案,其中提到了将我的项目作为一个包并添加 main.py 和init .py 文件,但无法理解该方法以及如何实现它......所以我仍然卡住了。
文件夹结构:
.
├── geckodriver.log
├── ReviewsScraper
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── __pycache__
│ │ ├── __init__.cpython-38.pyc
│ │ ├── items.cpython-38.pyc
│ │ ├── pipelines.cpython-38.pyc
│ │ └── settings.cpython-38.pyc
│ ├── settings.py
│ └── spiders
│ ├── hotels_spider.py
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── hotels_spider.cpython-38.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ └── reviews_spider.cpython-38.pyc
│ └── reviews_spider.py
└── scrapy.cfg
hotels_spider.py 代码:
from scrapy import Spider, Request
from requests import get
from bs4 import BeautifulSoup
from urllib.request import urljoin
from ..items import Hotel
from re import compile
from os import system
import time
class HotelsSpider(Spider):
name = "hotels"
city = ''
def __init__(self, city):
self.city = city
def start_requests(self):
urls = [get("https://www.booking.com/searchresults.ar.html", params={'ss': self.city}).url]
for url in urls:
yield Request(url, self.parse)
def parse(self, response):
def getHotelName(hotelDiv):
try:
name = hotelDiv.find('h3', {'class':"sr-hotel__title"}).find('span', {'class':"sr-hotel__name"}).get_text()
except AttributeError:
name = ''
return name
def getHotelLink(hotelDiv):
ptrn = compile(r'#hotelTmpl')
try:
link = hotelDiv.find('h3', {'class':"sr-hotel__title"}).find('a', {'class':["hotel_name_link", "url"]}).attrs['href']
link = ptrn.sub("#tab-reviews", link)
except AttributeError:
link = ''
return link
def getHotelStars(hotelDiv):
try:
stars = hotelDiv.find('span', {'class':"sr-hotel__title-badges"}).find('i', {'class':["bk-icon-wrapper", "bk-icon-stars", "star_track"]}).attrs['title']
except AttributeError:
try:
stars = len(hotelDiv.find('span', {'class':"sr-hotel__title-badges"}).find('span', {'class':["bh-quality-bars", "bh-quality-bars--medium"]}).find_all('svg', {'class':["bk-icon", "-iconset-square_rating"]}))
except AttributeError:
stars = ''
return stars
def getNbrOfReviews(hotelDiv):
try:
nbr = hotelDiv.find('div', {'class':"bui-review-score__content"}).find('div', {'class':"bui-review-score__text"}).get_text()
except AttributeError:
nbr = ''
return nbr
def getRating(hotelDiv):
try:
ratingLabel = hotelDiv.find('div', {'class':"bui-review-score__content"}).find('div', {'class':"bui-review-score__title"}).get_text()
ratingScore = hotelDiv.find('div', {'class':"bui-review-score__badge"}).get_text()
except AttributeError:
ratingLabel = ''
ratingScore = ''
return (ratingLabel, ratingScore)
def getHotelPages(soupObject):
try:
lis = soupObject.find('nav', {'class':"bui-pagination__nav"}).find('li', {'class':"bui-pagination__pages"}).find('ul', {'class':"bui-pagination__list"}).find_all('li', {'class':["bui-pagination__item", "sr_pagination_item"]})
except AttributeError:
lis = []
if lis != []:
links = []
for li in lis:
try:
links.append(urljoin("https://www.booking.com/", li.a.attrs['href']))
except AttributeError:
continue
return links
soup = BeautifulSoup(response.body, "html.parser")
hotelsList = soup.find('div', {'id':"hotellist_inner"}).find_all('div', {'class':["sr_item", "sr_item_new", "sr_item_default", "sr_property_block", "sr_flex_layout", "sr_item_no_dates"]})
for hotelDiv in hotelsList:
hotel = Hotel()
hotel['name'] = getHotelName(hotelDiv)
hotel['stars'] = getHotelStars(hotelDiv)
hotel['nbr_of_reviews'] = getNbrOfReviews(hotelDiv)
hotel['rating_score'] = getRating(hotelDiv)[1]
hotel['rating_label'] = getRating(hotelDiv)[0]
if hotel['name'] != '':
hotel['reviews_filename'] = "{}.csv".format(hotel['name'])
else:
t = time.time()
hotel['reviews_filename'] = "empty_file_{}.csv".format(t)
yield hotel
if getHotelLink(hotelDiv) != '':
system("scrapy crawl reviews -a hotel_link='{0}' -s filename='{1}' --loglevel=ERROR".format(urljoin("https://www.booking.com/", getHotelLink(hotelDiv)), hotel['reviews_filename']))
yield from response.follow_all(getHotelPages(soup), callback=self.parse)
完整的回溯(不仅仅是一次,而是多次):
Traceback (most recent call last):
File "/usr/local/bin/scrapy", line 8, in <module>
sys.exit(execute())
File "/usr/local/lib64/python3.8/site-packages/scrapy/cmdline.py", line 112, in execute
settings = get_project_settings()
File "/usr/local/lib64/python3.8/site-packages/scrapy/utils/project.py", line 69, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "/usr/local/lib64/python3.8/site-packages/scrapy/settings/__init__.py", line 287, in setmodule
module = import_module(module)
File "/usr/lib64/python3.8/importlib/__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 961, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 973, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'ReviewsScraper'
请问有什么帮助吗?提前致谢 :)