0

目标:爬取肥皂剧抄本网站,获取抄本语料库。成绩单位于http://tvmegasite.net/transcripts/amc/main/2001transcripts.shtml格式的页面中, 并具有正则表达式:

(https?:\/\/?tvmegasite\.net\/transcripts\/\w+\/main\/\d+\w+\.\w+)

1)问题 1:从结果 #1 来看,很明显,我没有做足够的工作来按照链接进入第 4 阶段,(即,到达类似于http://tvmegasite.net/transcripts/theshow的内容/main/ ,) 将包含上述正则表达式表单的所需成绩单页面。蜘蛛会在某个时候停止跟踪链接,而不是继续到最后。那么,在获得所需表格之前,继续关注链接的最佳方式是什么?

2)问题 2: 尽管如此,当我尝试测试刮板的工作情况时,例如通过在http://tvmegasite.net/amc/main开始刮板,我没有得到仅包含 html 的列表成绩单,但一堆其他链接,其中许多不满足正则表达式。我知道我的正则表达式是正确的,这是怎么回事?

我的代码:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from soapoperascrape.items import SoapoperascrapeItem

class SoapOperaSpider(CrawlSpider):
    name="S_O_"
    allowed_domains=["tvmegasite.net"]
    start_urls=["http://tvmegasite.net/transcripts"]
    rules=(Rule(LinkExtractor(allow=('https?://tvmegasite\.net/transcripts/\w+/main/\d+\w+\.\w+', )), callback='parse_dir_contents'),)

    def parse(self, response):
        hxs=Selector(response)
        for href in hxs.xpath('/html/body/pre/a/@href'):
            url=response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        for sel in response.xpath('/html/body/pre'):
            item=SoapoperascrapeItem()
            item['link']=sel.xpath('a/@href').extract()
            yield item

结果#1:

[{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "blueprnt/", "clearday/", "dpawin/", "expeditn/", "nature/", "strtedge/", "sumipntg/", "themes.inf", "tp-dis4/", "tp-mod2/", "tp-pnt9/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "bottom.htm", "top.htm"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=D", "/", "_borders/", "_fpclass/", "_private/", "_themes/", "amc/", "atwt/", "aw/", "bb/", "days/", "gh/", "gl/", "images/", "oltl/", "passions/", "pc/", "resources/", "test/", "yr/"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=A", "/", "yr/", "test/", "resources/", "pc/", "passions/", "oltl/", "images/", "gl/", "gh/", "days/", "bb/", "aw/", "atwt/", "amc/", "_themes/", "_private/", "_fpclass/", "_borders/"]},
{"link": ["?N=A", "?M=A", "?S=D", "?D=A", "/", "_borders/", "_fpclass/", "_private/", "_themes/", "amc/", "atwt/", "aw/", "bb/", "days/", "gh/", "gl/", "images/", "oltl/", "passions/", "pc/", "resources/", "test/", "yr/"]},
{"link": ["?N=A", "?M=D", "?S=A", "?D=A", "/", "_private/", "images/", "amc/", "atwt/", "aw/", "bb/", "days/", "gh/", "gl/", "oltl/", "passions/", "pc/", "yr/", "resources/", "_fpclass/", "_borders/", "_themes/", "test/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "menuh.css", "menuh_main.css", "new3.css"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "amc_menu.html", "atwt_menu.html", "aw_menu.html", "bb_menu.html", "city_menu.html", "classic_prime.html", "current_prime.html", "day_amc.html", "day_atwt.html", "day_aw.html", "day_bb.html", "day_bottom.html", "day_city.html", "day_days.html", "day_gh.html", "day_gl.html", "day_oltl.html", "day_passions.html", "day_pc.html", "day_sube.html", "day_yr.html", "days_menu.html", "gh_menu.html", "ghns_menu.html", "gl_menu.html", "home_gen.html", "menu.js", "menu_ie4.js", "menu_ie5.js", "menu_moz.js", "menu_ns4.js", "menu_op5.js", "menu_op6.js", "menu_style.txt", "pass_menu.html", "pc_menu.html", "plinks_menu.html", "sniffer.js", "style.js", "sube_menu.html", "yr_menu.html"]}]

结果 #2:当我在稍后阶段开始爬网时,例如,从http://tvmegasite.net/transcripts/amc/main开始,令人惊讶的是,我并没有得到仅包含成绩单 url 的列表,而是其他垃圾用它:

[{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/amc/main/"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/amc/main/", "adbannerinclude4.txt", "custom.js", "custom_orig.js", "exfile.htm", "exfileinclude.txt", "menu.txt", "webringsinclude.txt", "webringsinclude_copy(1).txt"]},
{"link": ["?N=D", "?M=A", "?S=A", "?D=A", "/transcripts/", "localresources/", "main/", "older/"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=D", "/transcripts/amc/", "1998transcripts.shtml", "1999transcripts.shtml", "2001transcripts.shtml", "2002transcripts.shtml", "2003transcripts.shtml", "2004transcripts.shtml", "2005transcripts.shtml", "2006transcripts.shtml", "2007transcripts.shtml", "2008transcripts.shtml", "2009transcripts.shtml", "2010transcripts.shtml", "2011transcripts.shtml", "2013transcripts.shtml", "_overlay/", "favicon.ico", "localresources/", "newtemplate.shtml"]},
{"link": ["?N=A", "?M=A", "?S=D", "?D=A", "/transcripts/amc/", "_overlay/", "localresources/", "favicon.ico", "newtemplate.shtml", "1999transcripts.shtml", "1998transcripts.shtml", "2013transcripts.shtml", "2001transcripts.shtml", "2011transcripts.shtml", "2007transcripts.shtml", "2008transcripts.shtml", "2009transcripts.shtml", "2006transcripts.shtml", "2005transcripts.shtml", "2004transcripts.shtml", "2003transcripts.shtml", "2010transcripts.shtml", "2002transcripts.shtml"]},
{"link": ["?N=A", "?M=D", "?S=A", "?D=A", "/transcripts/amc/", "_overlay/", "favicon.ico", "localresources/", "newtemplate.shtml", "1998transcripts.shtml", "2010transcripts.shtml", "1999transcripts.shtml", "2001transcripts.shtml", "2002transcripts.shtml", "2003transcripts.shtml", "2004transcripts.shtml", "2005transcripts.shtml", "2006transcripts.shtml", "2007transcripts.shtml", "2008transcripts.shtml", "2009transcripts.shtml", "2011transcripts.shtml", "2013transcripts.shtml"]},
{"link": ["?N=A", "?M=A", "?S=A", "?D=A", "/transcripts/amc/", "newtemplate.shtml", "localresources/", "favicon.ico", "_overlay/", "2013transcripts.shtml", "2011transcripts.shtml", "2010transcripts.shtml", "2009transcripts.shtml", "2008transcripts.shtml", "2007transcripts.shtml", "2006transcripts.shtml", "2005transcripts.shtml", "2004transcripts.shtml", "2003transcripts.shtml", "2002transcripts.shtml", "2001transcripts.shtml", "1999transcripts.shtml", "1998transcripts.shtml"]}]
4

0 回答 0