0

我已经创建了抓取网站的代码。两个问题。

  1. 该代码应该是递归的以获取来自同一域的所有链接,但它会停止而不检索所有链接。感觉循环函数有问题

  2. 模拟函数的测试代码失败。实现 foo 函数的类似代码对我有用,但这个没有。

类爬虫{

  val mainURL = "http://www.eldiario.es"


  def getLinksPage(urlToCrawl: String): List[String] = {

    val connURL: Try[Document] = Try(Jsoup.connect(urlToCrawl).get())

    def links(doc:Document):  Try[List[String]] = Try {
      val elements = doc.select("a[href]").asScala
      val links = elements.map(_.attr("abs:href")).toSeq
      val linksURL = links.map(new URL(_))

      val targetURL = (new URL(urlToCrawl)).getHost
      val listLinks = linksURL.filter(_.getHost == targetURL).map(_.toString).toList
      listLinks
    }

    val getListLinks: Try[List[String]] = for {
      a <- connURL
      b <- links(a)
    } yield  b

    val pageLinks: List[String] = getListLinks.getOrElse(List[String](urlToCrawl))
    println(pageLinks)
    pageLinks
  }


  def loop(ls: List[String], acc: List[String]): List[String] = ls match {
    case Nil => acc
    case hd::tl => if (!acc.contains(hd))   loop(getLinksPage(hd),hd::acc)
                        else  loop(tl, acc)
  }

  def getAllLinkPages(mainURL:String)= loop(getLinksPage(mainURL), List(mainURL))

}

   class CrawlerSpec extends WordSpec with MockFactory {

      trait LinksFixture {

        val getLinksPage = stubFunction[String, List[String]]

        lazy val crawlerMock = new Crawler() {
          override def getLinksPage(urlToCrawl: String) = LinksFixture.this.getLinksPage(urlToCrawl)
        }
      }

      "getLinksPage" should {
        "return the links" in new LinksFixture {

          getLinksPage when "http://example.com" returns  List("http://example.com", "http://example.com/a", "http://example.com/b")

          crawlerMock.getLinksPage("http://example.com") shouldBe  List("http://example.com", "http://example.com/a", "http://example.com/b")

        }
      }
    }

    [info] CrawlerSpec:
    [info] getLinksPage
    [info] - should return the links *** FAILED ***
    [info]   scala.MatchError: null
    [info]   at rbs.Crawler.loop(Crawler.scala:43)
    [info]   at rbs.Crawler.getAllLinkPages(Crawler.scala:47)
    [info]   at rbs.Crawler.<init>(Crawler.scala:49)

编辑 2:使用 mockFunction

[info] CrawlerSpec:
[info] getLinksPage
[info] - should return the links *** FAILED ***
[info]   Unexpected call: MockFunction1-1(http://www.eldiario.es)
[info]
[info]   Expected:
[info]   inAnyOrder {
[info]     MockFunction1-1(http://example.com) once (never called - UNSATISFIED)
[info]   }
[info]
[info]   Actual:
[info]     MockFunction1-1(http://www.eldiario.es) (Option.scala:121)
4

0 回答 0