-1

我一直在成功地从不同的网站提取数据并且到目前为止一直很成功,但现在我被困在一个网站上。我已经根据网络修改了我的代码,而且我是网络抓取的新手。

这是我的代码:

Option Explicit
Public Sub GetListings()
    Dim html As HTMLDocument, page As Long, html2 As HTMLDocument
    Dim results As Object, headers(), ws As Worksheet, i As Long

    Const START_PAGE As Long = 0
    Const END_PAGE As Long = 180

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    headers = Array("Name", "Phone", "Address")
    Application.ScreenUpdating = False
    Set html = New HTMLDocument
    Set html2 = New HTMLDocument
    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

    With CreateObject("MSXML2.XMLHTTP")
        For page = START_PAGE To END_PAGE
            .Open "GET", "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start=" & page, False
            .send
            html.body.innerHTML = .responseText
            Set results = html.querySelectorAll(".lemon--ul__-27c0__1_cxs undefined list__373c0__2G8oH")
            Dim output(), r As Long
            ReDim output(1 To results.Length, 1 To 3)
            r = 1
            For i = 0 To results.Length - 1
                On Error Resume Next
                html2.body.innerHTML = results.Item(i).outerHTML
                output(r, 1) = html2.querySelector(".lemon--div__373c0__1mboc businessName__373c0__1fTgn border-color--default__373c0__2oFDT").innerText
                output(r, 2) = html2.querySelector(".lemon--div__373c0__1mboc display--inline-block__373c0__2de_K u-space-b1 border-color--default__373c0__2oFDT").innerText
                'output(r, 3) = html2.querySelector(".track-visit-website").href
                output(r, 3) = html2.querySelector(".lemon--div__373c0__1mboc display--inline-block__373c0__2de_K u-space-b1 border-color--default__373c0__2oFDT").innerText & " " & html2.querySelector(".lemon--div__373c0__1mboc u-space-b1 border-color--default__373c0__2oFDT").innerText
                On Error GoTo 0
                r = r + 1
            Next
            ws.Cells(GetLastRow(ws, 1) + 1, 1).Resize(UBound(output, 1), UBound(output, 2)) = output
            page = page + 30
        Next
    End With
    Application.ScreenUpdating = True
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
    With ws
        GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
    End With
End Function

问题在下图中突出显示: 在此处输入图像描述

4

2 回答 2

2

尽管我不确定您所说的除法是什么意思,但我猜测并编写了一个脚本来达到目的。很难分离出要从中获取数据的元素部分。我几乎没有将我的代码放在两者之间On Error Resume NextOn Error GoTo 0但在这里我敢于这样做,因为我可以在你的脚本中看到相同的内容。地址块有两个不同的部分。我处理过一个。除法(我推测的)在地址块上。所以,当你看到脚本找不到地址时,它也不会找到分区。a[href]您可以通过在另一个中添加条件语句.querySelector()来查找丢失的地址来处理地址块。

Sub GetInfo()
    Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
    Dim Http As New XMLHTTP60, Html As New HTMLDocument, Htmldoc As New HTMLDocument, page&, I&

    For page = 1 To 2 ' this is where you change the last number for the pages to traverse
        With Http
            .Open "GET", URL & page * 30, False
            .send
            Html.body.innerHTML = .responseText
        End With

        With Html.querySelectorAll("[class*='searchResult']")
            For I = 0 To .Length - 1
                Htmldoc.body.innerHTML = .item(I).outerHTML
                On Error Resume Next
                R = R + 1: Cells(R, 1) = Htmldoc.querySelector("[class*='heading--h3'] > a").innerText
                Cells(R, 2) = Htmldoc.querySelector("[class*='container'] > [class*='display--inline-block']").innerText
                Cells(R, 3) = Htmldoc.querySelector("[class*='container'] > address").innerText
                Cells(R, 4) = Htmldoc.querySelector("[class*='container'] > address").NextSibling.innerText
                On Error GoTo 0
            Next I
        End With
    Next page
End Sub

在运行脚本之前添加参考:

Microsoft Html Object Library
Microsoft xml, v6.0
于 2019-06-22T11:44:34.863 回答
0

部分解决了问题

这是修改后的代码。在某些情况下,它仍然无法获取地址

Set results = html.getElementsByClassName("lemon--div__373c0__1mboc largerScrollablePhotos__373c0__3FEIJ arrange__373c0__UHqhV border-color--default__373c0__2oFDT")
            Debug.Print results.Length
            Dim output(), r As Long
            ReDim output(1 To results.Length, 1 To 3)
            r = 1
            For i = 0 To results.Length - 1
                'On Error Resume Next
                html2.body.innerHTML = results.Item(i).innerHTML
                output(r, 1) = html2.getElementsByClassName("lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5")(0).innerText
                output(r, 2) = html2.getElementsByClassName("lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--right__373c0__3ARv7")(0).innerText
                output(r, 3) = html2.getElementsByClassName("lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--right__373c0__3ARv7")(1).innerText
                'On Error GoTo 0
                r = r + 1
            Next
            ws.Cells(GetLastRow(ws, 1) + 1, 1).Resize(UBound(output, 1), UBound(output, 2)) = output

最好在不关闭屏幕更新的情况下运行脚本,因为即使是单个页面也需要花费大量时间。

于 2019-06-22T11:41:52.510 回答