9

我一直在尝试使用 IE 自动化在 Excel 中搜索一串文本。我想在 excel 的另一个单元格中返回第一个结果的网站的超链接。这可能吗?我有一个包含 60,000 条记录的列表,我需要用谷歌搜索并在第一个结果中返回网站的超链接。您会推荐另一种方法吗?我提前感谢您的帮助。

4

2 回答 2

19

由于它有 60,000 条记录,我建议使用 xmlHTTP 对象而不是使用 IE。
HTTP 请求更简单,更快

下载示例文件

Sub XMLHTTP()

    Dim url As String, lastRow As Long, i As Long
    Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
    Dim start_time As Date
    Dim end_time As Date

    lastRow = Range("A" & Rows.Count).End(xlUp).Row

    Dim cookie As String
    Dim result_cookie As String

    start_time = Time
    Debug.Print "start_time:" & start_time

    For i = 2 To lastRow

        url = "https://www.google.co.in/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)

        Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
        XMLHTTP.Open "GET", url, False
        XMLHTTP.setRequestHeader "Content-Type", "text/xml"
        XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
        XMLHTTP.send

        Set html = CreateObject("htmlfile")
        html.body.innerHTML = XMLHTTP.ResponseText
        Set objResultDiv = html.getelementbyid("rso")

        Set objH3 = objResultDiv.getelementsbytagname("h3")


        For Each link In objH3

            If link.className = "r" Then

                Cells(i, 2) = link.innerText
                Cells(i, 3) = link.getelementsbytagname("a")(0).href
                DoEvents
            End If
        Next
    Next

    end_time = Time
    Debug.Print "end_time:" & end_time

    Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time) & " :minutes"
    MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub

使用 CSS3 选择器

 Sub XMLHTTP1()

        Dim url As String, i As Long, lastRow As Long
        Dim XMLHTTP As Object, html As New HTMLDocument, objResultDiv As HTMLAnchorElement


        lastRow = Range("A" & Rows.Count).End(xlUp).Row
        For i = 2 To lastRow

            url = "https://www.google.co.in/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)

            Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
            XMLHTTP.Open "GET", url, False
            XMLHTTP.setRequestHeader "Content-Type", "text/xml"
            XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
            XMLHTTP.send

            Set html = New HTMLDocument
            html.body.innerHTML = XMLHTTP.ResponseText
            Set objResultDiv = html.querySelector("div#rso h3.r a")

            Cells(i, 2) = objResultDiv.innerText
            Cells(i, 3) = objResultDiv.href

            DoEvents
        Next

    End Sub

输出

在此处输入图像描述

HTH
桑托什

于 2013-07-06T03:51:11.450 回答
0

链接似乎始终在 H3 标记内。通常,您可能会使用以下内容来检查页面加载完毕:

Private Declare Sub Sleep Lib "kernel32" (ByVal nMilliseconds As Long)

Sub UseIE()
    Dim ie As Object
    Dim thePage As Object
    Dim strTextOfPage As String

    Set ie = CreateObject("InternetExplorer.Application")
    'ie.FullScreen = True
    With ie
        '.Visible = True
        .Navigate "http://www.bbc.co.uk"
        While Not .ReadyState = READYSTATE_COMPLETE '4
            Sleep 500      'wait 1/2 sec before trying again
        Wend
    End With

    Set thePage = ie.Document
    'more code here
End Sub

但是,相反,我会反复尝试使用 引用第一个 H3 中的 A 元素getElementsByTagName("H3"),获取这些元素中的第一个,然后在其中查找 A 链接及其 href 属性。

在 JavaScript 中,尝试引用不存在的元素会返回undefined,但从 VBA 中它可能需要错误处理代码。

一旦我获得了href,我将停止导航(可能不确定此命令ie.Stop)或立即导航到下一页。

但是,第一个链接通常是赞助链接,并且返回的 href 有点乱码。这些赞助商链接的文本似乎包含em标签。我可能会使用这些信息来丢弃这些链接并在页面下方查看。

我不知道是否有更好的方法来做到这一点。

于 2013-07-05T20:23:33.567 回答