0

我需要做一个网络数据抓取工具。

  1. 我需要登录网站:用户,密码,点击登录按钮
  2. 单击第二个按钮
  3. 等待页面加载,这里是有问题的表格。该表是一个通话记录,动态添加新内容,所以总是在刷新。
  4. 我想从表格内容中排除一个表单并限制粘贴到 Excel 的行。

我通过 InternetExplorer.Application 代码使其工作,但我需要切换到 MSXML2.XMLHTTP 代码,因为它非常慢。

工作 InternetExplorer.Application 版本:

 Sub extractTablesData()
 'we define the essential variables

 Dim IE As Object, obj As Object
 Dim r As Integer, c As Integer, t As Integer
 Dim elemCollection As Object


 'add the "Microsoft Internet Controls" reference in your VBA Project indirectly
 Set IE = CreateObject("InternetExplorer.Application")

 With IE
 .Silent = True
 .Visible = True
 .navigate ("https://www.clickphone.ro")

 ' we ensure that the web page downloads completely before we fill the form automatically
 While IE.readyState <> 4
 DoEvents
 Wend
Application.Wait Now + TimeValue("00:00:03")
Set HTMLDoc = IE.document
 HTMLDoc.all.user.Value = "user or email" 'Enter your email/user id here
 HTMLDoc.all.pass.Value = "xXXxXXXxxXXXxx" 'Enter your password here
 'Login Button Click               
 With IE.document

    Set elems = .getElementsByTagName("a")
    For Each e In elems

        If (e.getAttribute("class") = "orange_button") Then
            e.Click
            Exit For
        End If

    Next e

End With

 'Needed Table page Button Click https://www.clickphone.ro/account/istoric_apel_in.html
 While IE.readyState <> 4
 DoEvents
 Wend
Set iedoc = IE.document

Set elems = iedoc.getElementsByClassName("black")(12)
    elems.Click

 ' again ensuring that the web page loads completely before we start scraping data
 While IE.readyState <> 4
 DoEvents
 Wend
 Application.Wait Now + TimeValue("00:00:05")
 Set iedoc = IE.document

'Clearing any unnecessary or old data in Sheet1

 ThisWorkbook.Sheets("Sheet1").Range("A1:K1000").ClearContents

'Scrapping Data and past to Sheet1
 Set elemCollection = IE.document.getElementsByTagName("table")

    For t = 0 To (elemCollection.Length - 1)
        For r = 0 To (elemCollection(t).Rows.Length - 1)
            For c = 0 To (elemCollection(t).Rows(r).Cells.Length - 1)
                ThisWorkbook.Worksheets(1).Cells(r + 1, c + 1) = elemCollection(t).Rows(r).Cells(c).innerText
            Next c
        Next r
    Next t

 End With

 ' cleaning up memory
 Set IE = Nothing

 End Sub

这是我对 MSXMLHTTP 的尝试:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub Web_Table_Option_One()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.clickphone.ro/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send "userName=USER&password=XXXXxxxxXxxxxXXX"
.Open "GET", "https://www.clickphone.ro/account/istoric_apel_in.html", False 
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
 For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

HTML源代码:

对于用户,通过,登录按钮:

<form action="/login.html" id="toploginform" name="toploginform" method="post">
                                                                      <script>
                                            function processLoginForm(){
                                                with (document.toploginform) {
                                                    if (user.value=="Email"){alert('Email/Parola incorecte!'); return false}
                                                    document.getElementById('toploginform').submit();
                                                }
                                            }
                                        </script>

                                                                            <fieldset>
                                            <input name="userlogin" type="hidden" id="userlogin" value="true" />
                                            <span class="text">
                                            <input name="user" type="text" onFocus="if(this.value=='Email'){this.value=''}" onBlur="if(this.value==''){this.value='Email'}" value="Email">
                                            </span> <span class="text">
                                            <input name="pass" type="password" onFocus="if(this.value=='Password'){this.value=''}" onBlur="if(this.value==''){this.value='Password'}" value="Password">
                                            </span> 
                                            <input name="authcode" type="hidden" id="authcode" value="false" />
                                            <span><a href="#" class="orange_button" onClick="return processLoginForm()">Login</a></span>
                                             <span class="links"><a href="/login~parola.html">Am uitat parola</a><br/>
                                                <input class="css-checkbox" id="checkbox2" type="checkbox" name="rememberpass" value="da" />
                                                <label for="checkbox2" name="checkbox2_lbl" class="css-label lite-orange-check">Retin datele?</label>
                                        </span>
                                        </fieldset>                         
                                                                </form>

表格页面按钮:

<br />&nbsp;<img src="/images/sageata_orange.gif" width="7" height="8" />&nbsp;<a class="black" href="/account/istoric_apel_in.html">Apeluri primite</a>

表源代码:

<table class="TabelDate" cellspacing="0">
  <thead>
    <tr>
      <th width="130">Data</th>
      <th>Sursa</th>
      <th>Destinatie</th>
      <th>Durata</th>
      <th class="ultima">Status</th>
    </tr>
  </thead>
  <tr class="u">    <td class="prima">19-03-2017 17:31:16</td><td><font color="green"><form name="form24-1489937476.41719" method="post" action="">0720145931 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0720145931</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0720145931.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0720145931" /></form></font></td><td align="center"><font color="green">0371780444</font></td><td align="center"><font color="green">00:00:07</font></td>
            <td class="ultima" align="center"><font color="green">Apel preluat</font></td></tr>  <tr class="gri">    <td class="prima">19-03-2017 17:30:48</td><td><font color="green"><form name="form24-1489937448.41715" method="post" action="">0728409617 <a class="TipFB" href="#"><span class="tip"><span class="tipTitle">Suna inapoi la 0728409617</span><span class="tipBody">Click si serverul te va suna gratuit pe numarul tau 0371780434.<br />Dupa ce raspunzi se formeaza automat numarul 0728409617.</span><span class="tipArrow"></span></span><input type="image" name="btn_opentextbox" src="/images/phone_small.gif" value="Submit" /></a>    <input name="numartel" type="hidden" id="numartel" value="0728409617" /></form></font></td><td align="center"><font color="green">0371780655</font></td><td align="center"><font color="green">00:00:07</font></td>
4

2 回答 2

0

我设法部分解决了我的问题。现在我可以使用 XmlHttp 登录并检索我需要的表。我将在此处发布工作代码,以便每个人都可以使用它(我不认为此代码有任何功劳,我是在不同论坛的帮助下完成的)

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
    For lngTable = 0 To objTable.Length - 1
        For lngRow = 0 To objTable(lngTable).Rows.Length - 1
            For lngCol = 0 To objTable(lngTable).Rows(lngRow).Cells.Length - 1
                ThisWorkbook.Sheets("Sheet2").Cells(ActRw + lngRow + 1, lngCol + 1) = objTable(lngTable).Rows(lngRow).Cells(lngCol).innerText
            Next lngCol
        Next lngRow
        ActRw = ActRw + objTable(lngTable).Rows.Length + 1
    Next lngTable
End Sub

现在我剩下两个问题......我怎样才能从父“表”中获取子“表”(我要的表在一个更大的表中,见下面的源代码)我想得到只有第一行,但不包括行中的“表单”(它是一个 href 链接) 源代码

我怎么能连续得到这个(这个表是动态的,每次有人打电话给我时它都会更新,第一行,不断更新)

于 2017-03-28T12:29:38.330 回答
0

我的工作代码的 2.0 版:

Option Explicit
 'reference to Microsoft Internet Controls
 'reference to Microsoft HTML Object Library

Sub CallLog()
Dim xml    As Object
Dim html   As Object
Dim objTable As Object
Dim result As String
Dim lRow As Long
Dim lngTable As Long
Dim lngRow As Long
Dim lngCol As Long
Dim ActRw As Long

Set xml = CreateObject("MSXML2.XMLHTTP.6.0")

Set html = CreateObject("htmlFile")

With xml
.Open "POST", "https://www.XXXXXX.xx/login.html", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded" 'send appropriate Headers
.send "userlogin=true&user=USERNAME&pass=PASSWORD&authcode=false" ' send login info
'MsgBox xml.responseText
.Open "GET", "https://www.XXXXXX.xx/account/callLog.html", False
.setRequestHeader "Content-type", "text/xml"
.send
End With

html.body.innerHTML = xml.responseText

Set objTable = html.getElementsByTagName("table")
ThisWorkbook.Sheets("LogClickPhone").Range("A2") = objTable(1).Rows(1).Cells(0).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("B2") = objTable(1).Rows(1).Cells(1).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("C2") = objTable(1).Rows(1).Cells(2).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("D2") = objTable(1).Rows(1).Cells(3).innerText
ThisWorkbook.Sheets("LogClickPhone").Range("E2") = objTable(1).Rows(1).Cells(4).innerText
End Sub

我设法只得到我需要的行,但速度很慢,需要 38.5 秒才能完成。我想我会更好地使用 MSXML2.DOMDocument.6.0 结构来获取我需要的文本。但我不知道该怎么做。问题:我如何使这段代码自动化,使其每 60 秒左右运行一次?发送

于 2017-03-31T13:23:40.067 回答