web-scraping - 使用 rapidminer 对 imdb 电影评论进行文本分析

Question

我正在使用快速矿工对特定电影的评论进行分析。我使用“getpages”从 IMDB 中提取评论。网站上列出了大约 94 条评论，但提取后我只得到了 21 条。xml代码是：

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Users\Arbind\Desktop\review\rev.xlsx"/>
        <parameter key="imported_cell_range" value="A1:A5"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Link.true.text.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.001" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
        <parameter key="link_attribute" value="Link"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="313" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="gensym1"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_above_percent" value="90.0"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document" width="90" x="112" y="30">
            <parameter key="query_type" value="Regular Region"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries">
              <parameter key="extract" value="&lt;hr[^&gt;]\.*align=&quot;center&quot;&gt;.&lt;hr[^&gt;]\.*align=&quot;center&quot;&gt;"/>
            </list>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="112" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="review" value="//h:p/text()"/>
                  <parameter key="rating" value="//h:img/@alt"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="segment" to_op="Extract Information" to_port="document"/>
              <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="document" to_op="Cut Document" to_port="document"/>
          <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="514" y="210">
        <parameter key="excel_file" value="C:\Users\Arbind\Desktop\review\imdb rev2.xlsx"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

在 excel 文件中，我一个接一个地获取评论的链接。我还需要用户给出的“评级”。

score 0 · Accepted Answer

imported cell range是在A1:A5电子表格导入上。这意味着有一个隐含的 for 循环来获取这 5 个 URL。这反过来会导致Get Pages可能跟随重定向，从而导致更多可能的获取。不知道 URL 就不可能知道。

您可以发布电子表格的片段吗？

score 0 · Accepted Answer

The regular region setting is finding every other review. This is because the regular expression for the start of the region is the same as the one for the end. Once an end has been found no start will be found until the end of the next review.

web-scraping - 使用 rapidminer 对 imdb 电影评论进行文本分析

2 回答 2

Related

Reference