0

我编写了以下 ruby​​ 脚本:

require 'open-uri'
require 'Nokogiri'
require 'anemone'

class JobFox

  attr_accessor :company_url,
                :jobs_page,
                :max_words,
                :jobs_part,
                :jobs_container,
                :element_score,
                :max_score,
                :jobs

  def calc_element_score(element)
    self.element_score += (element['class'].to_s.scan(/job|career|position|opening/).count + element['id'].to_s.scan(/job|career|position|opening/).count) * 100
    self.element_score += element.to_s.scan(/job|career|position|opening/).count * 5
    element.css('a').each do |a|
      self.element_score += a.to_s.scan(/job|career|position|opening/).count * 7
    end
    element.css('li').each do |li|
      self.element_score += li.to_s.scan(/job|career|position|opening/).count * 5
    end
    element.css('h').each do |h|
      self.element_score += h.to_s.scan(/job|career|position|opening/).count * 3
    end
    if self.element_score > self.max_score
      self.max_score = self.element_score
      self.jobs_part = element
    end
    if element.children.count == 0
      self.element_score = 0
    end
  end

end

fox = JobFox.new
fox.company_url = 'http://www.website.com'
fox.max_words = 0
fox.jobs = []

# CRAWL THE WEBSITE TO FIND THE JOBS LINK
Anemone.crawl(fox.company_url, :depth_limit => 3) do |anemone|
  anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
    begin
      puts "SCANNING: " + page.url.to_s
      # SCAN THE HTML AND FIND THE OCCURENCES OF THE WORD "JOB"
      source_html = open(page.url).read
      job_occurences = source_html.scan(/job|jobs|work|position/).count
      # IF MORE OCCURENCES THAN BEFORE, WE KEEP THE PAGE URL
      if job_occurences > fox.max_words
        fox.max_words = job_occurences
        fox.jobs_page = page.url
      end
    rescue Exception => e
      puts e
    end
  end
end

fox.jobs_container = Nokogiri::HTML(open(fox.jobs_page))
fox.element_score = fox.max_score = 0

fox.jobs_container.css('div, section').each do |container|
  container.traverse do |element|
    fox.calc_element_score(element)
  end
end

fox.jobs_part.traverse do |element|
  element.css('a').each do |job|
    fox.jobs << job.text
  end
end

# REMOVE POSSIBLE DUPLICATE ENTRIES
fox.jobs = fox.jobs.uniq
puts fox.jobs

我正在尝试将其移植到 Rails 应用程序 - 不是作为脚本/任务,而是作为模型函数:

require 'anemone'
require 'open-uri'
require 'Nokogiri'

class Company < ActiveRecord::Base

    has_many :jobs
    accepts_nested_attributes_for :jobs

    # CALCULATE THE RELATEDNESS OF EACH HTML ELEMENT
    def calculate_element_score(element)
        @jobs_expression = '/job|career|position|opening/'
    @element_score += (element['class'].to_s.scan(@jobs_expression).count + element['id'].to_s.scan(@jobs_expression).count) * 100
    @element_score += element.to_s.scan(@jobs_expression).count * 5
    element.css('a').each do |a|
      @element_score += a.to_s.scan(@jobs_expression).count * 7
    end
    element.css('li').each do |li|
      @element_score += li.to_s.scan(@jobs_expression).count * 5
    end
    element.css('h').each do |h|
      @element_score += h.to_s.scan(@jobs_expression).count * 3
    end
    if @element_score > @max_score
      @max_score = @element_score
      @jobs_part = element
    end
    if element.children.count == 0
      @element_score = 0
    end
  end

    # CRAWL THE WEBSITE TO FIND THE JOBS PAGE
    def find_jobs_page
        max_words = 0
        Anemone.crawl(self.website, :depth_limit => 3) do |anemone|
          anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
            begin
              # SCAN THE HTML AND FIND OCCURENCES OF RELEVANT WORDS
              source_html = open(page.url).read
              job_occurences = source_html.scan(/job|jobs|work|position/).count
              # IF MORE OCCURENCES THAN BEFORE, KEEP THE PAGE URL
              if job_occurences > max_words
                max_words = job_occurences
                self.jobs_page = page.url
              end
            rescue Exception => e
              puts e
            end
          end
        end
    end

    # FIND THE CONTAINER THAT HAS THE JOB LISTINGS
    def find_jobs_container
        jobs_container = Nokogiri::HTML(open(self.jobs_page))
        @element_score = @max_score = 0
        @jobs_expression = '/job|career|position|opening/'

        jobs_container.css('div, section').each do |container|
          container.traverse do |element|
            self.calculate_element_score(element)
          end
        end
    end 

    # ADD THE JOBS FROM THE PAGE TO THE COMPANY ASSOCIATION
    def extract_jobs
        @jobs_part.traverse do |element|
          element.css('a').each do |job|
            j = JOBS.new()
            j.title = job.text
            j.url = job
            self.jobs << j
          end
        end
    end

    # THE METHOD TO FIND ALL THE JOBS FOR A COMPANY
    def find_jobs
        self.find_jobs_page
        self.find_jobs_container
        self.extract_jobs
    end

end

除了calculate_element_score方法之外,一切都很好 - @elements_score 始终为 0。关于全局变量,我是否理解完全错误的东西?

4

0 回答 0