我编写了以下 ruby 脚本:
require 'open-uri'
require 'Nokogiri'
require 'anemone'
class JobFox
attr_accessor :company_url,
:jobs_page,
:max_words,
:jobs_part,
:jobs_container,
:element_score,
:max_score,
:jobs
def calc_element_score(element)
self.element_score += (element['class'].to_s.scan(/job|career|position|opening/).count + element['id'].to_s.scan(/job|career|position|opening/).count) * 100
self.element_score += element.to_s.scan(/job|career|position|opening/).count * 5
element.css('a').each do |a|
self.element_score += a.to_s.scan(/job|career|position|opening/).count * 7
end
element.css('li').each do |li|
self.element_score += li.to_s.scan(/job|career|position|opening/).count * 5
end
element.css('h').each do |h|
self.element_score += h.to_s.scan(/job|career|position|opening/).count * 3
end
if self.element_score > self.max_score
self.max_score = self.element_score
self.jobs_part = element
end
if element.children.count == 0
self.element_score = 0
end
end
end
fox = JobFox.new
fox.company_url = 'http://www.website.com'
fox.max_words = 0
fox.jobs = []
# CRAWL THE WEBSITE TO FIND THE JOBS LINK
Anemone.crawl(fox.company_url, :depth_limit => 3) do |anemone|
anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
begin
puts "SCANNING: " + page.url.to_s
# SCAN THE HTML AND FIND THE OCCURENCES OF THE WORD "JOB"
source_html = open(page.url).read
job_occurences = source_html.scan(/job|jobs|work|position/).count
# IF MORE OCCURENCES THAN BEFORE, WE KEEP THE PAGE URL
if job_occurences > fox.max_words
fox.max_words = job_occurences
fox.jobs_page = page.url
end
rescue Exception => e
puts e
end
end
end
fox.jobs_container = Nokogiri::HTML(open(fox.jobs_page))
fox.element_score = fox.max_score = 0
fox.jobs_container.css('div, section').each do |container|
container.traverse do |element|
fox.calc_element_score(element)
end
end
fox.jobs_part.traverse do |element|
element.css('a').each do |job|
fox.jobs << job.text
end
end
# REMOVE POSSIBLE DUPLICATE ENTRIES
fox.jobs = fox.jobs.uniq
puts fox.jobs
我正在尝试将其移植到 Rails 应用程序 - 不是作为脚本/任务,而是作为模型函数:
require 'anemone'
require 'open-uri'
require 'Nokogiri'
class Company < ActiveRecord::Base
has_many :jobs
accepts_nested_attributes_for :jobs
# CALCULATE THE RELATEDNESS OF EACH HTML ELEMENT
def calculate_element_score(element)
@jobs_expression = '/job|career|position|opening/'
@element_score += (element['class'].to_s.scan(@jobs_expression).count + element['id'].to_s.scan(@jobs_expression).count) * 100
@element_score += element.to_s.scan(@jobs_expression).count * 5
element.css('a').each do |a|
@element_score += a.to_s.scan(@jobs_expression).count * 7
end
element.css('li').each do |li|
@element_score += li.to_s.scan(@jobs_expression).count * 5
end
element.css('h').each do |h|
@element_score += h.to_s.scan(@jobs_expression).count * 3
end
if @element_score > @max_score
@max_score = @element_score
@jobs_part = element
end
if element.children.count == 0
@element_score = 0
end
end
# CRAWL THE WEBSITE TO FIND THE JOBS PAGE
def find_jobs_page
max_words = 0
Anemone.crawl(self.website, :depth_limit => 3) do |anemone|
anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
begin
# SCAN THE HTML AND FIND OCCURENCES OF RELEVANT WORDS
source_html = open(page.url).read
job_occurences = source_html.scan(/job|jobs|work|position/).count
# IF MORE OCCURENCES THAN BEFORE, KEEP THE PAGE URL
if job_occurences > max_words
max_words = job_occurences
self.jobs_page = page.url
end
rescue Exception => e
puts e
end
end
end
end
# FIND THE CONTAINER THAT HAS THE JOB LISTINGS
def find_jobs_container
jobs_container = Nokogiri::HTML(open(self.jobs_page))
@element_score = @max_score = 0
@jobs_expression = '/job|career|position|opening/'
jobs_container.css('div, section').each do |container|
container.traverse do |element|
self.calculate_element_score(element)
end
end
end
# ADD THE JOBS FROM THE PAGE TO THE COMPANY ASSOCIATION
def extract_jobs
@jobs_part.traverse do |element|
element.css('a').each do |job|
j = JOBS.new()
j.title = job.text
j.url = job
self.jobs << j
end
end
end
# THE METHOD TO FIND ALL THE JOBS FOR A COMPANY
def find_jobs
self.find_jobs_page
self.find_jobs_container
self.extract_jobs
end
end
除了calculate_element_score
方法之外,一切都很好 - @elements_score 始终为 0。关于全局变量,我是否理解完全错误的东西?