我正在使用 Neo4j 进行我的第一个项目。我正在解析维基百科的页面和页面链接转储以创建一个图形,其中节点是页面,边缘是链接。我已经定义了一些 rake 任务来下载转储、解析数据并将其保存在 Neo4j 数据库中。在 rake 任务结束时,我打印创建的页面和链接的数量,以及一些链接最多的页面。这是 zawiki 的 raks 任务的输出。
$ rake wiki[zawiki]
[ omitted ]
...
:: Done parsing zawiki
:: 1984 pages
:: 2144 links
:: The pages with the most links are:
9625.0 - Emijrp/List_of_Wikipedians_by_number_of_edits_(bots_included): 40
1363.0 - Gvangjsih_Bouxcuengh_Swcigih: 30
9112.0 - Fuzsuih: 27
1367.0 - Cungzcoj: 26
9279.0 - Vangz_Yenfanh: 19
看起来正在创建页面和链接,但是当我启动 rails 控制台或服务器时,找不到链接。
$ rails c
jruby-1.7.5 :013 > Pages.all.count
=> 1984
jruby-1.7.5 :003 > Pages.all.reduce(0) { |count, page| count + page.links.count}
=> 0
jruby-1.7.5 :012 > Pages.all.sort_by { |p| p.links.count }.reverse[0...5].map { |p| p.links.count }
=> [0, 0, 0, 0, 0]
这是 rake 任务,这是项目 github 页面。谁能告诉我为什么链接没有保存?
DUMP_DIR = Rails.root.join('lib','assets')
desc "Download wiki dumps and parse them"
task :wiki, [:wiki] => 'wiki:all'
namespace :wiki do
task :all, [:wiki] => [:get, :parse] do |t, args|
# Print info about the newly created pages and links.
link_count = 0
Pages.all.each do |page|
link_count += page.links.count
end
indent "Done parsing #{args[:wiki]}"
indent "#{Pages.count} pages"
indent "#{link_count} links"
indent "The pages with the most links are:"
Pages.all.sort_by { |a| a.links.count }.reverse[0...5].each do |page|
puts "#{page.page_id} - #{page.title}: #{page.links.count}"
end
end
desc "Download wiki page and page links database dumps to /lib/assets"
task :get, :wiki do |t, args|
indent "Downloading dumps"
sh "#{Rails.root.join('lib', "get_wiki").to_s} #{args[:wiki]}"
indent "Done"
end
desc "Parse all dumps"
task :parse, [:wiki] => 'parse:all'
namespace :parse do
task :all, [:wiki] => [:pages, :pagelinks]
desc "Read wiki page dumps from lib/assests into the database"
task :pages, [:wiki] => :environment do |t, args|
parse_dumps('page', args[:wiki]) do |obj|
page = Pages.create_from_dump(obj)
end
indent = "Created #{Pages.count} pages"
end
desc "Read wiki pagelink dumps from lib/assests into the database"
task :pagelinks, [:wiki] => :environment do |t, args|
errors = 0
parse_dumps('pagelinks', args[:wiki]) do |from_id, namespace, to_title|
from = Pages.find(:page_id => from_id)
to = Pages.find(:title => to_title)
if to.nil? || from.nil?
errors = errors.succ
else
from.links << to
from.save
end
end
end
end
end
def indent *args
print ":: "
puts args
end
def parse_dumps(dump, wiki_match, &block)
wiki_match ||= /\w+/
DUMP_DIR.entries.each do |file|
file, wiki = *(file.to_s.match(Regexp.new "(#{wiki_match})-#{dump}.sql"))
if file
indent "Parsing #{wiki} #{dump.pluralize} from #{file}"
each_value(DUMP_DIR.join(file), &block)
end
end
end
def each_value(filename)
f = File.open(filename)
num_read = 0
begin # read file until line starting with INSERT INTO
line = f.gets
end until line.match /^INSERT INTO/
begin
line = line.match(/\(.*\)[,;]/)[0] # ignore begining of line until (...) object
begin
yield line[1..-3].split(',').map { |e| e.match(/^['"].*['"]$/) ? e[1..-2] : e.to_f }
num_read = num_read.succ
line = f.gets.chomp
end while(line[0] == '(') # until next insert block, or end of file
end while line.match /^INSERT INTO/ # Until line doesn't start with (...
f.close
end
应用程序/模型/pages.rb
class Pages < Neo4j::Rails::Model
include Neo4j::NodeMixin
has_n(:links).to(Pages)
property :page_id
property :namespace, :type => Fixnum
property :title, :type => String
property :restrictions, :type => String
property :counter, :type => Fixnum
property :is_redirect, :type => Fixnum
property :is_new, :type => Fixnum
property :random, :type => Float
property :touched, :type => String
property :latest, :type => Fixnum
property :length, :type => Fixnum
property :no_title_convert, :type => Fixnum
def self.create_from_dump(obj)
# TODO: I wonder if there is a way to compine these calls
page = {}
# order of this array is important, it corresponds to the data in obj
attrs = [:page_id, :namespace, :title, :restrictions, :counter, :is_redirect,
:is_new, :random, :touched, :latest, :length, :no_title_convert]
attrs.each_index { |i| page[attrs[i]] = obj[i] }
page = Pages.create(page)
return page
end
end