我正在编写一个简单的 HTTP 爬虫,但在运行底部的代码时遇到问题。我请求 50 个 URL 并获得 20+ 的内容。我已经生成了几个大小为 150kB 的文件来测试爬虫。所以我认为 20+ 响应受到带宽的限制?但是:如何告诉 Erlang 片段在没有获取最后一个文件之前不要退出?测试数据服务器在线,所以请尝试代码,欢迎任何提示:)
-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/1]).
start() ->
ibrowse:start(),
proc_lib:spawn(?MODULE, send_reqs, []).
to_url(Id) ->
?BASE_URL ++ integer_to_list(Id).
fetch_ids() ->
lists:seq(1, 50).
send_reqs() ->
spawn_workers(fetch_ids()).
spawn_workers(Ids) ->
lists:foreach(fun do_spawn/1, Ids).
do_spawn(Id) ->
proc_lib:spawn_link(?MODULE, do_send_req, [Id]).
do_send_req(Id) ->
io:format("Requesting ID ~p ... ~n", [Id]),
Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
case Result of
{ok, Status, _H, B} ->
io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]);
Err ->
io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err])
end.
这就是输出:
Requesting ID 1 ...
Requesting ID 2 ...
Requesting ID 3 ...
Requesting ID 4 ...
Requesting ID 5 ...
Requesting ID 6 ...
Requesting ID 7 ...
Requesting ID 8 ...
Requesting ID 9 ...
Requesting ID 10 ...
Requesting ID 11 ...
Requesting ID 12 ...
Requesting ID 13 ...
Requesting ID 14 ...
Requesting ID 15 ...
Requesting ID 16 ...
Requesting ID 17 ...
Requesting ID 18 ...
Requesting ID 19 ...
Requesting ID 20 ...
Requesting ID 21 ...
Requesting ID 22 ...
Requesting ID 23 ...
Requesting ID 24 ...
Requesting ID 25 ...
Requesting ID 26 ...
Requesting ID 27 ...
Requesting ID 28 ...
Requesting ID 29 ...
Requesting ID 30 ...
Requesting ID 31 ...
Requesting ID 32 ...
Requesting ID 33 ...
Requesting ID 34 ...
Requesting ID 35 ...
Requesting ID 36 ...
Requesting ID 37 ...
Requesting ID 38 ...
Requesting ID 39 ...
Requesting ID 40 ...
Requesting ID 41 ...
Requesting ID 42 ...
Requesting ID 43 ...
Requesting ID 44 ...
Requesting ID 45 ...
Requesting ID 46 ...
Requesting ID 47 ...
Requesting ID 48 ...
Requesting ID 49 ...
Requesting ID 50 ...
OK -- ID: 49 -- Status: "200" -- Content length: 150000
OK -- ID: 47 -- Status: "200" -- Content length: 150000
OK -- ID: 50 -- Status: "200" -- Content length: 150000
OK -- ID: 17 -- Status: "200" -- Content length: 150000
OK -- ID: 48 -- Status: "200" -- Content length: 150000
OK -- ID: 45 -- Status: "200" -- Content length: 150000
OK -- ID: 46 -- Status: "200" -- Content length: 150000
OK -- ID: 10 -- Status: "200" -- Content length: 150000
OK -- ID: 09 -- Status: "200" -- Content length: 150000
OK -- ID: 19 -- Status: "200" -- Content length: 150000
OK -- ID: 13 -- Status: "200" -- Content length: 150000
OK -- ID: 21 -- Status: "200" -- Content length: 150000
OK -- ID: 16 -- Status: "200" -- Content length: 150000
OK -- ID: 27 -- Status: "200" -- Content length: 150000
OK -- ID: 03 -- Status: "200" -- Content length: 150000
OK -- ID: 23 -- Status: "200" -- Content length: 150000
OK -- ID: 29 -- Status: "200" -- Content length: 150000
OK -- ID: 14 -- Status: "200" -- Content length: 150000
OK -- ID: 18 -- Status: "200" -- Content length: 150000
OK -- ID: 01 -- Status: "200" -- Content length: 150000
OK -- ID: 30 -- Status: "200" -- Content length: 150000
OK -- ID: 40 -- Status: "200" -- Content length: 150000
OK -- ID: 05 -- Status: "200" -- Content length: 150000
更新:
感谢stemm 对wait_workers 的提示。我已经结合了你和我的代码,但行为相同:(
-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/2]).
start() ->
ibrowse:start(),
proc_lib:spawn(?MODULE, send_reqs, []).
to_url(Id) ->
?BASE_URL ++ integer_to_list(Id).
fetch_ids() ->
lists:seq(1, 50).
send_reqs() ->
spawn_workers(fetch_ids()).
spawn_workers(Ids) ->
%% collect reference to each worker
Refs = [ do_spawn(Id) || Id <- Ids ],
%% wait for response from each worker
wait_workers(Refs).
wait_workers(Refs) ->
lists:foreach(fun receive_by_ref/1, Refs).
receive_by_ref(Ref) ->
%% receive message only from worker with specific reference
receive
{Ref, done} ->
done
end.
do_spawn(Id) ->
Ref = make_ref(),
proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
Ref.
do_send_req(Id, {Pid, Ref}) ->
io:format("Requesting ID ~p ... ~n", [Id]),
Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
case Result of
{ok, Status, _H, B} ->
io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
%% send message that work is done
Pid ! {Ref, done};
Err ->
io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
%% repeat request if there was error while fetching a page,
do_send_req(Id, {Pid, Ref})
%% or - if you don't want to repeat request, put there:
%% Pid ! {Ref, done}
end.
对少数文件运行爬虫分叉很好,但是代码甚至没有获取整个文件(每个文件大小为 150000 字节) - 他的爬虫部分获取了一些文件,请参阅以下 Web 服务器日志:(
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /10 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /1 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /3 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /8 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /39 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /7 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /6 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /2 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /5 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /50 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /9 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /44 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /38 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /47 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /49 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /43 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /37 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /46 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /48 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /36 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /42 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /41 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /45 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /17 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /35 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /16 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /15 HTTP/1.1" 200 17020 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /21 HTTP/1.1" 200 120360 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /40 HTTP/1.1" 200 117600 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /34 HTTP/1.1" 200 60660 "-" "-"
欢迎任何提示。我不知道那里出了什么问题:(