http - Erlang 中的 HTTP 爬虫

Question

我正在编写一个简单的 HTTP 爬虫，但在运行底部的代码时遇到问题。我请求 50 个 URL 并获得 20+ 的内容。我已经生成了几个大小为 150kB 的文件来测试爬虫。所以我认为 20+ 响应受到带宽的限制？但是：如何告诉 Erlang 片段在没有获取最后一个文件之前不要退出？测试数据服务器在线，所以请尝试代码，欢迎任何提示:)

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/1]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  lists:foreach(fun do_spawn/1, Ids).

do_spawn(Id) ->
  proc_lib:spawn_link(?MODULE, do_send_req, [Id]).

do_send_req(Id) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
{ok, Status, _H, B} ->
  io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]);
Err ->
  io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err])
  end.

这就是输出：

Requesting ID 1 ... 
Requesting ID 2 ... 
Requesting ID 3 ... 
Requesting ID 4 ... 
Requesting ID 5 ... 
Requesting ID 6 ... 
Requesting ID 7 ... 
Requesting ID 8 ... 
Requesting ID 9 ... 
Requesting ID 10 ... 
Requesting ID 11 ... 
Requesting ID 12 ... 
Requesting ID 13 ... 
Requesting ID 14 ... 
Requesting ID 15 ... 
Requesting ID 16 ... 
Requesting ID 17 ... 
Requesting ID 18 ... 
Requesting ID 19 ... 
Requesting ID 20 ... 
Requesting ID 21 ... 
Requesting ID 22 ... 
Requesting ID 23 ... 
Requesting ID 24 ... 
Requesting ID 25 ... 
Requesting ID 26 ... 
Requesting ID 27 ... 
Requesting ID 28 ... 
Requesting ID 29 ... 
Requesting ID 30 ... 
Requesting ID 31 ... 
Requesting ID 32 ... 
Requesting ID 33 ... 
Requesting ID 34 ... 
Requesting ID 35 ... 
Requesting ID 36 ... 
Requesting ID 37 ... 
Requesting ID 38 ... 
Requesting ID 39 ... 
Requesting ID 40 ... 
Requesting ID 41 ... 
Requesting ID 42 ... 
Requesting ID 43 ... 
Requesting ID 44 ... 
Requesting ID 45 ... 
Requesting ID 46 ... 
Requesting ID 47 ... 
Requesting ID 48 ... 
Requesting ID 49 ... 
Requesting ID 50 ... 
OK -- ID: 49 -- Status: "200" -- Content length: 150000
OK -- ID: 47 -- Status: "200" -- Content length: 150000
OK -- ID: 50 -- Status: "200" -- Content length: 150000
OK -- ID: 17 -- Status: "200" -- Content length: 150000
OK -- ID: 48 -- Status: "200" -- Content length: 150000
OK -- ID: 45 -- Status: "200" -- Content length: 150000
OK -- ID: 46 -- Status: "200" -- Content length: 150000
OK -- ID: 10 -- Status: "200" -- Content length: 150000
OK -- ID: 09 -- Status: "200" -- Content length: 150000
OK -- ID: 19 -- Status: "200" -- Content length: 150000
OK -- ID: 13 -- Status: "200" -- Content length: 150000
OK -- ID: 21 -- Status: "200" -- Content length: 150000
OK -- ID: 16 -- Status: "200" -- Content length: 150000
OK -- ID: 27 -- Status: "200" -- Content length: 150000
OK -- ID: 03 -- Status: "200" -- Content length: 150000
OK -- ID: 23 -- Status: "200" -- Content length: 150000
OK -- ID: 29 -- Status: "200" -- Content length: 150000
OK -- ID: 14 -- Status: "200" -- Content length: 150000
OK -- ID: 18 -- Status: "200" -- Content length: 150000
OK -- ID: 01 -- Status: "200" -- Content length: 150000
OK -- ID: 30 -- Status: "200" -- Content length: 150000
OK -- ID: 40 -- Status: "200" -- Content length: 150000
OK -- ID: 05 -- Status: "200" -- Content length: 150000

更新：

感谢stemm 对wait_workers 的提示。我已经结合了你和我的代码，但行为相同:(

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/2]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
{Ref, done} ->
    done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
  {ok, Status, _H, B} ->
    io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
    %% send message that work is done
    Pid ! {Ref, done};
  Err ->
    io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
    %% repeat request if there was error while fetching a page, 
    do_send_req(Id, {Pid, Ref})
    %% or - if you don't want to repeat request, put there:
    %% Pid ! {Ref, done}
end.

对少数文件运行爬虫分叉很好，但是代码甚至没有获取整个文件（每个文件大小为 150000 字节） - 他的爬虫部分获取了一些文件，请参阅以下 Web 服务器日志:(

82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /10 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /1 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /3 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /8 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /39 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /7 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /6 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /2 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /5 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /50 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /9 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /44 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /38 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /47 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /49 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /43 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /37 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /46 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /48 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /36 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /42 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /41 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /45 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /17 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /35 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /16 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /15 HTTP/1.1" 200 17020 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /21 HTTP/1.1" 200 120360 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /40 HTTP/1.1" 200 117600 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /34 HTTP/1.1" 200 60660 "-" "-"

欢迎任何提示。我不知道那里出了什么问题:(

score 1 · Accepted Answer

所以，如果我对你的理解正确——你不想在spawn_workers每个工作人员没有停止（并获取页面）之前从函数返回控制权？如果是这样 - 您可以通过以下方式更改您的代码：

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
    {Ref, done} ->
        done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
      {ok, Status, _H, B} ->
        io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
        %% send message that work is done
        Pid ! {Ref, done};
      Err ->
        io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
        %% repeat request if there was error while fetching a page, 
        do_send_req(Id, {Pid, Ref})
        %% or - if you don't want to repeat request, put there:
        %% Pid ! {Ref, done}
  end.

编辑：

我注意到您的入口点（函数start）返回控制权而不等待所有工作人员结束他们的任务（因为您在那里调用spawn）。如果您也想在那里等待 - 只需执行类似的技巧：

start() ->
  ibrowse:start(),
  Ref = make_ref(),
  proc_lib:spawn(?MODULE, send_reqs, [self(), Ref]),
  receive_by_ref(Ref).

send_reqs(Pid, Ref) ->
  spawn_workers(fetch_ids()),
  Pid ! {Ref, done}.

score 1 · Accepted Answer

您可以使用监督者和队列模块的组合：生成 N 个获取子项，每个子项获取队列的 1 项并处理它。完成后通知父进程继续队列中的下一个项目。这样你就可以限制并发请求的数量。
如果您当时产生 500 个请求，ibrowse 可能会感到困惑。您是否在控制台中收到任何错误？
见ibrowse:get_config_value/1和ibrowse:set_config_value/2

http - Erlang 中的 HTTP 爬虫

更新：

2 回答 2

Related

Reference