1

MongoDB新手问题:

我有很多 HTTP 日志存储到具有以下数据结构的集合中:

{
    'client': {
        'ip_address': '1.2.3.4',
        'referrer':"http://....",
        'user_agent':'Mozilla..."
    },
    'request':{
        "stream": "stream1",
        "method": "GET",
        "fragment_id": 97,
        "date": 13482181,
    'response':{
        'status':200,
        'size': 654
    }
}

每个文档都描述了一个 HTTP 请求(从客户端到内容流送器)。由于每个流都被分割成更小的部分,我想在我的集合上使用“mapReduce”,然后创建一个“通用流请求”文档,如下所示:

{
    'client_ip': '1.2.3.4',
    'user_agent': 'Mozilla',
    'streams':[
        {
        'stream':"stream1",
        'referrer':'http://...',
        'requests':[
          {
             'fragment_id':97,
             'status':200,
             'date': 13482181,
             'size': 654
             ...
          },
          {
             'fragment_id':98,
             'status':200,
             'date': 13482192,
             'size': 624
             ...
          }, [...]
         ]
        }, [...]
    ]

这是我尝试过的:

map = function(){
    emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{
                stream:this.request.stream,
                referrer:this.client.referer,
                status:this.response.status,
                date:this.request.date,
                size:this.response.total_size,
                fragment_id:this.request.fragment_infos[1]
    });
}

reduce = function(key,values){
    r = {'count':0,'request':[]};
    values.forEach(function(v){
        r.count += 1;
        r.request.push(v);
    });

    return r;
}

但这是我得到的结果:

"_id" : {
    "client_ip" : "1.2.3.4",
    "user_agent" : "Mozilla\/4.0"
 },
 "value" : {
    "client_ip" : "1.2.3.4",
    "user_agent" : "Mozilla\/4.0",
    "count" : 17,
    "request" : {
        "0" : {
            "client_ip" : "1.2.3.4",
            "user_agent" : "Mozilla\/4.0",
            "count" : 2,
            "request" : {
                "0" : {
                    "stream" : "stream1.isml",
                    "referrer" : null,
                    "status" : 200,
                    "date" : 1341706566,
                    "size" : 456,
                    "fragment_id" : null,
                    "count" : 1
                },
                "1" : {
                    "stream" : "stream1.isml",
                    "referrer" : null,
                    "status" : 200,
                    "date" : 1341706566,
                    "size" : null,
                    "fragment_id" : null,
                    "count" : 1
                }
            }
        },
        "1" : {
            "client_ip" : "1.2.3.4",
            "user_agent" : "Mozilla\/4.0",
            "count" : 3,
            "request" : {
                "0" : {
                    "client_ip" : "1.2.3.4",
                    "user_agent" : "Mozilla\/4.0",
                    "count" : 2,
                    "request" : {
                        "0" : {
                            "stream" : "stream1.isml",
                            "referrer" : null,
                            "status" : 200,
                            "date" : 1341706568,
                            "size" : null,
                            "fragment_id" : null,
                            "count" : 1
.........

我哪里错了?

4

1 回答 1

1

你总是会得到一个包含 _id 和 value 的记录,这是 MongoDB map/reduce 的一个属性。有一张公开票可以改变这种行为: https ://jira.mongodb.org/browse/SERVER-2517

至于使值与您的示例一致,您希望 map 函数的输出与您希望 reduce 函数的输出形式相同。

map = function(){

  emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{
    client_ip: this.client.ip,
    user_agent: this.client.user_agent,
    streams: {
      this.request.stream: {
        referrer: this.client.referer,
        requests: [
          {
            fragment_id: this.request.fragment_infos[1],
            status:this.response.status,
            date:this.request.date,
            size:this.response.total_size  
          }
        ]
      }
    }
  });
}

您需要修改您的 reduce 函数以合并此表单的多个文档。如有必要,编写一个 finalize 函数将流的散列转换为一个流数组,每个元素内都有流名称。

于 2012-07-10T05:39:31.767 回答