0

I have multiple time series in database in mongodb, with fields "ticker", "time", and "close" amongst other fields:

> db.bbticks.find().limit(2)
{ "_id" : ObjectId("522b2cf7d4236309a57c8f96"), "close" : 1.9432, "high" : 1.9433, "low" : 1.9426, "open" : 1.9427, "source" : "HIST", "systime" : ISODate("2013-09-07T13:41:13.383Z"), "ticker" : "USDTRY Curncy", "time" : ISODate("2013-08-01T15:14:00Z"), "type" : "BAR", "value" : 1.9432 }
{ "_id" : ObjectId("522b2cf7d4236309a57c8f97"), "close" : 1.9425, "high" : 1.9433, "low" : 1.9425, "open" : 1.9432, "source" : "HIST", "systime" : ISODate("2013-09-07T13:41:13.383Z"), "ticker" : "USDTRY Curncy", "time" : ISODate("2013-08-01T15:15:00Z"), "type" : "BAR", "value" : 1.9425 }

The time stamps are whole minutes. There are multiple timezones represented amongst the tickers, so for example, the MEXBOL Mexical stock market is open only from 13h30 UTC, whereas the FTSEMIB Italian stock market is open from 07h00 UTC. I want to bring down all the time series but only for timestamps that they all have. Here is an example:

> db.bbticks.find({ticker: "FTSEMIB Index", type: "BAR", time: {$gte: ISODate("2013-08-01")}}, {_id: 0, ticker: 1, time: 1, close: 1}).sort({time: 1}).limit(5)
{ "close" : 16565.04, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T07:00:00Z") }
{ "close" : 16585.56, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T07:01:00Z") }
{ "close" : 16583.29, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T07:02:00Z") }
{ "close" : 16578.95, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T07:03:00Z") }
{ "close" : 16587.16, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T07:04:00Z") }
> db.bbticks.find({ticker: "MEXBOL Index", type: "BAR", time: {$gte: ISODate("2013-08-01")}}, {_id: 0, ticker: 1, time: 1, close: 1}).sort({time: 1}).limit(5)
{ "close" : 41101.39, "ticker" : "MEXBOL Index", "time" : ISODate("2013-08-01T13:30:00Z") }
{ "close" : 41099.25, "ticker" : "MEXBOL Index", "time" : ISODate("2013-08-01T13:31:00Z") }
{ "close" : 41126.17, "ticker" : "MEXBOL Index", "time" : ISODate("2013-08-01T13:32:00Z") }
{ "close" : 41137.03, "ticker" : "MEXBOL Index", "time" : ISODate("2013-08-01T13:33:00Z") }
{ "close" : 41173.89, "ticker" : "MEXBOL Index", "time" : ISODate("2013-08-01T13:34:00Z") }

as you can see, for ticks on or after 1 August 2013, FTSEMIB starts at 07h00 and MEXBOL starts at 13h30. Data does exist for FTSEMIB after 13h30 too:

> db.bbticks.find({ticker: "FTSEMIB Index", type: "BAR", time: {$gte: ISODate("2013-08-01T13:30:00")}}, {_id: 0, ticker: 1, time: 1, close: 1}).sort({time: 1}).limit(5)
{ "close" : 16739.41, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T13:30:00Z") }
{ "close" : 16748.21, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T13:31:00Z") }
{ "close" : 16750.76, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T13:32:00Z") }
{ "close" : 16747.89, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T13:33:00Z") }
{ "close" : 16746.66, "ticker" : "FTSEMIB Index", "time" : ISODate("2013-08-01T13:34:00Z") }

So basically, wherever there is "time" field that exists for both tickers, I want only those closes returned. There may be multiple time series in the query (not just two), and there may be missing values within otherwise contiguous blocks of series (so for example, at 14h31 on 1 August for example, one series might not have value for that time, in which case no series must be returned for that time).

Basically, I want to compare time series, I need the series returned only for timestamps that they all have.

Finally, ideally I would prefer to use the aggregation pipeline framework, rather than Map Reduce, if possible.

4

1 回答 1

1

See if the following is in line with what you want to accomplish:

db.bbticks.aggregate(
[
 { $match: { time: { $gte: ISODate("2013-08-01") } } },
 { $group: { _id: "$time", count: {$sum: 1}, tickers: { $push: { "ticker": "$ticker" , "close": "$close" } } } } ,
 { $match: { count: { $gt: 1 } } }
]
)

-- break --

For the map-reduce, you could try the following (not very elegant, I think there are better ways but just some ideas to get you started). Also, as this will be a time series that grows, chances are you might want to use an incremental map-reduce (http://docs.mongodb.org/manual/tutorial/perform-incremental-map-reduce/). But the below can give you some ideas (like I said, it is ugly --- and it might be better to perform a second map-reduce operation rather than my last find statement, but up to you).

var mapFunction = function() {
                      var key = this.time

                      var value = { tickers: [
                                                { ticker: this.ticker, close: this.close } 
                                             ] };

                      emit( key, value );
                  };

var reduceFunction = function(keyObject, valuesArray) {
                     var reducedValue = { tickers: [] };

                     for (var idx = 0; idx < valuesArray.length; idx++) {
                        reducedValue.tickers.push( valuesArray[idx].tickers[0] )
                     }

                     return reducedValue;
                  };


db.bbticks.mapReduce( mapFunction,
                      reduceFunction,
                      {
                        out: "mr_interim_results",
                        sort: { time: 1 },
                        query: {
                                 time: {$gte: ISODate("2013-08-01") }
                               },
                      }
                   )

db.mr_interim_results.find( { 'value.tickers': { $not: { $size: 1 } } }  )
于 2013-09-08T20:47:07.780 回答