4

这台机器是我的 mongodb 集群的一个分片。集群有3个shard,shard1在machine1,shard2在machine2,都是8G内存和800G磁盘。machine3中的configdb、mongos和shard3,特别是16G内存和400G磁盘。

现在的问题是:

mongostat 在 machine3 中是正常的,但是在 machine1 和 machine2 中,page faults 和locked db 总是很高。

我只是列出了 machine1 的一些状态:top 命令的结果:

[]$top
Cpu(s):  0.2%us,  0.2%sy,  0.0%ni, 99.2%id,  0.3%wa,  0.0%hi,  0.0%si,  0.2%st
Mem:   7633792k total,  7302168k used,   331624k free,    84456k buffers
Swap:        0k total,        0k used,        0k free,  6209852k cached

 PID    PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
 7562   20   0  200g 1.0g 702m S  0.3 14.3   4:36.50 mongod

这是mongostat:

insert  query update delete getmore command flushes mapped  vsize    res non-mapped faults     locked db idx miss %     qr|qw   ar|aw  netIn netOut  conn       time 
 0      2     12      0       0      13       0   100g   201g   911m       101g     11  amazon:38.9%          0       3|0     1|0     4k     4k    14   01:45:35 
 0      0      3      0       0       7       1   100g   201g   912m       101g     28   amazon:1.2%          0       0|0     0|0     1k     3k    14   01:45:36 
 0      2     14      0       0      15       0   100g   201g   912m       101g     93   amazon:4.6%          0       0|0     0|0     7k     4k    14   01:45:37 
 0      0      0      0       0       1       0   100g   201g   911m       101g    141   amazon:0.2%          0       0|0     0|0    62b     2k    14   01:45:38 
 0      6     24      0       0      25       0   100g   201g   913m       101g    123   amazon:6.4%          0       0|0     0|0     8k     6k    14   01:45:39 
 0      1      9      0       0      10       0   100g   201g   912m       101g     33   amazon:4.2%          0       0|0     0|0     3k     3k    14   01:45:40 
 0     12     59      0       0      58       0   100g   201g   914m       101g    108  amazon:30.0%          0       1|0     0|1    24k    12k    14   01:45:41 
 0     20     93      0       0      96       0   100g   201g   911m       101g    114  amazon:36.1%          0       0|0     0|0    33k    17k    14   01:45:42 
 0     19     84      0       0      86       0   100g   201g   913m       101g    103  amazon:43.9%          0       0|0     1|0    28k    16k    14   01:45:43 
 0      9     29      0       0      26       0   100g   201g   914m       101g     37   amazon:5.5%          0       5|0     0|1    11k     6k    14   01:45:44 

这是服务器状态:

> db.serverStatus()
{
    "host" : "XX-XX-XX-XX:25018",
    "version" : "2.2.3",
    "process" : "mongod",
    "pid" : 7562,
    "uptime" : 1410,
    "uptimeMillis" : NumberLong(1410211),
    "uptimeEstimate" : 1390,
    "localTime" : ISODate("2013-03-22T01:49:01.459Z"),
    "locks" : {
        "." : {
            "timeLockedMicros" : {
                "R" : NumberLong(563437),
                "W" : NumberLong(22798453)
            },
            "timeAcquiringMicros" : {
                "R" : NumberLong(303677814),
                "W" : NumberLong(59991149)
            }
        },
        "admin" : {
            "timeLockedMicros" : {
            },
            "timeAcquiringMicros" : {   
            }
        },
        "local" : {
            "timeLockedMicros" : {
                "r" : NumberLong(6613),
                "w" : NumberLong(0)
            },
            "timeAcquiringMicros" : {
                "r" : NumberLong(1937433),
                "w" : NumberLong(0)
            }
        },
        "amazon" : {
            "timeLockedMicros" : {
                "r" : NumberLong(203845605),
                "w" : NumberLong(651848025)
            },
            "timeAcquiringMicros" : {
                "r" : NumberLong(621538184),
                "w" : NumberLong(1525509360)
            }
        },
        "test" : {
            "timeLockedMicros" : {
                "r" : NumberLong(5143),
                "w" : NumberLong(999532)
            },
            "timeAcquiringMicros" : {
                "r" : NumberLong(157712),
                "w" : NumberLong(60)
            }
        }
    },
    "globalLock" : {
        "totalTime" : NumberLong(1410211000),
        "lockTime" : NumberLong(22798453),
        "currentQueue" : {
            "total" : 0,
            "readers" : 0,
            "writers" : 0
        },
        "activeClients" : {
            "total" : 0,
            "readers" : 0,
            "writers" : 0
        }
    },
    "mem" : {
        "bits" : 64,
        "resident" : 945,
        "virtual" : 205577,
        "supported" : true,
        "mapped" : 102383,
        "mappedWithJournal" : 204766
    },
    "connections" : {
        "current" : 14,
        "available" : 805
    },
    "extra_info" : {
        "note" : "fields vary by platform",
        "heap_usage_bytes" : 190782680,
        "page_faults" : 68002
    },
    "indexCounters" : {
        "btree" : {
            "accesses" : 274412,
            "hits" : 274412,
            "misses" : 0,
            "resets" : 0,
            "missRatio" : 0
        }
    },
    "backgroundFlushing" : {
        "flushes" : 23,
        "total_ms" : 89781,
        "average_ms" : 3903.521739130435,
        "last_ms" : 929,
        "last_finished" : ISODate("2013-03-22T01:48:32.243Z")
    },
    "cursors" : {
        "totalOpen" : 0,
        "clientCursors_size" : 0,
        "timedOut" : 0
    },
    "network" : {
        "bytesIn" : 11325630,
        "bytesOut" : 181775584,
        "numRequests" : 67850
    },
    "opcounters" : {
        "insert" : 157,
        "query" : 6898,
        "update" : 29954,
        "delete" : 0,
        "getmore" : 0,
        "command" : 30902
    },
    "asserts" : {
        "regular" : 0,
        "warning" : 0,
        "msg" : 0,
        "user" : 1,
        "rollovers" : 0
    },
    "writeBacksQueued" : false,
    "dur" : {
        "commits" : 27,
        "journaledMB" : 0.36864,
        "writeToDataFilesMB" : 1.241313,
        "compression" : 0.2963027264769924,
        "commitsInWriteLock" : 0,
        "earlyCommits" : 0,
        "timeMs" : {
            "dt" : 3269,
            "prepLogBuffer" : 0,
            "writeToJournal" : 442,
            "writeToDataFiles" : 4,
            "remapPrivateView" : 23
        }
    },
    "recordStats" : {
        "accessesNotInMemory" : 32752,
        "pageFaultExceptionsThrown" : 1656,
        "amazon" : {
            "accessesNotInMemory" : 32752,
            "pageFaultExceptionsThrown" : 1656
        },
        "local" : {
            "accessesNotInMemory" : 0,
            "pageFaultExceptionsThrown" : 0
        },
        "test" : {
            "accessesNotInMemory" : 0,
            "pageFaultExceptionsThrown" : 0
        }
    },
    "ok" : 1
}

有人给我一些建议吗?非常感谢。

4

1 回答 1

1

我有类似的问题。服务器有时开始挂断。我对查询进行了一些更改,希望它能解决。现在我得到这些错误的频率降低了。

我做了什么:

  1. 尽可能使用聚合而不是原始 js
  2. 使我的查询尽可能小。在我的 js 执行脚本中,我使循环变小并引入了分页。
  3. 使用 $lt 而不是限制。
  4. 仅在 find() 上获取所需的字段值。
  5. 对我用来查找的键字段使用索引。索引字段的搜索速度更快。
  6. 限制 $in 查询的元素数量
  7. 我使用 php ORM 将我的对象保存为 mongo 文档。我的一些文件非常大。我将某些文档拆分为较小的文档,以便快速保存。

希望这些提示有所帮助。

于 2014-09-04T07:30:10.513 回答