-1

Setup:

  • 6 node cluster with 3 hosts with 12 hdd osd(s) each (36 total) and other 3 hosts with 24 ssd osd(s) each (72 total).
  • 2 erasure code pool that takes 100% of data one for ssd class and the other for hdd class.

    # hdd k=22 m=14 64% overhead. Withstands 14 hdd osd failures. This includes 
    # tolerating one host failure and additional 2 osd failures on top. 
    ceph osd erasure-code-profile set hdd_k22_m14_osd \
    k=22 \
    m=14 \
    crush-device-class=hdd \
    crush-failure-domain=osd
    
    # ssd k=44 m=28 64% overhead. Withstands 28 ssd osd failures. This includes 
    # tolerating one host failure and additional 4 osd failures on top. 
    ceph osd erasure-code-profile set ssd_k44_m28_osd \
    k=44 \
    m=28 \
    crush-device-class=ssd \
    crush-failure-domain=osd
    
    # creating erasure code pool  min_size=k+2
    ceph osd pool create cephfs.vol1.test.hdd.ec erasure hdd_k22_m14_osd 
    ceph osd pool set cephfs.vol1.test.hdd.ec allow_ec_overwrites true
    ceph osd pool set cephfs.vol1.test.hhd.ec pg_num 128
    ceph osd pool set cephfs.vol1.test.hhd.ec pgp_num 128
    ceph osd pool set cephfs.vol1.test.hdd.ec min_size 24
    
    # creating erasure code pool 
    ceph osd pool create cephfs.vol1.test.ssd.ec erasure ssd_k44_m28_osd 
    ceph osd pool set cephfs.vol1.test.ssd.ec allow_ec_overwrites true
    ceph osd pool set cephfs.vol1.test.ssd.ec pg_num 128
    ceph osd pool set cephfs.vol1.test.ssd.ec pgp_num 128
    ceph osd pool set cephfs.vol1.test.ssd.ec min_size 46

  • k=22 m=14; 128 pgs; failure domain = osd. The reason for this is for ceph cluster to account for a full host failure (12osds). All osds have the same storage space and same storage class (hdd).

    # ceph osd erasure-code-profile get hdd_k22_m14_osd
    crush-device-class=hdd
    crush-failure-domain=osd
    crush-root=default
    jerasure-per-chunk-alignment=false
    k=22
    m=14
    plugin=jerasure
    technique=reed_sol_van
    w=8


    # ceph osd pool ls detail | grep hdd
    pool 16 'cephfs.vol1.test.hdd.ec' erasure profile hdd_k22_m14_osd size 36 min_size 24 crush_rule 7 object_hash rjenkins pg_num 253 pgp_num 241 pg_num_target 128 pgp_num_target 128 autoscale_mode on last_change 17748 lfor 0/7144/7142 flags hashpspool,ec_overwrites stripe_width 90112 target_size_bytes 344147139493888 application cephfs
    # ceph osd pool ls detail | grep ssd
    pool 17 'cephfs.vol1.test.ssd.ec' erasure profile ssd_k44_m28_osd size 72 min_size 46 crush_rule 8 object_hash rjenkins pg_num 128 pgp_num 128 autoscale_mode on last_change 13591 lfor 0/0/7109 flags hashpspool,ec_overwrites stripe_width 180224 target_size_bytes 113249697660928 application cephfs


    {
        "rule_id": 7,
        "rule_name": "cephfs.vol1.test.hdd.ec",
        "ruleset": 7,
        "type": 3,
        "min_size": 3,
        "max_size": 36,
        "steps": [
            {
                "op": "set_chooseleaf_tries",
                "num": 5
            },
            {
                "op": "set_choose_tries",
                "num": 100
            },
            {
                "op": "take",
                "item": -2,
                "item_name": "default~hdd"
            },
            {
                "op": "choose_indep",
                "num": 0,
                "type": "osd"
            },
            {
                "op": "emit"
            }
        ]
    }


    {
        "rule_id": 8,
        "rule_name": "cephfs.vol1.test.ssd.ec",
        "ruleset": 8,
        "type": 3,
        "min_size": 3,
        "max_size": 72,
        "steps": [
            {
                "op": "set_chooseleaf_tries",
                "num": 5
            },
            {
                "op": "set_choose_tries",
                "num": 100
            },
            {
                "op": "take",
                "item": -12,
                "item_name": "default~ssd"
            },
            {
                "op": "choose_indep",
                "num": 0,
                "type": "osd"
            },
            {
                "op": "emit"
            }
        ]
    } 

Issues:

However, this setup does not seem to work and gives:


    # ceph -s 
      cluster:
        id:     <id>
        health: HEALTH_WARN
                Degraded data redundancy: 19 pgs undersized
                20 pgs not deep-scrubbed in time


    # ceph health detail
        pg 17.0 is stuck undersized for 7h, current state active+undersized, last acting [92,76,44,84,46,72,102,104,59,62,60,89,40,47,65,38,95,79,43,67,91,69,80,83,94,48,42,90,88,37,49,75,53,58,93,45,96,61,106,64,52,70,77,99,107,63,97,100,56,98,87,105,36,68,103,55,85,2147483647,82,66,51,101,81,54,78,74,39,50,73,71,57,41]
        pg 17.1 is stuck undersized for 7h, current state active+undersized, last acting [69,59,75,104,79,83,89,51,76,102,37,54,95,60,105,87,43,91,70,101,45,94,68,57,72,107,53,49,40,50,65,61,88,84,73,58,47,96,48,100,103,42,52,71,63,86,39,64,97,41,46,81,67,36,93,82,62,38,98,90,85,2147483647,44,99,55,80,78,56,92,66,106,77]
        pg 17.4 is stuck undersized for 7h, current state active+undersized, last acting [46,84,96,39,38,94,82,67,103,63,50,52,106,42,61,64,45,62,74,79,101,48,2147483647,85,105,59,72,81,91,60,56,71,102,77,70,57,54,100,49,75,36,53,92,98,58,83,51,69,44,89,65,47,43,41,99,107,90,76,37,68,80,40,55,93,104,66,95,78,86,97,73,88]
        pg 17.5 is stuck undersized for 7h, current state active+undersized, last acting [63,64,93,82,69,90,60,102,89,104,50,103,55,52,66,98,99,65,100,48,53,76,68,62,84,87,57,42,75,46,83,71,43,92,51,44,80,56,61,88,77,37,38,39,81,74,105,49,85,41,91,36,79,54,45,94,67,101,72,96,47,73,86,2147483647,106,97,70,107,59,78,40,95]
        pg 17.6 is stuck undersized for 7h, current state active+undersized, last acting [48,67,88,105,97,78,92,79,58,59,46,98,91,45,96,52,38,57,41,81,73,49,89,55,86,68,37,39,77,47,83,76,54,94,44,70,43,62,42,60,104,64,84,85,63,102,87,90,71,80,103,100,101,40,50,72,75,95,51,82,53,36,65,61,106,93,2147483647,99,56,74,107,66]
        pg 17.7 is stuck undersized for 7h, current state active+undersized, last acting [69,79,84,103,37,60,75,42,67,40,65,90,99,85,63,91,83,58,104,56,43,62,55,86,82,72,73,106,87,68,57,50,64,96,41,39,61,71,93,97,59,92,102,81,38,98,48,51,95,101,52,74,77,53,44,49,45,107,78,88,70,105,46,54,80,36,47,89,76,66,100,2147483647]
        pg 17.8 is stuck undersized for 7h, current state active+undersized, last acting [71,78,99,81,43,58,54,86,95,82,52,46,73,69,97,39,93,88,59,105,103,91,50,101,102,49,51,64,98,90,84,75,42,107,56,83,60,67,70,55,104,61,66,79,96,74,63,72,92,53,2147483647,100,62,77,45,87,85,89,76,80,37,44,68,57,41,94,40,48,38,47,65,36]
        pg 17.a is stuck undersized for 7h, current state active+undersized, last acting [65,42,58,61,52,57,60,85,100,75,98,40,74,79,38,72,91,48,93,80,54,41,83,95,76,49,46,71,55,88,63,94,73,44,45,102,89,107,92,86,53,103,47,43,56,82,104,106,51,37,36,39,99,97,59,81,64,66,84,96,90,77,87,78,50,105,62,67,69,70,101,2147483647]
        pg 17.b is stuck undersized for 7h, current state active+undersized, last acting [47,54,59,93,91,36,58,98,39,60,46,49,78,64,88,100,66,107,92,83,99,56,63,87,41,96,89,45,51,76,69,71,103,94,90,50,85,68,81,73,75,105,40,79,84,44,80,37,42,52,95,70,62,55,82,53,38,72,65,2147483647,48,106,43,101,104,86,61,57,102,77,74,67]
        pg 17.d is stuck undersized for 7h, current state active+undersized, last acting [92,83,39,44,75,98,96,61,41,64,38,97,63,37,70,68,87,90,36,77,73,60,69,55,93,47,2147483647,56,102,50,54,91,82,58,43,67,53,86,81,95,105,52,85,51,79,46,62,49,80,40,57,42,104,107,78,84,94,103,48,72,88,74,71,45,101,99,65,59,106,66,100,76]
        pg 17.10 is stuck undersized for 7h, current state active+undersized, last acting [96,94,52,46,43,50,82,97,75,84,53,106,91,78,64,65,42,95,98,87,69,99,77,59,76,2147483647,49,70,79,90,105,81,107,86,45,39,55,93,92,56,72,37,101,36,85,100,67,47,104,74,63,38,48,68,44,60,57,61,40,88,51,62,71,83,58,89,103,80,102,41,54,73]
        pg 17.13 is stuck undersized for 7h, current state active+undersized, last acting [46,55,50,77,73,97,45,57,67,95,103,38,90,106,66,87,36,44,82,49,100,107,84,88,102,40,65,60,43,70,42,86,48,39,71,74,99,56,59,96,72,92,101,62,93,51,47,52,85,53,104,76,37,79,58,94,81,64,83,68,69,63,54,80,98,61,78,105,2147483647,91,75,41]
        pg 17.14 is stuck undersized for 7h, current state active+undersized, last acting [105,62,66,55,53,51,97,50,65,90,104,56,74,52,70,100,42,107,101,40,58,63,44,49,59,69,38,80,73,102,36,76,106,75,39,99,92,60,94,91,89,41,46,72,88,2147483647,87,98,71,78,54,68,84,95,57,103,81,82,96,61,67,79,37,83,86,47,93,77,64,48,85,45]
        pg 17.19 is stuck undersized for 7h, current state active+undersized, last acting [50,90,73,99,45,101,72,93,85,47,59,78,95,83,96,58,76,39,43,49,44,92,91,102,81,74,62,86,54,56,103,87,70,105,75,48,88,97,67,38,57,46,36,84,107,66,65,69,106,41,80,42,52,63,64,61,98,100,79,60,51,94,53,89,37,68,40,55,77,71,2147483647,104]
        pg 17.1a is stuck undersized for 7h, current state active+undersized, last acting [70,95,59,78,87,85,66,68,40,63,90,73,89,101,86,80,82,50,107,74,55,49,72,48,43,104,62,97,81,94,103,58,77,52,2147483647,102,53,75,106,91,88,57,42,61,99,79,39,54,38,96,37,45,76,105,51,84,60,47,93,98,83,100,64,65,44,36,56,71,67,46,41,69]
        pg 17.1b is stuck undersized for 7h, current state active+undersized, last acting [84,37,62,58,87,36,94,77,53,55,45,93,43,82,75,78,101,104,95,106,98,107,61,99,38,46,52,76,56,51,66,83,42,80,63,81,79,86,100,90,88,65,47,60,44,103,2147483647,73,59,69,102,67,57,70,72,41,105,54,64,91,97,48,74,89,92,96,40,71,50,39,49,68]
        pg 17.1e is stuck undersized for 7h, current state active+undersized, last acting [103,48,71,70,104,47,77,56,55,89,68,97,72,82,36,69,40,83,107,38,80,76,39,100,92,79,57,37,42,66,98,53,62,43,84,95,75,105,59,94,106,45,88,54,96,67,91,46,44,58,2147483647,93,73,64,85,78,101,65,50,99,74,102,49,51,41,61,87,90,52,63,60,81]

And the external cluster rook pvc mounts cannot write to it.

What was done wrong here? Why are the pg(s) undersized?

4

1 回答 1

1

This is a really bad design, you should start from scratch. First, the number of chunks you're creating is way too high and there's no need for that. It's also a bad choice to involve all hosts because in case of a host or even OSD failure there's no room for recovery, so your cluster will be in degraded state until the failed host or OSD is back online. Second, OSD as failure domain is not a good choice either, usually you'd go with host as failure domain. For your relatively small setup I would rather choose a replicated pool with size 6 (2 replicas per node, you can lose 2 hosts without data loss). If you really need to go with EC be aware that you won't be able to sustain the loss of a host since there's not enough space to recover. You could choose a profile like k2 m4 - or if you want to have more chunks make it k3 m6 - and keep OSD as failure domain, but as I said, it's not very resilient. You'd be better off with replicated pools.

Why your PGs are degraded is depending on a couple of things. If you want to keep your current setup (which I don't recommend) you could post your ceph osd tree and ceph osd df to begin with.

于 2021-01-13T10:22:22.263 回答