将设为首页浏览此站
开启辅助访问 天气与日历 收藏本站联系我们切换到窄版

易陆发现论坛

 找回密码
 开始注册
查看: 58|回复: 1
收起左侧

Degraded data redundancy: 1 pg undersized ceph status状态异常

[复制链接]
发表于 2021-6-9 15:00:16 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。

您需要 登录 才可以下载或查看,没有帐号?开始注册

x
[root@controller ~]# ceph -s  y$ s& N( z# r/ u1 d( w. L2 `- R0 X6 c
  cluster:- T2 Z0 y6 v; X& w
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de9 C+ t# g0 B3 F# h: j
    health: HEALTH_WARN, i+ O4 \. W6 }2 B8 ~
            Degraded data redundancy: 1 pg undersized0 Q/ a3 q' P# x7 }3 K9 i# D" q, O0 R

# f/ a% S3 p2 F- L  services:
4 j6 j9 L; |6 ^3 s% K* d- I7 ?    mon: 1 daemons, quorum controller (age 87m)
  x' `! t7 h; {2 Q5 ~0 {( J2 ~/ `    mgr: controller.horbtx(active, since 87m)# t; D7 s; Y2 A
    osd: 6 osds: 6 up (since 6m), 6 in (since 6m); 1 remapped pgs% d; N$ ^; S! i1 q. X- A# s1 K
# N  c5 J4 G3 T+ ~7 \
  data:
3 t9 F3 h9 B& E: q, a/ b    pools:   1 pools, 1 pgs( E- l% {; O+ w$ ?! O
    objects: 0 objects, 0 B
3 x( D- D* g6 ?  [2 n: U* J: S    usage:   6.0 GiB used, 114 GiB / 120 GiB avail% S8 i1 G. d3 H9 ?$ t6 m
    pgs:     1 active+undersized+remapped) Z6 i: ?5 ~! M2 P8 b) W

" z: f0 l- D" m& I解决过程:
: `1 Z9 R0 Y- @1 K8 L6 k8 `" C+ e' }
[root@controller ~]# vim /etc/ceph/ceph.conf 4 I4 g% R$ p0 ?

8 m$ c% ]: Z  N  O5 b/ B  osd_class_update_on_start = false
- y6 i4 x: J. F, H! c/ ^+ l0 W% U3 p6 H, @4 T" q* ]2 O3 O

* P, k' }- D0 c6 \/ V[root@controller ~]# ceph health detail
& M# x6 U6 N6 Y1 f" BHEALTH_WARN Degraded data redundancy: 1 pg undersized' I* ]! ^$ N/ J7 w* h
[WRN] PG_DEGRADED: Degraded data redundancy: 1 pg undersized2 c' J+ g8 G3 G" C* \& @) |( N
    pg 1.0 is stuck undersized for 86m, current state active+undersized+remapped, last acting [1,0]& ^4 {" N2 \4 ]2 }! Z

- t  N3 \& ^( w. Z修改配置后,需要重启osd服务:7 ~( L) G: P% N, ~# a7 [
. Y! d% ?6 k3 U* K+ p: G7 f
ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.0.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.3.service( k; b9 U! }. Q& y) |7 l
ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.1.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.4.service
- y3 c8 L$ [0 g6 V% o9 Oceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.2.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.5.service
% n; \& S5 l+ `/ Z# [[root@controller ~]# systemctl restart ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.
8 N" x6 Y; f7 Aceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.0.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.3.service
3 M  R: h, d" A) m- ^, k. Qceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.1.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.4.service* _8 h3 y& a5 s6 e( U
ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.2.service  ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.5.service8 A9 d  \% O3 u( S" M5 {
[root@controller ~]# systemctl restart ceph-a4bb5236-c8ca-11eb-a67b-000c29ad02de@osd.*
/ g2 h8 d  @- d% n7 f& _[root@controller ~]# ceph -s
' n6 o& ~3 ?0 Z+ s1 [( M  cluster:7 }) `, b- s, g
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de  [5 Q# w$ u1 l5 I& D
    health: HEALTH_WARN+ o+ r) M0 S# n/ K& Y
            4 osds down4 ]7 a' S+ F. B. S- I) x: M
            Degraded data redundancy: 1 pg undersized
7 d4 [+ T. y! N4 i& `0 U
3 e% q; ], W4 E# W7 x0 @  services:
1 s$ w3 p+ Y  e6 v    mon: 1 daemons, quorum controller (age 89m)
0 s; Z* C/ R+ i5 a7 s! U$ [    mgr: controller.horbtx(active, since 88m)
7 z6 |& Y# v) o9 }    osd: 6 osds: 2 up (since 0.641904s), 6 in (since 8m)/ ?# F7 `8 n* K+ Q: B, m

' o; T( K* |& P3 d, I8 `+ m  data:
) @( ^7 F1 `/ i, H4 V    pools:   1 pools, 1 pgs  m& z; s# Z( U8 v- Q: q2 M% p& b
    objects: 0 objects, 0 B
9 Q  ^% `; }6 W4 g$ ^    usage:   6.0 GiB used, 114 GiB / 120 GiB avail
" D8 ]3 o0 ^) w1 @' ?: b    pgs:     1 stale+active+undersized+remapped! e: f1 Q' t) @" z" v0 p" K
, l1 s  m  e1 L2 ]* K7 o
[root@controller ~]# ceph -s
( ]: u9 A$ f5 S4 s" R  cluster:* s- a3 b  g, R8 Y# J8 a8 E
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de, B! N8 G% r! K6 {0 A, F- {
    health: HEALTH_OK. d  p0 W8 A, `0 @& P& ]
+ `( g! H5 D/ h' X% x
  services:3 I% f) i8 Y7 K) A
    mon: 1 daemons, quorum controller (age 89m). Q4 c1 B' y! B  f/ _
    mgr: controller.horbtx(active, since 89m), {0 m2 G( B1 d5 H. J/ l
    osd: 6 osds: 6 up (since 6s), 6 in (since 8m); 1 remapped pgs& d8 ?* q0 {6 ]4 P" u/ G9 ?7 ?

" ?. N2 b8 H- p5 |& ^1 J1 y  data:
' I% P& |1 x% A$ @    pools:   1 pools, 1 pgs
& r3 t) L; n' B. t  c    objects: 0 objects, 0 B! m1 n% A& A4 W% s
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail* D# M9 d9 w0 F2 s, w$ q  i
    pgs:     1 active+undersized+remapped
( K/ h+ N# J5 H& g# Y* P/ N ) \% }5 s  f: |3 }: f
[root@controller ~]# ceph -s
2 F+ F' }& B6 N6 V  R2 d  cluster:
2 |% w, V) O! y( d    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de0 `. H7 o4 }5 f6 C8 e, h1 n" _5 ^
    health: HEALTH_OK
) c/ [* K* j; X& I ; P, w. k* t$ V* F1 E. e
  services:
4 E4 Q  M( K: g2 B% g: Y    mon: 1 daemons, quorum controller (age 89m)$ B& ?& [% X" R& w7 }$ y! D. [
    mgr: controller.horbtx(active, since 89m)
/ M' o$ f- x3 ^7 M$ q  P3 Z( J    osd: 6 osds: 6 up (since 8s), 6 in (since 9m); 1 remapped pgs
# ~, W/ B9 H- T5 X/ F& l7 H
6 _3 |: X9 M; o  g  data:; ~- m3 ^) h8 A8 z8 b* D
    pools:   1 pools, 1 pgs7 d. c6 [. ~1 s( ?/ E! z0 }
    objects: 0 objects, 0 B
( C) }( T2 g& A4 k' v) e7 g3 c- H    usage:   6.0 GiB used, 114 GiB / 120 GiB avail( P" u9 g- ~5 a/ `: x
    pgs:     1 active+undersized+remapped
( y: g5 [# A" r: s4 w% H 8 F* Y2 r8 {' W, p& u1 u0 x
[root@controller ~]# ceph -s
9 m3 E* \* g( U: Z5 H  T  cluster:$ k% K+ J) U+ p( @+ _% C- x8 y, {- M
    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de
$ a# ?0 W, e. t& W2 O3 J    health: HEALTH_OK
2 Y4 Y& X: k5 F$ J) w 5 v- n4 q9 S# X1 d. j4 s- F# F3 f
  services:
; S3 `% Z6 x, t    mon: 1 daemons, quorum controller (age 89m)
+ O0 @0 T8 P8 `( ^9 _/ {/ P    mgr: controller.horbtx(active, since 89m): h' n& s/ |. @' i" r1 d
    osd: 6 osds: 6 up (since 9s), 6 in (since 9m); 1 remapped pgs
) y+ h) p/ g0 v/ g5 A/ f4 P' Y+ A
/ P8 Q% o- ?, [. c- e# t" j  data:5 n5 @1 G" _) i1 e" e# ]* Y  W% b
    pools:   1 pools, 1 pgs) L- [5 t% V, c1 h% b% f6 [3 F
    objects: 0 objects, 0 B' q% d9 }+ p# \' D% @
    usage:   6.0 GiB used, 114 GiB / 120 GiB avail
. B, ]- u$ z  u& h# _% y    pgs:     1 active+undersized+remapped6 q+ H/ \& H( D7 {
" E, `# Z/ D( T
[root@controller ~]# ceph -s8 z+ G2 C% ]: g
  cluster:
) `  n3 B6 u, B" i/ K    id:     a4bb5236-c8ca-11eb-a67b-000c29ad02de# z' k( e/ i" W' B) n/ e% P
    health: HEALTH_OK
: ^+ M9 E5 V* O0 S " B! g* H! `* A- \' l
  services:9 K! @* B2 a! t+ `0 R8 E! ?
    mon: 1 daemons, quorum controller (age 89m): K: @8 o" T9 v' [
    mgr: controller.horbtx(active, since 89m)
, o+ g" W, D: y' k% O$ r# A9 u, A    osd: 6 osds: 6 up (since 10s), 6 in (since 9m); 1 remapped pgs5 [* i( F- Z2 C8 F9 m
$ L7 U% ]- a; J4 F/ k  e" b
  data:
" m, I9 T+ O" s3 L2 k5 z    pools:   1 pools, 1 pgs2 L0 V! a; ?& H7 a( w3 X
    objects: 0 objects, 0 B
: a& V+ a" _$ p& K- X9 W; ?: r    usage:   6.0 GiB used, 114 GiB / 120 GiB avail8 a4 G, G. x' I" Z. z1 z: ^. x
    pgs:     1 active+undersized+remapped
9 e! y" _5 ]( `" c/ L' R2 \" H8 C 6 z) A, ?0 S% t" I6 U
[root@controller ~]# ceph health detail
1 Z7 Y% w! \1 Y' e1 fHEALTH_OK
' l1 y" V: w9 a" K[root@controller ~]# ceph health detail & c8 V4 g5 o9 C0 F% R
HEALTH_OK$ p( u/ @- Q5 O9 V
[root@controller ~]#
0 ^( \. S3 ^6 _9 d5 w& b( g+ k
 楼主| 发表于 2021-6-9 15:00:17 | 显示全部楼层
3.1.1 说明
) S5 s( V; }7 v, h降级:由上文可以得知,每个PG有三个副本,分别保存在不同的OSD中,在非故障情况下,这个PG是active+clean 状态,那么,如果PG 的 副本osd.4 挂掉了,这个 PG 是降级状态。
0 g5 }+ \, Q( o( W  P8 p3.1.2 故障模拟: ?  H5 |2 G5 Q" b# I& z! E- W7 S
a. 停止osd.1
. M9 `- {3 W, K $ systemctl stop ceph-osd@1
0 X. C/ t; |/ O  J  Sb. 查看PG状态, J9 f; I" x4 l% u7 D
$ bin/ceph pg stat 20 pgs: 20 active+undersized+degraded; 14512 kB data, 302 GB used, 6388 GB / 6691 GB avail; 12/36 objects degraded (33.333%)
& t' ]6 g- ~. X- k; {c. 查看集群监控状态2 D+ }1 Z) R2 @9 e( V
$ bin/ceph health detail
, ?. {# D7 F4 e( Y; L  XHEALTH_WARN 1 osds down; Degraded data redundancy: 12/36 objects degraded (33.333%), 20 pgs unclean, 20 pgs degraded; application not enabled on 1 pool(s)
5 Y1 m' @: C5 i$ l; jOSD_DOWN 1 osds down     
; t) }# J) Q+ `, I, A# m( e   osd.1 (root=default,host=ceph-xx-cc00) is down 6 q: ?& O9 u, S5 T
PG_DEGRADED Degraded data redundancy: 12/36 objects degraded (33.333%), 20 pgs unclean, 20 pgs degraded     , v4 ^& B0 M* C& ?( e: U
   pg 1.0 is active+undersized+degraded, acting [0,2]     , M/ d" j8 O3 o( O
   pg 1.1 is active+undersized+degraded, acting [2,0] 6 U* Z+ w. f6 f
d. 客户端IO操作
& _: ^$ }3 h# }* D% J# s #写入对象
% {8 T4 X2 W( [" ^, |0 l8 n$ bin/rados -p test_pool put myobject ceph.conf  
1 \, A: R6 V4 S  u4 v& g' _ ) }/ z" S5 j5 p9 x5 v  E6 S4 D) J+ D
#读取对象到文件 & X* x' L8 _6 @( T" @: d( }5 x
$ bin/rados -p test_pool get myobject.old  # e4 w  W- ^6 l9 S- Q. I

; G3 m7 P$ K4 T7 P. N#查看文件 + Z: m7 x7 O, g5 G! L# B
$ ll ceph.conf*
- r+ M% k; I$ v: b6 s+ `/ p5 r-rw-r--r-- 1 root root 6211 Jun 25 14:01 ceph.conf 4 @9 w! W2 L" {2 N
-rw-r--r-- 1 root root 6211 Jul 3 19:57 ceph.conf.old 8 D2 P; h5 f4 G
故障总结:' n. @) f5 \3 L5 j) R/ J
为了模拟故障,(size = 3, min_size = 2) 我们手动停止了 osd.1,然后查看PG状态,可见,它此刻的状态是active+undersized+degraded,当一个 PG 所在的 OSD 挂掉之后,这个 PG 就会进入undersized+degraded 状态,而后面的[0,2]的意义就是还有两个副本存活在 osd.0 和 osd.2 上, 并且这个时候客户端可以正常读写IO。. Q( P' H; T. R$ O
3.1.3 总结
1 D+ r" j1 T2 M# X. o降级就是在发生了一些故障比如OSD挂掉之后,Ceph 将这个 OSD 上的所有 PG 标记为 Degraded。" D( ~6 J$ D+ Q) M2 F8 `
降级的集群可以正常读写数据,降级的 PG 只是相当于小毛病而已,并不是严重的问题。; W1 ?+ E' L$ v& \! X% j- H
Undersized的意思就是当前存活的PG 副本数为 2,小于副本数3,将其做此标记,表明存货副本数不足,也不是严重的问题。
5 p) ?1 R: d# J  X: i0 W3.2 Peered
, [3 |$ D" l- w3 Q! v9 C# K3.2.1 说明7 }& |9 \) x7 y/ _8 h0 _
Peering已经完成,但是PG当前Acting Set规模小于存储池规定的最小副本数(min_size)。
/ m" c! P" I  ~4 {3.2.2 故障模拟
) `$ A5 f) v) v' F0 |a. 停掉两个副本osd.1,osd.0
, o  Q, U. e6 a6 s% {0 ~ $ systemctl stop ceph-osd@1 ( d* z- H' n, D  U
$ systemctl stop ceph-osd@0
# ]: D2 T# r. e$ ^# ^$ ?% a
' q9 `0 L( T& [; z: ?) K2 u  m# ?# Y2 d  Q( A3 U" j  q
3.2.1 说明
! `6 a) t- c7 g* q0 r" M- |- nPeering已经完成,但是PG当前Acting Set规模小于存储池规定的最小副本数(min_size)。$ b: D: n* [2 D% F/ H9 P
3.2.2 故障模拟
/ a) c- `3 C4 ?) p* Y
) @( r; \3 R- y1 Q, a. ^a. 停掉两个副本osd.1,osd.0
; l* W% v; {  L# n" U4 K  u  h) E! \0 _8 P  D" `
$ systemctl stop ceph-osd@1
) O% @, ]. L. V2 d" Y! c9 W $ systemctl stop ceph-osd@0 , F! |) p3 Z& `+ `; x8 q; R

6 o: @) Y2 v# k5 A) O" S3 Fb. 查看集群健康状态: R: f1 W8 X" k2 t! w7 H9 E

; e' N( i" {$ G3 \1 [) G/ J" r8 M2 I $ bin/ceph health detail
8 ^$ V3 N& V" y) n! `HEALTH_WARN 1 osds down; Reduced data availability: 4 pgs inactive; Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded; application not enabled on 1 pool(s)
2 U5 }: R6 I( v6 z: T2 |& e7 _5 \OSD_DOWN 1 osds down     
4 h1 X% O3 }! V. ^; X    osd.0 (root=default,host=ceph-xx-cc00) is down # k; ?' A9 I% Q! p# k+ J
PG_AVAILABILITY Reduced data availability: 4 pgs inactive     
. `! X8 e4 H: D. ?, S, h% q' K    pg 1.6 is stuck inactive for 516.741081, current state undersized+degraded+peered, last acting [2]     
' t3 F) `) ^$ r' x( d5 y" g    pg 1.10 is stuck inactive for 516.737888, current state undersized+degraded+peered, last acting [2]     
) C  a5 O- u5 ^, ^+ C' \" z    pg 1.11 is stuck inactive for 516.737408, current state undersized+degraded+peered, last acting [2]     
0 \) C; Y2 a# ]    pg 1.12 is stuck inactive for 516.736955, current state undersized+degraded+peered, last acting [2] 5 K2 U/ i& A1 V; P/ C
PG_DEGRADED Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded     
5 }- \) h' A5 I5 @  `% [    pg 1.0 is undersized+degraded+peered, acting [2]     - r; V* ~5 Q9 `- a9 b, h1 R1 Y
    pg 1.1 is undersized+degraded+peered, acting [2] 9 R7 M1 n9 h# J8 B& i. h
c. 客户端IO操作(夯住)
: e8 L4 H" U- D- t$ n. y' w6 D  i, g2 H" ?/ x2 q7 J: [, Q
#读取对象到文件,夯住IO + \. m5 R2 A; k8 o0 M
$ bin/rados -p test_pool get myobject  ceph.conf.bak 1 i& w# A; l: O
故障总结:
6 C: M' q: n5 D: B* ?2 R( B
1 z# r; K9 [( }& B- q2 q现在pg 只剩下osd.2上存活,并且 pg 还多了一个状态:peered,英文的意思是仔细看,这里我们可以理解成协商、搜索。- H3 |4 U7 B; W+ J7 [) M1 }
这时候读取文件,会发现指令会卡在那个地方一直不动,为什么就不能读取内容了,因为我们设置的 min_size=2 ,如果存活数少于2,比如这里的 1 ,那么就不会响应外部的IO请求。& k2 ^9 h1 C: i, M
d. 调整min_size=1可以解决IO夯住问题
+ R9 N( S$ p  b- _! ]) E" G% L# p% k7 V& {8 V+ V" N/ x* G
#设置min_size = 1 : P3 E# t6 g# N/ ?
$ bin/ceph osd pool set test_pool min_size 1 4 H% z/ z! n; a/ j4 H' x) |
set pool 1 min_size to 1 5 N) z7 [, E8 f9 m6 M- j8 F" m
e. 查看集群监控状态
' P/ e5 _1 w/ S/ Q" M9 Y
/ J3 Q0 E$ j2 j( k" p# p; l $ bin/ceph health detail
1 U. h/ m5 ]# THEALTH_WARN 1 osds down; Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded, 20 pgs undersized; application not enabled on 1 pool(s) + y5 t4 ]/ w7 w  `
OSD_DOWN 1 osds down       i2 @) q1 F, \4 q
   osd.0 (root=default,host=ceph-xx-cc00) is down。
4 ^8 P: O* {( H4 a! HPG_DEGRADED Degraded data redundancy: 26/39 objects degraded (66.667%), 20 pgs unclean, 20 pgs degraded, 20 pgs undersized     % F5 \4 c% B/ E4 \3 m5 r
pg 1.0 is stuck undersized for 65.958983, current state active+undersized+degraded, last acting [2]     
/ Z6 P1 I8 V) a, e* i0 l5 [pg 1.1 is stuck undersized for 65.960092, current state active+undersized+degraded, last acting [2]     
: i8 ]" g' h* \0 z: B! J# P, Epg 1.2 is stuck undersized for 65.960974, current state active+undersized+degraded, last acting [2]
# y, f. ?- Z8 K4 O- ^f. 客户端IO操作$ ~+ R: S/ d8 ?) p* W; E

: E: n3 s+ N) L& _4 h9 M$ x% ` #读取对象到文件中 , C% u  Q/ }& _; A/ {! g5 X' ?
$ ll -lh ceph.conf* . I$ q, T# @+ K
-rw-r--r-- 1 root root 6.1K Jun 25 14:01 ceph.conf
+ Q/ q' M1 o5 r2 k$ G$ H2 |-rw-r--r-- 1 root root 6.1K Jul 3 20:11 ceph.conf.bak + y; i  {! L7 `' }# r7 |
-rw-r--r-- 1 root root 6.1K Jul 3 20:11 ceph.conf.bak.1 , d8 k# t2 \1 }) f+ l0 q
故障总结:0 G2 M8 b4 E9 T6 I, \

- _- o% [6 o+ {5 Q- E/ n7 t5 U可以看到,PG状态Peered没有了,并且客户端文件IO可以正常读写了。
" [/ o% U9 F# E2 m0 V1 Q5 y当min_size=1时,只要集群里面有一份副本活着,那就可以响应外部的IO请求。0 T8 D: l* E/ U! ^% ~/ k. ?$ z

# e& Y7 q/ [+ B6 N
您需要登录后才可以回帖 登录 | 开始注册

本版积分规则

关闭

站长推荐上一条 /4 下一条

如有购买积分卡请联系497906712

QQ|返回首页|Archiver|手机版|小黑屋|易陆发现 点击这里给我发消息

GMT+8, 2021-6-20 19:58 , Processed in 0.053229 second(s), 22 queries .

Powered by 龙睿 bbs168x X3.2

© 2001-2020 Comsenz Inc.

快速回复 返回顶部 返回列表