背景

在一次物理机断电回复后,Kubernetes 集群因断电或其他原因导致 etcd 启动时出现 “panic: lease ID must be 8-byte” 错误。本文记录如何诊断和修复 etcd 数据损坏,并恢复 Kubernetes 集群的过程。

问题描述

在尝试启动 etcd 服务时,日志显示以下错误:

bash
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
{"level":"warn","ts":"2025-06-26T08:46:12.178202Z","caller":"embed/config.go:687","msg":"Running http and grpc server on single port. This is not recommended for production."}
{"level":"info","ts":"2025-06-26T08:46:12.178400Z","caller":"etcdmain/etcd.go:73","msg":"Running: ","args":["etcd","--advertise-client-urls=https://10.0.10.188:2379","--cert-file=/etc/kubernetes/pki/etcd/server.crt","--client-cert-auth=true","--data-dir=/var/lib/etcd","--experimental-initial-corrupt-check=true","--experimental-watch-progress-notify-interval=5s","--initial-advertise-peer-urls=https://10.0.10.188:2380","--initial-cluster=buss-k8s-master=https://10.0.10.188:2380","--key-file=/etc/kubernetes/pki/etcd/server.key","--listen-client-urls=https://127.0.0.1:2379,https://10.0.10.188:2379","--listen-metrics-urls=http://127.0.0.1:2381","--listen-peer-urls=https://10.0.10.188:2380","--name=buss-k8s-master","--peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt","--peer-client-cert-auth=true","--peer-key-file=/etc/kubernetes/pki/etcd/peer.key","--peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt","--snapshot-count=10000","--trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt"]}
{"level":"info","ts":"2025-06-26T08:46:12.178527Z","caller":"etcdmain/etcd.go:116","msg":"server has been already initialized","data-dir":"/var/lib/etcd","dir-type":"member"}
{"level":"warn","ts":"2025-06-26T08:46:12.178591Z","caller":"embed/config.go:687","msg":"Running http and grpc server on single port. This is not recommended for production."}
{"level":"info","ts":"2025-06-26T08:46:12.178613Z","caller":"embed/etcd.go:128","msg":"configuring peer listeners","listen-peer-urls":["https://10.0.10.188:2380"]}
{"level":"info","ts":"2025-06-26T08:46:12.178682Z","caller":"embed/etcd.go:496","msg":"starting with peer TLS","tls-info":"cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, client-cert=, client-key=, trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file = ","cipher-suites":[]}
{"level":"info","ts":"2025-06-26T08:46:12.180487Z","caller":"embed/etcd.go:136","msg":"configuring client listeners","listen-client-urls":["https://127.0.0.1:2379","https://10.0.10.188:2379"]}
{"level":"info","ts":"2025-06-26T08:46:12.180772Z","caller":"embed/etcd.go:310","msg":"starting an etcd server","etcd-version":"3.5.15","git-sha":"9a5533382","go-version":"go1.21.12","go-os":"linux","go-arch":"amd64","max-cpu-set":8,"max-cpu-available":8,"member-initialized":true,"name":"buss-k8s-master","data-dir":"/var/lib/etcd","wal-dir":"","wal-dir-dedicated":"","member-dir":"/var/lib/etcd/member","force-new-cluster":false,"heartbeat-interval":"100ms","election-timeout":"1s","initial-election-tick-advance":true,"snapshot-count":10000,"max-wals":5,"max-snapshots":5,"snapshot-catchup-entries":5000,"initial-advertise-peer-urls":["https://10.0.10.188:2380"],"listen-peer-urls":["https://10.0.10.188:2380"],"advertise-client-urls":["https://10.0.10.188:2379"],"listen-client-urls":["https://127.0.0.1:2379","https://10.0.10.188:2379"],"listen-metrics-urls":["http://127.0.0.1:2381"],"cors":["*"],"host-whitelist":["*"],"initial-cluster":"","initial-cluster-state":"new","initial-cluster-token":"","quota-backend-bytes":2147483648,"max-request-bytes":1572864,"max-concurrent-streams":4294967295,"pre-vote":true,"initial-corrupt-check":true,"corrupt-check-time-interval":"0s","compact-check-time-enabled":false,"compact-check-time-interval":"1m0s","auto-compaction-mode":"periodic","auto-compaction-retention":"0s","auto-compaction-interval":"0s","discovery-url":"","discovery-proxy":"","downgrade-check-interval":"5s"}
{"level":"info","ts":"2025-06-26T08:46:12.196655Z","caller":"etcdserver/backend.go:81","msg":"opened backend db","path":"/var/lib/etcd/member/snap/db","took":"14.157616ms"}
{"level":"info","ts":"2025-06-26T08:46:13.588902Z","caller":"etcdserver/server.go:511","msg":"recovered v2 store from snapshot","snapshot-index":67867015,"snapshot-size":"7.2 kB"}
{"level":"info","ts":"2025-06-26T08:46:13.589033Z","caller":"etcdserver/server.go:524","msg":"recovered v3 backend from snapshot","backend-size-bytes":53522432,"backend-size":"54 MB","backend-size-in-use-bytes":143360,"backend-size-in-use":"143 kB"}
{"level":"info","ts":"2025-06-26T08:46:13.859083Z","caller":"etcdserver/raft.go:530","msg":"restarting local member","cluster-id":"39b983ddef836ae7","local-member-id":"9b72a468becd8ce1","commit-index":67871394}
{"level":"info","ts":"2025-06-26T08:46:13.860760Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"9b72a468becd8ce1 switched to configuration voters=(11201195993008540897)"}
{"level":"info","ts":"2025-06-26T08:46:13.860902Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"9b72a468becd8ce1 became follower at term 2"}
{"level":"info","ts":"2025-06-26T08:46:13.860942Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"newRaft 9b72a468becd8ce1 [peers: [9b72a468becd8ce1], term: 2, commit: 67871394, applied: 67867015, lastindex: 67871394, lastterm: 2]"}
{"level":"info","ts":"2025-06-26T08:46:13.861173Z","caller":"api/capability.go:75","msg":"enabled capabilities for version","cluster-version":"3.5"}
{"level":"info","ts":"2025-06-26T08:46:13.861222Z","caller":"membership/cluster.go:278","msg":"recovered/added member from store","cluster-id":"39b983ddef836ae7","local-member-id":"9b72a468becd8ce1","recovered-remote-peer-id":"9b72a468becd8ce1","recovered-remote-peer-urls":["https://10.0.10.188:2380"]}
{"level":"info","ts":"2025-06-26T08:46:13.861246Z","caller":"membership/cluster.go:287","msg":"set cluster version from store","cluster-version":"3.5"}
panic: lease ID must be 8-byte

goroutine 1 [running]:
go.etcd.io/etcd/server/v3/lease.bytesToLeaseID(...)
        go.etcd.io/etcd/server/v3/lease/lessor.go:943
go.etcd.io/etcd/server/v3/lease.unsafeGetAllLeases.func1({0x7fec3c955070, 0x11, 0x10?}, {0x7fec3c955081, 0x1e1, 0x1e1})
        go.etcd.io/etcd/server/v3/lease/lessor.go:954 +0x179
go.etcd.io/bbolt.(*Bucket).ForEach(0xc0000dc0f8?, 0xc000112e90)
        go.etcd.io/bbolt@v1.3.10/bucket.go:397 +0x90
go.etcd.io/etcd/server/v3/mvcc/backend.unsafeForEach(0xc0000dc0e0, {0x12723d0?, 0x1a07d00?}, 0x41d601?)
        go.etcd.io/etcd/server/v3/mvcc/backend/batch_tx.go:236 +0x56
go.etcd.io/etcd/server/v3/mvcc/backend.(*batchTx).UnsafeForEach(...)
        go.etcd.io/etcd/server/v3/mvcc/backend/batch_tx.go:231
go.etcd.io/etcd/server/v3/lease.unsafeGetAllLeases({0x7feeba39f358, 0xc000363400})
        go.etcd.io/etcd/server/v3/lease/lessor.go:950 +0x92
go.etcd.io/etcd/server/v3/lease.(*lessor).initAndRecover(0xc0000003c0)
        go.etcd.io/etcd/server/v3/lease/lessor.go:801 +0x77
go.etcd.io/etcd/server/v3/lease.newLessor(0xc000158be0, {0x127b3c8?, 0xc000000000}, {0x1265220?, 0xc00068c420}, {0x1272878?, 0xc000112e20?, 0x5f5e100?, 0x0?})
        go.etcd.io/etcd/server/v3/lease/lessor.go:235 +0x288
go.etcd.io/etcd/server/v3/lease.NewLessor(...)
        go.etcd.io/etcd/server/v3/lease/lessor.go:206
go.etcd.io/etcd/server/v3/etcdserver.NewServer({{0x7fff6b55ee37, 0x11}, {0x0, 0x0}, {0x0, 0x0}, {0xc00015bef0, 0x1, 0x1}, {0xc0001c3170, ...}, ...})
        go.etcd.io/etcd/server/v3/etcdserver/server.go:601 +0x419a
go.etcd.io/etcd/server/v3/embed.StartEtcd(0xc000137500)
        go.etcd.io/etcd/server/v3/embed/etcd.go:247 +0x10d8
go.etcd.io/etcd/server/v3/etcdmain.startEtcd(0x7fff6b55ee37?)
        go.etcd.io/etcd/server/v3/etcdmain/etcd.go:228 +0x17
go.etcd.io/etcd/server/v3/etcdmain.startEtcdOrProxyV2({0xc000134000, 0x14, 0x14})
        go.etcd.io/etcd/server/v3/etcdmain/etcd.go:123 +0x13c5
go.etcd.io/etcd/server/v3/etcdmain.Main({0xc000134000?, 0x14, 0x14})
        go.etcd.io/etcd/server/v3/etcdmain/main.go:40 +0x105
main.main()
        go.etcd.io/etcd/server/v3/main.go:31 +0x28

仅仅看报错表明 etcd 的租约(lease)数据存储在 bbolt 数据库中出现损坏,具体是租约键的长度不符合 8 字节要求,导致 etcd 服务无法正常启动。

修复的过程

方法1:snapshot restore

  1. 检查快照状态

    使用 etcdctl 或 etcdutl 检查 etcd 快照文件的完整性:

    bash
    1
    
    etcdctl snapshot status /var/lib/etcd/member/snap/db

    输出结果如下:

    bash
    1
    
    dce40287, 64190322, 108, 54 MB
    • Hash: dce40287(快照的哈希值,用于验证完整性)
    • Revision: 64190322(快照的修订版本号)
    • Total Keys: 108(快照中存储的键数量)
    • Total Size: 54 MB(快照文件大小)

    注意:etcdctl snapshot status 已被标记为弃用,推荐使用 etcdutl snapshot status。

因为没有备份就直接吧当前的存储备份后,restore 使用了当前的的db文件 (/var/lib/etcd/member/snap/db)。

  1. 备份数据

    在任何修复操作前,备份当前 etcd 数据目录以防止进一步数据丢失:

    bash
    1
    
    cp -r /var/lib/etcd /var/lib/etcd.bak
  2. 尝试恢复快照

    使用 etcdutl 尝试从快照恢复数据库:

    bash
    1
    2
    3
    
    $ etcdutl snapshot restore /var/lib/etcd/member/snap/db --data-dir /var/lib/etcd/restore
    2025-06-26T16:49:16+08:00       info    snapshot/v3_snapshot.go:260     restoring snapshot      {"path": "/var/lib/etcd/member/snap/db", "wal-dir": "/var/lib/etcd/restore/member/wal", "data-dir": "/var/lib/etcd/restore", "snap-dir": "/var/lib/etcd/restore/member/snap"}
    Error: snapshot missing hash but --skip-hash-check=false

    结果报错:

    bash
    1
    
    Error: snapshot missing hash but --skip-hash-check=false

    增加参数

    text
    1
    
    $ etcdutl snapshot restore /var/lib/etcd/member/snap/db --data-dir /var/lib/etcd/restore --skip-hash-check=false

gork 告知 使用更宽松的参数 --force-new-cluster 启用新集群,尝试可以启动

bash
1
$ etcd --force-new-cluster --data-dir=/var/lib/etcd

回复后的日志

text
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
{"level":"warn","ts":"2025-06-26T08:51:58.191181Z","caller":"embed/config.go:687","msg":"Running http and grpc server on single port. This is not recommended for production."}
{"level":"info","ts":"2025-06-26T08:51:58.191356Z","caller":"etcdmain/etcd.go:73","msg":"Running: ","args":["etcd","--force-new-cluster","--advertise-client-urls=https://10.0.10.188:2379","--cert-file=/etc/kubernetes/pki/etcd/server.crt","--client-cert-auth=true","--data-dir=/var/lib/etcd","--experimental-initial-corrupt-check=true","--experimental-watch-progress-notify-interval=5s","--initial-advertise-peer-urls=https://10.0.10.188:2380","--initial-cluster=buss-k8s-master=https://10.0.10.188:2380","--key-file=/etc/kubernetes/pki/etcd/server.key","--listen-client-urls=https://127.0.0.1:2379,https://10.0.10.188:2379","--listen-metrics-urls=http://127.0.0.1:2381","--listen-peer-urls=https://10.0.10.188:2380","--name=buss-k8s-master","--peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt","--peer-client-cert-auth=true","--peer-key-file=/etc/kubernetes/pki/etcd/peer.key","--peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt","--snapshot-count=10000","--trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt"]}
{"level":"info","ts":"2025-06-26T08:51:58.191490Z","caller":"etcdmain/etcd.go:116","msg":"server has been already initialized","data-dir":"/var/lib/etcd","dir-type":"member"}
{"level":"warn","ts":"2025-06-26T08:51:58.191558Z","caller":"embed/config.go:687","msg":"Running http and grpc server on single port. This is not recommended for production."}
{"level":"info","ts":"2025-06-26T08:51:58.191586Z","caller":"embed/etcd.go:128","msg":"configuring peer listeners","listen-peer-urls":["https://10.0.10.188:2380"]}
{"level":"info","ts":"2025-06-26T08:51:58.191669Z","caller":"embed/etcd.go:496","msg":"starting with peer TLS","tls-info":"cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, client-cert=, client-key=, trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file = ","cipher-suites":[]}
{"level":"info","ts":"2025-06-26T08:51:58.193868Z","caller":"embed/etcd.go:136","msg":"configuring client listeners","listen-client-urls":["https://127.0.0.1:2379","https://10.0.10.188:2379"]}
{"level":"info","ts":"2025-06-26T08:51:58.196250Z","caller":"embed/etcd.go:310","msg":"starting an etcd server","etcd-version":"3.5.15","git-sha":"9a5533382","go-version":"go1.21.12","go-os":"linux","go-arch":"amd64","max-cpu-set":8,"max-cpu-available":8,"member-initialized":true,"name":"buss-k8s-master","data-dir":"/var/lib/etcd","wal-dir":"","wal-dir-dedicated":"","member-dir":"/var/lib/etcd/member","force-new-cluster":true,"heartbeat-interval":"100ms","election-timeout":"1s","initial-election-tick-advance":true,"snapshot-count":10000,"max-wals":5,"max-snapshots":5,"snapshot-catchup-entries":5000,"initial-advertise-peer-urls":["https://10.0.10.188:2380"],"listen-peer-urls":["https://10.0.10.188:2380"],"advertise-client-urls":["https://10.0.10.188:2379"],"listen-client-urls":["https://127.0.0.1:2379","https://10.0.10.188:2379"],"listen-metrics-urls":["http://127.0.0.1:2381"],"cors":["*"],"host-whitelist":["*"],"initial-cluster":"","initial-cluster-state":"new","initial-cluster-token":"","quota-backend-bytes":2147483648,"max-request-bytes":1572864,"max-concurrent-streams":4294967295,"pre-vote":true,"initial-corrupt-check":true,"corrupt-check-time-interval":"0s","compact-check-time-enabled":false,"compact-check-time-interval":"1m0s","auto-compaction-mode":"periodic","auto-compaction-retention":"0s","auto-compaction-interval":"0s","discovery-url":"","discovery-proxy":"","downgrade-check-interval":"5s"}
{"level":"info","ts":"2025-06-26T08:51:58.212345Z","caller":"etcdserver/backend.go:81","msg":"opened backend db","path":"/var/lib/etcd/member/snap/db","took":"15.340047ms"}
{"level":"info","ts":"2025-06-26T08:51:58.213964Z","caller":"etcdserver/server.go:532","msg":"No snapshot found. Recovering WAL from scratch!"}
{"level":"info","ts":"2025-06-26T08:51:58.216418Z","caller":"etcdserver/raft.go:603","msg":"forcing restart member","cluster-id":"39b983ddef836ae7","local-member-id":"9b72a468becd8ce1","commit-index":1}
{"level":"info","ts":"2025-06-26T08:51:58.217035Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"9b72a468becd8ce1 switched to configuration voters=()"}
{"level":"info","ts":"2025-06-26T08:51:58.217225Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"9b72a468becd8ce1 became follower at term 0"}
{"level":"info","ts":"2025-06-26T08:51:58.217311Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"newRaft 9b72a468becd8ce1 [peers: [], term: 0, commit: 1, applied: 0, lastindex: 1, lastterm: 0]"}
[root@buss-k8s-master lib]# crictl logs 6db850d2044ac
{"level":"warn","ts":"2025-06-26T08:51:58.191181Z","caller":"embed/config.go:687","msg":"Running http and grpc server on single port. This is not recommended for production."}
{"level":"info","ts":"2025-06-26T08:51:58.191356Z","caller":"etcdmain/etcd.go:73","msg":"Running: ","args":["etcd","--force-new-cluster","--advertise-client-urls=https://10.0.10.188:2379","--cert-file=/etc/kubernetes/pki/etcd/server.crt","--client-cert-auth=true","--data-dir=/var/lib/etcd","--experimental-initial-corrupt-check=true","--experimental-watch-progress-notify-interval=5s","--initial-advertise-peer-urls=https://10.0.10.188:2380","--initial-cluster=buss-k8s-master=https://10.0.10.188:2380","--key-file=/etc/kubernetes/pki/etcd/server.key","--listen-client-urls=https://127.0.0.1:2379,https://10.0.10.188:2379","--listen-metrics-urls=http://127.0.0.1:2381","--listen-peer-urls=https://10.0.10.188:2380","--name=buss-k8s-master","--peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt","--peer-client-cert-auth=true","--peer-key-file=/etc/kubernetes/pki/etcd/peer.key","--peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt","--snapshot-count=10000","--trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt"]}
{"level":"info","ts":"2025-06-26T08:51:58.191490Z","caller":"etcdmain/etcd.go:116","msg":"server has been already initialized","data-dir":"/var/lib/etcd","dir-type":"member"}
{"level":"warn","ts":"2025-06-26T08:51:58.191558Z","caller":"embed/config.go:687","msg":"Running http and grpc server on single port. This is not recommended for production."}
{"level":"info","ts":"2025-06-26T08:51:58.191586Z","caller":"embed/etcd.go:128","msg":"configuring peer listeners","listen-peer-urls":["https://10.0.10.188:2380"]}
{"level":"info","ts":"2025-06-26T08:51:58.191669Z","caller":"embed/etcd.go:496","msg":"starting with peer TLS","tls-info":"cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, client-cert=, client-key=, trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file = ","cipher-suites":[]}
{"level":"info","ts":"2025-06-26T08:51:58.193868Z","caller":"embed/etcd.go:136","msg":"configuring client listeners","listen-client-urls":["https://127.0.0.1:2379","https://10.0.10.188:2379"]}
{"level":"info","ts":"2025-06-26T08:51:58.196250Z","caller":"embed/etcd.go:310","msg":"starting an etcd server","etcd-version":"3.5.15","git-sha":"9a5533382","go-version":"go1.21.12","go-os":"linux","go-arch":"amd64","max-cpu-set":8,"max-cpu-available":8,"member-initialized":true,"name":"buss-k8s-master","data-dir":"/var/lib/etcd","wal-dir":"","wal-dir-dedicated":"","member-dir":"/var/lib/etcd/member","force-new-cluster":true,"heartbeat-interval":"100ms","election-timeout":"1s","initial-election-tick-advance":true,"snapshot-count":10000,"max-wals":5,"max-snapshots":5,"snapshot-catchup-entries":5000,"initial-advertise-peer-urls":["https://10.0.10.188:2380"],"listen-peer-urls":["https://10.0.10.188:2380"],"advertise-client-urls":["https://10.0.10.188:2379"],"listen-client-urls":["https://127.0.0.1:2379","https://10.0.10.188:2379"],"listen-metrics-urls":["http://127.0.0.1:2381"],"cors":["*"],"host-whitelist":["*"],"initial-cluster":"","initial-cluster-state":"new","initial-cluster-token":"","quota-backend-bytes":2147483648,"max-request-bytes":1572864,"max-concurrent-streams":4294967295,"pre-vote":true,"initial-corrupt-check":true,"corrupt-check-time-interval":"0s","compact-check-time-enabled":false,"compact-check-time-interval":"1m0s","auto-compaction-mode":"periodic","auto-compaction-retention":"0s","auto-compaction-interval":"0s","discovery-url":"","discovery-proxy":"","downgrade-check-interval":"5s"}
{"level":"info","ts":"2025-06-26T08:51:58.212345Z","caller":"etcdserver/backend.go:81","msg":"opened backend db","path":"/var/lib/etcd/member/snap/db","took":"15.340047ms"}
{"level":"info","ts":"2025-06-26T08:51:58.213964Z","caller":"etcdserver/server.go:532","msg":"No snapshot found. Recovering WAL from scratch!"}
{"level":"info","ts":"2025-06-26T08:51:58.216418Z","caller":"etcdserver/raft.go:603","msg":"forcing restart member","cluster-id":"39b983ddef836ae7","local-member-id":"9b72a468becd8ce1","commit-index":1}
{"level":"info","ts":"2025-06-26T08:51:58.217035Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"9b72a468becd8ce1 switched to configuration voters=()"}
{"level":"info","ts":"2025-06-26T08:51:58.217225Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"9b72a468becd8ce1 became follower at term 0"}
{"level":"info","ts":"2025-06-26T08:51:58.217311Z","logger":"raft","caller":"etcdserver/zap_raft.go:77","msg":"newRaft 9b72a468becd8ce1 [peers: [], term: 0, commit: 1, applied: 0, lastindex: 1, lastterm: 0]"}

方法2:修复 bbolt 数据库

etcd 使用 bbolt(BoltDB 的增强版本)作为存储引擎,租约数据存储在 lease 桶中。可以通过 bbolt 工具直接检查和修复数据库。

安装 bbolt 工具

text
1
go install go.etcd.io/bbolt/cmd/bbolt@latest

检查数据库

text
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# 首先查看 bbolt 的可用命令
bbolt --help

# 查看数据库信息
bbolt info /var/lib/etcd/member/snap/db

# 列出所有桶
bbolt buckets /var/lib/etcd/member/snap/db

# 查看租约桶的内容
bbolt keys /var/lib/etcd/member/snap/db lease

# 如果需要删除特定的键,可以使用页面检查
bbolt page /var/lib/etcd/member/snap/db

# 尝试修复租约桶(lease bucket)
bbolt buckets /var/lib/etcd/member/snap/db

方法3:使用脚本直接修改 lease 的值

本身 bblot db 可以直接使用 go api 来操作,所以也可以使用 go 脚本来直接读取 bbolt 文件然后修复对应的 lease。

go
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
package main

import (
	"fmt"
	"log"
	"os"

	bolt "go.etcd.io/bbolt"
)

func main() {
	if len(os.Args) < 2 {
		fmt.Println("Usage: go run etcd_lease_fix.go <path-to-etcd-db>")
		fmt.Println("Example: go run etcd_lease_fix.go /var/lib/etcd/member/snap/db")
		os.Exit(1)
	}

	dbPath := os.Args[1]
	
	// 备份原数据库
	backupPath := dbPath + ".backup"
	fmt.Printf("Creating backup: %s\n", backupPath)
	if err := copyFile(dbPath, backupPath); err != nil {
		log.Fatalf("Failed to create backup: %v", err)
	}

	// 打开数据库
	db, err := bolt.Open(dbPath, 0600, nil)
	if err != nil {
		log.Fatalf("Failed to open database: %v", err)
	}
	defer db.Close()

	// 检查并修复租约桶
	err = db.Update(func(tx *bolt.Tx) error {
		// 检查租约桶是否存在
		leaseBucket := tx.Bucket([]byte("lease"))
		if leaseBucket == nil {
			fmt.Println("Lease bucket not found, nothing to fix")
			return nil
		}

		fmt.Println("Found lease bucket, checking for corrupted entries...")

		// 收集需要修复的键值对
		var keysToFix []struct {
			oldKey []byte
			newKey []byte
			value  []byte
		}
		
		err := leaseBucket.ForEach(func(k, v []byte) error {
			// 检查键长度是否为8字节
			if len(k) != 8 {
				fmt.Printf("Found corrupted lease key with length %d: %x\n", len(k), k)
				
				// 尝试修复键
				var newKey []byte
				if len(k) < 8 {
					// 如果键太短,用零填充到8字节
					newKey = make([]byte, 8)
					copy(newKey, k)
					fmt.Printf("Padding short key %x to %x\n", k, newKey)
				} else if len(k) > 8 {
					// 如果键太长,截取前8字节
					newKey = make([]byte, 8)
					copy(newKey, k[:8])
					fmt.Printf("Truncating long key %x to %x\n", k, newKey)
				}
				
				keysToFix = append(keysToFix, struct {
					oldKey []byte
					newKey []byte
					value  []byte
				}{
					oldKey: append([]byte(nil), k...),
					newKey: newKey,
					value:  append([]byte(nil), v...),
				})
			}
			return nil
		})
		
		if err != nil {
			return fmt.Errorf("failed to iterate lease bucket: %v", err)
		}

		// 修复损坏的键
		for _, fix := range keysToFix {
			fmt.Printf("Fixing lease key %x -> %x\n", fix.oldKey, fix.newKey)
			
			// 检查新键是否已存在
			if existing := leaseBucket.Get(fix.newKey); existing != nil {
				fmt.Printf("Warning: new key %x already exists, skipping repair\n", fix.newKey)
				continue
			}
			
			// 添加修复后的键值对
			if err := leaseBucket.Put(fix.newKey, fix.value); err != nil {
				return fmt.Errorf("failed to put fixed key %x: %v", fix.newKey, err)
			}
			
			// 删除原来的损坏键
			if err := leaseBucket.Delete(fix.oldKey); err != nil {
				return fmt.Errorf("failed to delete old key %x: %v", fix.oldKey, err)
			}
		}

		if len(keysToFix) > 0 {
			fmt.Printf("Fixed %d corrupted lease entries\n", len(keysToFix))
		} else {
			fmt.Println("No corrupted lease entries found")
		}

		return nil
	})

	if err != nil {
		log.Fatalf("Failed to fix lease bucket: %v", err)
	}

	fmt.Println("Database repair completed successfully!")
	fmt.Printf("Original database backed up to: %s\n", backupPath)
}

func copyFile(src, dst string) error {
	data, err := os.ReadFile(src)
	if err != nil {
		return err
	}
	return os.WriteFile(dst, data, 0600)
}

这里将 lease key 直接修改为了 8 位,尝试启动是不成功的,最终解决方式是通过删除 lease

text
1
2
3
4
5
#  先运行脚本分析损坏情况
go run etcd_lease_analyze.go /var/lib/etcd/member/snap/db

# 运行修复脚本
$ go run etcd_lease_fix.go /var/lib/etcd/member/snap/db

修复后 etcd 可以启动,但发现数据丢失了,重新启动的集群没有业务 Pod 了。重新部署pod报错

text
1
2
3
4
5
6
7
8
a6529859b8b3872d252fe18e02f871bd946ea08c703181ad92ad5e\\\",\\\"lastState\\\":{},\\\"name\\\":\\\"cluster-register\\\",\\\"ready\\\":false,\\\"restartCount\\\":47151,\\\"started\\\":false,\\\"state\\\":{\\\"terminated\\\":{\\\"containerID\\\":\\\"containerd://5fc6780c61418d5f01543cd25ffd6890adbaa62094c892cabc1c3ddf918bd779\\\",\\\"exitCode\\\":1,\\\"finishedAt\\\":\\\"2025-06-26T00:45:02Z\\\",\\\"reason\\\":\\\"Error\\\",\\\"startedAt\\\":\\\"2025-06-26T00:45:02Z\\\"}}}],\\\"podIP\\\":null,\\\"podIPs\\\":null}}\" for pod \"cattle-system\"/\"cattle-cluster-agent-54dbc88855-tg9ts\": namespaces \"cattle-system\" not found"
Jun 26 21:37:49 buss-k8s-worker-03 kubelet[4178]: E0626 21:37:49.719262    4178 remote_runtime.go:222] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc = failed to destroy network for sandbox \"f7eded679eee4ecd17cd2bc687e27ee626f637c0486dae47d8969b0698bc75d6\": plugin type=\"calico\" failed (delete): error getting ClusterInformation: Get \"https://10.96.0.1:443/apis/crd.projectcalico.org/v1/clusterinformations/default\": dial tcp 10.96.0.1:443: i/o timeout" podSandboxID="f7eded679eee4ecd17cd2bc687e27ee626f637c0486dae47d8969b0698bc75d6"
Jun 26 21:37:49 buss-k8s-worker-03 kubelet[4178]: E0626 21:37:49.719350    4178 kuberuntime_gc.go:180] "Failed to stop sandbox before removing" err="rpc error: code = Unknown desc = failed to destroy network for sandbox \"f7eded679eee4ecd17cd2bc687e27ee626f637c0486dae47d8969b0698bc75d6\": plugin type=\"calico\" failed (delete): error getting ClusterInformation: Get \"https://10.96.0.1:443/apis/crd.projectcalico.org/v1/clusterinformations/default\": dial tcp 10.96.0.1:443: i/o timeout" sandboxID="f7eded679eee4ecd17cd2bc687e27ee626f637c0486dae47d8969b0698bc75d6"
Jun 26 21:37:54 buss-k8s-worker-03 kubelet[4178]: I0626 21:37:54.339411    4178 scope.go:117] "RemoveContainer" containerID="4dee6c66c8e8a5c84c9791f37c2ba7a83b7740a25838e498d09e8ea0f032e51e"
Jun 26 21:37:54 buss-k8s-worker-03 kubelet[4178]: I0626 21:37:54.340648    4178 scope.go:117] "RemoveContainer" containerID="4dee6c66c8e8a5c84c9791f37c2ba7a83b7740a25838e498d09e8ea0f032e51e"
Jun 26 21:37:54 buss-k8s-worker-03 kubelet[4178]: E0626 21:37:54.343343    4178 remote_runtime.go:385] "RemoveContainer from runtime service failed" err="rpc error: code = Unknown desc = failed to set removing state for container \"4dee6c66c8e8a5c84c9791f37c2ba7a83b7740a25838e498d09e8ea0f032e51e\": container is already in removing state" containerID="4dee6c66c8e8a5c84c9791f37c2ba7a83b7740a25838e498d09e8ea0f032e51e"
Jun 26 21:37:54 buss-k8s-worker-03 kubelet[4178]: E0626 21:37:54.343482    4178 kuberuntime_container.go:867] failed to remove pod init container "install-cni": rpc error: code = Unknown desc = failed to set removing state for container "4dee6c66c8e8a5c84c9791f37c2ba7a83b7740a25838e498d09e8ea0f032e51e": container is already in removing state; Skipping pod "calico-node-hvtrx_kube-system(33e26ece-e14a-4b8c-b8d2-4805c1369315)"
Jun 26 21:37:54 buss-k8s-worker-03 kubelet[4178]: E0626 21:37:54.344463    4178 pod_workers.go:1298] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"install-cni\" with CrashLoopBackOff: \"back-off 20s restarting failed container=install-cni pod=calico-node-hvtrx_kube-system(33e26ece-e14a-4b8c-b8d2-4805c1369315)\"" pod="kube-system/calico-node-hvtrx" podUID="33e26ece-e14a-4b8c-b8d2-4805c1369315"

原因:很典型的 Pod 无法连接 apiserver,检查 kube-proxy 服务,发现因为数据损失导致 kube-proxy 没有了(kubeadm 部署的集群 kube-proxy 是作为 daemonset 方式部署的)

text
1
kubeadm init phase addon kube-proxy

也可以使用脚本先读取数据后再启动服务验证(当服务无法启动时)

go
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package main

import (
        "fmt"
        "log"
        "os"

        bolt "go.etcd.io/bbolt"
)

func main() {
        if len(os.Args) < 2 {
                log.Fatal("使用方法: go run main.go <etcd-db-file-path>")
        }

        dbPath := os.Args[1]

        // 以只读模式打开数据库
        db, err := bolt.Open(dbPath, 0600, &bolt.Options{ReadOnly: true})
        if err != nil {
                log.Fatal("打开数据库失败:", err)
        }
        defer db.Close()

        fmt.Printf("正在读取etcd数据库: %s\n", dbPath)
        fmt.Println("==========================================")

        // 查看所有bucket
        err = db.View(func(tx *bolt.Tx) error {
                return tx.ForEach(func(name []byte, b *bolt.Bucket) error {
                        fmt.Printf("\n[Bucket]: %s\n", string(name))
                        fmt.Println("------------------------------------------")

                        return b.ForEach(func(k, v []byte) error {
                                fmt.Printf("Key: %s\n", string(k))
                                fmt.Printf("Value: %s\n", string(v))
                                fmt.Println("---")
                                return nil
                        })
                })
        })

        if err != nil {
                log.Fatal("读取数据失败:", err)
        }

        fmt.Println("\n数据读取完成!")
}

总结

在恢复服务时,没有做到数据的备份读取,导致无法确定是否是因为启动时覆盖掉的 etcd 的数据,这样导致在事后无法确定问题的根本原因,后期遇到问题要先分析并保留证据,其次在启动服务。