milvus数据库因etcd存储问题失败恢复
原创大约 5 分钟
milvus数据库因etcd存储问题失败恢复解决方案
环境
- milvus版本:2.3.9
原
docker部署脚本
#!/usr/bin/env bash
# Licensed to the LF AI & Data foundation under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
run_embed() {
cat << EOF > /milvus/configs/embedEtcd.yaml
listen-client-urls: http://0.0.0.0:2379
advertise-client-urls: http://0.0.0.0:2379
EOF
sudo docker run -d \
--name milvus-standalone \
--security-opt seccomp:unconfined \
-e ETCD_USE_EMBED=true \
-e ETCD_DATA_DIR=/var/lib/milvus/etcd \
-e ETCD_CONFIG_PATH=/milvus/configs/embedEtcd.yaml \
-e COMMON_STORAGETYPE=local \
-v /milvus/data:/var/lib/milvus \
-v /milvus/configs:/milvus/configs \
-p 19530:19530 \
-p 9091:9091 \
-p 2379:2379 \
--health-cmd="curl -f http://localhost:9091/healthz" \
--health-interval=30s \
--health-start-period=90s \
--health-timeout=20s \
--health-retries=3 \
milvusdb/milvus:v2.3.9 \
milvus run standalone 1> /dev/null
}
wait_for_milvus_running() {
echo "Wait for Milvus Starting..."
while true
do
res=`sudo docker ps|grep milvus-standalone|grep healthy|wc -l`
if [ $res -eq 1 ]
then
echo "Start successfully."
break
fi
sleep 1
done
}
start() {
res=`sudo docker ps|grep milvus-standalone|grep healthy|wc -l`
if [ $res -eq 1 ]
then
echo "Milvus is running."
exit 0
fi
res=`sudo docker ps -a|grep milvus-standalone|wc -l`
if [ $res -eq 1 ]
then
sudo docker start milvus-standalone 1> /dev/null
else
run_embed
fi
if [ $? -ne 0 ]
then
echo "Start failed."
exit 1
fi
wait_for_milvus_running
}
stop() {
sudo docker stop milvus-standalone 1> /dev/null
if [ $? -ne 0 ]
then
echo "Stop failed."
exit 1
fi
echo "Stop successfully."
}
delete() {
res=`sudo docker ps|grep milvus-standalone|wc -l`
if [ $res -eq 1 ]
then
echo "Please stop Milvus service before delete."
exit 1
fi
sudo docker rm milvus-standalone 1> /dev/null
if [ $? -ne 0 ]
then
echo "Delete failed."
exit 1
fi
sudo rm -rf $(pwd)/volumes
sudo rm -rf $(pwd)/embedEtcd.yaml
echo "Delete successfully."
}
case $1 in
start)
start
;;
stop)
stop
;;
delete)
delete
;;
*)
echo "please use bash standalone_embed.sh start|stop|delete"
;;
esac
运行一段时间某时刻报错如下
2024/08/12 03:00:06 maxprocs: Leaving GOMAXPROCS=8: CPU quota undefined
__ _________ _ ____ ______
/ |/ / _/ /| | / / / / / __/
/ /|_/ // // /_| |/ / /_/ /\ \
/_/ /_/___/____/___/\____/___/
Welcome to use Milvus!
Version: v2.3.9
Built: Mon Feb 19 04:43:50 UTC 2024
GitCommit: 35330ff8
GoVersion: go version go1.20.7 linux/amd64
TotalMem: 33583980544
UsedMem: 23867392
{"level":"info","ts":"2024-08-12T03:00:18.851Z","caller":"etcdserver/server.go:522","msg":"recovered v3 backend from snapshot","backend-size-bytes":2147717120,"backend-size":"2.1 GB","backend-size-in-use-bytes":2147696640,"backend-size-in-use":"2.1 GB"}
[2024/08/12 03:00:24.381 +00:00] [WARN] [sessionutil/session_util.go:361] ["Session Txn failed"] [key=id] [error="etcdserver: mvcc: database space exceeded"]
{"level":"warn","ts":"2024-08-12T03:00:24.381Z","caller":"etcdserver/util.go:123","msg":"failed to apply request","took":"3.129µs","request":"header:<ID:7587880692825299214 > txn:<compare:<target:VALUE key:\"by-dev/meta/session/id\" value_size:2 > success:<request_put:<key:\"by-dev/meta/session/id\" value_size:2 >> failure:<>>","response":"","error":"etcdserver: no space"}
[2024/08/12 03:00:24.381 +00:00] [DEBUG] [sessionutil/session_util.go:311] [getServerID] [reuse=true]
panic: etcdserver: mvcc: database space exceeded
goroutine 429 [IO wait]:
关键报错
etcdserver: no space
etcdserver: mvcc: database space exceeded
goroutine 429 [IO wait]:
此刻
milvus
数据库一直死循环在goroutine 429 [IO wait]:
无法启动
解决方案
思路 , 用
etcd
镜像挂载milvus
的etcd
数据目录,进去尝试恢复数据.处理后在启动milvus
使用到的镜像
bitnami/etcd:3.5.15
参考文件https://hub.docker.com/r/bitnami/etcd
参考文件https://etcd.io/docs/v3.4/op-guide/configuration/
https://github.com/etcd-io/etcd/blob/v3.5.5/server/etcdserver/backend.go
参考文件https://github.com/etcd-io/etcd/releases/tag/v3.5.15
- 启动镜像
docker run -itd --etcd-test test -v /milvus/configs/embedEtcd.yaml:/opt/bitnami/Etcd/conf/etcd.conf.yml \
-v /milvus/data/etcd:/bitnami/etcd/data \
--env ALLOW_NONE_AUTHENTICATION=yes \
bitnami/etcd:3.5.15
- 进入镜像
docker exec -it etcd-test /bin/bash
- 查看并恢复数据
ETCD_ENDPOINT="http://0.0.0.0:2379"
# 查看etcd状态
etcdctl --endpoints=${ETCD_ENDPOINT} endpoint status --write-out=table
+---------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------------------------------+
| ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS |
+---------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------------------------------+
| http://0.0.0.0:2379 | 8e9e05c52164694d | 3.5.15 | 2.1 GB | true | false | 9362 | 10624047 | 10624047 | memberID:10276657743932975437 |
| | | | | | | | | | alarm:NOSPACE |
+---------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------------------------------+
# 获取etcd revision
etcdctl --endpoints=${ETCD_ENDPOINT} endpoint status --write-out="json" | egrep -o '"revision":[0-9]*' | egrep -o '[0-9]*'
10281210
# 压缩数据(删除历史版本数据):
etcdctl --endpoints=${ETCD_ENDPOINT} compact 10281210
# 碎片整理(释放未使用的存储空间)
etcdctl --endpoints=${ETCD_ENDPOINT} defrag
# 报警列表
etcdctl --endpoints=${ETCD_ENDPOINT} alarm list
# 解除警报
etcdctl --endpoints=${ETCD_ENDPOINT} alarm disarm
# 存一个数据
etcdctl --endpoints=${ETCD_ENDPOINT} put aa value123
- 退出镜像
- docker restart etcd-test
- 查看日志是否正常启动
升级milvus到 v2.4.7
注意
etcd
的配置文件, 增大了quota-backend-bytes
参数
参考文件https://github.com/milvus-io/milvus/blob/v2.4.7/scripts/standalone_embed.sh
#!/usr/bin/env bash
# Licensed to the LF AI & Data foundation under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
run_embed() {
cat << EOF > /milvus/configs/embedEtcd.yaml
listen-client-urls: http://0.0.0.0:2379
advertise-client-urls: http://0.0.0.0:2379
quota-backend-bytes: 4294967296
auto-compaction-mode: revision
auto-compaction-retention: '1000'
EOF
sudo docker run -d \
--name milvus-standalone \
--security-opt seccomp:unconfined \
-e ETCD_USE_EMBED=true \
-e ETCD_DATA_DIR=/var/lib/milvus/etcd \
-e ETCD_CONFIG_PATH=/milvus/configs/embedEtcd.yaml \
-e COMMON_STORAGETYPE=local \
-v /milvus/data:/var/lib/milvus \
-v /milvus/configs:/milvus/configs \
-p 19530:19530 \
-p 9091:9091 \
-p 2379:2379 \
--health-cmd="curl -f http://localhost:9091/healthz" \
--health-interval=30s \
--health-start-period=90s \
--health-timeout=20s \
--health-retries=3 \
milvusdb/milvus:v2.4.7 \
milvus run standalone 1> /dev/null
}
wait_for_milvus_running() {
echo "Wait for Milvus Starting..."
while true
do
res=`sudo docker ps|grep milvus-standalone|grep healthy|wc -l`
if [ $res -eq 1 ]
then
echo "Start successfully."
break
fi
sleep 1
done
}
start() {
res=`sudo docker ps|grep milvus-standalone|grep healthy|wc -l`
if [ $res -eq 1 ]
then
echo "Milvus is running."
exit 0
fi
res=`sudo docker ps -a|grep milvus-standalone|wc -l`
if [ $res -eq 1 ]
then
sudo docker start milvus-standalone 1> /dev/null
else
run_embed
fi
if [ $? -ne 0 ]
then
echo "Start failed."
exit 1
fi
wait_for_milvus_running
}
stop() {
sudo docker stop milvus-standalone 1> /dev/null
if [ $? -ne 0 ]
then
echo "Stop failed."
exit 1
fi
echo "Stop successfully."
}
delete() {
res=`sudo docker ps|grep milvus-standalone|wc -l`
if [ $res -eq 1 ]
then
echo "Please stop Milvus service before delete."
exit 1
fi
sudo docker rm milvus-standalone 1> /dev/null
if [ $? -ne 0 ]
then
echo "Delete failed."
exit 1
fi
sudo rm -rf $(pwd)/volumes
sudo rm -rf $(pwd)/embedEtcd.yaml
echo "Delete successfully."
}
case $1 in
start)
start
;;
stop)
stop
;;
delete)
delete
;;
*)
echo "please use bash standalone_embed.sh start|stop|delete"
;;
esac