Skip to content

Commit b233d11

Browse files
author
yaokl
committedJul 15, 2022
feat: add
1 parent 4832f28 commit b233d11

9 files changed

+1990
-1
lines changed
 

‎shell/aliyun.sh

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
if ps aux | grep -i '[a]liyun'; then
2+
curl http://update.aegis.aliyun.com/download/uninstall.sh | bash
3+
curl http://update.aegis.aliyun.com/download/quartz_uninstall.sh | bash
4+
pkill aliyun-service
5+
rm -rf /etc/init.d/agentwatch /usr/sbin/aliyun-service
6+
rm -rf /usr/local/aegis*
7+
systemctl stop aliyun.service
8+
systemctl disable aliyun.service
9+
service bcm-agent stop
10+
yum remove bcm-agent -y
11+
apt-get remove bcm-agent -y
12+
elif ps aux | grep -i '[y]unjing'; then
13+
/usr/local/qcloud/stargate/admin/uninstall.sh
14+
/usr/local/qcloud/YunJing/uninst.sh
15+
/usr/local/qcloud/monitor/barad/admin/uninstall.sh
16+
fi
17+
if [ -f /usr/local/cloudmonitor/wrapper/bin/cloudmonitor.sh ]; then
18+
/usr/local/cloudmonitor/wrapper/bin/cloudmonitor.sh stop && /usr/local/cloudmonitor/wrapper/bin/cloudmonitor.sh remove && rm -rf /usr/local/cloudmonitor
19+
else
20+
export ARCH=amd64
21+
if [ -f /usr/local/cloudmonitor/CmsGoAgent.linux-${ARCH} ]; then
22+
/usr/local/cloudmonitor/CmsGoAgent.linux-${ARCH} stop && /usr/local/cloudmonitor/CmsGoAgent.linux-${ARCH} uninstall && rm -rf /usr/local/cloudmonitor
23+
else
24+
echo "ali cloud monitor not running"
25+
fi
26+
fi

‎shell/init/init_centos.sh

+457
Large diffs are not rendered by default.

‎shell/k8s/diagnose_k8s.sh

+338
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
#!/usr/bin/env bash
2+
3+
set -x
4+
current_dir=$(pwd)
5+
timestamp=$(date +%s)
6+
diagnose_dir=/tmp/diagnose_${timestamp}
7+
mkdir -p $diagnose_dir
8+
is_ps_hang=false
9+
10+
run() {
11+
echo
12+
echo "-----------------run $@------------------"
13+
timeout 10s $@
14+
if [ "$?" != "0" ]; then
15+
echo "failed to collect info: $@"
16+
fi
17+
echo "------------End of ${1}----------------"
18+
}
19+
20+
os_env()
21+
{
22+
grep -q "Ubuntu" /etc/os-release && export OS="Ubuntu" && return
23+
grep -q "SUSE" /etc/os-release && export OS="SUSE" && return
24+
grep -q "Red Hat" /etc/os-release && export OS="RedHat" && return
25+
grep -q "CentOS Linux" /etc/os-release && export OS="CentOS" && return
26+
grep -q "Kylin Linux" /etc/os-release && export OS="CentOS" && return
27+
grep -q "Aliyun Linux" /etc/os-release && export OS="AliyunOS" && return
28+
grep -q "Alibaba Group Enterprise Linux" /etc/os-release && export OS="AliOS" && return
29+
30+
echo "unknown os... exit."
31+
exit 1
32+
}
33+
34+
dist() {
35+
cat /etc/issue*
36+
}
37+
38+
command_exists() {
39+
command -v "$@" > /dev/null 2>&1
40+
}
41+
42+
# Service status
43+
service_status() {
44+
run service firewalld status | tee $diagnose_dir/service_status
45+
run service ntpd status | tee $diagnose_dir/service_status
46+
run service chronyd status | tee $diagnose_dir/service_status
47+
}
48+
49+
50+
#system info
51+
52+
system_info() {
53+
# mkdir -p ${diagnose_dir}/system_info
54+
run uname -a | tee -a ${diagnose_dir}/system_info
55+
run uname -r | tee -a ${diagnose_dir}/system_info
56+
run dist | tee -a ${diagnose_dir}/system_info
57+
if command_exists lsb_release; then
58+
run lsb_release | tee -a ${diagnose_dir}/system_info
59+
fi
60+
run ulimit -a | tee -a ${diagnose_dir}/system_info
61+
run sysctl -a | tee -a ${diagnose_dir}/system_info
62+
}
63+
64+
#network
65+
network_info() {
66+
# mkdir -p ${diagnose_dir}/network_info
67+
#run ifconfig
68+
run ip --details ad show | tee -a ${diagnose_dir}/network_info
69+
run ip --details link show | tee -a ${diagnose_dir}/network_info
70+
run ip route show | tee -a ${diagnose_dir}/network_info
71+
run iptables-save | tee -a ${diagnose_dir}/network_info
72+
netstat -nt | tee -a ${diagnose_dir}/network_info
73+
netstat -nu | tee -a ${diagnose_dir}/network_info
74+
netstat -ln | tee -a ${diagnose_dir}/network_info
75+
}
76+
77+
78+
# check ps -ef command is hung
79+
check_ps_hang() {
80+
echo "check if ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
81+
checkD=$(timeout -s 9 2 ps -ef)
82+
if [ "$?" != "0" ]; then
83+
echo "ps -ef command is hung" | tee -a ${diagnose_dir}/ps_command_status
84+
is_ps_hang=true
85+
echo "start to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
86+
for f in `find /proc/*/task -name status`
87+
do
88+
checkD=$(cat $f|grep "State.*D")
89+
if [ "$?" == "0" ]; then
90+
cmdline=$(echo ${f%%status}"cmdline")
91+
pid=$(echo ${f%%status}"")
92+
stack=$(echo ${f%%status}"stack")
93+
timeout -s 9 2 cat $cmdline
94+
if [ "$?" != "0" ]; then
95+
echo "process $pid is in State D and lead to ps -ef process hang,stack info:" | tee -a ${diagnose_dir}/ps_command_status
96+
cat $stack | tee -a ${diagnose_dir}/ps_command_status
97+
fi
98+
fi
99+
done
100+
echo "finish to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status
101+
else
102+
echo "ps -ef command works fine" | tee -a ${diagnose_dir}/ps_command_status
103+
fi
104+
}
105+
106+
107+
#system status
108+
system_status() {
109+
#mkdir -p ${diagnose_dir}/system_status
110+
run uptime | tee -a ${diagnose_dir}/system_status
111+
run top -b -n 1 | tee -a ${diagnose_dir}/system_status
112+
if [ "$is_ps_hang" == "false" ]; then
113+
run ps -ef | tee -a ${diagnose_dir}/system_status
114+
else
115+
echo "ps -ef command hang, skip [ps -ef] check" | tee -a ${diagnose_dir}/system_status
116+
fi
117+
run netstat -nt | tee -a ${diagnose_dir}/system_status
118+
run netstat -nu | tee -a ${diagnose_dir}/system_status
119+
run netstat -ln | tee -a ${diagnose_dir}/system_status
120+
121+
run df -h | tee -a ${diagnose_dir}/system_status
122+
123+
run cat /proc/mounts | tee -a ${diagnose_dir}/system_status
124+
125+
if [ "$is_ps_hang" == "false" ]; then
126+
run pstree -al | tee -a ${diagnose_dir}/system_status
127+
else
128+
echo "ps -ef command hang, skip [pstree -al] check" | tee -a ${diagnose_dir}/system_status
129+
fi
130+
131+
run lsof | tee -a ${diagnose_dir}/system_status
132+
133+
(
134+
cd /proc
135+
find -maxdepth 1 -type d -name '[0-9]*' \
136+
-exec bash -c "ls {}/fd/ | wc -l | tr '\n' ' '" \; \
137+
-printf "fds (PID = %P), command: " \
138+
-exec bash -c "tr '\0' ' ' < {}/cmdline" \; \
139+
-exec echo \; | sort -rn | head | tee -a ${diagnose_dir}/system_status
140+
)
141+
}
142+
143+
144+
daemon_status() {
145+
run systemctl status docker -l | tee -a ${diagnose_dir}/docker_status
146+
run systemctl status containerd -l | tee -a ${diagnose_dir}/containerd_status
147+
run systemctl status container-storaged -l | tee -a ${diagnose_dir}/container-storaged_status
148+
run systemctl status kubelet -l | tee -a ${diagnose_dir}/kubelet_status
149+
}
150+
151+
docker_status() {
152+
#mkdir -p ${diagnose_dir}/docker_status
153+
echo "check dockerd process"
154+
if [ "$is_ps_hang" == "false" ]; then
155+
run ps -ef|grep -E 'dockerd|docker daemon'|grep -v grep| tee -a ${diagnose_dir}/docker_status
156+
else
157+
echo "ps -ef command hang, skip [ps -ef|grep -E 'dockerd|docker daemon'] check" | tee -a ${diagnose_dir}/docker_status
158+
fi
159+
160+
#docker info
161+
run docker info | tee -a ${diagnose_dir}/docker_status
162+
run docker version | tee -a ${diagnose_dir}/docker_status
163+
sudo kill -SIGUSR1 $(cat /var/run/docker.pid)
164+
cp /var/run/docker/libcontainerd/containerd/events.log ${diagnose_dir}/containerd_events.log
165+
sleep 10
166+
cp /var/run/docker/*.log ${diagnose_dir}
167+
168+
}
169+
170+
showlog() {
171+
local file=$1
172+
if [ -f "$file" ]; then
173+
tail -n 200 $file
174+
fi
175+
}
176+
177+
#collect log
178+
common_logs() {
179+
log_tail_lines=10000
180+
mkdir -p ${diagnose_dir}/logs
181+
run dmesg -T | tail -n ${log_tail_lines} | tee ${diagnose_dir}/logs/dmesg.log
182+
tail -c 500M /var/log/messages &> ${diagnose_dir}/logs/messages
183+
pidof systemd && journalctl -n ${log_tail_lines} -u docker.service &> ${diagnose_dir}/logs/docker.log || tail -n ${log_tail_lines} /var/log/upstart/docker.log &> ${diagnose_dir}/logs/docker.log
184+
}
185+
186+
archive() {
187+
tar -zcvf ${current_dir}/diagnose_${timestamp}.tar.gz ${diagnose_dir}
188+
echo "please get diagnose_${timestamp}.tar.gz for diagnostics"
189+
}
190+
191+
varlogmessage(){
192+
grep cloud-init /var/log/messages > $diagnose_dir/varlogmessage.log
193+
}
194+
195+
cluster_dump(){
196+
kubectl cluster-info dump > $diagnose_dir/cluster_dump.log
197+
}
198+
199+
events(){
200+
kubectl get events > $diagnose_dir/events.log
201+
}
202+
203+
core_component() {
204+
local comp="$1"
205+
local label="$2"
206+
mkdir -p $diagnose_dir/cs/$comp/
207+
local pods=`kubectl get -n kube-system po -l $label=$comp | awk '{print $1}'|grep -v NAME`
208+
for po in ${pods}
209+
do
210+
kubectl logs -n kube-system ${po} &> $diagnose_dir/cs/${comp}/${po}.log
211+
done
212+
}
213+
214+
etcd() {
215+
journalctl -u etcd -xe &> $diagnose_dir/cs/etcd.log
216+
}
217+
218+
storageplugins() {
219+
mkdir -p ${diagnose_dir}/storage/
220+
cp /var/log/alicloud/* ${diagnose_dir}/storage/
221+
}
222+
223+
sandbox_runtime_status() {
224+
if [[ ! -z $(pidof dockerd) || -z $(pidof containerd) ]]; then
225+
return 0
226+
fi
227+
wget http://aliacs-k8s-cn-hangzhou.oss-cn-hangzhou.aliyuncs.com/public/diagnose/sandbox-runtime-status.tgz -q -O ${diagnose_dir}/sandbox-runtime-status.tgz
228+
tar -xzvf ${diagnose_dir}/sandbox-runtime-status.tgz -C ${diagnose_dir}
229+
pushd ${diagnose_dir}/sandbox-runtime-status
230+
bash script_collect.sh >> $diagnose_dir/sandbox_runtime.status
231+
popd
232+
}
233+
234+
upload_oss() {
235+
if [[ "$UPLOAD_OSS" == "" ]]; then
236+
return 0
237+
fi
238+
239+
bucket_path=${UPLOAD_OSS}
240+
diagnose_file=diagnose_${timestamp}.tar.gz
241+
242+
if ! command_exists ossutil; then
243+
curl -o /usr/local/bin/ossutil http://gosspublic.alicdn.com/ossutil/1.6.10/ossutil64
244+
chmod u+x /usr/local/bin/ossutil
245+
fi
246+
247+
248+
region=$(curl http://100.100.100.200/latest/meta-data/region-id)
249+
endpoint="oss-$region.aliyuncs.com"
250+
if [[ "$ACCESS_KEY_ID" == "" ]]; then
251+
roleName=$(curl 100.100.100.200/latest/meta-data/ram/security-credentials/)
252+
echo "
253+
[Credentials]
254+
language = CH
255+
endpoint = $endpoint
256+
[AkService]
257+
ecsAk=http://100.100.100.200/latest/meta-data/Ram/security-credentials/$roleName" > ./config
258+
else
259+
echo "
260+
[Credentials]
261+
language = CH
262+
endpoint = $endpoint
263+
accessKeyID = $ACCESS_KEY_ID
264+
accessKeySecret = $ACCESS_KEY_SECRET
265+
" > ./config
266+
fi
267+
bucket_name=${bucket_path%%/*}
268+
oss_endpoint=$(ossutil stat oss://$bucket_name --config-file ./config | grep ExtranetEndpoint | awk '{print $3}')
269+
if [[ "$oss_endpoint" != "" ]]; then
270+
endpoint=$oss_endpoint
271+
fi
272+
ossutil cp ./${diagnose_file} oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint
273+
274+
if [[ "$OSS_PUBLIC_LINK" != "" ]]; then
275+
ossutil sign --timeout 7200 oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint
276+
fi
277+
}
278+
279+
parse_args() {
280+
while
281+
[[ $# -gt 0 ]]
282+
do
283+
key="$1"
284+
285+
case $key in
286+
--oss)
287+
export UPLOAD_OSS=$2
288+
shift
289+
;;
290+
--oss-public-link)
291+
export OSS_PUBLIC_LINK="true"
292+
;;
293+
--access-key-id)
294+
export ACCESS_KEY_ID=$2
295+
shift
296+
;;
297+
--access-key-secret)
298+
export ACCESS_KEY_SECRET=$2
299+
shift
300+
;;
301+
*)
302+
echo "unknown option [$key]"
303+
;;
304+
esac
305+
shift
306+
done
307+
}
308+
309+
pd_collect() {
310+
os_env
311+
system_info
312+
service_status
313+
network_info
314+
check_ps_hang
315+
system_status
316+
docker_status
317+
sandbox_runtime_status
318+
common_logs
319+
320+
varlogmessage
321+
core_component "cloud-controller-manager" "app"
322+
core_component "kube-apiserver" "component"
323+
core_component "kube-controller-manager" "component"
324+
core_component "kube-scheduler" "component"
325+
events
326+
storageplugins
327+
etcd
328+
cluster_dump
329+
archive
330+
}
331+
332+
parse_args "$@"
333+
334+
pd_collect
335+
336+
upload_oss
337+
338+
echo "请上传 diagnose_${timestamp}.tar.gz"

‎shell/library.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -803,4 +803,4 @@ function utils::quote() {
803803
else
804804
echo "$@"
805805
fi
806-
}
806+
}

0 commit comments

Comments
 (0)