|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +set -x |
| 4 | +current_dir=$(pwd) |
| 5 | +timestamp=$(date +%s) |
| 6 | +diagnose_dir=/tmp/diagnose_${timestamp} |
| 7 | +mkdir -p $diagnose_dir |
| 8 | +is_ps_hang=false |
| 9 | + |
| 10 | +run() { |
| 11 | + echo |
| 12 | + echo "-----------------run $@------------------" |
| 13 | + timeout 10s $@ |
| 14 | + if [ "$?" != "0" ]; then |
| 15 | + echo "failed to collect info: $@" |
| 16 | + fi |
| 17 | + echo "------------End of ${1}----------------" |
| 18 | +} |
| 19 | + |
| 20 | +os_env() |
| 21 | +{ |
| 22 | + grep -q "Ubuntu" /etc/os-release && export OS="Ubuntu" && return |
| 23 | + grep -q "SUSE" /etc/os-release && export OS="SUSE" && return |
| 24 | + grep -q "Red Hat" /etc/os-release && export OS="RedHat" && return |
| 25 | + grep -q "CentOS Linux" /etc/os-release && export OS="CentOS" && return |
| 26 | + grep -q "Kylin Linux" /etc/os-release && export OS="CentOS" && return |
| 27 | + grep -q "Aliyun Linux" /etc/os-release && export OS="AliyunOS" && return |
| 28 | + grep -q "Alibaba Group Enterprise Linux" /etc/os-release && export OS="AliOS" && return |
| 29 | + |
| 30 | + echo "unknown os... exit." |
| 31 | + exit 1 |
| 32 | +} |
| 33 | + |
| 34 | +dist() { |
| 35 | + cat /etc/issue* |
| 36 | +} |
| 37 | + |
| 38 | +command_exists() { |
| 39 | + command -v "$@" > /dev/null 2>&1 |
| 40 | +} |
| 41 | + |
| 42 | +# Service status |
| 43 | +service_status() { |
| 44 | + run service firewalld status | tee $diagnose_dir/service_status |
| 45 | + run service ntpd status | tee $diagnose_dir/service_status |
| 46 | + run service chronyd status | tee $diagnose_dir/service_status |
| 47 | +} |
| 48 | + |
| 49 | + |
| 50 | +#system info |
| 51 | + |
| 52 | +system_info() { |
| 53 | + # mkdir -p ${diagnose_dir}/system_info |
| 54 | + run uname -a | tee -a ${diagnose_dir}/system_info |
| 55 | + run uname -r | tee -a ${diagnose_dir}/system_info |
| 56 | + run dist | tee -a ${diagnose_dir}/system_info |
| 57 | + if command_exists lsb_release; then |
| 58 | + run lsb_release | tee -a ${diagnose_dir}/system_info |
| 59 | + fi |
| 60 | + run ulimit -a | tee -a ${diagnose_dir}/system_info |
| 61 | + run sysctl -a | tee -a ${diagnose_dir}/system_info |
| 62 | +} |
| 63 | + |
| 64 | +#network |
| 65 | +network_info() { |
| 66 | + # mkdir -p ${diagnose_dir}/network_info |
| 67 | + #run ifconfig |
| 68 | + run ip --details ad show | tee -a ${diagnose_dir}/network_info |
| 69 | + run ip --details link show | tee -a ${diagnose_dir}/network_info |
| 70 | + run ip route show | tee -a ${diagnose_dir}/network_info |
| 71 | + run iptables-save | tee -a ${diagnose_dir}/network_info |
| 72 | + netstat -nt | tee -a ${diagnose_dir}/network_info |
| 73 | + netstat -nu | tee -a ${diagnose_dir}/network_info |
| 74 | + netstat -ln | tee -a ${diagnose_dir}/network_info |
| 75 | +} |
| 76 | + |
| 77 | + |
| 78 | +# check ps -ef command is hung |
| 79 | +check_ps_hang() { |
| 80 | + echo "check if ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status |
| 81 | + checkD=$(timeout -s 9 2 ps -ef) |
| 82 | + if [ "$?" != "0" ]; then |
| 83 | + echo "ps -ef command is hung" | tee -a ${diagnose_dir}/ps_command_status |
| 84 | + is_ps_hang=true |
| 85 | + echo "start to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status |
| 86 | + for f in `find /proc/*/task -name status` |
| 87 | + do |
| 88 | + checkD=$(cat $f|grep "State.*D") |
| 89 | + if [ "$?" == "0" ]; then |
| 90 | + cmdline=$(echo ${f%%status}"cmdline") |
| 91 | + pid=$(echo ${f%%status}"") |
| 92 | + stack=$(echo ${f%%status}"stack") |
| 93 | + timeout -s 9 2 cat $cmdline |
| 94 | + if [ "$?" != "0" ]; then |
| 95 | + echo "process $pid is in State D and lead to ps -ef process hang,stack info:" | tee -a ${diagnose_dir}/ps_command_status |
| 96 | + cat $stack | tee -a ${diagnose_dir}/ps_command_status |
| 97 | + fi |
| 98 | + fi |
| 99 | + done |
| 100 | + echo "finish to check which process lead to ps -ef command hang" | tee -a ${diagnose_dir}/ps_command_status |
| 101 | + else |
| 102 | + echo "ps -ef command works fine" | tee -a ${diagnose_dir}/ps_command_status |
| 103 | + fi |
| 104 | +} |
| 105 | + |
| 106 | + |
| 107 | +#system status |
| 108 | +system_status() { |
| 109 | + #mkdir -p ${diagnose_dir}/system_status |
| 110 | + run uptime | tee -a ${diagnose_dir}/system_status |
| 111 | + run top -b -n 1 | tee -a ${diagnose_dir}/system_status |
| 112 | + if [ "$is_ps_hang" == "false" ]; then |
| 113 | + run ps -ef | tee -a ${diagnose_dir}/system_status |
| 114 | + else |
| 115 | + echo "ps -ef command hang, skip [ps -ef] check" | tee -a ${diagnose_dir}/system_status |
| 116 | + fi |
| 117 | + run netstat -nt | tee -a ${diagnose_dir}/system_status |
| 118 | + run netstat -nu | tee -a ${diagnose_dir}/system_status |
| 119 | + run netstat -ln | tee -a ${diagnose_dir}/system_status |
| 120 | + |
| 121 | + run df -h | tee -a ${diagnose_dir}/system_status |
| 122 | + |
| 123 | + run cat /proc/mounts | tee -a ${diagnose_dir}/system_status |
| 124 | + |
| 125 | + if [ "$is_ps_hang" == "false" ]; then |
| 126 | + run pstree -al | tee -a ${diagnose_dir}/system_status |
| 127 | + else |
| 128 | + echo "ps -ef command hang, skip [pstree -al] check" | tee -a ${diagnose_dir}/system_status |
| 129 | + fi |
| 130 | + |
| 131 | + run lsof | tee -a ${diagnose_dir}/system_status |
| 132 | + |
| 133 | + ( |
| 134 | + cd /proc |
| 135 | + find -maxdepth 1 -type d -name '[0-9]*' \ |
| 136 | + -exec bash -c "ls {}/fd/ | wc -l | tr '\n' ' '" \; \ |
| 137 | + -printf "fds (PID = %P), command: " \ |
| 138 | + -exec bash -c "tr '\0' ' ' < {}/cmdline" \; \ |
| 139 | + -exec echo \; | sort -rn | head | tee -a ${diagnose_dir}/system_status |
| 140 | + ) |
| 141 | +} |
| 142 | + |
| 143 | + |
| 144 | +daemon_status() { |
| 145 | + run systemctl status docker -l | tee -a ${diagnose_dir}/docker_status |
| 146 | + run systemctl status containerd -l | tee -a ${diagnose_dir}/containerd_status |
| 147 | + run systemctl status container-storaged -l | tee -a ${diagnose_dir}/container-storaged_status |
| 148 | + run systemctl status kubelet -l | tee -a ${diagnose_dir}/kubelet_status |
| 149 | +} |
| 150 | + |
| 151 | +docker_status() { |
| 152 | + #mkdir -p ${diagnose_dir}/docker_status |
| 153 | + echo "check dockerd process" |
| 154 | + if [ "$is_ps_hang" == "false" ]; then |
| 155 | + run ps -ef|grep -E 'dockerd|docker daemon'|grep -v grep| tee -a ${diagnose_dir}/docker_status |
| 156 | + else |
| 157 | + echo "ps -ef command hang, skip [ps -ef|grep -E 'dockerd|docker daemon'] check" | tee -a ${diagnose_dir}/docker_status |
| 158 | + fi |
| 159 | + |
| 160 | + #docker info |
| 161 | + run docker info | tee -a ${diagnose_dir}/docker_status |
| 162 | + run docker version | tee -a ${diagnose_dir}/docker_status |
| 163 | + sudo kill -SIGUSR1 $(cat /var/run/docker.pid) |
| 164 | + cp /var/run/docker/libcontainerd/containerd/events.log ${diagnose_dir}/containerd_events.log |
| 165 | + sleep 10 |
| 166 | + cp /var/run/docker/*.log ${diagnose_dir} |
| 167 | + |
| 168 | +} |
| 169 | + |
| 170 | +showlog() { |
| 171 | + local file=$1 |
| 172 | + if [ -f "$file" ]; then |
| 173 | + tail -n 200 $file |
| 174 | + fi |
| 175 | +} |
| 176 | + |
| 177 | +#collect log |
| 178 | +common_logs() { |
| 179 | + log_tail_lines=10000 |
| 180 | + mkdir -p ${diagnose_dir}/logs |
| 181 | + run dmesg -T | tail -n ${log_tail_lines} | tee ${diagnose_dir}/logs/dmesg.log |
| 182 | + tail -c 500M /var/log/messages &> ${diagnose_dir}/logs/messages |
| 183 | + pidof systemd && journalctl -n ${log_tail_lines} -u docker.service &> ${diagnose_dir}/logs/docker.log || tail -n ${log_tail_lines} /var/log/upstart/docker.log &> ${diagnose_dir}/logs/docker.log |
| 184 | +} |
| 185 | + |
| 186 | +archive() { |
| 187 | + tar -zcvf ${current_dir}/diagnose_${timestamp}.tar.gz ${diagnose_dir} |
| 188 | + echo "please get diagnose_${timestamp}.tar.gz for diagnostics" |
| 189 | +} |
| 190 | + |
| 191 | +varlogmessage(){ |
| 192 | + grep cloud-init /var/log/messages > $diagnose_dir/varlogmessage.log |
| 193 | +} |
| 194 | + |
| 195 | +cluster_dump(){ |
| 196 | + kubectl cluster-info dump > $diagnose_dir/cluster_dump.log |
| 197 | +} |
| 198 | + |
| 199 | +events(){ |
| 200 | + kubectl get events > $diagnose_dir/events.log |
| 201 | +} |
| 202 | + |
| 203 | +core_component() { |
| 204 | + local comp="$1" |
| 205 | + local label="$2" |
| 206 | + mkdir -p $diagnose_dir/cs/$comp/ |
| 207 | + local pods=`kubectl get -n kube-system po -l $label=$comp | awk '{print $1}'|grep -v NAME` |
| 208 | + for po in ${pods} |
| 209 | + do |
| 210 | + kubectl logs -n kube-system ${po} &> $diagnose_dir/cs/${comp}/${po}.log |
| 211 | + done |
| 212 | +} |
| 213 | + |
| 214 | +etcd() { |
| 215 | + journalctl -u etcd -xe &> $diagnose_dir/cs/etcd.log |
| 216 | +} |
| 217 | + |
| 218 | +storageplugins() { |
| 219 | + mkdir -p ${diagnose_dir}/storage/ |
| 220 | + cp /var/log/alicloud/* ${diagnose_dir}/storage/ |
| 221 | +} |
| 222 | + |
| 223 | +sandbox_runtime_status() { |
| 224 | + if [[ ! -z $(pidof dockerd) || -z $(pidof containerd) ]]; then |
| 225 | + return 0 |
| 226 | + fi |
| 227 | + wget http://aliacs-k8s-cn-hangzhou.oss-cn-hangzhou.aliyuncs.com/public/diagnose/sandbox-runtime-status.tgz -q -O ${diagnose_dir}/sandbox-runtime-status.tgz |
| 228 | + tar -xzvf ${diagnose_dir}/sandbox-runtime-status.tgz -C ${diagnose_dir} |
| 229 | + pushd ${diagnose_dir}/sandbox-runtime-status |
| 230 | + bash script_collect.sh >> $diagnose_dir/sandbox_runtime.status |
| 231 | + popd |
| 232 | +} |
| 233 | + |
| 234 | +upload_oss() { |
| 235 | + if [[ "$UPLOAD_OSS" == "" ]]; then |
| 236 | + return 0 |
| 237 | + fi |
| 238 | + |
| 239 | + bucket_path=${UPLOAD_OSS} |
| 240 | + diagnose_file=diagnose_${timestamp}.tar.gz |
| 241 | + |
| 242 | + if ! command_exists ossutil; then |
| 243 | + curl -o /usr/local/bin/ossutil http://gosspublic.alicdn.com/ossutil/1.6.10/ossutil64 |
| 244 | + chmod u+x /usr/local/bin/ossutil |
| 245 | + fi |
| 246 | + |
| 247 | + |
| 248 | + region=$(curl http://100.100.100.200/latest/meta-data/region-id) |
| 249 | + endpoint="oss-$region.aliyuncs.com" |
| 250 | + if [[ "$ACCESS_KEY_ID" == "" ]]; then |
| 251 | + roleName=$(curl 100.100.100.200/latest/meta-data/ram/security-credentials/) |
| 252 | + echo " |
| 253 | +[Credentials] |
| 254 | + language = CH |
| 255 | + endpoint = $endpoint |
| 256 | +[AkService] |
| 257 | + ecsAk=http://100.100.100.200/latest/meta-data/Ram/security-credentials/$roleName" > ./config |
| 258 | + else |
| 259 | + echo " |
| 260 | +[Credentials] |
| 261 | + language = CH |
| 262 | + endpoint = $endpoint |
| 263 | + accessKeyID = $ACCESS_KEY_ID |
| 264 | + accessKeySecret = $ACCESS_KEY_SECRET |
| 265 | +" > ./config |
| 266 | + fi |
| 267 | + bucket_name=${bucket_path%%/*} |
| 268 | + oss_endpoint=$(ossutil stat oss://$bucket_name --config-file ./config | grep ExtranetEndpoint | awk '{print $3}') |
| 269 | + if [[ "$oss_endpoint" != "" ]]; then |
| 270 | + endpoint=$oss_endpoint |
| 271 | + fi |
| 272 | + ossutil cp ./${diagnose_file} oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint |
| 273 | + |
| 274 | + if [[ "$OSS_PUBLIC_LINK" != "" ]]; then |
| 275 | + ossutil sign --timeout 7200 oss://$bucket_path/$diagnose_file --config-file ./config --endpoint $endpoint |
| 276 | + fi |
| 277 | +} |
| 278 | + |
| 279 | +parse_args() { |
| 280 | + while |
| 281 | + [[ $# -gt 0 ]] |
| 282 | + do |
| 283 | + key="$1" |
| 284 | + |
| 285 | + case $key in |
| 286 | + --oss) |
| 287 | + export UPLOAD_OSS=$2 |
| 288 | + shift |
| 289 | + ;; |
| 290 | + --oss-public-link) |
| 291 | + export OSS_PUBLIC_LINK="true" |
| 292 | + ;; |
| 293 | + --access-key-id) |
| 294 | + export ACCESS_KEY_ID=$2 |
| 295 | + shift |
| 296 | + ;; |
| 297 | + --access-key-secret) |
| 298 | + export ACCESS_KEY_SECRET=$2 |
| 299 | + shift |
| 300 | + ;; |
| 301 | + *) |
| 302 | + echo "unknown option [$key]" |
| 303 | + ;; |
| 304 | + esac |
| 305 | + shift |
| 306 | + done |
| 307 | +} |
| 308 | + |
| 309 | +pd_collect() { |
| 310 | + os_env |
| 311 | + system_info |
| 312 | + service_status |
| 313 | + network_info |
| 314 | + check_ps_hang |
| 315 | + system_status |
| 316 | + docker_status |
| 317 | + sandbox_runtime_status |
| 318 | + common_logs |
| 319 | + |
| 320 | + varlogmessage |
| 321 | + core_component "cloud-controller-manager" "app" |
| 322 | + core_component "kube-apiserver" "component" |
| 323 | + core_component "kube-controller-manager" "component" |
| 324 | + core_component "kube-scheduler" "component" |
| 325 | + events |
| 326 | + storageplugins |
| 327 | + etcd |
| 328 | + cluster_dump |
| 329 | + archive |
| 330 | +} |
| 331 | + |
| 332 | +parse_args "$@" |
| 333 | + |
| 334 | +pd_collect |
| 335 | + |
| 336 | +upload_oss |
| 337 | + |
| 338 | +echo "请上传 diagnose_${timestamp}.tar.gz" |
0 commit comments