forked from facebook/rocksdb
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added simple monitoring script to monitor overusage of memory in db_b…
…ench Summary: rockuse more memory that asked to. Monitor and report. Test Plan: run the pro with conditions to simulate the overusage. It should report that the process is using more memory than needed. Reviewers: yhchiang, rven, sdong, igor Reviewed By: igor Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D33249
- Loading branch information
Ramki Balasubramanian
committed
Feb 12, 2015
1 parent
5f00af4
commit 5d1151d
Showing
2 changed files
with
319 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/bin/bash | ||
# | ||
#(c) 2004-present, Facebook Inc. All rights reserved. | ||
# | ||
#see LICENSE file for more information on use/redistribution rights. | ||
# | ||
|
||
# | ||
#dbench_monitor: monitor db_bench process for violation of memory utilization | ||
# | ||
#default usage will monitor 'virtual memory size'. See below for standard options | ||
#passed to db_bench during this test. | ||
# | ||
# See also: ./pflag for the actual monitoring script that does the work | ||
# | ||
#NOTE: | ||
# You may end up with some /tmp/ files if db_bench OR | ||
# this script OR ./pflag was killed unceremoniously | ||
# | ||
# If you see the script taking a long time, trying "kill" | ||
# will usually cleanly exit. | ||
# | ||
# | ||
DIR=`dirname $0` | ||
LOG=/tmp/`basename $0`.$$ | ||
DB_BENCH="$DIR/../db_bench"; | ||
PFLAG=${DIR}/pflag | ||
|
||
usage() { | ||
cat <<HELP; exit | ||
Usage: $0 [-h] | ||
-h: prints this help message | ||
This program will run the db_bench script to monitor memory usage | ||
using the 'pflag' program. It launches db_bench with default settings | ||
for certain arguments. You can change the defaults passed to | ||
'db_bench' program, by setting the following environment | ||
variables: | ||
bs [block_size] | ||
ztype [compression_type] | ||
benches [benchmarks] | ||
reads [reads] | ||
threads [threads] | ||
cs [cache_size] | ||
vsize [value_size] | ||
comp [compression_ratio] | ||
num [num] | ||
See the code for more info | ||
HELP | ||
|
||
} | ||
|
||
[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1; | ||
|
||
[ "x$1" = "x-h" ] && usage; | ||
|
||
trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15 | ||
|
||
touch $LOG; | ||
|
||
: ${bs:=16384} | ||
: ${ztype:=zlib} | ||
: ${benches:=readwhilewriting} | ||
: ${reads:=$((1*1024*1024))}; | ||
: ${threads:=8} | ||
: ${vsize:=2000} | ||
: ${comp:=0.5} | ||
: ${num:=10000} | ||
: ${cs:=$((1*1024*1024*1024))}; | ||
|
||
DEBUG=1 #Set to 0 to remove chattiness | ||
|
||
|
||
if [ "x$DEBUG" != "x" ]; then | ||
# | ||
#NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/* | ||
#cleanup the dir and re-run | ||
# | ||
echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db | ||
|
||
fi | ||
|
||
$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 & | ||
|
||
if [ $? -ne 0 ]; then | ||
warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!"; | ||
exit; | ||
fi | ||
PID=$! | ||
|
||
# | ||
#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem | ||
#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this | ||
# | ||
${PFLAG} -p $PID -v | ||
|
||
rm -f $LOG; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
#!/bin/bash | ||
# | ||
#(c) 2004-present, Facebook, all rights reserved. | ||
# See the LICENSE file for usage and distribution rights. | ||
# | ||
|
||
trap 'echo "Caught exception, dying"; exit' 1 2 3 15 | ||
|
||
ME=`basename $0` | ||
SERVER=`hostname` | ||
|
||
#parameters used | ||
# | ||
Dump_Config=0 | ||
DEBUG= | ||
OS=`/bin/uname -s` | ||
VMEM= | ||
RSS= | ||
CPU= | ||
VERBOSE= | ||
VAR= | ||
LIMIT= | ||
ACTION= | ||
N= | ||
WAIT= | ||
|
||
# | ||
#supported OS: Linux only for now. Easy to add | ||
# | ||
oscheck() { | ||
case ${OS} in | ||
Linux) | ||
VMEM=vsz | ||
RSS=rss | ||
CPU=bsdtime | ||
;; | ||
*) | ||
die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks." | ||
;; | ||
esac | ||
} | ||
|
||
|
||
verbose() { | ||
if [ "x$DEBUG" != "x" ]; then | ||
echo "$@" >&2 | ||
fi | ||
} | ||
|
||
warn() { | ||
echo "$@" >&2 | ||
} | ||
|
||
die() { | ||
echo "ERROR: " "$@" >&2; | ||
exit; | ||
} | ||
|
||
dump_config() { | ||
cat <<EOCONFIG; | ||
$ME running on ${HOSTNAME} at `date` | ||
Configuration for this run: | ||
PID to monitor : ${PID} | ||
Resource monitored : ${VAR} | ||
Resource limit : ${LIMIT} | ||
Check every : ${WAIT} seconds | ||
No. of times run : ${N} | ||
What to do : ${ACTION} | ||
EOCONFIG | ||
|
||
} | ||
|
||
usage() { | ||
cat <<USAGE; exit | ||
$@ | ||
Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait] | ||
Monitor a process for set of violations. Options: | ||
-p: PID of process to monitor | ||
-x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM | ||
-l: what is the threshold/limit for the metric that is being sensed. | ||
Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU | ||
NOTE: defaults to 1GB | ||
-a: action. Currently {warn|die|kill} are supported. | ||
The default action is to 'warn'. Here is the behavior: | ||
warn: complain if usage exceeds threshold, but continue monitoring | ||
kill: complain, kill the db_bench process and exit | ||
die: if usage exceeds threshold, die immediately | ||
-n: number of cycles to monitor. Default is to monitor until PID no longer exists. | ||
-w: wait time per cycle of monitoring. Default is 5 seconds. | ||
-v: verbose messaging | ||
USAGE | ||
|
||
} | ||
|
||
#set default values if none given | ||
set_defaults_if_noopt_given() { | ||
|
||
: ${VAR:=vsz} | ||
: ${LIMIT:=1024000} | ||
: ${WAIT:=5} | ||
: ${N:=999999} | ||
: ${ACTION:=warn} | ||
} | ||
|
||
validate_options() { | ||
if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then | ||
usage "PID is mandatory" | ||
fi | ||
} | ||
|
||
###### START | ||
|
||
|
||
while getopts ":p:x:l:a:n:t:vhd" opt; do | ||
case $opt in | ||
d) | ||
Dump_Config=1 | ||
;; | ||
h) | ||
usage; | ||
;; | ||
a) | ||
ACTION=${OPTARG}; | ||
;; | ||
v) | ||
DEBUG=1; | ||
;; | ||
p) | ||
PID=$OPTARG; | ||
;; | ||
x) | ||
VAR=$OPTARG; | ||
;; | ||
l) | ||
LIMIT=$OPTARG; | ||
;; | ||
w) | ||
WAIT=$OPTARG; | ||
;; | ||
n) | ||
N=$OPTARG; | ||
;; | ||
\?) | ||
usage; | ||
;; | ||
esac | ||
done | ||
|
||
oscheck; | ||
set_defaults_if_noopt_given; | ||
validate_options; | ||
|
||
if [ $Dump_Config -eq 1 ]; then | ||
dump_config; | ||
exit; | ||
fi | ||
|
||
Done=0 | ||
|
||
verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration"; | ||
|
||
while [ $Done -eq 0 ]; do | ||
VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'` | ||
if [ ${VAL:=0} -eq 0 ]; then | ||
warn "Process $PID ended without incident." | ||
Done=1; | ||
break; | ||
fi | ||
|
||
if [ $VAL -ge $LIMIT ]; then | ||
Done=1; | ||
else | ||
echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}" | ||
sleep $WAIT; | ||
fi | ||
if [ $Done -eq 1 ]; then | ||
|
||
if [ "$ACTION" = "kill" ]; then | ||
kill ${PID} || kill -3 ${PID} | ||
exit; | ||
|
||
elif [ "$ACTION" = "warn" ]; then | ||
|
||
# go back to monitoring. | ||
|
||
warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}" | ||
Done=0 #go back to monitoring | ||
|
||
elif [ "$ACTION" = "die" ]; then | ||
warn "WARNING: dying without killing process ${PID} on ${SERVER}" | ||
warn "The process details are below: " | ||
warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`" | ||
warn "" | ||
|
||
#should we send email/notify someone? TODO... for now, bail. | ||
|
||
exit -1; | ||
|
||
fi | ||
else | ||
: | ||
#warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded"; | ||
fi | ||
done | ||
|