After digging around, I've located two culprits. One is the thrift hive server we run manually for Tableau connection and the other is Hue.
To make a long story short, here's my solution:
#!/bin/bashThis script is executed every morning via crontab to keep zookeeper from choking. It's been running happily for a few months now :)
# Function check thrift process id
check_pid() {
pid=`cat ~/thrift.pid`
#echo $pid
ps -A|grep $pid|grep java > /dev/null 2>&1
result=$?
}
# Function restart Hue service
restart_hue() {
curl -X POST -u 'admin:admin' http://localhost:7180/api/v2/clusters/TEST%20-%20CDH4/services/hue1/commands/restart >/dev/null 2>&1
}
# Function kill thrift server
kill_thrift() {
kill $pid
sleep 2
check_pid
if [ "$result" == 0 ]; then {
kill -9 $pid
sleep 2
}
fi
check_pid
if [ "$result" == 0 ]; then {
echo "Error: Failed to kill server"
exit 1
}
fi
}
# Sanity
if [ ! -f ~/thrift.pid ]; then {
echo "Error: thrift process id file ~/thrift.pid not found."
exit 1
}
fi
if [ `whoami` != "admin" ]; then {
echo "Error: you are not logged in or executing as user admin, please do so."
exit 1
}
fi
# Main
restart_hue
result=$?
if [ "$result" -gt 0 ]; then {
echo Error restarting Hue service, please check cluster integrity.
exit 1
}
fi
zookeeper_connections=`echo srvr|nc localhost 2181|grep -i connections`
result=$?
if [ "$result" -gt 0 ]; then {
echo Error: no reply from Zookeeper.
exit 1
}
fi
zookeeper_connections=${zookeeper_connections##*: }
#echo $zookeeper_connections
if [ $zookeeper_connections -gt 100 ]; then {
echo "Zookeeper has $zookeeper_connections open conenctions."
check_pid
if [ "$result" -gt 0 ]; then {
echo "Error: can not find thrift server PID"
exit 1
}
fi
echo "Found Thrift server, restarting"
kill_thrift
export HIVE_PORT=10001
hive --service hiveserver &
pid=$!
echo $pid> ~/thrift.pid
return=$?
}
else {
echo "Zookeeper has $zookeeper_connections open conenctions."
}
fi
exit $return