Skip to content
Snippets Groups Projects
Commit 792e8985 authored by root's avatar root
Browse files

Added check_hadoop_apps

parent b49c4283
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python3
# Author: Devon Merner <dmerner>
# Date: 2022-01-13
# Purpose: To monitor Hadoop applications and kill applications that are stuck
import os
import sys
import glob
import re
import argparse
import subprocess
import time
parser = argparse.ArgumentParser()
parser.add_argument("-c", default="28800", dest="critical", help="Critical and kill applications running above this amount of seconds")
parser.add_argument("-w", default="14400", dest="warning", help="Warn applications running above this amount of seconds")
options = parser.parse_args()
# Basic Icinga defines
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_LABEL = {
STATE_OK : "OK",
STATE_WARNING : "WARNING",
STATE_CRITICAL : "CRITICAL",
STATE_UNKNOWN : "UNKNOWN"
}
# Default exit code
EXIT_CODE = STATE_OK
def hadoop_query_running_apps():
try:
data = {}
hadoop = subprocess.Popen(["yarn", "application", "-list", "-appStates", "RUNNING"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Format the output data from CSV to an array of dicts for ease-of-use
hadoopStd = str(hadoop.stdout.read(), 'UTF8')
results = hadoopStd.splitlines()
# Remove the headers
results.pop(0)
results.pop(0)
results.pop(0)
for res in results:
# 0 = Application-Id, 1 = Application-Name, 3 = Application-Type
resData = res.split()
data[resData[0]] = {}
data[resData[0]]['Application-Name'] = resData[1]
return data
except FileNotFoundError as e:
print("UNKNOWN - Could not detect yarn command. |")
sys.exit(STATE_OK)
def hadoop_query_app(appID):
try:
data = {}
yarn = subprocess.Popen(["yarn", "application", "-status", appID], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
yarnStd = str(yarn.stdout.read(), 'UTF8')
results = yarnStd.splitlines()
for line in results:
if "Start-Time" in line:
data["Start-Time"] = int(int(line.split(" : ")[1]) / 1000) # Convert Start-Time (ms) into Seconds
data["Running-Time"] = int(int(time.time()) - data["Start-Time"])
return data
except FileNotFoundError as e:
print("UNKNOWN - Could not detect yarn command. |")
sys.exit(STATE_OK)
def yarn_kill_app(appID):
try:
yarn = subprocess.Popen(["yarn", "application", "-kill", appID], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
except FileNotFoundError as e:
print("UNKNOWN - Could not detect yarn command. |")
sys.exit(STATE_OK)
def main():
EXIT_CODE = STATE_OK
app_data = hadoop_query_running_apps()
for id, app in app_data.items():
OUTPUT_TEXT = ""
PERFDATA = ""
APP_STATE = STATE_OK
app_info = hadoop_query_app(id)
if (int(app_info['Running-Time']) >= int(options.critical)):
yarn_kill_app(id)
if (APP_STATE < STATE_CRITICAL):
APP_STATE = STATE_CRITICAL
if (EXIT_CODE < STATE_CRITICAL):
EXIT_CODE = STATE_CRITICAL
elif (int(app_info['Running-Time']) >= int(options.warning)):
if (APP_STATE < STATE_WARNING):
APP_STATE = STATE_WARNING
if (EXIT_CODE < STATE_WARNING):
EXIT_CODE = STATE_WARNING
if (APP_STATE == STATE_CRITICAL):
print("[{0}] {1}: {2}: Has been running for {3} hours. Killing application... | ".format(id, app['Application-Name'], STATE_LABEL[APP_STATE], str(int(app_info['Running-Time'] / 3600))))
sys.exit(EXIT_CODE)
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment