Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
icinga-plugins
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
cscf
icinga-plugins
Commits
792e8985
Commit
792e8985
authored
3 years ago
by
root
Browse files
Options
Downloads
Patches
Plain Diff
Added check_hadoop_apps
parent
b49c4283
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
check_hadoop_apps
+122
-0
122 additions, 0 deletions
check_hadoop_apps
with
122 additions
and
0 deletions
check_hadoop_apps
0 → 100755
+
122
−
0
View file @
792e8985
#!/usr/bin/python3
# Author: Devon Merner <dmerner>
# Date: 2022-01-13
# Purpose: To monitor Hadoop applications and kill applications that are stuck
import
os
import
sys
import
glob
import
re
import
argparse
import
subprocess
import
time
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"
-c
"
,
default
=
"
28800
"
,
dest
=
"
critical
"
,
help
=
"
Critical and kill applications running above this amount of seconds
"
)
parser
.
add_argument
(
"
-w
"
,
default
=
"
14400
"
,
dest
=
"
warning
"
,
help
=
"
Warn applications running above this amount of seconds
"
)
options
=
parser
.
parse_args
()
# Basic Icinga defines
STATE_OK
=
0
STATE_WARNING
=
1
STATE_CRITICAL
=
2
STATE_UNKNOWN
=
3
STATE_LABEL
=
{
STATE_OK
:
"
OK
"
,
STATE_WARNING
:
"
WARNING
"
,
STATE_CRITICAL
:
"
CRITICAL
"
,
STATE_UNKNOWN
:
"
UNKNOWN
"
}
# Default exit code
EXIT_CODE
=
STATE_OK
def
hadoop_query_running_apps
():
try
:
data
=
{}
hadoop
=
subprocess
.
Popen
([
"
yarn
"
,
"
application
"
,
"
-list
"
,
"
-appStates
"
,
"
RUNNING
"
],
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
)
# Format the output data from CSV to an array of dicts for ease-of-use
hadoopStd
=
str
(
hadoop
.
stdout
.
read
(),
'
UTF8
'
)
results
=
hadoopStd
.
splitlines
()
# Remove the headers
results
.
pop
(
0
)
results
.
pop
(
0
)
results
.
pop
(
0
)
for
res
in
results
:
# 0 = Application-Id, 1 = Application-Name, 3 = Application-Type
resData
=
res
.
split
()
data
[
resData
[
0
]]
=
{}
data
[
resData
[
0
]][
'
Application-Name
'
]
=
resData
[
1
]
return
data
except
FileNotFoundError
as
e
:
print
(
"
UNKNOWN - Could not detect yarn command. |
"
)
sys
.
exit
(
STATE_OK
)
def
hadoop_query_app
(
appID
):
try
:
data
=
{}
yarn
=
subprocess
.
Popen
([
"
yarn
"
,
"
application
"
,
"
-status
"
,
appID
],
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
)
yarnStd
=
str
(
yarn
.
stdout
.
read
(),
'
UTF8
'
)
results
=
yarnStd
.
splitlines
()
for
line
in
results
:
if
"
Start-Time
"
in
line
:
data
[
"
Start-Time
"
]
=
int
(
int
(
line
.
split
(
"
:
"
)[
1
])
/
1000
)
# Convert Start-Time (ms) into Seconds
data
[
"
Running-Time
"
]
=
int
(
int
(
time
.
time
())
-
data
[
"
Start-Time
"
])
return
data
except
FileNotFoundError
as
e
:
print
(
"
UNKNOWN - Could not detect yarn command. |
"
)
sys
.
exit
(
STATE_OK
)
def
yarn_kill_app
(
appID
):
try
:
yarn
=
subprocess
.
Popen
([
"
yarn
"
,
"
application
"
,
"
-kill
"
,
appID
],
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
STDOUT
)
except
FileNotFoundError
as
e
:
print
(
"
UNKNOWN - Could not detect yarn command. |
"
)
sys
.
exit
(
STATE_OK
)
def
main
():
EXIT_CODE
=
STATE_OK
app_data
=
hadoop_query_running_apps
()
for
id
,
app
in
app_data
.
items
():
OUTPUT_TEXT
=
""
PERFDATA
=
""
APP_STATE
=
STATE_OK
app_info
=
hadoop_query_app
(
id
)
if
(
int
(
app_info
[
'
Running-Time
'
])
>=
int
(
options
.
critical
)):
yarn_kill_app
(
id
)
if
(
APP_STATE
<
STATE_CRITICAL
):
APP_STATE
=
STATE_CRITICAL
if
(
EXIT_CODE
<
STATE_CRITICAL
):
EXIT_CODE
=
STATE_CRITICAL
elif
(
int
(
app_info
[
'
Running-Time
'
])
>=
int
(
options
.
warning
)):
if
(
APP_STATE
<
STATE_WARNING
):
APP_STATE
=
STATE_WARNING
if
(
EXIT_CODE
<
STATE_WARNING
):
EXIT_CODE
=
STATE_WARNING
if
(
APP_STATE
==
STATE_CRITICAL
):
print
(
"
[{0}] {1}: {2}: Has been running for {3} hours. Killing application... |
"
.
format
(
id
,
app
[
'
Application-Name
'
],
STATE_LABEL
[
APP_STATE
],
str
(
int
(
app_info
[
'
Running-Time
'
]
/
3600
))))
sys
.
exit
(
EXIT_CODE
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment