File: //opt/perf/newconfig/alarmdef
# @(#)alarmdef 11.00.000 for PA/Linux =*=
#
# HP Performance Agent alarm definitions
#
# The perfalarm daemons read the alarmdef file to configure alarms.
# The /var/opt/perf/alarmdef file should be edited to meet the needs of your
# environment. User changes to this file will be preserved when Perf Agent is
# updated to a later release. The /opt/perf/newconfig/alarmdef file will
# always contain the installed release's default alarms. Additional
# examples exist # under /opt/perf/examples/config.
# See your Perf Agent User's Manual for more information.
# Below are the primary CPU, Disk, Memory, and Network Bottleneck alarms.
# For each area, a bottleneck symptom is calculated, and the resulting
# bottleneck probability is used to define yellow or red alerts.
# The CPU bottleneck symptom default is influenced mostly by the overall
# cpu utilization. Note that cpu utilization may be high even though
# there is no bottleneck. The run queue is an indicator processes are
# waiting for cpu resources, and that the cpu may be bottlenecked.
symptom CPU_Bottleneck type=CPU
rule GBL_CPU_TOTAL_UTIL > 75 prob 25
rule GBL_CPU_TOTAL_UTIL > 85 prob 25
rule GBL_CPU_TOTAL_UTIL > 90 prob 25
rule GBL_RUN_QUEUE > 2 prob 25
alarm CPU_Bottleneck > 50 for 5 minutes
type = "CPU"
start
if CPU_Bottleneck > 90 then
red alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
else
yellow alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
repeat every 10 minutes
if CPU_Bottleneck > 90 then
red alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
else
yellow alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
end
reset alert "End of CPU Bottleneck Alert"
# The Disk bottleneck symptom default is influenced mostly by the busiest
# disk's utilization. The disk request queue is an indicator that processes
# may be waiting for disk resources.
symptom Disk_Bottleneck type=DISK
rule GBL_DISK_UTIL_PEAK > 50 prob GBL_DISK_UTIL_PEAK
rule GBL_DISK_REQUEST_QUEUE > 3 prob 25
alarm Disk_Bottleneck > 50 for 5 minutes
type = "Disk"
start
if Disk_Bottleneck > 90 then
red alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
else
yellow alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
repeat every 10 minutes
if Disk_Bottleneck > 90 then
red alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
else
yellow alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
end
reset alert "End of Disk Bottleneck Alert"
# The Memory bottleneck symptom default is triggered by a combination
# of several metrics. Excessive page outs can be an indicator of memory
# pressure when the memory utilization is high, however memory-mapped
# file writes also generate pageouts. Under heavy memory pressure, data
# will start to be swapped out.
symptom Memory_Bottleneck type=MEMORY
rule GBL_MEM_UTIL > 95 prob 30
rule GBL_MEM_UTIL > 98 prob 20
rule GBL_MEM_PAGEOUT_BYTE_RATE > 200 prob 20
rule GBL_MEM_SWAPOUT_BYTE_RATE > 0 prob 20
rule GBL_MEM_SWAPOUT_BYTE_RATE > 10 prob 50
alarm Memory_Bottleneck > 50 for 5 minutes
type = "Memory"
start
if Memory_Bottleneck > 90 then
red alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
else
yellow alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
repeat every 10 minutes
if Memory_Bottleneck > 90 then
red alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
else
yellow alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
end
reset alert "End of Memory Bottleneck Alert"
# The Network bottleneck symptom default relies on general throughput
# metrics. Not all network interfaces report collision data. To be
# useful as a bottleneck indicator, the rate thresholds should be
# adjusted based on values seen in historical data for a particular
# system or network. For example, 100mbit networks cannot handle as
# high packet rates without a bottleneck than can gigabit networks.
symptom Network_Bottleneck type=NETWORK
rule GBL_NFS_CALL_RATE > 500 prob 25
rule GBL_NET_COLLISION_PCT > 10 prob 10
rule GBL_NET_COLLISION_PCT > 25 prob 20
rule GBL_NET_COLLISION_PCT > 50 prob 30
rule GBL_NET_PACKET_RATE > 500 prob 10
rule GBL_NET_PACKET_RATE > 1000 prob 15
rule GBL_NET_PACKET_RATE > 3000 prob 20
rule GBL_NET_PACKET_RATE > 5000 prob 25
rule GBL_NET_PACKET_RATE > 9000 prob 25
alarm Network_Bottleneck > 50 for 5 minutes
type = "Network"
start
if Network_Bottleneck > 90 then
red alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
else
yellow alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
repeat every 10 minutes
if Network_Bottleneck > 90 then
red alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
else
yellow alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
end
reset alert "End of Network Bottleneck Alert"
# The following alarm assumes that on a good network, few errors occur:
alarm GBL_NET_ERROR_RATE > 1 for 5 minutes
type = "Network"
start
red alert "Network error rate is greater than one per second"
end
reset alert "End of network error rate condition"
# Global swap space utilization alarm:
alarm GBL_SWAP_SPACE_UTIL > 95 for 5 minutes
start
red alert "Global swap space is nearly full"
end
reset alert "End of global swap space full condition"
##############################################################################
#
# The following are few sample alarms that illustrate some of the aspects of
# performance alarming. Additional examples may be found under the directory
# /opt/perf/examples/config/
#
##############################################################################
#
# The following two alarm shows the use of the EXEC statement to execute
# the local action of mailing a message.
#
##
# alarm other:APP_CPU_TOTAL_UTIL > 10 for 10 minutes
# start
# {
# yellow alert "other application using more than 10 percent of the cpu"
# exec "echo 'other application using > 10% cpu' | mail root"
# }
# end
# reset alert "other application cpu warning over"
#
# alarm GBL_STARTED_PROC_RATE > 100 for 5 minutes
# start
# exec "echo \"Processes being started at a high rate of ",
# GBL_STARTED_PROC_RATE,
# " per second\" | mail -s \"high fork rate on `hostname` \" root"
# end
# exec "echo \"end of high fork rate on `hostname` \" | mail root"
#
##############################################################################
#
# This alarm shows how to not trigger an alarm if your backup process is
# running. You will have to define an application called "backup" in the
# parm file that contains the appropriate backup processes and in the "log"
# line of your parm file, change "application" to "application=all" to
# request that scopeux log all applications every interval even if they
# have no processes alive.
#
##
# alarm GBL_CPU_TOTAL_UTIL > 50 and
# backup:APP_ALIVE_PROC == 0 for 5 minutes
# type = "CPU"
# start
# yellow alert "Warning high cpu util = ", GBL_CPU_TOTAL_UTIL, "%"
# repeat every 60 minutes
# red alert "cpu warning continuing"
# end
# reset alert "End of cpu utilization alarm"
#
##############################################################################
#
# This alarm monitors two file systems and sends an alert when the space
# utilization on a given filesystem exceeds the given value.
#
# Initialize the variables root_util and var_util which will hold the
# current FS_SPACE_UTIL value. The first time they are accessed, they will
# be initialized to zero. Loop through the filesystem each interval and save
# the FS_SPACE_UTIL for each one. Send an alert if the space utilization
# exceeds the given threshold. A repeat alert will be sent every 30 minutes
# for as long as the threshold is exceeded.
#
# Note how alert and print syntac allows you to format numeric values with
# the optional parameters [|width|decimals]. The values will be right
# justified by default. To left justify the field, add a "-" character
# before the width parameter.
#
##
# root_util = root_util
# var_util = var_util
#
# FILESYSTEM LOOP
# {
# if FS_DIRNAME == "/" then
# root_util = FS_SPACE_UTIL
#
# if FS_DIRNAME == "/var" then
# var_util = FS_SPACE_UTIL
# }
#
# alarm root_util > 90 for 10 minutes
# start
# yellow alert "root filesys space util too high at ", root_util|3|2, "%"
# repeat every 30 minutes
# red alert "root fs space is still too high at ", root_util|-3|2, "%"
# end
# reset alert "root space util below the threshold"
#
# alarm var_util > 75 for 10 minutes
# start
# yellow alert "/var space util is too high at ", var_util|3|2, "%"
# repeat every 30 minutes
# red alert "/var space util is still too high at ", var_util|3|2, "%"
# end
# reset alert "/var space util now below the threshold"
#
##############################################################################
#
# The following alarm is triggered when an application stops running.
# You would have to define your application to replace this "Finance" example
# in the parm file that contains the appropriate processes you are interested
# in. For this alarm to work reliably, you would also need to change
# change "application" to "application=all" in the "log" line of the parm
# file to request that scopeux log all applications every interval even if
# they have no processes alive.
##
# alarm Finance:APP_ALIVE_PROC < 1 for 5 minutes
# start
# yellow alert "no processes alive for the Finance application"
# repeat every 360 minutes
# red alert "no processes alive for the Finance application"
# end
# reset alert "Finance application running again"
#
##############################################################################
#
# The following alarm shows how long an application has been running since
# perfalarm was started. It will create a file named /tmp/uptime which you
# can use, and it will send mail when the application stops running. You
# must, of course, have defined an application you are interested in
# monitoring in the parm file and change the "myapplication" name below to
# match.
#
##
# uptime = uptime
# alarm myapplication:APP_ALIVE_PROC > 2 for 5 minutes
# start
# {
# uptime = uptime + 5
# exec "echo '", date, " ", time, " uptime for myapplication is ",
# uptime|6|0, "' minutes > /tmp/uptime"
# }
# repeat every 5 minutes
# {
# uptime = uptime + 5
# exec "echo 'uptime for myapplication is ",
# uptime|6|0, "' minutes > /tmp/uptime"
# }
# end
# {
# uptime = 0
# exec "echo 'myapplication has stopped running!' | mail root"
# }
#
##############################################################################
#
# The following example will watch for process records being logged for
# a specific process name, and then check for its memory usage exceeding
# a high-water mark above a threshold.
#
# Note that scopeus only logs "interesting" processes according to
# parameters set in the parm file. There is a process threshold option
# for virtual memory consumption you can adjust.
#
# Local variable Threshold will be set to 10000 (units in KB = 10 megabytes)
##
# threshold = 10000
# highwater = highwater
# PROCESS LOOP
# {
# if (PROC_PROC_NAME == "myprogram") and
# (PROC_MEM_VIRT > threshold) and
# (PROC_MEM_VIRT > highwater) then
# {
# yellow alert
# "High value for myprogram's memory virtual set size encountered: ",
# PROC_MEM_VIRT, "KB"
# highwater = PROC_MEM_VIRT
# }
# }
#
##############################################################################
#
# The following example may be useful after you've instrumented an
# application with ARM (see the Tracking Your Transactions manual).
# If you are logging transaction data and have tuned your Service Level
# Objective (slo) threshold in the ttd.conf file, you can then usefully
# use alerts when the slo is exceeded.
#
##
# TRANSACTION LOOP
# if TT_SLO_COUNT > 0 then
# yellow alert "Service Level Objective not met",
# TT_SLO_COUNT|4|0, " times for application ", TT_APP_NAME|20,
# "transaction ", TT_NAME
#
##############################################################################
#
# End of sample alarm section.
#
##############################################################################