HEX

File: //opt/perf/newconfig/alarmdef
# @(#)alarmdef            11.00.000     for PA/Linux             =*=
#
#  HP Performance Agent alarm definitions
#
# The perfalarm daemons read the alarmdef file to configure alarms.
# The /var/opt/perf/alarmdef file should be edited to meet the needs of your
# environment.  User changes to this file will be preserved when Perf Agent is 
# updated to a later release.  The /opt/perf/newconfig/alarmdef file will
# always contain the installed release's default alarms.  Additional 
# examples exist # under /opt/perf/examples/config.
# See your Perf Agent User's Manual for more information.

# Below are the primary CPU, Disk, Memory, and Network Bottleneck alarms.
# For each area, a bottleneck symptom is calculated, and the resulting
# bottleneck probability is used to define yellow or red alerts.
 
# The CPU bottleneck symptom default is influenced mostly by the overall
# cpu utilization.  Note that cpu utilization may be high even though 
# there is no bottleneck.  The run queue is an indicator processes are 
# waiting for cpu resources, and that the cpu may be bottlenecked.
symptom CPU_Bottleneck type=CPU
rule GBL_CPU_TOTAL_UTIL       > 75   prob 25
rule GBL_CPU_TOTAL_UTIL       > 85   prob 25
rule GBL_CPU_TOTAL_UTIL       > 90   prob 25
rule GBL_RUN_QUEUE            >  2   prob 25
 
alarm CPU_Bottleneck > 50 for 5 minutes
  type = "CPU"
  start 
    if CPU_Bottleneck > 90 then
      red alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
    else
      yellow alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
  repeat every 10 minutes
    if CPU_Bottleneck > 90 then
      red alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
    else
      yellow alert "CPU Bottleneck probability= ", CPU_Bottleneck, "%"
  end 
    reset alert "End of CPU Bottleneck Alert"
 

# The Disk bottleneck symptom default is influenced mostly by the busiest
# disk's utilization.  The disk request queue is an indicator that processes 
# may be waiting for disk resources.
symptom Disk_Bottleneck type=DISK
rule GBL_DISK_UTIL_PEAK     > 50   prob GBL_DISK_UTIL_PEAK
rule GBL_DISK_REQUEST_QUEUE >  3   prob 25
 
alarm Disk_Bottleneck > 50 for 5 minutes
  type = "Disk"
  start 
    if Disk_Bottleneck > 90 then
      red alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
    else
      yellow alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
  repeat every 10 minutes
    if Disk_Bottleneck > 90 then
      red alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
    else
      yellow alert "Disk Bottleneck probability= ", Disk_Bottleneck, "%"
  end 
    reset alert "End of Disk Bottleneck Alert"


# The Memory bottleneck symptom default is triggered by a combination
# of several metrics.  Excessive page outs can be an indicator of memory
# pressure when the memory utilization is high, however memory-mapped
# file writes also generate pageouts.  Under heavy memory pressure, data
# will start to be swapped out.
symptom Memory_Bottleneck type=MEMORY
rule GBL_MEM_UTIL              >   95  prob 30
rule GBL_MEM_UTIL              >   98  prob 20
rule GBL_MEM_PAGEOUT_BYTE_RATE >  200  prob 20
rule GBL_MEM_SWAPOUT_BYTE_RATE >    0  prob 20
rule GBL_MEM_SWAPOUT_BYTE_RATE >   10  prob 50
 
alarm Memory_Bottleneck > 50 for 5 minutes
  type = "Memory"
  start 
    if Memory_Bottleneck > 90 then
      red alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
    else
      yellow alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
  repeat every 10 minutes
    if Memory_Bottleneck > 90 then
      red alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
    else
      yellow alert "Memory Bottleneck probability= ", Memory_Bottleneck, "%"
  end 
    reset alert "End of Memory Bottleneck Alert"
 

# The Network bottleneck symptom default relies on general throughput
# metrics.  Not all network interfaces report collision data.  To be
# useful as a bottleneck indicator, the rate thresholds should be 
# adjusted based on values seen in historical data for a particular 
# system or network.  For example, 100mbit networks cannot handle as
# high packet rates without a bottleneck than can gigabit networks.
symptom Network_Bottleneck type=NETWORK     
rule GBL_NFS_CALL_RATE         >  500  prob 25
rule GBL_NET_COLLISION_PCT     >   10  prob 10
rule GBL_NET_COLLISION_PCT     >   25  prob 20
rule GBL_NET_COLLISION_PCT     >   50  prob 30
rule GBL_NET_PACKET_RATE       >  500  prob 10
rule GBL_NET_PACKET_RATE       > 1000  prob 15
rule GBL_NET_PACKET_RATE       > 3000  prob 20
rule GBL_NET_PACKET_RATE       > 5000  prob 25
rule GBL_NET_PACKET_RATE       > 9000  prob 25
 
alarm Network_Bottleneck > 50 for 5 minutes
  type = "Network"
  start 
    if Network_Bottleneck > 90 then
      red alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
    else
      yellow alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
  repeat every 10 minutes
    if Network_Bottleneck > 90 then
      red alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
    else
      yellow alert "Network Bottleneck probability= ", Network_Bottleneck, "%"
  end 
    reset alert "End of Network Bottleneck Alert"

# The following alarm assumes that on a good network, few errors occur:
alarm GBL_NET_ERROR_RATE > 1 for 5 minutes
  type = "Network"
  start
    red alert "Network error rate is greater than one per second"
  end
    reset alert "End of network error rate condition"


# Global swap space utilization alarm:
alarm GBL_SWAP_SPACE_UTIL > 95 for 5 minutes
  start
    red alert "Global swap space is nearly full"
  end
    reset alert "End of global swap space full condition"

     
############################################################################## 
#
# The following are few sample alarms that illustrate some of the aspects of 
# performance alarming.  Additional examples may be found under the directory
# /opt/perf/examples/config/
#
############################################################################## 
#
# The following two alarm shows the use of the EXEC statement to execute 
# the local action of mailing a message.
#
##
# alarm other:APP_CPU_TOTAL_UTIL > 10 for 10 minutes 
#   start
#   {
#     yellow alert "other application using more than 10 percent of the cpu" 
#     exec "echo 'other application using > 10% cpu' | mail root"
#   }
#   end
#     reset alert "other application cpu warning over" 
#
# alarm GBL_STARTED_PROC_RATE > 100 for 5 minutes 
#   start
#     exec "echo \"Processes being started at a high rate of ", 
#        GBL_STARTED_PROC_RATE, 
#        " per second\" | mail -s \"high fork rate on `hostname` \" root"
#   end
#     exec "echo \"end of high fork rate on `hostname` \" | mail root" 
#
############################################################################## 
#
# This alarm shows how to not trigger an alarm if your backup process is 
# running.  You will have to define an application called "backup" in the 
# parm file that contains the appropriate backup processes and in the "log"
# line of your parm file, change "application" to "application=all" to 
# request that scopeux log all applications every interval even if they 
# have no processes alive.
#
##
# alarm GBL_CPU_TOTAL_UTIL > 50 and
#       backup:APP_ALIVE_PROC == 0  for 5 minutes 
#   type = "CPU"
#   start 
#     yellow alert "Warning high cpu util = ", GBL_CPU_TOTAL_UTIL, "%" 
#   repeat every 60 minutes
#     red alert "cpu warning continuing"
#   end 
#     reset alert "End of cpu utilization alarm" 
# 
############################################################################## 
#
# This alarm monitors two file systems and sends an alert when the space
# utilization on a given filesystem exceeds the given value.
#
# Initialize the variables root_util and var_util which will hold the
# current FS_SPACE_UTIL value.  The first time they are accessed, they will
# be initialized to zero.  Loop through the filesystem each interval and save
# the FS_SPACE_UTIL for each one.  Send an alert if the space utilization
# exceeds the given threshold.  A repeat alert will be sent every 30 minutes
# for as long as the threshold is exceeded.
#
# Note how alert and print syntac allows you to format numeric values with 
# the optional parameters [|width|decimals].  The values will be right 
# justified by default.  To left justify the field, add a "-" character 
# before the width parameter.
#
##
# root_util = root_util
# var_util  = var_util
#
# FILESYSTEM LOOP
# {
#   if FS_DIRNAME == "/" then
#     root_util = FS_SPACE_UTIL
# 
#   if FS_DIRNAME == "/var" then
#     var_util = FS_SPACE_UTIL
# }
#
# alarm root_util > 90 for 10 minutes 
#   start
#     yellow alert "root filesys space util too high at ", root_util|3|2, "%" 
#   repeat every 30 minutes
#     red alert    "root fs space is still too high at ", root_util|-3|2, "%" 
#   end
#     reset alert  "root space util below the threshold" 
# 
# alarm var_util > 75 for 10 minutes 
#   start
#     yellow alert "/var space util is too high at ", var_util|3|2, "%" 
#   repeat every 30 minutes
#     red alert    "/var space util is still too high at ", var_util|3|2, "%" 
#   end
#     reset alert  "/var space util now below the threshold" 
#
############################################################################## 
#
# The following alarm is triggered when an application stops running. 
# You would have to define your application to replace this "Finance" example
# in the parm file that contains the appropriate processes you are interested 
# in.  For this alarm to work reliably, you would also need to change 
# change "application" to "application=all" in the "log" line of the parm
# file to request that scopeux log all applications every interval even if 
# they have no processes alive.
##
# alarm Finance:APP_ALIVE_PROC < 1 for 5 minutes 
#   start
#     yellow alert "no processes alive for the Finance application" 
#   repeat every 360 minutes
#     red alert "no processes alive for the Finance application" 
#   end
#     reset alert "Finance application running again" 
#    
############################################################################## 
#
# The following alarm shows how long an application has been running since 
# perfalarm was started.  It will create a file named /tmp/uptime which you
# can use, and it will send mail when the application stops running.  You
# must, of course, have defined an application you are interested in
# monitoring in the parm file and change the "myapplication" name below to
# match.
#
##
# uptime = uptime
# alarm myapplication:APP_ALIVE_PROC > 2 for 5 minutes 
#   start
#   {
#     uptime = uptime + 5
#     exec "echo '", date, " ", time, " uptime for myapplication is ", 
#          uptime|6|0, "' minutes > /tmp/uptime"
#   }
#   repeat every 5 minutes
#   {
#     uptime = uptime + 5
#     exec "echo 'uptime for myapplication is ", 
#          uptime|6|0, "' minutes > /tmp/uptime" 
#   }
#   end
#   {
#     uptime = 0
#     exec "echo 'myapplication has stopped running!' | mail root" 
#   }
#
############################################################################## 
#
# The following example will watch for process records being logged for
# a specific process name, and then check for its memory usage exceeding
# a high-water mark above a threshold.  
#
# Note that scopeus only logs "interesting" processes according to 
# parameters set in the parm file.  There is a process threshold option
# for virtual memory consumption you can adjust.
#
# Local variable Threshold will be set to 10000 (units in KB = 10 megabytes)
##
# threshold = 10000
# highwater = highwater
# PROCESS LOOP
# {
#   if (PROC_PROC_NAME == "myprogram") and
#      (PROC_MEM_VIRT > threshold) and
#      (PROC_MEM_VIRT > highwater) then
#   {
#     yellow alert 
#       "High value for myprogram's memory virtual set size encountered: ",
#       PROC_MEM_VIRT, "KB"
#     highwater = PROC_MEM_VIRT
#   }
# }
#
############################################################################## 
#
# The following example may be useful after you've instrumented an 
# application with ARM (see the Tracking Your Transactions manual).
# If you are logging transaction data and have tuned your Service Level 
# Objective (slo) threshold in the ttd.conf file, you can then usefully 
# use alerts when the slo is exceeded.
#
##
# TRANSACTION LOOP
#   if TT_SLO_COUNT > 0 then
#       yellow alert "Service Level Objective not met",
#              TT_SLO_COUNT|4|0, " times for application ", TT_APP_NAME|20,
#              "transaction ", TT_NAME
#
############################################################################## 
#
# End of sample alarm section.
#
##############################################################################