HEX

File: //opt/perf/examples/config/alarmdef_procloop
# The following are some example alarms which use Perf Agent process data in
# alarms. Caution should be used when writing process alarms into alarmdef,
# as improperly written loops could increase perfalarm daemon overhead, 
# and possibly trigger too often.  You can use utility's "analyze" command 
# to see how your custom alarmdef syntax works when run against historical 
# data.

# Remember that scopeux does not usually log all processes on the system
# every interval.  Thus, the process loop executed by perfalarm will only
# see processes that exceed scope's interesting process thresholds, that
# are defined in the /var/opt/perf/parm file.


#######################################################################
#
# This first alarm should detect runaway shell processes and send an
# email message to alert the sysadmin of them.  The alarmpid variable
# is used to control repeated email messages if a process continues
# to hog the cpu interval after interval.  Note that on multiprocessor
# systems, it is possible to have more than one process to have a cpu
# util of over 95% at the same time, which might cause this alarm to
# fire too often.

alarmpid = alarmpid
PROCESS LOOP
{
  if ((PROC_PROC_NAME == "sh") or (PROC_PROC_NAME == "-sh") or
      (PROC_PROC_NAME == "bash") or (PROC_PROC_NAME == "-bash") or
      (PROC_PROC_NAME == "ksh") or (PROC_PROC_NAME == "-ksh"))
     and
     (PROC_CPU_TOTAL_TIME_CUM > 1000) and
     (PROC_CPU_TOTAL_UTIL > 95) and
     (PROC_PROC_ID != alarmpid) then
  {
    exec "echo 'Possible runaway shell process detected by PA, pid ",
         PROC_PROC_ID, ", cpu util = ", PROC_CPU_TOTAL_UTIL,
         "\\nfor username ", PROC_USER_NAME|8, " on ", DATE, " at ", TIME,
         "' | mailx -s 'shell cpu process alert' root"
    alarmpid = PROC_PROC_ID
  }
}


#######################################################################
#
# The following alarms catch cpu hog processes in a specific application.
# It will avoid unwanted alarm repeats on multiprocessor systems if two 
# processes happen to be looping at the same time because it compares both 
# the first-highest and second-highest cpu-consuming processes.

#
# Initialize variables for cpu hog alarms
#
cpuhog1_util = 0
cpuhog1_pid = cpuhog1_pid

cpuhog2_util = 0
cpuhog2_pid = cpuhog2_pid
     
# This alarm will work well as long as there are fewer than 3 processes
# in application number 2 that have over 60% cpu utilization in an
# interval.  If one of the same 2 processes continue to use over 60%
# cpu util, then one of the cpuhog?_util variables will be set that we 
# then alarm on.  If a different process hogs the cpu, then the util 
# variables will be initialized to zero and the alarm will not go off.
#
# Note: Application names are not available at the process-data level, but
# the application number is shown by the name in utility "scan" command
# output.
#
PROCESS LOOP
{
   if PROC_CPU_TOTAL_UTIL > 60 and PROC_APP_ID == 2 then 
   {
      if PROC_PROC_ID == cpuhog1_pid then
         cpuhog1_util = PROC_CPU_TOTAL_UTIL

      else if PROC_PROC_ID == cpuhog2_pid then
         cpuhog2_util = PROC_CPU_TOTAL_UTIL

      else
      {
         if cpuhog1_pid == 0 then
         {
            cpuhog1_name = PROC_PROC_NAME
            cpuhog1_pid  = PROC_PROC_ID
         }
        else if cpuhog2_pid == 0 then
         {
            cpuhog2_name = PROC_PROC_NAME
            cpuhog2_pid  = PROC_PROC_ID
         }
      }
   }
}

#
# If the top two cpu hogs did did not consume more than 60%
# of the cpu in the last interval, then reset the pid variables.
#

if cpuhog1_util == 0 then
   cpuhog1_pid = 0

if cpuhog2_util == 0 then
   cpuhog2_pid = 0

#
# Will alarm if a cpuhog process in application 2 was detected which 
# continued to use the cpu for over 3 minutes. The alarm will repeat 
# every 5 minutes.
#
alarm cpuhog1_util > 60 for 3 minutes
   start 
      red alert "CPU hog alert: myapp Process ", cpuhog1_pid|5|0, " ", 
                cpuhog1_name|10, " using ", cpuhog1_util, "% CPU"
   repeat every 5 minutes
      red alert "CPU hog alert: myapp Process ", cpuhog1_pid|5|0, " ", 
                cpuhog1_name|10, " using ", cpuhog1_util, "% CPU"
     
#
# Will alarm if a cpuhog process in application 2 was detected which 
# continued to use the cpu for over 3 minutes. The alarm will repeat 
# every 5 minutes.
#
alarm cpuhog2_util > 60 for 3 minutes
   start 
      red alert "CPU hog alert: myapp Process ", cpuhog2_pid|5|0, " ", 
                cpuhog2_name|10, " using ", cpuhog2_util, "% CPU"
   repeat every 5 minutes
      red alert "CPU hog alert: myapp Process ", cpuhog2_pid|5|0, " ", 
                cpuhog2_name|10, " using ", cpuhog2_util, "% CPU"
     
     

#######################################################################
#
# The following alarms catch possible memory hog processes.  The parm
# file has an interesting process threshold that looks for excessive
# virtual memory, and if you are concerned about monitoring for memory
# leaks you may want to adjust this threshold.  If a process exceeds the
# memory threshold in the parm file, it will be logged with "M" in the
# PROC_INTEREST metric.
 
# Note that virtual set size (VSS), the total working set for a process
# which includes regions which may be shared by other processes, is 
# quite different from the resident set size (RSS), which is the amount
# of physical memory that the process currently is accessing.  A memory
# hog can allocate much more memory space than will actually be in 
# physical RAM, so we use VSS not RSS for our alarm.

# Initialize variables for memory hog alarms:
memhog_pid = 0
seconds_since_last_alarm = seconds_since_last_alarm + 60

# Change the following VSS threshold to a value that makes sense for 
# the workload on your system.  This default example is set to 50000 
# kilobytes (~50 megabytes) Virtual Set Size, meaning that no alarm
# will be generated unless a process has allocated over 50mb of 
# virtual memory.  The actual threshold increases if the alarm gets
# triggered and resets back to this original value if the alarm doesn't.
starting_vss_threshold = 50000

# Bump the current threshold up as memory hogs are found to avoid
# repeated alarms.  The following initializes the current threshold:
current_vss_threshold = current_vss_threshold
if current_vss_threshold == 0 then
  current_vss_threshold = starting_vss_threshold

PROCESS LOOP
{
  # Set memhog_pid to the pid of the process with the highest virtual 
  # set size above the current threshold, as long as it has been
  # running for at least ten minutes (600 seconds).
  if (PROC_MEM_VIRT > current_vss_threshold) and
     (PROC_RUN_TIME > 600) then
  {
    memhog_pid = PROC_PROC_ID
    memhog_name = PROC_PROC_NAME
    memhog_vss = PROC_MEM_VIRT
  }
}

# Only start this alarm at the most once every 10 minutes by checking
# how many seconds its been since the last time we alarmed:
alarm (memhog_pid != 0) and (seconds_since_last_alarm > 600) for 1 minute
   start
   {
    # Send mail to administrator.  Convert this to a red alert like in
    # the cpu example above if you want to use normal Perf Agent alarming
    # methods.
    exec "echo \"Possible memory hog process detected by PA",
         "\\nname = ", memhog_name,
         "\\npid = ", memhog_pid|5|0,
         "\\nVSS = ", memhog_vss|5|0, "KB",
         "\\ndetected on ", DATE, " at ", TIME,
         "\" | mailx -s \"memory hog process alert from `hostname` \" ",
         "root@adminsystem"
    # Reset the last alarm timer so we can check the time between alarms:
    seconds_since_last_alarm = 0
    # Reset the current VSS threshold to 1mb more than hog's VSS:
    current_vss_threshold = memhog_vss + 1000
   }

# Reset VSSthreshold back to original value if there has been no alarm
# for 10 hours (10*60*60 = 36000 seconds):
if (seconds_since_last_alarm >= 36000) and 
   (current_vss_threshold > starting_vss_threshold) then
  current_vss_threshold = starting_vss_threshold
  


#######################################################################
#
# This example will send email *and* a special message to OVO via opcmsg 
# when a single process is using a significant amount of cpu, and has 
# accumulated over 10 minutes of cpu time in total.  We avoid processes 
# with pids < 100 assuming they're system processes and they know what 
# they're doing.
#

hogpid = hogpid
PROCESS LOOP
{

  if (PROC_CPU_TOTAL_UTIL > 60) and
     (PROC_CPU_TOTAL_TIME_CUM > 600) and
     (PROC_PROC_ID > 100) and
     (PROC_PROC_ID != hogpid) then
  {

    # First exec sends mail (be sure to change destination address):
    exec "echo \"Possible runaway process detected by PA",
      "\\nname=", PROC_PROC_NAME,
      "\\npid=", PROC_PROC_ID,
      "\\noriginal parent pid=", PROC_PARENT_PROC_ID,
      "\\ncpu util=", PROC_CPU_TOTAL_UTIL,
      "\\ncumulative cpu seconds=", PROC_CPU_TOTAL_TIME_CUM,
      "\\nruntime=", PROC_RUN_TIME,
      "\\nusername=", PROC_USER_NAME,
      "\\ndetected on ", DATE, " at ", TIME,
      "\\n `ps -ef | grep ", PROC_PROC_ID, " | grep -v grep`",
      "\" | mailx -s \"runanway process alert from `hostname`\" ",
            "root@mgrhost"
  
    # Second exec invokes opcmsg with non-default msg group.  You can
    # see in the output of 'agsysdb -l' whether OVO messages are
    # automatically being generated.
    cpu_minutes = PROC_CPU_TOTAL_TIME_CUM / 60
    runtime_minutes = PROC_RUN_TIME / 60
    exec "/opt/OV/bin/OpC/opcmsg app=OVPA obj=CPU msg_g=DNS sev=MAJOR ",
      "msg_t=\"",
      "Process ", PROC_PROC_NAME|18,
      "User    ", PROC_USER_NAME|18,
      "PID ",  PROC_PROC_ID|5, "                 ",
      "CPU Utilization", PROC_CPU_TOTAL_UTIL|6|1, "     ",
      "Minutes of CPU time", cpu_minutes|5|0, "  ",
      "Minutes of Run time", runtime_minutes|5|0, "  ",
      "ps -ef output:",
      "`ps -ef | grep ", PROC_PROC_ID, " | grep -v grep`",
      "\""
  
    hogpid=PROC_PROC_ID
  }

}