web_log.conf 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # unmatched lines
  2. # the following alarms trigger only when there are enough data.
  3. # we assume there are enough data when:
  4. #
  5. # $1m_total_requests > 120
  6. #
  7. # i.e. when there are at least 120 requests during the last minute
  8. template: web_log_1m_total_requests
  9. on: web_log.requests
  10. class: Workload
  11. type: Web Server
  12. component: Web log
  13. lookup: sum -1m unaligned
  14. calc: ($this == 0)?(1):($this)
  15. units: requests
  16. every: 10s
  17. info: number of HTTP requests in the last minute
  18. template: web_log_1m_unmatched
  19. on: web_log.excluded_requests
  20. class: Errors
  21. type: Web Server
  22. component: Web log
  23. lookup: sum -1m unaligned of unmatched
  24. calc: $this * 100 / $web_log_1m_total_requests
  25. units: %
  26. every: 10s
  27. warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
  28. delay: up 1m down 5m multiplier 1.5 max 1h
  29. info: percentage of unparsed log lines over the last minute
  30. to: webmaster
  31. # -----------------------------------------------------------------------------
  32. # high level response code alarms
  33. # the following alarms trigger only when there are enough data.
  34. # we assume there are enough data when:
  35. #
  36. # $1m_requests > 120
  37. #
  38. # i.e. when there are at least 120 requests during the last minute
  39. template: web_log_1m_requests
  40. on: web_log.type_requests
  41. class: Workload
  42. type: Web Server
  43. component: Web log
  44. lookup: sum -1m unaligned
  45. calc: ($this == 0)?(1):($this)
  46. units: requests
  47. every: 10s
  48. info: number of HTTP requests in the last minute
  49. template: web_log_1m_successful
  50. on: web_log.type_requests
  51. class: Workload
  52. type: Web Server
  53. component: Web log
  54. lookup: sum -1m unaligned of success
  55. calc: $this * 100 / $web_log_1m_requests
  56. units: %
  57. every: 10s
  58. warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
  59. crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
  60. delay: up 2m down 15m multiplier 1.5 max 1h
  61. info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
  62. to: webmaster
  63. template: web_log_1m_redirects
  64. on: web_log.type_requests
  65. class: Workload
  66. type: Web Server
  67. component: Web log
  68. lookup: sum -1m unaligned of redirect
  69. calc: $this * 100 / $web_log_1m_requests
  70. units: %
  71. every: 10s
  72. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
  73. delay: up 2m down 15m multiplier 1.5 max 1h
  74. info: ratio of redirection HTTP requests over the last minute (3xx except 304)
  75. to: webmaster
  76. template: web_log_1m_bad_requests
  77. on: web_log.type_requests
  78. class: Errors
  79. type: Web Server
  80. component: Web log
  81. lookup: sum -1m unaligned of bad
  82. calc: $this * 100 / $web_log_1m_requests
  83. units: %
  84. every: 10s
  85. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
  86. delay: up 2m down 15m multiplier 1.5 max 1h
  87. info: ratio of client error HTTP requests over the last minute (4xx except 401)
  88. to: webmaster
  89. template: web_log_1m_internal_errors
  90. on: web_log.type_requests
  91. class: Errors
  92. type: Web Server
  93. component: Web log
  94. lookup: sum -1m unaligned of error
  95. calc: $this * 100 / $web_log_1m_requests
  96. units: %
  97. every: 10s
  98. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
  99. crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
  100. delay: up 2m down 15m multiplier 1.5 max 1h
  101. info: ratio of server error HTTP requests over the last minute (5xx)
  102. to: webmaster
  103. # -----------------------------------------------------------------------------
  104. # web slow
  105. # the following alarms trigger only when there are enough data.
  106. # we assume there are enough data when:
  107. #
  108. # $1m_requests > 120
  109. #
  110. # i.e. when there are at least 120 requests during the last minute
  111. template: web_log_10m_response_time
  112. on: web_log.request_processing_time
  113. class: Latency
  114. type: System
  115. component: Web log
  116. lookup: average -10m unaligned of avg
  117. units: ms
  118. every: 30s
  119. info: average HTTP response time over the last 10 minutes
  120. template: web_log_web_slow
  121. on: web_log.request_processing_time
  122. class: Latency
  123. type: Web Server
  124. component: Web log
  125. lookup: average -1m unaligned of avg
  126. units: ms
  127. every: 10s
  128. green: 500
  129. red: 1000
  130. warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
  131. crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
  132. delay: down 15m multiplier 1.5 max 1h
  133. info: average HTTP response time over the last 1 minute
  134. options: no-clear-notification
  135. to: webmaster
  136. # -----------------------------------------------------------------------------
  137. # web too many or too few requests
  138. # the following alarms trigger only when there are enough data.
  139. # we assume there are enough data when:
  140. #
  141. # $5m_successful_old > 120
  142. #
  143. # i.e. when there were at least 120 requests during the 5 minutes starting
  144. # at -10m and ending at -5m
  145. template: web_log_5m_successful_old
  146. on: web_log.type_requests
  147. class: Workload
  148. type: Web Server
  149. component: Web log
  150. lookup: average -5m at -5m unaligned of success
  151. units: requests/s
  152. every: 30s
  153. info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
  154. template: web_log_5m_successful
  155. on: web_log.type_requests
  156. class: Workload
  157. type: Web Server
  158. component: Web log
  159. lookup: average -5m unaligned of success
  160. units: requests/s
  161. every: 30s
  162. info: average number of successful HTTP requests over the last 5 minutes
  163. template: web_log_5m_requests_ratio
  164. on: web_log.type_requests
  165. class: Workload
  166. type: Web Server
  167. component: Web log
  168. calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
  169. units: %
  170. every: 30s
  171. warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
  172. crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
  173. delay: down 15m multiplier 1.5 max 1h
  174. options: no-clear-notification
  175. info: ratio of successful HTTP requests over over the last 5 minutes, \
  176. compared with the previous 5 minutes \
  177. (clear notification for this alarm will not be sent)
  178. to: webmaster