web_log.conf 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # unmatched lines
  2. # the following alarms trigger only when there are enough data.
  3. # we assume there are enough data when:
  4. #
  5. # $1m_total_requests > 120
  6. #
  7. # i.e. when there are at least 120 requests during the last minute
  8. template: web_log_1m_total_requests
  9. on: web_log.requests
  10. class: Workload
  11. type: Web Server
  12. component: Web log
  13. families: *
  14. lookup: sum -1m unaligned
  15. calc: ($this == 0)?(1):($this)
  16. units: requests
  17. every: 10s
  18. info: number of HTTP requests in the last minute
  19. template: web_log_1m_unmatched
  20. on: web_log.excluded_requests
  21. class: Errors
  22. type: Web Server
  23. component: Web log
  24. families: *
  25. lookup: sum -1m unaligned of unmatched
  26. calc: $this * 100 / $web_log_1m_total_requests
  27. units: %
  28. every: 10s
  29. warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
  30. delay: up 1m down 5m multiplier 1.5 max 1h
  31. info: percentage of unparsed log lines over the last minute
  32. to: webmaster
  33. # -----------------------------------------------------------------------------
  34. # high level response code alarms
  35. # the following alarms trigger only when there are enough data.
  36. # we assume there are enough data when:
  37. #
  38. # $1m_requests > 120
  39. #
  40. # i.e. when there are at least 120 requests during the last minute
  41. template: web_log_1m_requests
  42. on: web_log.type_requests
  43. class: Workload
  44. type: Web Server
  45. component: Web log
  46. families: *
  47. lookup: sum -1m unaligned
  48. calc: ($this == 0)?(1):($this)
  49. units: requests
  50. every: 10s
  51. info: number of HTTP requests in the last minute
  52. template: web_log_1m_successful
  53. on: web_log.type_requests
  54. class: Workload
  55. type: Web Server
  56. component: Web log
  57. families: *
  58. lookup: sum -1m unaligned of success
  59. calc: $this * 100 / $web_log_1m_requests
  60. units: %
  61. every: 10s
  62. warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
  63. crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
  64. delay: up 2m down 15m multiplier 1.5 max 1h
  65. info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
  66. to: webmaster
  67. template: web_log_1m_redirects
  68. on: web_log.type_requests
  69. class: Workload
  70. type: Web Server
  71. component: Web log
  72. families: *
  73. lookup: sum -1m unaligned of redirect
  74. calc: $this * 100 / $web_log_1m_requests
  75. units: %
  76. every: 10s
  77. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
  78. delay: up 2m down 15m multiplier 1.5 max 1h
  79. info: ratio of redirection HTTP requests over the last minute (3xx except 304)
  80. to: webmaster
  81. template: web_log_1m_bad_requests
  82. on: web_log.type_requests
  83. class: Errors
  84. type: Web Server
  85. component: Web log
  86. families: *
  87. lookup: sum -1m unaligned of bad
  88. calc: $this * 100 / $web_log_1m_requests
  89. units: %
  90. every: 10s
  91. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
  92. delay: up 2m down 15m multiplier 1.5 max 1h
  93. info: ratio of client error HTTP requests over the last minute (4xx except 401)
  94. to: webmaster
  95. template: web_log_1m_internal_errors
  96. on: web_log.type_requests
  97. class: Errors
  98. type: Web Server
  99. component: Web log
  100. families: *
  101. lookup: sum -1m unaligned of error
  102. calc: $this * 100 / $web_log_1m_requests
  103. units: %
  104. every: 10s
  105. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
  106. crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
  107. delay: up 2m down 15m multiplier 1.5 max 1h
  108. info: ratio of server error HTTP requests over the last minute (5xx)
  109. to: webmaster
  110. # -----------------------------------------------------------------------------
  111. # web slow
  112. # the following alarms trigger only when there are enough data.
  113. # we assume there are enough data when:
  114. #
  115. # $1m_requests > 120
  116. #
  117. # i.e. when there are at least 120 requests during the last minute
  118. template: web_log_10m_response_time
  119. on: web_log.request_processing_time
  120. class: Latency
  121. type: System
  122. component: Web log
  123. families: *
  124. lookup: average -10m unaligned of avg
  125. units: ms
  126. every: 30s
  127. info: average HTTP response time over the last 10 minutes
  128. template: web_log_web_slow
  129. on: web_log.request_processing_time
  130. class: Latency
  131. type: Web Server
  132. component: Web log
  133. families: *
  134. lookup: average -1m unaligned of avg
  135. units: ms
  136. every: 10s
  137. green: 500
  138. red: 1000
  139. warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
  140. crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
  141. delay: down 15m multiplier 1.5 max 1h
  142. info: average HTTP response time over the last 1 minute
  143. options: no-clear-notification
  144. to: webmaster
  145. # -----------------------------------------------------------------------------
  146. # web too many or too few requests
  147. # the following alarms trigger only when there are enough data.
  148. # we assume there are enough data when:
  149. #
  150. # $5m_successful_old > 120
  151. #
  152. # i.e. when there were at least 120 requests during the 5 minutes starting
  153. # at -10m and ending at -5m
  154. template: web_log_5m_successful_old
  155. on: web_log.type_requests
  156. class: Workload
  157. type: Web Server
  158. component: Web log
  159. families: *
  160. lookup: average -5m at -5m unaligned of success
  161. units: requests/s
  162. every: 30s
  163. info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
  164. template: web_log_5m_successful
  165. on: web_log.type_requests
  166. class: Workload
  167. type: Web Server
  168. component: Web log
  169. families: *
  170. lookup: average -5m unaligned of success
  171. units: requests/s
  172. every: 30s
  173. info: average number of successful HTTP requests over the last 5 minutes
  174. template: web_log_5m_requests_ratio
  175. on: web_log.type_requests
  176. class: Workload
  177. type: Web Server
  178. component: Web log
  179. families: *
  180. calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
  181. units: %
  182. every: 30s
  183. warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
  184. crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
  185. delay: down 15m multiplier 1.5 max 1h
  186. options: no-clear-notification
  187. info: ratio of successful HTTP requests over over the last 5 minutes, \
  188. compared with the previous 5 minutes \
  189. (clear notification for this alarm will not be sent)
  190. to: webmaster