web_log.conf 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. # unmatched lines
  2. # the following alarms trigger only when there are enough data.
  3. # we assume there are enough data when:
  4. #
  5. # $1m_total_requests > 120
  6. #
  7. # i.e. when there are at least 120 requests during the last minute
  8. template: web_log_1m_total_requests
  9. on: web_log.requests
  10. class: Workload
  11. type: Web Server
  12. component: Web log
  13. lookup: sum -1m unaligned
  14. calc: ($this == 0)?(1):($this)
  15. units: requests
  16. every: 10s
  17. info: number of HTTP requests in the last minute
  18. template: web_log_1m_unmatched
  19. on: web_log.excluded_requests
  20. class: Errors
  21. type: Web Server
  22. component: Web log
  23. lookup: sum -1m unaligned of unmatched
  24. calc: $this * 100 / $web_log_1m_total_requests
  25. units: %
  26. every: 10s
  27. warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
  28. delay: up 1m down 5m multiplier 1.5 max 1h
  29. summary: Web log unparsed
  30. info: Percentage of unparsed log lines over the last minute
  31. to: webmaster
  32. # -----------------------------------------------------------------------------
  33. # high level response code alarms
  34. # the following alarms trigger only when there are enough data.
  35. # we assume there are enough data when:
  36. #
  37. # $1m_requests > 120
  38. #
  39. # i.e. when there are at least 120 requests during the last minute
  40. template: web_log_1m_requests
  41. on: web_log.type_requests
  42. class: Workload
  43. type: Web Server
  44. component: Web log
  45. lookup: sum -1m unaligned
  46. calc: ($this == 0)?(1):($this)
  47. units: requests
  48. every: 10s
  49. info: number of HTTP requests in the last minute
  50. template: web_log_1m_successful
  51. on: web_log.type_requests
  52. class: Workload
  53. type: Web Server
  54. component: Web log
  55. lookup: sum -1m unaligned of success
  56. calc: $this * 100 / $web_log_1m_requests
  57. units: %
  58. every: 10s
  59. warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
  60. crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
  61. delay: up 2m down 15m multiplier 1.5 max 1h
  62. summary: Web log successful
  63. info: Ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
  64. to: webmaster
  65. template: web_log_1m_redirects
  66. on: web_log.type_requests
  67. class: Workload
  68. type: Web Server
  69. component: Web log
  70. lookup: sum -1m unaligned of redirect
  71. calc: $this * 100 / $web_log_1m_requests
  72. units: %
  73. every: 10s
  74. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
  75. delay: up 2m down 15m multiplier 1.5 max 1h
  76. summary: Web log redirects
  77. info: Ratio of redirection HTTP requests over the last minute (3xx except 304)
  78. to: webmaster
  79. template: web_log_1m_bad_requests
  80. on: web_log.type_requests
  81. class: Errors
  82. type: Web Server
  83. component: Web log
  84. lookup: sum -1m unaligned of bad
  85. calc: $this * 100 / $web_log_1m_requests
  86. units: %
  87. every: 10s
  88. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
  89. delay: up 2m down 15m multiplier 1.5 max 1h
  90. summary: Web log bad requests
  91. info: Ratio of client error HTTP requests over the last minute (4xx except 401)
  92. to: webmaster
  93. template: web_log_1m_internal_errors
  94. on: web_log.type_requests
  95. class: Errors
  96. type: Web Server
  97. component: Web log
  98. lookup: sum -1m unaligned of error
  99. calc: $this * 100 / $web_log_1m_requests
  100. units: %
  101. every: 10s
  102. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
  103. crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
  104. delay: up 2m down 15m multiplier 1.5 max 1h
  105. summary: Web log server errors
  106. info: Ratio of server error HTTP requests over the last minute (5xx)
  107. to: webmaster
  108. # -----------------------------------------------------------------------------
  109. # web slow
  110. # the following alarms trigger only when there are enough data.
  111. # we assume there are enough data when:
  112. #
  113. # $1m_requests > 120
  114. #
  115. # i.e. when there are at least 120 requests during the last minute
  116. template: web_log_10m_response_time
  117. on: web_log.request_processing_time
  118. class: Latency
  119. type: System
  120. component: Web log
  121. lookup: average -10m unaligned of avg
  122. units: ms
  123. every: 30s
  124. info: average HTTP response time over the last 10 minutes
  125. template: web_log_web_slow
  126. on: web_log.request_processing_time
  127. class: Latency
  128. type: Web Server
  129. component: Web log
  130. lookup: average -1m unaligned of avg
  131. units: ms
  132. every: 10s
  133. green: 500
  134. red: 1000
  135. warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
  136. crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
  137. delay: down 15m multiplier 1.5 max 1h
  138. summary: Web log processing time
  139. info: Average HTTP response time over the last 1 minute
  140. options: no-clear-notification
  141. to: webmaster
  142. # -----------------------------------------------------------------------------
  143. # web too many or too few requests
  144. # the following alarms trigger only when there are enough data.
  145. # we assume there are enough data when:
  146. #
  147. # $5m_successful_old > 120
  148. #
  149. # i.e. when there were at least 120 requests during the 5 minutes starting
  150. # at -10m and ending at -5m
  151. template: web_log_5m_successful_old
  152. on: web_log.type_requests
  153. class: Workload
  154. type: Web Server
  155. component: Web log
  156. lookup: average -5m at -5m unaligned of success
  157. units: requests/s
  158. every: 30s
  159. info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
  160. template: web_log_5m_successful
  161. on: web_log.type_requests
  162. class: Workload
  163. type: Web Server
  164. component: Web log
  165. lookup: average -5m unaligned of success
  166. units: requests/s
  167. every: 30s
  168. info: average number of successful HTTP requests over the last 5 minutes
  169. template: web_log_5m_requests_ratio
  170. on: web_log.type_requests
  171. class: Workload
  172. type: Web Server
  173. component: Web log
  174. calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
  175. units: %
  176. every: 30s
  177. warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
  178. crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
  179. delay: down 15m multiplier 1.5 max 1h
  180. options: no-clear-notification
  181. summary: Web log 5 minutes requests ratio
  182. info: Ratio of successful HTTP requests over over the last 5 minutes, \
  183. compared with the previous 5 minutes \
  184. (clear notification for this alarm will not be sent)
  185. to: webmaster