web_log.conf 13 KB


  1. # -----------------------------------------------------------------------------
  2. # high level response code alarms
  3. # the following alarms trigger only when there are enough data.
  4. # we assume there are enough data when:
  5. #
  6. # $1m_requests > 120
  7. #
  8. # i.e. when there are at least 120 requests during the last minute
  9. template: 1m_requests
  10. on: web_log.response_statuses
  11. class: Workload
  12. type: Web Server
  13. component: Web log
  14. families: *
  15. lookup: sum -1m unaligned
  16. calc: ($this == 0)?(1):($this)
  17. units: requests
  18. every: 10s
  19. info: number of HTTP requests in the last minute
  20. template: 1m_successful
  21. on: web_log.response_statuses
  22. class: Workload
  23. type: Web Server
  24. component: Web log
  25. families: *
  26. lookup: sum -1m unaligned of successful_requests
  27. calc: $this * 100 / $1m_requests
  28. units: %
  29. every: 10s
  30. warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
  31. crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
  32. delay: up 2m down 15m multiplier 1.5 max 1h
  33. info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
  34. to: webmaster
  35. template: 1m_redirects
  36. on: web_log.response_statuses
  37. class: Workload
  38. type: Web Server
  39. component: Web log
  40. families: *
  41. lookup: sum -1m unaligned of redirects
  42. calc: $this * 100 / $1m_requests
  43. units: %
  44. every: 10s
  45. warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
  46. delay: up 2m down 15m multiplier 1.5 max 1h
  47. info: ratio of redirection HTTP requests over the last minute (3xx except 304)
  48. to: webmaster
  49. template: 1m_bad_requests
  50. on: web_log.response_statuses
  51. class: Errors
  52. type: Web Server
  53. component: Web log
  54. families: *
  55. lookup: sum -1m unaligned of bad_requests
  56. calc: $this * 100 / $1m_requests
  57. units: %
  58. every: 10s
  59. warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
  60. delay: up 2m down 15m multiplier 1.5 max 1h
  61. info: ratio of client error HTTP requests over the last minute (4xx except 401)
  62. to: webmaster
  63. template: 1m_internal_errors
  64. on: web_log.response_statuses
  65. class: Errors
  66. type: Web Server
  67. component: Web log
  68. families: *
  69. lookup: sum -1m unaligned of server_errors
  70. calc: $this * 100 / $1m_requests
  71. units: %
  72. every: 10s
  73. warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
  74. crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
  75. delay: up 2m down 15m multiplier 1.5 max 1h
  76. info: ratio of server error HTTP requests over the last minute (5xx)
  77. to: webmaster
  78. # unmatched lines
  79. # the following alarms trigger only when there are enough data.
  80. # we assume there are enough data when:
  81. #
  82. # $1m_total_requests > 120
  83. #
  84. # i.e. when there are at least 120 requests during the last minute
  85. template: 1m_total_requests
  86. on: web_log.response_codes
  87. class: Workload
  88. type: Web Server
  89. component: Web log
  90. families: *
  91. lookup: sum -1m unaligned
  92. calc: ($this == 0)?(1):($this)
  93. units: requests
  94. every: 10s
  95. info: number of HTTP requests over the last minute
  96. template: 1m_unmatched
  97. on: web_log.response_codes
  98. class: Errors
  99. type: Web Server
  100. component: Web log
  101. families: *
  102. lookup: sum -1m unaligned of unmatched
  103. calc: $this * 100 / $1m_total_requests
  104. units: %
  105. every: 10s
  106. warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
  107. delay: up 1m down 5m multiplier 1.5 max 1h
  108. info: percentage of unparsed log lines over the last minute
  109. to: webmaster
  110. # -----------------------------------------------------------------------------
  111. # web slow
  112. # the following alarms trigger only when there are enough data.
  113. # we assume there are enough data when:
  114. #
  115. # $1m_requests > 120
  116. #
  117. # i.e. when there are at least 120 requests during the last minute
  118. template: 10m_response_time
  119. on: web_log.response_time
  120. class: Latency
  121. type: System
  122. component: Web log
  123. families: *
  124. lookup: average -10m unaligned of avg
  125. units: ms
  126. every: 30s
  127. info: average HTTP response time over the last 10 minutes
  128. template: web_slow
  129. on: web_log.response_time
  130. class: Latency
  131. type: Web Server
  132. component: Web log
  133. families: *
  134. lookup: average -1m unaligned of avg
  135. units: ms
  136. every: 10s
  137. green: 500
  138. red: 1000
  139. warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
  140. crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
  141. delay: down 15m multiplier 1.5 max 1h
  142. info: average HTTP response time over the last minute
  143. options: no-clear-notification
  144. to: webmaster
  145. # -----------------------------------------------------------------------------
  146. # web too many or too few requests
  147. # the following alarms trigger only when there are enough data.
  148. # we assume there are enough data when:
  149. #
  150. # $5m_successful_old > 120
  151. #
  152. # i.e. when there were at least 120 requests during the 5 minutes starting
  153. # at -10m and ending at -5m
  154. template: 5m_successful_old
  155. on: web_log.response_statuses
  156. class: Workload
  157. type: Web Server
  158. component: Web log
  159. families: *
  160. lookup: average -5m at -5m unaligned of successful_requests
  161. units: requests/s
  162. every: 30s
  163. info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
  164. template: 5m_successful
  165. on: web_log.response_statuses
  166. class: Workload
  167. type: Web Server
  168. component: Web log
  169. families: *
  170. lookup: average -5m unaligned of successful_requests
  171. units: requests/s
  172. every: 30s
  173. info: average number of successful HTTP requests over the last 5 minutes
  174. template: 5m_requests_ratio
  175. on: web_log.response_codes
  176. class: Workload
  177. type: Web Server
  178. component: Web log
  179. families: *
  180. calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
  181. units: %
  182. every: 30s
  183. warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
  184. crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
  185. delay: down 15m multiplier 1.5 max 1h
  186. options: no-clear-notification
  187. info: ratio of successful HTTP requests over the last 5 minutes, \
  188. compared with the previous 5 minutes \
  189. (clear notification for this alarm will not be sent)
  190. to: webmaster
  191. # ---------------------------------------------------GO-VERSION---------------------------------------------------------
  192. # unmatched lines
  193. # the following alarms trigger only when there are enough data.
  194. # we assume there are enough data when:
  195. #
  196. # $1m_total_requests > 120
  197. #
  198. # i.e. when there are at least 120 requests during the last minute
  199. template: web_log_1m_total_requests
  200. on: web_log.requests
  201. class: Workload
  202. type: Web Server
  203. component: Web log
  204. families: *
  205. lookup: sum -1m unaligned
  206. calc: ($this == 0)?(1):($this)
  207. units: requests
  208. every: 10s
  209. info: number of HTTP requests in the last minute
  210. template: web_log_1m_unmatched
  211. on: web_log.excluded_requests
  212. class: Errors
  213. type: Web Server
  214. component: Web log
  215. families: *
  216. lookup: sum -1m unaligned of unmatched
  217. calc: $this * 100 / $web_log_1m_total_requests
  218. units: %
  219. every: 10s
  220. warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
  221. delay: up 1m down 5m multiplier 1.5 max 1h
  222. info: percentage of unparsed log lines over the last minute
  223. to: webmaster
  224. # -----------------------------------------------------------------------------
  225. # high level response code alarms
  226. # the following alarms trigger only when there are enough data.
  227. # we assume there are enough data when:
  228. #
  229. # $1m_requests > 120
  230. #
  231. # i.e. when there are at least 120 requests during the last minute
  232. template: web_log_1m_requests
  233. on: web_log.type_requests
  234. class: Workload
  235. type: Web Server
  236. component: Web log
  237. families: *
  238. lookup: sum -1m unaligned
  239. calc: ($this == 0)?(1):($this)
  240. units: requests
  241. every: 10s
  242. info: number of HTTP requests in the last minute
  243. template: web_log_1m_successful
  244. on: web_log.type_requests
  245. class: Workload
  246. type: Web Server
  247. component: Web log
  248. families: *
  249. lookup: sum -1m unaligned of success
  250. calc: $this * 100 / $web_log_1m_requests
  251. units: %
  252. every: 10s
  253. warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
  254. crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
  255. delay: up 2m down 15m multiplier 1.5 max 1h
  256. info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
  257. to: webmaster
  258. template: web_log_1m_redirects
  259. on: web_log.type_requests
  260. class: Workload
  261. type: Web Server
  262. component: Web log
  263. families: *
  264. lookup: sum -1m unaligned of redirect
  265. calc: $this * 100 / $web_log_1m_requests
  266. units: %
  267. every: 10s
  268. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
  269. delay: up 2m down 15m multiplier 1.5 max 1h
  270. info: ratio of redirection HTTP requests over the last minute (3xx except 304)
  271. to: webmaster
  272. template: web_log_1m_bad_requests
  273. on: web_log.type_requests
  274. class: Errors
  275. type: Web Server
  276. component: Web log
  277. families: *
  278. lookup: sum -1m unaligned of bad
  279. calc: $this * 100 / $web_log_1m_requests
  280. units: %
  281. every: 10s
  282. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
  283. delay: up 2m down 15m multiplier 1.5 max 1h
  284. info: ratio of client error HTTP requests over the last minute (4xx except 401)
  285. to: webmaster
  286. template: web_log_1m_internal_errors
  287. on: web_log.type_requests
  288. class: Errors
  289. type: Web Server
  290. component: Web log
  291. families: *
  292. lookup: sum -1m unaligned of error
  293. calc: $this * 100 / $web_log_1m_requests
  294. units: %
  295. every: 10s
  296. warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
  297. crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
  298. delay: up 2m down 15m multiplier 1.5 max 1h
  299. info: ratio of server error HTTP requests over the last minute (5xx)
  300. to: webmaster
  301. # -----------------------------------------------------------------------------
  302. # web slow
  303. # the following alarms trigger only when there are enough data.
  304. # we assume there are enough data when:
  305. #
  306. # $1m_requests > 120
  307. #
  308. # i.e. when there are at least 120 requests during the last minute
  309. template: web_log_10m_response_time
  310. on: web_log.request_processing_time
  311. class: Latency
  312. type: System
  313. component: Web log
  314. families: *
  315. lookup: average -10m unaligned of avg
  316. units: ms
  317. every: 30s
  318. info: average HTTP response time over the last 10 minutes
  319. template: web_log_web_slow
  320. on: web_log.request_processing_time
  321. class: Latency
  322. type: Web Server
  323. component: Web log
  324. families: *
  325. lookup: average -1m unaligned of avg
  326. units: ms
  327. every: 10s
  328. green: 500
  329. red: 1000
  330. warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
  331. crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
  332. delay: down 15m multiplier 1.5 max 1h
  333. info: average HTTP response time over the last 1 minute
  334. options: no-clear-notification
  335. to: webmaster
  336. # -----------------------------------------------------------------------------
  337. # web too many or too few requests
  338. # the following alarms trigger only when there are enough data.
  339. # we assume there are enough data when:
  340. #
  341. # $5m_successful_old > 120
  342. #
  343. # i.e. when there were at least 120 requests during the 5 minutes starting
  344. # at -10m and ending at -5m
  345. template: web_log_5m_successful_old
  346. on: web_log.type_requests
  347. class: Workload
  348. type: Web Server
  349. component: Web log
  350. families: *
  351. lookup: average -5m at -5m unaligned of success
  352. units: requests/s
  353. every: 30s
  354. info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
  355. template: web_log_5m_successful
  356. on: web_log.type_requests
  357. class: Workload
  358. type: Web Server
  359. component: Web log
  360. families: *
  361. lookup: average -5m unaligned of success
  362. units: requests/s
  363. every: 30s
  364. info: average number of successful HTTP requests over the last 5 minutes
  365. template: web_log_5m_requests_ratio
  366. on: web_log.type_requests
  367. class: Workload
  368. type: Web Server
  369. component: Web log
  370. families: *
  371. calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
  372. units: %
  373. every: 30s
  374. warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
  375. crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
  376. delay: down 15m multiplier 1.5 max 1h
  377. options: no-clear-notification
  378. info: ratio of successful HTTP requests over over the last 5 minutes, \
  379. compared with the previous 5 minutes \
  380. (clear notification for this alarm will not be sent)
  381. to: webmaster