postgres.conf 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # you can disable an alarm notification by setting the 'to' line to: silent
  2. template: postgres_total_connection_utilization
  3. on: postgres.connections_utilization
  4. class: Utilization
  5. type: Database
  6. component: PostgreSQL
  7. hosts: *
  8. lookup: average -1m unaligned of used
  9. units: %
  10. every: 1m
  11. warn: $this > (($status >= $WARNING) ? (70) : (80))
  12. crit: $this > (($status == $CRITICAL) ? (80) : (90))
  13. delay: down 15m multiplier 1.5 max 1h
  14. info: average total connection utilization over the last minute
  15. to: dba
  16. template: postgres_acquired_locks_utilization
  17. on: postgres.locks_utilization
  18. class: Utilization
  19. type: Database
  20. component: PostgreSQL
  21. hosts: *
  22. lookup: average -1m unaligned of used
  23. units: %
  24. every: 1m
  25. warn: $this > (($status >= $WARNING) ? (15) : (20))
  26. delay: down 15m multiplier 1.5 max 1h
  27. info: average acquired locks utilization over the last minute
  28. to: dba
  29. template: postgres_txid_exhaustion_perc
  30. on: postgres.txid_exhaustion_perc
  31. class: Utilization
  32. type: Database
  33. component: PostgreSQL
  34. hosts: *
  35. calc: $txid_exhaustion
  36. units: %
  37. every: 1m
  38. warn: $this > 90
  39. delay: down 15m multiplier 1.5 max 1h
  40. info: percent towards TXID wraparound
  41. to: dba
  42. # Database alarms
  43. template: postgres_db_cache_io_ratio
  44. on: postgres.db_cache_io_ratio
  45. class: Workload
  46. type: Database
  47. component: PostgreSQL
  48. hosts: *
  49. lookup: average -1m unaligned of miss
  50. calc: 100 - $this
  51. units: %
  52. every: 1m
  53. warn: $this < (($status >= $WARNING) ? (70) : (60))
  54. crit: $this < (($status == $CRITICAL) ? (60) : (50))
  55. delay: down 15m multiplier 1.5 max 1h
  56. info: average cache hit ratio in db ${label:database} over the last minute
  57. to: dba
  58. template: postgres_db_transactions_rollback_ratio
  59. on: postgres.db_transactions_ratio
  60. class: Workload
  61. type: Database
  62. component: PostgreSQL
  63. hosts: *
  64. lookup: average -5m unaligned of rollback
  65. units: %
  66. every: 1m
  67. warn: $this > (($status >= $WARNING) ? (0) : (2))
  68. delay: down 15m multiplier 1.5 max 1h
  69. info: average aborted transactions percentage in db ${label:database} over the last five minutes
  70. to: dba
  71. template: postgres_db_deadlocks_rate
  72. on: postgres.db_deadlocks_rate
  73. class: Errors
  74. type: Database
  75. component: PostgreSQL
  76. hosts: *
  77. lookup: sum -1m unaligned of deadlocks
  78. units: deadlocks
  79. every: 1m
  80. warn: $this > (($status >= $WARNING) ? (0) : (10))
  81. delay: down 15m multiplier 1.5 max 1h
  82. info: number of deadlocks detected in db ${label:database} in the last minute
  83. to: dba
  84. # Table alarms
  85. template: postgres_table_cache_io_ratio
  86. on: postgres.table_cache_io_ratio
  87. class: Workload
  88. type: Database
  89. component: PostgreSQL
  90. hosts: *
  91. lookup: average -1m unaligned of miss
  92. calc: 100 - $this
  93. units: %
  94. every: 1m
  95. warn: $this < (($status >= $WARNING) ? (70) : (60))
  96. crit: $this < (($status == $CRITICAL) ? (60) : (50))
  97. delay: down 15m multiplier 1.5 max 1h
  98. info: average cache hit ratio in db ${label:database} table ${label:table} over the last minute
  99. to: dba
  100. template: postgres_table_index_cache_io_ratio
  101. on: postgres.table_index_cache_io_ratio
  102. class: Workload
  103. type: Database
  104. component: PostgreSQL
  105. hosts: *
  106. lookup: average -1m unaligned of miss
  107. calc: 100 - $this
  108. units: %
  109. every: 1m
  110. warn: $this < (($status >= $WARNING) ? (70) : (60))
  111. crit: $this < (($status == $CRITICAL) ? (60) : (50))
  112. delay: down 15m multiplier 1.5 max 1h
  113. info: average index cache hit ratio in db ${label:database} table ${label:table} over the last minute
  114. to: dba
  115. template: postgres_table_toast_cache_io_ratio
  116. on: postgres.table_toast_cache_io_ratio
  117. class: Workload
  118. type: Database
  119. component: PostgreSQL
  120. hosts: *
  121. lookup: average -1m unaligned of miss
  122. calc: 100 - $this
  123. units: %
  124. every: 1m
  125. warn: $this < (($status >= $WARNING) ? (70) : (60))
  126. crit: $this < (($status == $CRITICAL) ? (60) : (50))
  127. delay: down 15m multiplier 1.5 max 1h
  128. info: average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
  129. to: dba
  130. template: postgres_table_toast_index_cache_io_ratio
  131. on: postgres.table_toast_index_cache_io_ratio
  132. class: Workload
  133. type: Database
  134. component: PostgreSQL
  135. hosts: *
  136. lookup: average -1m unaligned of miss
  137. calc: 100 - $this
  138. units: %
  139. every: 1m
  140. warn: $this < (($status >= $WARNING) ? (70) : (60))
  141. crit: $this < (($status == $CRITICAL) ? (60) : (50))
  142. delay: down 15m multiplier 1.5 max 1h
  143. info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
  144. to: dba
  145. template: postgres_table_bloat_size_perc
  146. on: postgres.table_bloat_size_perc
  147. class: Errors
  148. type: Database
  149. component: PostgreSQL
  150. hosts: *
  151. calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0)
  152. units: %
  153. every: 1m
  154. warn: $this > (($status >= $WARNING) ? (60) : (70))
  155. crit: $this > (($status == $CRITICAL) ? (70) : (80))
  156. delay: down 15m multiplier 1.5 max 1h
  157. info: bloat size percentage in db ${label:database} table ${label:table}
  158. to: dba
  159. template: postgres_table_last_autovacuum_time
  160. on: postgres.table_autovacuum_since_time
  161. class: Errors
  162. type: Database
  163. component: PostgreSQL
  164. hosts: !*
  165. calc: $time
  166. units: seconds
  167. every: 1m
  168. warn: $this != nan AND $this > (60 * 60 * 24 * 7)
  169. info: time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon
  170. to: dba
  171. template: postgres_table_last_autoanalyze_time
  172. on: postgres.table_autoanalyze_since_time
  173. class: Errors
  174. type: Database
  175. component: PostgreSQL
  176. hosts: !*
  177. calc: $time
  178. units: seconds
  179. every: 1m
  180. warn: $this != nan AND $this > (60 * 60 * 24 * 7)
  181. info: time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon
  182. to: dba
  183. # Index alarms
  184. template: postgres_index_bloat_size_perc
  185. on: postgres.index_bloat_size_perc
  186. class: Errors
  187. type: Database
  188. component: PostgreSQL
  189. hosts: *
  190. calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0)
  191. units: %
  192. every: 1m
  193. warn: $this > (($status >= $WARNING) ? (60) : (70))
  194. crit: $this > (($status == $CRITICAL) ? (70) : (80))
  195. delay: down 15m multiplier 1.5 max 1h
  196. info: bloat size percentage in db ${label:database} table ${label:table} index ${label:index}
  197. to: dba