monitoring_controller.rb 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. # Copyright (C) 2012-2016 Zammad Foundation, http://zammad-foundation.org/
  2. class MonitoringController < ApplicationController
  3. prepend_before_action -> { authentication_check(permission: 'admin.monitoring') }, except: %i[health_check status amount_check]
  4. skip_before_action :verify_csrf_token
  5. =begin
  6. Resource:
  7. GET /api/v1/monitoring/health_check?token=XXX
  8. Response:
  9. {
  10. "healthy": true,
  11. "message": "success",
  12. }
  13. {
  14. "healthy": false,
  15. "message": "authentication of XXX failed; issue #2",
  16. "issues": ["authentication of XXX failed", "issue #2"],
  17. }
  18. Test:
  19. curl http://localhost/api/v1/monitoring/health_check?token=XXX
  20. =end
  21. def health_check
  22. token_or_permission_check
  23. issues = []
  24. actions = Set.new
  25. # channel check
  26. last_run_tolerance = Time.zone.now - 1.hour
  27. Channel.where(active: true).each do |channel|
  28. # inbound channel
  29. if channel.status_in == 'error'
  30. message = "Channel: #{channel.area} in "
  31. %w[host user uid].each do |key|
  32. next if channel.options[key].blank?
  33. message += "key:#{channel.options[key]};"
  34. end
  35. issues.push "#{message} #{channel.last_log_in}"
  36. end
  37. if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance
  38. diff = Time.zone.now - channel.preferences['last_fetch']
  39. issues.push "#{message} channel is active but not fetched for #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} hour"
  40. end
  41. # outbound channel
  42. next if channel.status_out != 'error'
  43. message = "Channel: #{channel.area} out "
  44. %w[host user uid].each do |key|
  45. next if channel.options[key].blank?
  46. message += "key:#{channel.options[key]};"
  47. end
  48. issues.push "#{message} #{channel.last_log_out}"
  49. end
  50. # unprocessable mail check
  51. directory = Rails.root.join('tmp', 'unprocessable_mail').to_s
  52. if File.exist?(directory)
  53. count = 0
  54. Dir.glob("#{directory}/*.eml") do |_entry|
  55. count += 1
  56. end
  57. if count.nonzero?
  58. issues.push "unprocessable mails: #{count}"
  59. end
  60. end
  61. # scheduler running check
  62. Scheduler.where('active = ? AND period > 300', true).where.not(last_run: nil).order(last_run: :asc, period: :asc).each do |scheduler|
  63. diff = Time.zone.now - (scheduler.last_run + scheduler.period.seconds)
  64. next if diff < 8.minutes
  65. issues.push "scheduler may not run (last execution of #{scheduler.method} #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} over) - please contact your system administrator"
  66. break
  67. end
  68. if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count
  69. issues.push 'scheduler not running'
  70. end
  71. Scheduler.failed_jobs.each do |job|
  72. issues.push "Failed to run scheduled job '#{job.name}'. Cause: #{job.error_message}"
  73. actions.add(:restart_failed_jobs)
  74. end
  75. # failed jobs check
  76. failed_jobs = Delayed::Job.where('attempts > 0')
  77. count_failed_jobs = failed_jobs.count
  78. if count_failed_jobs > 10
  79. issues.push "#{count_failed_jobs} failing background jobs"
  80. end
  81. listed_failed_jobs = failed_jobs.select(:handler, :attempts).limit(10)
  82. sorted_failed_jobs = listed_failed_jobs.group_by(&:name).sort_by { |_handler, entries| entries.length }.reverse.to_h
  83. sorted_failed_jobs.each_with_index do |(name, jobs), index|
  84. attempts = jobs.map(&:attempts).sum
  85. issues.push "Failed to run background job ##{index += 1} '#{name}' #{jobs.count} time(s) with #{attempts} attempt(s)."
  86. end
  87. # job count check
  88. total_jobs = Delayed::Job.where('created_at < ?', Time.zone.now - 15.minutes).count
  89. if total_jobs > 8000
  90. issues.push "#{total_jobs} background jobs in queue"
  91. end
  92. # import jobs
  93. import_backends = ImportJob.backends
  94. # failed import jobs
  95. import_backends.each do |backend|
  96. job = ImportJob.where(
  97. name: backend,
  98. dry_run: false,
  99. ).where('finished_at >= ?', 5.minutes.ago).limit(1).first
  100. next if job.blank?
  101. next if !job.result.is_a?(Hash)
  102. error_message = job.result[:error]
  103. next if error_message.blank?
  104. issues.push "Failed to run import backend '#{backend}'. Cause: #{error_message}"
  105. end
  106. # stuck import jobs
  107. import_backends.each do |backend|
  108. job = ImportJob.where(
  109. name: backend,
  110. dry_run: false,
  111. finished_at: nil,
  112. ).where('updated_at <= ?', 5.minutes.ago).limit(1).first
  113. next if job.blank?
  114. issues.push "Stuck import backend '#{backend}' detected. Last update: #{job.updated_at}"
  115. end
  116. token = Setting.get('monitoring_token')
  117. if issues.blank?
  118. result = {
  119. healthy: true,
  120. message: 'success',
  121. token: token,
  122. }
  123. render json: result
  124. return
  125. end
  126. result = {
  127. healthy: false,
  128. message: issues.join(';'),
  129. issues: issues,
  130. actions: actions,
  131. token: token,
  132. }
  133. render json: result
  134. end
  135. =begin
  136. Resource:
  137. GET /api/v1/monitoring/status?token=XXX
  138. Response:
  139. {
  140. "agents": 8123,
  141. "last_login": "2016-11-21T14:14:14Z",
  142. "counts": {
  143. "users": 12313,
  144. "tickets": 23123,
  145. "ticket_articles": 131451,
  146. },
  147. "last_created_at": {
  148. "users": "2016-11-21T14:14:14Z",
  149. "tickets": "2016-11-21T14:14:14Z",
  150. "ticket_articles": "2016-11-21T14:14:14Z",
  151. },
  152. }
  153. Test:
  154. curl http://localhost/api/v1/monitoring/status?token=XXX
  155. =end
  156. def status
  157. token_or_permission_check
  158. last_login = nil
  159. last_login_user = User.where('last_login IS NOT NULL').order(last_login: :desc).limit(1).first
  160. if last_login_user
  161. last_login = last_login_user.last_login
  162. end
  163. status = {
  164. counts: {},
  165. last_created_at: {},
  166. last_login: last_login,
  167. agents: User.with_permissions('ticket.agent').count,
  168. }
  169. map = {
  170. users: User,
  171. groups: Group,
  172. overviews: Overview,
  173. tickets: Ticket,
  174. ticket_articles: Ticket::Article,
  175. text_modules: TextModule,
  176. }
  177. map.each do |key, class_name|
  178. status[:counts][key] = class_name.count
  179. last = class_name.last
  180. status[:last_created_at][key] = last&.created_at
  181. end
  182. if ActiveRecord::Base.connection_config[:adapter] == 'postgresql'
  183. sql = 'SELECT SUM(CAST(coalesce(size, \'0\') AS INTEGER)) FROM stores WHERE id IN (SELECT DISTINCT(store_file_id) FROM stores)'
  184. records_array = ActiveRecord::Base.connection.exec_query(sql)
  185. if records_array[0] && records_array[0]['sum']
  186. sum = records_array[0]['sum']
  187. status[:storage] = {
  188. kB: sum / 1024,
  189. MB: sum / 1024 / 1024,
  190. GB: sum / 1024 / 1024 / 1024,
  191. }
  192. end
  193. end
  194. render json: status
  195. end
  196. =begin
  197. get counts about created ticket in certain time slot. s, m, h and d possible.
  198. Resource:
  199. GET /api/v1/monitoring/amount_check?token=XXX&max_warning=2000&max_critical=3000&periode=1h
  200. GET /api/v1/monitoring/amount_check?token=XXX&min_warning=2000&min_critical=3000&periode=1h
  201. GET /api/v1/monitoring/amount_check?token=XXX&periode=1h
  202. Response:
  203. {
  204. "state": "ok",
  205. "message": "",
  206. "count": 123,
  207. }
  208. {
  209. "state": "warning",
  210. "message": "limit of 2000 tickets in 1h reached",
  211. "count": 123,
  212. }
  213. {
  214. "state": "critical",
  215. "message": "limit of 3000 tickets in 1h reached",
  216. "count": 123,
  217. }
  218. Test:
  219. curl http://localhost/api/v1/monitoring/amount_check?token=XXX&max_warning=2000&max_critical=3000&periode=1h
  220. curl http://localhost/api/v1/monitoring/amount_check?token=XXX&min_warning=2000&min_critical=3000&periode=1h
  221. curl http://localhost/api/v1/monitoring/amount_check?token=XXX&periode=1h
  222. =end
  223. def amount_check
  224. token_or_permission_check
  225. raise Exceptions::UnprocessableEntity, 'periode is missing!' if params[:periode].blank?
  226. scale = params[:periode][-1, 1]
  227. raise Exceptions::UnprocessableEntity, 'periode need to have s, m, h or d as last!' if scale !~ /^(s|m|h|d)$/
  228. periode = params[:periode][0, params[:periode].length - 1]
  229. raise Exceptions::UnprocessableEntity, 'periode need to be an integer!' if periode.to_i.zero?
  230. if scale == 's'
  231. created_at = Time.zone.now - periode.to_i.seconds
  232. elsif scale == 'm'
  233. created_at = Time.zone.now - periode.to_i.minutes
  234. elsif scale == 'h'
  235. created_at = Time.zone.now - periode.to_i.hours
  236. elsif scale == 'd'
  237. created_at = Time.zone.now - periode.to_i.days
  238. end
  239. map = [
  240. { param: :max_critical, notice: 'critical', type: 'gt' },
  241. { param: :min_critical, notice: 'critical', type: 'lt' },
  242. { param: :max_warning, notice: 'warning', type: 'gt' },
  243. { param: :min_warning, notice: 'warning', type: 'lt' },
  244. ]
  245. result = {}
  246. map.each do |row|
  247. next if params[row[:param]].blank?
  248. raise Exceptions::UnprocessableEntity, "#{row[:param]} need to be an integer!" if params[row[:param]].to_i.zero?
  249. count = Ticket.where('created_at >= ?', created_at).count
  250. if row[:type] == 'gt'
  251. if count > params[row[:param]].to_i
  252. result = {
  253. state: row[:notice],
  254. message: "The limit of #{params[row[:param]]} was exceeded with #{count} in the last #{params[:periode]}",
  255. count: count,
  256. }
  257. break
  258. end
  259. next
  260. end
  261. next if count > params[row[:param]].to_i
  262. result = {
  263. state: row[:notice],
  264. message: "The minimum of #{params[row[:param]]} was undercut by #{count} in the last #{params[:periode]}",
  265. count: count,
  266. }
  267. break
  268. end
  269. if result.blank?
  270. result = {
  271. state: 'ok',
  272. message: '',
  273. count: Ticket.where('created_at >= ?', created_at).count,
  274. }
  275. end
  276. render json: result
  277. end
  278. def token
  279. access_check
  280. token = SecureRandom.urlsafe_base64(40)
  281. Setting.set('monitoring_token', token)
  282. result = {
  283. token: token,
  284. }
  285. render json: result, status: :created
  286. end
  287. def restart_failed_jobs
  288. access_check
  289. Scheduler.restart_failed_jobs
  290. render json: {}, status: :ok
  291. end
  292. private
  293. def token_or_permission_check
  294. user = authentication_check_only(permission: 'admin.monitoring')
  295. return if user
  296. return if Setting.get('monitoring_token') == params[:token]
  297. raise Exceptions::NotAuthorized
  298. end
  299. def access_check
  300. return if Permission.find_by(name: 'admin.monitoring', active: true)
  301. raise Exceptions::NotAuthorized
  302. end
  303. end