monitoring_controller.rb 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. # Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
  2. class MonitoringController < ApplicationController
  3. prepend_before_action { authorize! }
  4. prepend_before_action -> { authentication_check }, except: %i[health_check status amount_check]
  5. prepend_before_action -> { authentication_check_only }, only: %i[health_check status amount_check]
  6. skip_before_action :verify_csrf_token
  7. =begin
  8. Resource:
  9. GET /api/v1/monitoring/health_check?token=XXX
  10. Response:
  11. {
  12. "healthy": true,
  13. "message": "success",
  14. }
  15. {
  16. "healthy": false,
  17. "message": "authentication of XXX failed; issue #2",
  18. "issues": ["authentication of XXX failed", "issue #2"],
  19. }
  20. Test:
  21. curl http://localhost/api/v1/monitoring/health_check?token=XXX
  22. =end
  23. def health_check
  24. issues = []
  25. actions = Set.new
  26. # channel check
  27. last_run_tolerance = 1.hour.ago
  28. options_keys = %w[host user uid]
  29. Channel.where(active: true).each do |channel|
  30. # inbound channel
  31. if channel.status_in == 'error'
  32. message = "Channel: #{channel.area} in "
  33. options_keys.each do |key|
  34. next if channel.options[key].blank?
  35. message += "key:#{channel.options[key]};"
  36. end
  37. issues.push "#{message} #{channel.last_log_in}"
  38. end
  39. if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance
  40. diff = Time.zone.now - channel.preferences['last_fetch']
  41. issues.push "#{message} channel is active but not fetched for #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)}"
  42. end
  43. # outbound channel
  44. next if channel.status_out != 'error'
  45. message = "Channel: #{channel.area} out "
  46. options_keys.each do |key|
  47. next if channel.options[key].blank?
  48. message += "key:#{channel.options[key]};"
  49. end
  50. issues.push "#{message} #{channel.last_log_out}"
  51. end
  52. # unprocessable mail check
  53. directory = Rails.root.join('tmp/unprocessable_mail').to_s
  54. if File.exist?(directory)
  55. count = 0
  56. Dir.glob("#{directory}/*.eml") do |_entry|
  57. count += 1
  58. end
  59. if count.nonzero?
  60. issues.push "unprocessable mails: #{count}"
  61. end
  62. end
  63. # scheduler running check
  64. Scheduler.where('active = ? AND period > 300', true).where.not(last_run: nil).order(last_run: :asc, period: :asc).each do |scheduler|
  65. diff = Time.zone.now - (scheduler.last_run + scheduler.period.seconds)
  66. next if diff < 8.minutes
  67. issues.push "scheduler may not run (last execution of #{scheduler.method} #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} over) - please contact your system administrator"
  68. break
  69. end
  70. if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count
  71. issues.push 'scheduler not running'
  72. end
  73. Scheduler.failed_jobs.each do |job|
  74. issues.push "Failed to run scheduled job '#{job.name}'. Cause: #{job.error_message}"
  75. actions.add(:restart_failed_jobs)
  76. end
  77. # failed jobs check
  78. failed_jobs = Delayed::Job.where('attempts > 0')
  79. count_failed_jobs = failed_jobs.count
  80. if count_failed_jobs > 10
  81. issues.push "#{count_failed_jobs} failing background jobs"
  82. end
  83. handler_attempts_map = {}
  84. failed_jobs.order(:created_at).limit(10).each do |job|
  85. job_name = if job.instance_of?(Delayed::Backend::ActiveRecord::Job) && job.payload_object.respond_to?(:job_data)
  86. job.payload_object.job_data['job_class']
  87. else
  88. job.name
  89. end
  90. handler_attempts_map[job_name] ||= {
  91. count: 0,
  92. attempts: 0,
  93. }
  94. handler_attempts_map[job_name][:count] += 1
  95. handler_attempts_map[job_name][:attempts] += job.attempts
  96. end
  97. handler_attempts_map.sort.to_h.each_with_index do |(job_name, job_data), index|
  98. issues.push "Failed to run background job ##{index + 1} '#{job_name}' #{job_data[:count]} time(s) with #{job_data[:attempts]} attempt(s)."
  99. end
  100. # job count check
  101. total_jobs = Delayed::Job.where('created_at < ?', 15.minutes.ago).count
  102. if total_jobs > 8000
  103. issues.push "#{total_jobs} background jobs in queue"
  104. end
  105. # import jobs
  106. import_backends = ImportJob.backends
  107. # failed import jobs
  108. import_backends.each do |backend|
  109. job = ImportJob.where(
  110. name: backend,
  111. dry_run: false,
  112. ).where('finished_at >= ?', 5.minutes.ago).limit(1).first
  113. next if job.blank?
  114. next if !job.result.is_a?(Hash)
  115. error_message = job.result[:error]
  116. next if error_message.blank?
  117. issues.push "Failed to run import backend '#{backend}'. Cause: #{error_message}"
  118. end
  119. # stuck import jobs
  120. import_backends.each do |backend| # rubocop:disable Style/CombinableLoops
  121. job = ImportJob.where(
  122. name: backend,
  123. dry_run: false,
  124. finished_at: nil,
  125. ).where('updated_at <= ?', 5.minutes.ago).limit(1).first
  126. next if job.blank?
  127. issues.push "Stuck import backend '#{backend}' detected. Last update: #{job.updated_at}"
  128. end
  129. # stuck data privacy tasks
  130. DataPrivacyTask.where.not(state: 'completed').where('updated_at <= ?', 30.minutes.ago).find_each do |task|
  131. issues.push "Stuck data privacy task (ID #{task.id}) detected. Last update: #{task.updated_at}"
  132. end
  133. token = Setting.get('monitoring_token')
  134. if issues.blank?
  135. result = {
  136. healthy: true,
  137. message: 'success',
  138. issues: issues,
  139. token: token,
  140. }
  141. render json: result
  142. return
  143. end
  144. result = {
  145. healthy: false,
  146. message: issues.join(';'),
  147. issues: issues,
  148. actions: actions,
  149. token: token,
  150. }
  151. render json: result
  152. end
  153. =begin
  154. Resource:
  155. GET /api/v1/monitoring/status?token=XXX
  156. Response:
  157. {
  158. "agents": 8123,
  159. "last_login": "2016-11-21T14:14:14Z",
  160. "counts": {
  161. "users": 12313,
  162. "tickets": 23123,
  163. "ticket_articles": 131451,
  164. },
  165. "last_created_at": {
  166. "users": "2016-11-21T14:14:14Z",
  167. "tickets": "2016-11-21T14:14:14Z",
  168. "ticket_articles": "2016-11-21T14:14:14Z",
  169. },
  170. }
  171. Test:
  172. curl http://localhost/api/v1/monitoring/status?token=XXX
  173. =end
  174. def status
  175. last_login = nil
  176. last_login_user = User.where.not(last_login: nil).order(last_login: :desc).limit(1).first
  177. if last_login_user
  178. last_login = last_login_user.last_login
  179. end
  180. status = {
  181. counts: {},
  182. last_created_at: {},
  183. last_login: last_login,
  184. agents: User.with_permissions('ticket.agent').count,
  185. }
  186. map = {
  187. users: User,
  188. groups: Group,
  189. overviews: Overview,
  190. tickets: Ticket,
  191. ticket_articles: Ticket::Article,
  192. text_modules: TextModule,
  193. taskbars: Taskbar,
  194. object_manager_attributes: ObjectManager::Attribute,
  195. knowledge_base_categories: KnowledgeBase::Category,
  196. knowledge_base_answers: KnowledgeBase::Answer,
  197. }
  198. map.each do |key, class_name|
  199. status[:counts][key] = class_name.count
  200. last = class_name.last
  201. status[:last_created_at][key] = last&.created_at
  202. end
  203. if ActiveRecord::Base.connection_db_config.configuration_hash[:adapter] == 'postgresql'
  204. sql = 'SELECT SUM(CAST(coalesce(size, \'0\') AS INTEGER)) FROM stores'
  205. records_array = ActiveRecord::Base.connection.exec_query(sql)
  206. if records_array[0] && records_array[0]['sum']
  207. sum = records_array[0]['sum']
  208. status[:storage] = {
  209. kB: sum / 1024,
  210. MB: sum / 1024 / 1024,
  211. GB: sum / 1024 / 1024 / 1024,
  212. }
  213. end
  214. end
  215. render json: status
  216. end
  217. =begin
  218. get counts about created ticket in certain time slot. s, m, h and d possible.
  219. Resource:
  220. GET /api/v1/monitoring/amount_check?token=XXX&max_warning=2000&max_critical=3000&periode=1h
  221. GET /api/v1/monitoring/amount_check?token=XXX&min_warning=2000&min_critical=3000&periode=1h
  222. GET /api/v1/monitoring/amount_check?token=XXX&periode=1h
  223. Response:
  224. {
  225. "state": "ok",
  226. "message": "",
  227. "count": 123,
  228. }
  229. {
  230. "state": "warning",
  231. "message": "limit of 2000 tickets in 1h reached",
  232. "count": 123,
  233. }
  234. {
  235. "state": "critical",
  236. "message": "limit of 3000 tickets in 1h reached",
  237. "count": 123,
  238. }
  239. Test:
  240. curl http://localhost/api/v1/monitoring/amount_check?token=XXX&max_warning=2000&max_critical=3000&periode=1h
  241. curl http://localhost/api/v1/monitoring/amount_check?token=XXX&min_warning=2000&min_critical=3000&periode=1h
  242. curl http://localhost/api/v1/monitoring/amount_check?token=XXX&periode=1h
  243. =end
  244. def amount_check
  245. raise Exceptions::UnprocessableEntity, 'periode is missing!' if params[:periode].blank?
  246. scale = params[:periode][-1, 1]
  247. raise Exceptions::UnprocessableEntity, 'periode need to have s, m, h or d as last!' if !scale.match?(%r{^(s|m|h|d)$})
  248. periode = params[:periode][0, params[:periode].length - 1]
  249. raise Exceptions::UnprocessableEntity, 'periode needs to be an integer!' if periode.to_i.zero?
  250. case scale
  251. when 's'
  252. created_at = Time.zone.now - periode.to_i.seconds
  253. when 'm'
  254. created_at = Time.zone.now - periode.to_i.minutes
  255. when 'h'
  256. created_at = Time.zone.now - periode.to_i.hours
  257. when 'd'
  258. created_at = Time.zone.now - periode.to_i.days
  259. end
  260. map = [
  261. { param: :max_critical, notice: 'critical', type: 'gt' },
  262. { param: :min_critical, notice: 'critical', type: 'lt' },
  263. { param: :max_warning, notice: 'warning', type: 'gt' },
  264. { param: :min_warning, notice: 'warning', type: 'lt' },
  265. ]
  266. result = {}
  267. state_param = false
  268. map.each do |row|
  269. next if params[row[:param]].blank?
  270. raise Exceptions::UnprocessableEntity, "#{row[:param]} needs to be an integer!" if params[row[:param]].to_i.zero?
  271. state_param = true
  272. count = Ticket.where('created_at >= ?', created_at).count
  273. if row[:type] == 'gt'
  274. if count > params[row[:param]].to_i
  275. result = {
  276. state: row[:notice],
  277. message: "The limit of #{params[row[:param]]} was exceeded with #{count} in the last #{params[:periode]}",
  278. count: count,
  279. }
  280. break
  281. end
  282. next
  283. end
  284. next if count > params[row[:param]].to_i
  285. result = {
  286. state: row[:notice],
  287. message: "The minimum of #{params[row[:param]]} was undercut by #{count} in the last #{params[:periode]}",
  288. count: count,
  289. }
  290. break
  291. end
  292. if result.blank?
  293. result = {
  294. state: 'ok',
  295. count: Ticket.where('created_at >= ?', created_at).count,
  296. }
  297. end
  298. if state_param == false
  299. result.delete(:state)
  300. end
  301. render json: result
  302. end
  303. def token
  304. token = SecureRandom.urlsafe_base64(40)
  305. Setting.set('monitoring_token', token)
  306. result = {
  307. token: token,
  308. }
  309. render json: result, status: :created
  310. end
  311. def restart_failed_jobs
  312. Scheduler.restart_failed_jobs
  313. render json: {}, status: :ok
  314. end
  315. end