monitoring_controller.rb 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. # Copyright (C) 2012-2016 Zammad Foundation, http://zammad-foundation.org/
  2. class MonitoringController < ApplicationController
  3. prepend_before_action -> { authentication_check(permission: 'admin.monitoring') }, except: %i[health_check status]
  4. skip_before_action :verify_csrf_token
  5. =begin
  6. Resource:
  7. GET /api/v1/monitoring/health_check?token=XXX
  8. Response:
  9. {
  10. "healthy": true,
  11. "message": "success",
  12. }
  13. {
  14. "healthy": false,
  15. "message": "authentication of XXX failed; issue #2",
  16. "issues": ["authentication of XXX failed", "issue #2"],
  17. }
  18. Test:
  19. curl http://localhost/api/v1/monitoring/health_check?token=XXX
  20. =end
  21. def health_check
  22. token_or_permission_check
  23. issues = []
  24. actions = Set.new
  25. # channel check
  26. last_run_tolerance = Time.zone.now - 1.hour
  27. Channel.where(active: true).each do |channel|
  28. # inbound channel
  29. if channel.status_in == 'error'
  30. message = "Channel: #{channel.area} in "
  31. %w[host user uid].each do |key|
  32. next if channel.options[key].blank?
  33. message += "key:#{channel.options[key]};"
  34. end
  35. issues.push "#{message} #{channel.last_log_in}"
  36. end
  37. if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance
  38. issues.push "#{message} channel is active but not fetched for 1 hour"
  39. end
  40. # outbound channel
  41. next if channel.status_out != 'error'
  42. message = "Channel: #{channel.area} out "
  43. %w[host user uid].each do |key|
  44. next if channel.options[key].blank?
  45. message += "key:#{channel.options[key]};"
  46. end
  47. issues.push "#{message} #{channel.last_log_out}"
  48. end
  49. # unprocessable mail check
  50. directory = Rails.root.join('tmp', 'unprocessable_mail').to_s
  51. if File.exist?(directory)
  52. count = 0
  53. Dir.glob("#{directory}/*.eml") do |_entry|
  54. count += 1
  55. end
  56. if count.nonzero?
  57. issues.push "unprocessable mails: #{count}"
  58. end
  59. end
  60. # scheduler check
  61. Scheduler.where(active: true).where.not(last_run: nil).each do |scheduler|
  62. next if scheduler.period <= 300
  63. next if scheduler.last_run + scheduler.period.seconds > Time.zone.now - 5.minutes
  64. issues.push 'scheduler not running'
  65. break
  66. end
  67. if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count
  68. issues.push 'scheduler not running'
  69. end
  70. Scheduler.failed_jobs.each do |job|
  71. issues.push "Failed to run scheduled job '#{job.name}'. Cause: #{job.error_message}"
  72. actions.add(:restart_failed_jobs)
  73. end
  74. # failed jobs check
  75. failed_jobs = Delayed::Job.where('attempts > 0')
  76. count_failed_jobs = failed_jobs.count
  77. if count_failed_jobs > 10
  78. issues.push "#{count_failed_jobs} failing background jobs."
  79. end
  80. listed_failed_jobs = failed_jobs.select(:handler, :attempts).limit(10)
  81. sorted_failed_jobs = listed_failed_jobs.group_by(&:name).sort_by { |_handler, entries| entries.length }.reverse.to_h
  82. sorted_failed_jobs.each_with_index do |(name, jobs), index|
  83. attempts = jobs.map(&:attempts).sum
  84. issues.push "Failed to run background job ##{index += 1} '#{name}' #{jobs.count} time(s) with #{attempts} attempt(s)."
  85. end
  86. # import jobs
  87. import_backends = ImportJob.backends
  88. # failed import jobs
  89. import_backends.each do |backend|
  90. job = ImportJob.where(
  91. name: backend,
  92. dry_run: false,
  93. ).where('finished_at >= ?', 5.minutes.ago).limit(1).first
  94. next if job.blank?
  95. next if !job.result.is_a?(Hash)
  96. error_message = job.result[:error]
  97. next if error_message.blank?
  98. issues.push "Failed to run import backend '#{backend}'. Cause: #{error_message}"
  99. end
  100. # stuck import jobs
  101. import_backends.each do |backend|
  102. job = ImportJob.where(
  103. name: backend,
  104. dry_run: false,
  105. finished_at: nil,
  106. ).where('updated_at <= ?', 5.minutes.ago).limit(1).first
  107. next if job.blank?
  108. issues.push "Stuck import backend '#{backend}' detected. Last update: #{job.updated_at}"
  109. end
  110. token = Setting.get('monitoring_token')
  111. if issues.blank?
  112. result = {
  113. healthy: true,
  114. message: 'success',
  115. token: token,
  116. }
  117. render json: result
  118. return
  119. end
  120. result = {
  121. healthy: false,
  122. message: issues.join(';'),
  123. issues: issues,
  124. actions: actions,
  125. token: token,
  126. }
  127. render json: result
  128. end
  129. =begin
  130. Resource:
  131. GET /api/v1/monitoring/status?token=XXX
  132. Response:
  133. {
  134. "agents": 8123,
  135. "last_login": "2016-11-21T14:14:14Z",
  136. "counts": {
  137. "users": 12313,
  138. "tickets": 23123,
  139. "ticket_articles": 131451,
  140. },
  141. "last_created_at": {
  142. "users": "2016-11-21T14:14:14Z",
  143. "tickets": "2016-11-21T14:14:14Z",
  144. "ticket_articles": "2016-11-21T14:14:14Z",
  145. },
  146. }
  147. Test:
  148. curl http://localhost/api/v1/monitoring/status?token=XXX
  149. =end
  150. def status
  151. token_or_permission_check
  152. last_login = nil
  153. last_login_user = User.where('last_login IS NOT NULL').order(last_login: :desc).limit(1).first
  154. if last_login_user
  155. last_login = last_login_user.last_login
  156. end
  157. status = {
  158. counts: {},
  159. last_created_at: {},
  160. last_login: last_login,
  161. agents: User.with_permissions('ticket.agent').count,
  162. }
  163. map = {
  164. users: User,
  165. groups: Group,
  166. overviews: Overview,
  167. tickets: Ticket,
  168. ticket_articles: Ticket::Article,
  169. }
  170. map.each do |key, class_name|
  171. status[:counts][key] = class_name.count
  172. last = class_name.last
  173. status[:last_created_at][key] = last&.created_at
  174. end
  175. if ActiveRecord::Base.connection_config[:adapter] == 'postgresql'
  176. sql = 'SELECT SUM(CAST(coalesce(size, \'0\') AS INTEGER)) FROM stores WHERE id IN (SELECT DISTINCT(store_file_id) FROM stores)'
  177. records_array = ActiveRecord::Base.connection.exec_query(sql)
  178. if records_array[0] && records_array[0]['sum']
  179. sum = records_array[0]['sum']
  180. status[:storage] = {
  181. kB: sum / 1024,
  182. MB: sum / 1024 / 1024,
  183. GB: sum / 1024 / 1024 / 1024,
  184. }
  185. end
  186. end
  187. render json: status
  188. end
  189. def token
  190. access_check
  191. token = SecureRandom.urlsafe_base64(40)
  192. Setting.set('monitoring_token', token)
  193. result = {
  194. token: token,
  195. }
  196. render json: result, status: :created
  197. end
  198. def restart_failed_jobs
  199. access_check
  200. Scheduler.restart_failed_jobs
  201. render json: {}, status: :ok
  202. end
  203. private
  204. def token_or_permission_check
  205. user = authentication_check_only(permission: 'admin.monitoring')
  206. return if user
  207. return if Setting.get('monitoring_token') == params[:token]
  208. raise Exceptions::NotAuthorized
  209. end
  210. def access_check
  211. return if Permission.find_by(name: 'admin.monitoring', active: true)
  212. raise Exceptions::NotAuthorized
  213. end
  214. end