scheduler.rb 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. # Copyright (C) 2012-2016 Zammad Foundation, http://zammad-foundation.org/
  2. class Scheduler < ApplicationModel
  3. extend ::Mixin::StartFinishLogger
  4. # rubocop:disable Style/ClassVars
  5. @@jobs_started = {}
  6. # rubocop:enable Style/ClassVars
  7. # start threads
  8. def self.threads
  9. Thread.abort_on_exception = true
  10. # reconnect in case db connection is lost
  11. begin
  12. ActiveRecord::Base.connection.reconnect!
  13. rescue => e
  14. logger.error "Can't reconnect to database #{e.inspect}"
  15. end
  16. # cleanup old background jobs
  17. cleanup
  18. # start worker for background jobs
  19. worker
  20. # start loop to execute scheduler jobs
  21. loop do
  22. logger.info 'Scheduler running...'
  23. # reconnect in case db connection is lost
  24. begin
  25. ActiveRecord::Base.connection.reconnect!
  26. rescue => e
  27. logger.error "Can't reconnect to database #{e.inspect}"
  28. end
  29. # read/load jobs and check if it is alredy started
  30. jobs = Scheduler.where('active = ?', true).order('prio ASC')
  31. jobs.each do |job|
  32. # ignore job is still running
  33. next if skip_job?(job)
  34. # check job.last_run
  35. next if job.last_run && job.period && job.last_run > (Time.zone.now - job.period)
  36. # run job as own thread
  37. @@jobs_started[ job.id ] = start_job(job)
  38. sleep 10
  39. end
  40. sleep 60
  41. end
  42. end
  43. # Checks if a Scheduler Job should get started or not.
  44. # The decision is based on if there is a running thread or not.
  45. # Invalid threads get canceled and new threads can get started.
  46. #
  47. # @param [Scheduler] job The job that should get checked for running threads.
  48. #
  49. # @example
  50. # Scheduler.skip_job(job)
  51. #
  52. # return [Boolean]
  53. def self.skip_job?(job)
  54. thread = @@jobs_started[ job.id ]
  55. return false if thread.blank?
  56. # check for validity of thread instance
  57. if !thread.respond_to?(:status)
  58. logger.error "Invalid thread stored for job '#{job.name}' (#{job.method}): #{thread.inspect}. Deleting and resting job."
  59. @@jobs_started.delete(job.id)
  60. return false
  61. end
  62. # check thread state:
  63. # http://devdocs.io/ruby~2.4/thread#method-i-status
  64. status = thread.status
  65. # non falsly state means it has some literal running state
  66. if status.present?
  67. logger.info "Running job thread for '#{job.name}' (#{job.method}) status is: #{status}"
  68. return true
  69. end
  70. # the following cases should not happen since the
  71. # @@jobs_started cleanup is performed inside of the
  72. # thread itself
  73. # therefore we have to log an error and remove it
  74. # from our threadpool @@jobs_started
  75. how = 'unknownly'
  76. if status.nil?
  77. how = 'via an exception'
  78. elsif status == false
  79. how = 'normally'
  80. end
  81. logger.error "Job thread terminated #{how} found for '#{job.name}' (#{job.method}). This should not happen. Please report."
  82. @@jobs_started.delete(job.id)
  83. false
  84. end
  85. # Checks all delayed jobs that are locked and cleans them up.
  86. # Should only get called when the Scheduler gets started.
  87. #
  88. # @see Scheduler#cleanup_delayed
  89. #
  90. # @param [Boolean] force forces the cleanup if not called in Scheduler starting context.
  91. #
  92. # @example
  93. # Scheduler.cleanup
  94. #
  95. # @raise [RuntimeError] If called without force and not when Scheduler gets started.
  96. #
  97. # return [nil]
  98. def self.cleanup(force: false)
  99. if !force && caller_locations(1..1).first.label != 'threads'
  100. raise 'This method should only get called when Scheduler.threads are initialized. Use `force: true` to start anyway.'
  101. end
  102. start_time = Time.zone.now
  103. cleanup_delayed_jobs(start_time)
  104. cleanup_import_jobs(start_time)
  105. end
  106. # Checks for locked delayed jobs and tries to reschedule or destroy each of them.
  107. #
  108. # @param [ActiveSupport::TimeWithZone] after the time the cleanup was started
  109. #
  110. # @example
  111. # Scheduler.cleanup_delayed_jobs(TimeZone.now)
  112. #
  113. # return [nil]
  114. def self.cleanup_delayed_jobs(after)
  115. log_start_finish(:info, "Cleanup of left over locked delayed jobs #{after}") do
  116. Delayed::Job.where('updated_at < ?', after).where.not(locked_at: nil).each do |job|
  117. log_start_finish(:info, "Checking left over delayed job #{job.inspect}") do
  118. cleanup_delayed(job)
  119. end
  120. end
  121. end
  122. end
  123. # Checks if the given delayed job can be rescheduled or destroys it. Logs the action as warn.
  124. # Works only for locked delayed jobs. Delayed jobs that are not locked are ignored and
  125. # should get destroyed directly.
  126. # Checks the Delayed::Job instance for a method called .reschedule?. The method is called
  127. # with the Delayed::Job instance as a parameter. The result value is expected to be a Boolean.
  128. # If the result is true the lock gets removed and the delayed job gets rescheduled.
  129. # If the return value is false it will get destroyed which is the default behaviour.
  130. #
  131. # @param [Delayed::Job] job the job that should get checked for destroying/rescheduling.
  132. #
  133. # @example
  134. # Scheduler.cleanup_delayed(job)
  135. #
  136. # return [nil]
  137. def self.cleanup_delayed(job)
  138. return if job.locked_at.blank?
  139. job_name = job.name
  140. payload_object = job.payload_object
  141. reschedule = false
  142. if payload_object.present?
  143. if payload_object.respond_to?(:object)
  144. object = payload_object.object
  145. if object.respond_to?(:id)
  146. job_name += " (id: #{object.id})"
  147. end
  148. if object.respond_to?(:reschedule?) && object.reschedule?(job)
  149. reschedule = true
  150. end
  151. end
  152. if payload_object.respond_to?(:args)
  153. job_name += " - ARGS: #{payload_object.args.inspect}"
  154. end
  155. end
  156. if reschedule
  157. action = 'Rescheduling'
  158. job.unlock
  159. job.save
  160. else
  161. action = 'Destroyed'
  162. job.destroy
  163. end
  164. logger.warn "#{action} locked delayed job: #{job_name}"
  165. end
  166. # Checks for killed import jobs and marks them as finished and adds a note.
  167. #
  168. # @param [ActiveSupport::TimeWithZone] after the time the cleanup was started
  169. #
  170. # @example
  171. # Scheduler.cleanup_import_jobs(TimeZone.now)
  172. #
  173. # return [nil]
  174. def self.cleanup_import_jobs(after)
  175. log_start_finish(:info, "Cleanup of left over import jobs #{after}") do
  176. error = 'Interrupted by scheduler restart. Please restart manually or wait till next execution time.'.freeze
  177. # we need to exclude jobs that were updated at or since we started
  178. # cleaning up (via the #reschedule? call) because they might
  179. # were started `.delay`-ed and are flagged for restart
  180. ImportJob.running.where('updated_at < ?', after).each do |job|
  181. job.update!(
  182. finished_at: after,
  183. result: {
  184. error: error
  185. }
  186. )
  187. end
  188. end
  189. end
  190. def self.start_job(job)
  191. # start job and return thread handle
  192. Thread.new do
  193. ApplicationHandleInfo.current = 'scheduler'
  194. logger.info "Started job thread for '#{job.name}' (#{job.method})..."
  195. # start loop for periods equal or under 5 minutes
  196. if job.period && job.period <= 5.minutes
  197. loop_count = 0
  198. loop do
  199. loop_count += 1
  200. _start_job(job)
  201. job = Scheduler.lookup(id: job.id)
  202. # exit is job got deleted
  203. break if !job
  204. # exit if job is not active anymore
  205. break if !job.active
  206. # exit if there is no loop period defined
  207. break if !job.period
  208. # only do a certain amount of loops in this thread
  209. break if loop_count == 1800
  210. # wait until next run
  211. sleep job.period
  212. end
  213. else
  214. _start_job(job)
  215. end
  216. job.pid = ''
  217. job.save
  218. logger.info " ...stopped thread for '#{job.method}'"
  219. ActiveRecord::Base.connection.close
  220. # release thread lock and remove thread handle
  221. @@jobs_started.delete(job.id)
  222. end
  223. end
  224. def self._start_job(job, try_count = 0, try_run_time = Time.zone.now)
  225. job.update!(
  226. last_run: Time.zone.now,
  227. pid: Thread.current.object_id,
  228. status: 'ok',
  229. error_message: '',
  230. )
  231. logger.info "execute #{job.method} (try_count #{try_count})..."
  232. eval job.method() # rubocop:disable Security/Eval
  233. rescue => e
  234. logger.error "execute #{job.method} (try_count #{try_count}) exited with error #{e.inspect}"
  235. # reconnect in case db connection is lost
  236. begin
  237. ActiveRecord::Base.connection.reconnect!
  238. rescue => e
  239. logger.error "Can't reconnect to database #{e.inspect}"
  240. end
  241. try_run_max = 10
  242. try_count += 1
  243. # reset error counter if to old
  244. if try_run_time + (60 * 5) < Time.zone.now
  245. try_count = 0
  246. end
  247. try_run_time = Time.zone.now
  248. # restart job again
  249. if try_run_max > try_count
  250. # wait between retries (see https://github.com/zammad/zammad/issues/1950)
  251. sleep(try_count) if Rails.env.production?
  252. _start_job(job, try_count, try_run_time)
  253. else
  254. # release thread lock and remove thread handle
  255. @@jobs_started.delete(job.id)
  256. error = "Failed to run #{job.method} after #{try_count} tries #{e.inspect}"
  257. logger.error error
  258. job.update!(
  259. error_message: error,
  260. status: 'error',
  261. active: false,
  262. )
  263. end
  264. # rescue any other Exceptions that are not StandardError or childs of it
  265. # https://stackoverflow.com/questions/10048173/why-is-it-bad-style-to-rescue-exception-e-in-ruby
  266. # http://rubylearning.com/satishtalim/ruby_exceptions.html
  267. rescue Exception => e # rubocop:disable Lint/RescueException
  268. logger.error "execute #{job.method} (try_count #{try_count}) exited with a non standard-error #{e.inspect}"
  269. raise
  270. end
  271. def self.worker(foreground = false)
  272. # used for tests
  273. if foreground
  274. original_interface_handle = ApplicationHandleInfo.current
  275. ApplicationHandleInfo.current = 'scheduler'
  276. original_user_id = UserInfo.current_user_id
  277. UserInfo.current_user_id = nil
  278. loop do
  279. success, failure = Delayed::Worker.new.work_off
  280. if failure.nonzero?
  281. raise "ERROR: #{failure} failed background jobs: #{Delayed::Job.where('last_error IS NOT NULL').inspect}"
  282. end
  283. break if success.zero?
  284. end
  285. UserInfo.current_user_id = original_user_id
  286. ApplicationHandleInfo.current = original_interface_handle
  287. return
  288. end
  289. # used for production
  290. wait = 4
  291. Thread.new do
  292. sleep wait
  293. logger.info "Starting worker thread #{Delayed::Job}"
  294. loop do
  295. ApplicationHandleInfo.current = 'scheduler'
  296. result = nil
  297. realtime = Benchmark.realtime do
  298. logger.debug { "*** worker thread, #{Delayed::Job.all.count} in queue" }
  299. result = Delayed::Worker.new.work_off
  300. end
  301. count = result.sum
  302. if count.zero?
  303. sleep wait
  304. logger.debug { '*** worker thread loop' }
  305. else
  306. format "*** #{count} jobs processed at %<jps>.4f j/s, %<failed>d failed ...\n", jps: count / realtime, failed: result.last
  307. end
  308. end
  309. logger.info ' ...stopped worker thread'
  310. ActiveRecord::Base.connection.close
  311. end
  312. end
  313. # This function returns a list of failed jobs
  314. #
  315. # @example
  316. # Scheduler.failed_jobs
  317. #
  318. # return [Array]
  319. def self.failed_jobs
  320. where(status: 'error', active: false)
  321. end
  322. # This function restarts failed jobs to retry them
  323. #
  324. # @example
  325. # Scheduler.restart_failed_jobs
  326. #
  327. # return [true]
  328. def self.restart_failed_jobs
  329. failed_jobs.each do |job|
  330. job.update!(active: true)
  331. end
  332. true
  333. end
  334. end