scheduler.rb 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. # Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
  2. class Scheduler < ApplicationModel
  3. include ChecksHtmlSanitized
  4. include HasTimeplan
  5. extend ::Mixin::StartFinishLogger
  6. sanitized_html :note
  7. # rubocop:disable Style/ClassVars
  8. @@jobs_started = {}
  9. # rubocop:enable Style/ClassVars
  10. # start threads
  11. def self.threads
  12. Thread.abort_on_exception = true
  13. # reconnect in case db connection is lost
  14. begin
  15. ActiveRecord::Base.connection.reconnect!
  16. rescue => e
  17. logger.error "Can't reconnect to database #{e.inspect}"
  18. end
  19. # cleanup old background jobs
  20. cleanup
  21. # start worker for background jobs
  22. worker
  23. # start loop to execute scheduler jobs
  24. loop do
  25. logger.info 'Scheduler running...'
  26. # reconnect in case db connection is lost
  27. begin
  28. ActiveRecord::Base.connection.reconnect!
  29. rescue => e
  30. logger.error "Can't reconnect to database #{e.inspect}"
  31. end
  32. # read/load jobs and check if each has already been started
  33. jobs = Scheduler.where(active: true).order(prio: :asc)
  34. jobs.each do |job|
  35. _try_job(job)
  36. sleep 10
  37. end
  38. sleep 60
  39. end
  40. end
  41. # Checks if a Scheduler Job should get started or not.
  42. # The decision is based on if there is a running thread or not.
  43. # Invalid threads get cancelled and new threads can get started.
  44. #
  45. # @param [Scheduler] job The job that should get checked for running threads.
  46. #
  47. # @example
  48. # Scheduler.skip_job(job)
  49. #
  50. # return [Boolean]
  51. def self.skip_job?(job)
  52. thread = @@jobs_started[ job.id ]
  53. return false if thread.blank?
  54. # check for validity of thread instance
  55. if !thread.respond_to?(:status)
  56. logger.error "Invalid thread stored for job '#{job.name}' (#{job.method}): #{thread.inspect}. Deleting and resting job."
  57. @@jobs_started.delete(job.id)
  58. return false
  59. end
  60. # check thread state:
  61. # http://devdocs.io/ruby~2.4/thread#method-i-status
  62. status = thread.status
  63. # non falsly state means it has some literal running state
  64. if status.present?
  65. logger.info "Running job thread for '#{job.name}' (#{job.method}) status is: #{status}"
  66. return true
  67. end
  68. # the following cases should not happen since the
  69. # @@jobs_started cleanup is performed inside of the
  70. # thread itself
  71. # therefore we have to log an error and remove it
  72. # from our threadpool @@jobs_started
  73. how = 'unknownly'
  74. if status.nil?
  75. how = 'via an exception'
  76. elsif status == false
  77. how = 'normally'
  78. end
  79. logger.error "Job thread terminated #{how} found for '#{job.name}' (#{job.method}). This should not happen. Please report."
  80. @@jobs_started.delete(job.id)
  81. false
  82. end
  83. # Checks all delayed jobs that are locked and cleans them up.
  84. # Should only get called when the Scheduler gets started.
  85. #
  86. # @see Scheduler#cleanup_delayed
  87. #
  88. # @param [Boolean] force forces the cleanup if not called in Scheduler starting context.
  89. #
  90. # @example
  91. # Scheduler.cleanup
  92. #
  93. # @raise [RuntimeError] If called without force and not when Scheduler gets started.
  94. #
  95. # return [nil]
  96. def self.cleanup(force: false)
  97. if !force && caller_locations(1..1).first.label != 'threads'
  98. raise 'This method should only get called when Scheduler.threads are initialized. Use `force: true` to start anyway.' # rubocop:disable Zammad/DetectTranslatableString
  99. end
  100. start_time = Time.zone.now
  101. cleanup_delayed_jobs(start_time)
  102. cleanup_import_jobs(start_time)
  103. end
  104. # Checks for locked delayed jobs and tries to reschedule or destroy each of them.
  105. #
  106. # @param [ActiveSupport::TimeWithZone] after the time the cleanup was started
  107. #
  108. # @example
  109. # Scheduler.cleanup_delayed_jobs(TimeZone.now)
  110. #
  111. # return [nil]
  112. def self.cleanup_delayed_jobs(after)
  113. log_start_finish(:info, "Cleanup of left over locked delayed jobs #{after}") do
  114. Delayed::Job.where('updated_at < ?', after).where.not(locked_at: nil).each do |job|
  115. log_start_finish(:info, "Checking left over delayed job #{job.inspect}") do
  116. cleanup_delayed(job)
  117. end
  118. end
  119. end
  120. end
  121. # Checks if the given delayed job can be rescheduled or destroys it. Logs the action as warn.
  122. # Works only for locked delayed jobs. Delayed jobs that are not locked are ignored and
  123. # should get destroyed directly.
  124. # Checks the Delayed::Job instance for a method called .reschedule?. The method is called
  125. # with the Delayed::Job instance as a parameter. The result value is expected to be a Boolean.
  126. # If the result is true the lock gets removed and the delayed job gets rescheduled.
  127. # If the return value is false it will get destroyed which is the default behaviour.
  128. #
  129. # @param [Delayed::Job] job the job that should get checked for destroying/rescheduling.
  130. #
  131. # @example
  132. # Scheduler.cleanup_delayed(job)
  133. #
  134. # return [nil]
  135. def self.cleanup_delayed(job)
  136. return if job.locked_at.blank?
  137. job_name = job.name
  138. payload_object = job.payload_object
  139. reschedule = false
  140. if payload_object.present?
  141. if payload_object.respond_to?(:object)
  142. object = payload_object.object
  143. if object.respond_to?(:id)
  144. job_name += " (id: #{object.id})"
  145. end
  146. if object.respond_to?(:reschedule?) && object.reschedule?(job)
  147. reschedule = true
  148. end
  149. end
  150. if payload_object.respond_to?(:args)
  151. job_name += " - ARGS: #{payload_object.args.inspect}"
  152. end
  153. end
  154. if reschedule
  155. action = 'Rescheduling'
  156. job.unlock
  157. job.save
  158. else
  159. action = 'Destroyed'
  160. job.destroy
  161. end
  162. logger.warn "#{action} locked delayed job: #{job_name}"
  163. end
  164. # Checks for killed import jobs and marks them as finished and adds a note.
  165. #
  166. # @param [ActiveSupport::TimeWithZone] after the time the cleanup was started
  167. #
  168. # @example
  169. # Scheduler.cleanup_import_jobs(TimeZone.now)
  170. #
  171. # return [nil]
  172. def self.cleanup_import_jobs(after)
  173. log_start_finish(:info, "Cleanup of left over import jobs #{after}") do
  174. error = __('Interrupted by scheduler restart. Please restart manually or wait till next execution time.').freeze
  175. # we need to exclude jobs that were updated at or since we started
  176. # cleaning up (via the #reschedule? call) because they might
  177. # were started `.delay`-ed and are flagged for restart
  178. ImportJob.running.where('updated_at < ?', after).each do |job|
  179. job.update!(
  180. finished_at: after,
  181. result: {
  182. error: error
  183. }
  184. )
  185. end
  186. end
  187. end
  188. def self.start_job(job)
  189. # start job and return thread handle
  190. Thread.new do
  191. ApplicationHandleInfo.current = 'scheduler'
  192. logger.debug { "Started job thread for '#{job.name}' (#{job.method})..." }
  193. # start loop for periods equal or under 5 minutes
  194. if job.period && job.period <= 5.minutes
  195. loop_count = 0
  196. loop do
  197. loop_count += 1
  198. _start_job(job)
  199. job = Scheduler.lookup(id: job.id)
  200. # exit is job got deleted
  201. break if !job
  202. # exit if job is not active anymore
  203. break if !job.active
  204. # exit if there is no loop period defined
  205. break if !job.period
  206. # only do a certain amount of loops in this thread
  207. break if loop_count == 1800
  208. # wait until next run
  209. sleep job.period
  210. end
  211. else
  212. _start_job(job)
  213. end
  214. if job.present?
  215. job.pid = ''
  216. job.save
  217. logger.debug { " ...stopped thread for '#{job.method}'" }
  218. # release thread lock and remove thread handle
  219. @@jobs_started.delete(job.id)
  220. else
  221. logger.warn ' ...Job got deleted while running'
  222. end
  223. ActiveRecord::Base.connection.close
  224. end
  225. end
  226. def self._try_job(job)
  227. # ignore job is still running
  228. return if skip_job?(job)
  229. # check job.last_run
  230. return if job.last_run && job.period && job.last_run > (Time.zone.now - job.period)
  231. # timeplan is optional
  232. # but if timeplan is present
  233. return if job.timeplan.present? && !job.in_timeplan?(Time.zone.now)
  234. @@jobs_started[ job.id ] = start_job(job)
  235. end
  236. def self._start_job(job, try_count = 0, try_run_time = Time.zone.now)
  237. started_at = Time.zone.now
  238. job.update!(
  239. last_run: started_at,
  240. pid: Thread.current.object_id,
  241. status: 'ok',
  242. error_message: '',
  243. )
  244. logger.info "execute #{job.method} (try_count #{try_count})..."
  245. eval job.method # rubocop:disable Security/Eval
  246. took = Time.zone.now - started_at
  247. logger.info "ended #{job.method} took: #{took} seconds."
  248. rescue => e
  249. took = Time.zone.now - started_at
  250. logger.error "execute #{job.method} (try_count #{try_count}) exited with error #{e.inspect} in: #{took} seconds."
  251. # reconnect in case db connection is lost
  252. begin
  253. ActiveRecord::Base.connection.reconnect!
  254. rescue => e
  255. logger.error "Can't reconnect to database #{e.inspect}"
  256. end
  257. try_run_max = 10
  258. try_count += 1
  259. # reset error counter if to old
  260. if try_run_time + (60 * 5) < Time.zone.now
  261. try_count = 0
  262. end
  263. try_run_time = Time.zone.now
  264. # restart job again
  265. if try_run_max > try_count
  266. # wait between retries (see https://github.com/zammad/zammad/issues/1950)
  267. sleep(try_count) if Rails.env.production?
  268. _start_job(job, try_count, try_run_time)
  269. else
  270. # release thread lock and remove thread handle
  271. @@jobs_started.delete(job.id)
  272. error = "Failed to run #{job.method} after #{try_count} tries #{e.inspect}"
  273. logger.error error
  274. job.update!(
  275. error_message: error,
  276. status: 'error',
  277. active: false,
  278. )
  279. end
  280. # rescue any other Exceptions that are not StandardError or childs of it
  281. # https://stackoverflow.com/questions/10048173/why-is-it-bad-style-to-rescue-exception-e-in-ruby
  282. # http://rubylearning.com/satishtalim/ruby_exceptions.html
  283. rescue Exception => e # rubocop:disable Lint/RescueException
  284. took = Time.zone.now - started_at
  285. logger.error "execute #{job.method} (try_count #{try_count}) exited with a non standard-error #{e.inspect} in: #{took} seconds."
  286. raise
  287. ensure
  288. ActiveSupport::CurrentAttributes.clear_all
  289. end
  290. def self.worker(foreground = false)
  291. # used for tests
  292. if foreground
  293. original_interface_handle = ApplicationHandleInfo.current
  294. ApplicationHandleInfo.current = 'scheduler'
  295. original_user_id = UserInfo.current_user_id
  296. UserInfo.current_user_id = nil
  297. loop do
  298. success, failure = Delayed::Worker.new.work_off
  299. if failure.nonzero?
  300. raise "#{failure} failed background jobs: #{Delayed::Job.where.not(last_error: nil).inspect}"
  301. end
  302. break if success.zero?
  303. end
  304. UserInfo.current_user_id = original_user_id
  305. ApplicationHandleInfo.current = original_interface_handle
  306. return
  307. end
  308. # used for production
  309. wait = 4
  310. Thread.new do
  311. sleep wait
  312. logger.info "Starting worker thread #{Delayed::Job}"
  313. loop do
  314. ApplicationHandleInfo.current = 'scheduler'
  315. result = nil
  316. realtime = Benchmark.realtime do
  317. logger.debug { "*** worker thread, #{Delayed::Job.all.count} in queue" }
  318. result = Delayed::Worker.new.work_off
  319. end
  320. count = result.sum
  321. if count.zero?
  322. sleep wait
  323. logger.debug { '*** worker thread loop' }
  324. else
  325. format "*** #{count} jobs processed at %<jps>.4f j/s, %<failed>d failed ...\n", jps: count / realtime, failed: result.last
  326. end
  327. end
  328. logger.info ' ...stopped worker thread'
  329. ActiveRecord::Base.connection.close
  330. end
  331. end
  332. # This function returns a list of failed jobs
  333. #
  334. # @example
  335. # Scheduler.failed_jobs
  336. #
  337. # return [Array]
  338. def self.failed_jobs
  339. where(status: 'error', active: false)
  340. end
  341. # This function restarts failed jobs to retry them
  342. #
  343. # @example
  344. # Scheduler.restart_failed_jobs
  345. #
  346. # return [true]
  347. def self.restart_failed_jobs
  348. failed_jobs.each do |job|
  349. job.update!(active: true)
  350. end
  351. true
  352. end
  353. end