diff options
Diffstat (limited to 'src/com/github')
| -rw-r--r-- | src/com/github/ivarref/yoltq.clj | 539 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/error_poller.clj | 124 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/ext_sys.clj | 27 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/impl.clj | 248 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/migrate.clj | 61 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/poller.clj | 64 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/report_queue.clj | 459 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/slow_executor_detector.clj | 36 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/test_queue.clj | 191 | ||||
| -rw-r--r-- | src/com/github/ivarref/yoltq/utils.clj | 179 |
10 files changed, 1928 insertions, 0 deletions
diff --git a/src/com/github/ivarref/yoltq.clj b/src/com/github/ivarref/yoltq.clj new file mode 100644 index 0000000..8c8ca7a --- /dev/null +++ b/src/com/github/ivarref/yoltq.clj @@ -0,0 +1,539 @@ +(ns com.github.ivarref.yoltq + (:require + [clojure.tools.logging :as log] + [com.github.ivarref.yoltq.error-poller :as errpoller] + [com.github.ivarref.yoltq.impl :as i] + [com.github.ivarref.yoltq.migrate :as migrate] + [com.github.ivarref.yoltq.poller :as poller] + [com.github.ivarref.yoltq.report-queue :as rq] + [com.github.ivarref.yoltq.slow-executor-detector :as slow-executor] + [com.github.ivarref.yoltq.utils :as u] + [datomic.api :as d]) + (:import (datomic Connection) + (java.lang.management ManagementFactory) + (java.time Duration Instant ZoneOffset ZonedDateTime) + (java.util.concurrent BlockingQueue ExecutorService Executors ScheduledExecutorService TimeUnit))) + +(defonce ^:dynamic *config* (atom nil)) +(defonce threadpool (atom nil)) +(defonce ^:dynamic *running?* (atom false)) +(defonce ^:dynamic *test-mode* false) + +(def default-opts + (-> {; Default number of times a queue job will be retried before giving up + ; Can be overridden on a per-consumer basis with + ; (yq/add-consumer! :q (fn [payload] ...) {:max-retries 200}) + ; If you want no limit on the number of retries, specify + ; the value `0`. That will set the effective retry limit to + ; 9223372036854775807 times. + :max-retries 9223372036854775807 + + ; Minimum amount of time to wait before a failed queue job is retried + :error-backoff-time (Duration/ofSeconds 5) + + ; Max time a queue job can execute before an error is logged + :max-execute-time (Duration/ofMinutes 5) + + ; Amount of time an in progress queue job can run before it is considered failed + ; and will be marked as such. + :hung-backoff-time (Duration/ofMinutes 30) + + ; Most queue jobs in init state will be consumed by the tx-report-queue listener. + ; However, in the case where an init job was added right before the application + ; was shut down and did not have time to be processed by the tx-report-queue listener, + ; it will be consumer by the init poller. This init poller backs off by + ; :init-backoff-time to avoid unnecessary compare-and-swap lock failures that could + ; otherwise occur if competing with the tx-report-queue listener. + :init-backoff-time (Duration/ofSeconds 60) + + ; If you are dealing with a flaky downstream service, you may not want + ; yoltq to mark itself as unhealthy on the first failure encounter with + ; the downstream service. Change this setting to let yoltq mark itself + ; as healthy even though a queue item has been failing for some time. + :healthy-allowed-error-time (Duration/ofMinutes 15) + + ; How frequent polling for init, error and hung jobs should be done. + :poll-delay (Duration/ofSeconds 10) + + ; Specifies the number of threads available for executing queue and polling jobs. + ; The final thread pool will be this size + 2. + ; + ; One thread is permanently allocated for listening to the + ; tx-report-queue. + ; + ; Another thread is permanently allocated for checking :max-execute-time. + ; This means that if all executing queue jobs are stuck and the thread pool is unavailable + ; as such, at least an error will be logged about this. The log entry will + ; contain the stacktrace of the stuck threads. + :pool-size 4 + + :capture-bindings (if-let [s (resolve (symbol "taoensso.timbre/*context*"))] + [s] + []) + + ; How often should the system be polled for failed queue jobs + :system-error-poll-delay (Duration/ofMinutes 1) + + ; How often should the system invoke + :system-error-callback-backoff (Duration/ofHours 1) + + ; Should old, possibly stalled jobs be automatically be migrated + ; as part of `start!`? + :auto-migrate? true} + + u/duration->millis)) + + +(defn init! [{:keys [conn tx-report-queue] :as cfg}] + (assert (instance? Connection conn) (str "Expected :conn to be of type datomic Connection. Was: " (or (some-> conn class str) "nil"))) + (when (some? tx-report-queue) + (assert (instance? BlockingQueue tx-report-queue) + (str "Expected :tx-report-queue to be of type java.util.concurrent.BlockingQueue"))) + (locking threadpool + @(d/transact conn i/schema) + (let [new-cfg (swap! *config* + (fn [old-conf] + (-> (merge-with (fn [_ b] b) + {:running-queues (atom #{}) + :start-execute-time (atom {}) + :system-error (atom {}) + :healthy? (atom nil) + :slow? (atom nil) + :slow-thread-watcher-done? (promise)} + default-opts + (if *test-mode* old-conf (select-keys old-conf [:handlers])) + cfg) + u/duration->millis)))] + new-cfg))) + + +(defn get-queue-id + [queue-id-or-var] + (cond (and (var? queue-id-or-var) + (keyword? (:yoltq/queue-id (meta queue-id-or-var)))) + (:yoltq/queue-id (meta queue-id-or-var)) + + (keyword? queue-id-or-var) + queue-id-or-var + + :else + (throw (ex-info (str "Could not get queue-id for " queue-id-or-var) {:queue-id queue-id-or-var})))) + +(defn add-consumer! + ([queue-id f] + (add-consumer! queue-id f {})) + ([queue-id f opts] + (swap! *config* (fn [old-config] (assoc-in old-config [:handlers (get-queue-id queue-id)] (merge opts {:f f})))))) + + +(defn put + ([queue-id payload] (put queue-id payload {})) + ([queue-id payload opts] + (let [{:keys [bootstrap-poller! conn] :as cfg} @*config*] + (when (and *test-mode* bootstrap-poller!) + (bootstrap-poller! conn)) + (i/put cfg (get-queue-id queue-id) payload opts)))) + + +(defn- do-start! [] + (let [{:keys [poll-delay pool-size system-error-poll-delay auto-migrate? slow-thread-watcher-done?] :as cfg} @*config*] + (when auto-migrate? + (future (migrate/migrate! cfg))) + (let [pool (reset! threadpool (Executors/newScheduledThreadPool (+ 1 pool-size))) + queue-listener-ready (promise)] + (reset! *running?* true) + (.scheduleAtFixedRate ^ScheduledExecutorService pool (fn [] (poller/poll-all-queues! *running?* *config* pool)) 0 poll-delay TimeUnit/MILLISECONDS) + (.scheduleAtFixedRate ^ScheduledExecutorService pool (fn [] (errpoller/poll-errors *running?* *config*)) 0 system-error-poll-delay TimeUnit/MILLISECONDS) + (.execute ^ScheduledExecutorService pool + (fn [] + (try + (log/debug "report-queue-listener starting") + (rq/report-queue-listener *running?* queue-listener-ready pool *config*) + (finally + (log/debug "report-queue-listener exiting") + (deliver queue-listener-ready :finally))))) + (future (try + (slow-executor/show-slow-threads pool *config*) + (finally + (deliver slow-thread-watcher-done? :done)))) + (let [q-listener-retval (deref queue-listener-ready 30000 :timeout)] + (cond (= :timeout q-listener-retval) + (do + (log/error "Timed out waiting for report-queue-listener to start") + (throw (IllegalStateException. "Timed out waiting for report-queue-listener to start"))) + + (= :finally q-listener-retval) + (do + (log/error "report-queue-listener did not start") + (throw (IllegalStateException. "report-queue-listener did not start"))) + + (= :ready q-listener-retval) + (do + (log/debug "report-queue-listener is ready")) + + :else + (do + (log/error (str "Unexpected queue-listener-retval: " (pr-str q-listener-retval))) + (throw (IllegalStateException. (str "Unexpected queue-listener-retval: " (pr-str q-listener-retval)))))))))) + + +(defn start! [] + (locking threadpool + (cond (true? *test-mode*) + (log/info "test mode enabled, doing nothing for start!") + + (true? @*running?*) + nil + + (false? @*running?*) + (do-start!)))) + + +(defn stop! [] + (locking threadpool + (cond (true? *test-mode*) + (log/info "test mode enabled, doing nothing for stop!") + + (false? @*running?*) + nil + + (true? @*running?*) + (do + (reset! *running?* false) + (when-let [^ExecutorService tp @threadpool] + (log/debug "shutting down threadpool") + (.shutdown tp) + (while (not (.awaitTermination tp 1 TimeUnit/SECONDS)) + (log/trace "waiting for threadpool to stop")) + (log/debug "stopped!") + (reset! threadpool nil)) + (when-let [wait-slow-threads (some->> *config* deref :slow-thread-watcher-done?)] + (log/debug "waiting for slow-thread-watcher to stop ...") + @wait-slow-threads + (log/debug "waiting for slow-thread-watcher to stop ... OK")))))) + + +(defn healthy? [] + (cond + (< (.toMinutes (Duration/ofMillis (.getUptime (ManagementFactory/getRuntimeMXBean)))) 10) + true + + (false? (some->> @*config* + :healthy? + (deref))) + false + + (true? (some->> @*config* + :slow? + (deref))) + false + + :else + true)) + +(defn unhealthy? + "Returns `true` if there are queues in error or a thread is slow, and the application has been up for over 10 minutes, otherwise `false`." + [] + (false? (healthy?))) + +(defn queue-stats [] + (let [{:keys [conn]} @*config* + db (d/db conn)] + (->> (d/q '[:find ?e ?qname ?status + :in $ + :where + [?e :com.github.ivarref.yoltq/queue-name ?qname] + [?e :com.github.ivarref.yoltq/status ?status]] + db) + (mapv (partial zipmap [:e :qname :status])) + (mapv #(select-keys % [:qname :status])) + (mapv (fn [qitem] {qitem 1})) + (reduce (partial merge-with +) {}) + (mapv (fn [[{:keys [qname status]} v]] + (array-map + :qname qname + :status status + :count v))) + (sort-by (juxt :qname :status)) + (vec)))) + +(defn job-group-progress [queue-name job-group] + (let [{:keys [conn]} @*config* + db (d/db conn)] + (->> (d/q '[:find ?e ?qname ?job-group ?status + :keys :e :qname :job-group :status + :in $ ?qname ?job-group + :where + [?e :com.github.ivarref.yoltq/queue-name ?qname] + [?e :com.github.ivarref.yoltq/job-group ?job-group] + [?e :com.github.ivarref.yoltq/status ?status]] + db queue-name job-group) + (mapv #(select-keys % [:qname :job-group :status])) + (mapv (fn [qitem] {qitem 1})) + (reduce (partial merge-with +) {}) + (mapv (fn [[{:keys [qname job-group status]} v]] + (array-map + :qname qname + :job-group job-group + :status status + :count v))) + (sort-by (juxt :qname :job-group :status)) + (vec)))) + +(defn get-errors [qname] + (let [{:keys [conn]} @*config* + db (d/db conn)] + (->> (d/q '[:find [?id ...] + :in $ ?qname ?status + :where + [?e :com.github.ivarref.yoltq/queue-name ?qname] + [?e :com.github.ivarref.yoltq/status ?status] + [?e :com.github.ivarref.yoltq/id ?id]] + db + qname + :error) + (mapv (partial u/get-queue-item db))))) + +(defn retry-one-error! [qname] + (let [{:keys [handlers] :as cfg} @*config* + _ (assert (contains? handlers qname) "Queue not found") + cfg (assoc-in cfg [:handlers qname :max-retries] Long/MAX_VALUE)] + (poller/poll-once! cfg qname :error))) + +(defn retry-stats + "Gather retry statistics. + + Optional keyword arguments: + * :age-days — last number of days to look at data from. Defaults to 30. + * :queue-name — only gather statistics for this queue name. Defaults to nil, meaning all queues. + + Example return value: + {:queue-a {:ok 100, :retries 2, :retry-percentage 2.0} + :queue-b {:ok 100, :retries 75, :retry-percentage 75.0}} + + From the example value above, we can see that :queue-b fails at a much higher rate than :queue-a. + Assuming that the queue consumers are correctly implemented, this means that the service representing :queue-b + is much more unstable than the one representing :queue-a. This again implies + that you will probably want to fix the downstream service of :queue-b, if that is possible. + " + [{:keys [age-days queue-name now db] + :or {age-days 30 + now (ZonedDateTime/now ZoneOffset/UTC)}}] + (let [{:keys [conn]} @*config* + db (or db (d/db conn))] + (->> (d/query {:query {:find '[?qname ?status ?tries ?init-time] + :in (into '[$] (when queue-name '[?qname])) + :where '[[?e :com.github.ivarref.yoltq/queue-name ?qname] + [?e :com.github.ivarref.yoltq/status ?status] + [?e :com.github.ivarref.yoltq/tries ?tries] + [?e :com.github.ivarref.yoltq/init-time ?init-time]]} + :args (remove nil? [db queue-name])}) + (mapv (partial zipmap [:qname :status :tries :init-time])) + (mapv #(update % :init-time (fn [init-time] (.atZone (Instant/ofEpochMilli init-time) ZoneOffset/UTC)))) + (mapv #(assoc % :age-days (.toDays (Duration/between (:init-time %) now)))) + (filter #(<= (:age-days %) age-days)) + (group-by :qname) + (mapv (fn [[q values]] + {q (let [{:keys [ok retries] :as m} (->> values + (mapv (fn [{:keys [tries status]}] + (condp = status + u/status-init {} + u/status-processing {:processing 1 :retries (dec tries)} + u/status-done {:ok 1 :retries (dec tries)} + u/status-error {:error 1 :retries (dec tries)}))) + (reduce (partial merge-with +) {}))] + (into (sorted-map) (merge m + (when (pos-int? ok) + {:retry-percentage (double (* 100 (/ retries ok)))}))))})) + (into (sorted-map))))) + +(defn- percentile [n values] + (let [idx (int (Math/floor (* (count values) (/ n 100))))] + (nth values idx))) + +(defn processing-time-stats + "Gather processing time statistics. Default unit is seconds. + + Optional keyword arguments: + * :age-days — last number of days to look at data from. Defaults to 30. + Use nil to have no limit. + + * :queue-name — only gather statistics for this queue name. Defaults to nil, meaning all queues. + + * :duration->long - Specify what unit should be used for values. + Must take a java.time.Duration as input and return a long. + + Defaults to (fn [duration] (.toSeconds duration). + I.e. the default unit is seconds. + + Example return value: + {:queue-a {:avg 1 + :max 10 + :min 0 + :p50 ... + :p90 ... + :p95 ... + :p99 ...}}" + [{:keys [age-days queue-name now db duration->long] + :or {age-days 30 + now (ZonedDateTime/now ZoneOffset/UTC) + duration->long (fn [duration] (.toSeconds ^Duration duration))}}] + (let [{:keys [conn]} @*config* + db (or db (d/db conn)) + ->zdt #(.atZone (Instant/ofEpochMilli %) ZoneOffset/UTC)] + (->> (d/query {:query {:find '[?qname ?status ?init-time ?done-time] + :in (into '[$ ?status] (when queue-name '[?qname])) + :where '[[?e :com.github.ivarref.yoltq/queue-name ?qname] + [?e :com.github.ivarref.yoltq/status ?status] + [?e :com.github.ivarref.yoltq/init-time ?init-time] + [?e :com.github.ivarref.yoltq/done-time ?done-time]]} + :args (vec (remove nil? [db u/status-done queue-name]))}) + (mapv (partial zipmap [:qname :status :init-time :done-time])) + (mapv #(update % :init-time ->zdt)) + (mapv #(update % :done-time ->zdt)) + (mapv #(assoc % :age-days (.toDays (Duration/between (:init-time %) now)))) + (mapv #(assoc % :spent-time (duration->long (Duration/between (:init-time %) (:done-time %))))) + (filter #(or (nil? age-days) (<= (:age-days %) age-days))) + (group-by :qname) + (mapv (fn [[q values]] + (let [values (vec (sort (mapv :spent-time values)))] + {q (sorted-map + :max (apply max values) + :avg (int (Math/floor (/ (reduce + 0 values) (count values)))) + :p50 (percentile 50 values) + :p90 (percentile 90 values) + :p95 (percentile 95 values) + :p99 (percentile 99 values) + :min (apply min values))}))) + (into (sorted-map))))) + +(defn get-tx-report-queue-multicast! + "Multicast the datomic.api/tx-report-queue to different consumers. + A multicaster thread is started on demand per connection. `conn` and `id` identifies the consumer. + Repeated calls using the same `conn` and `id` returns the same queue. + + The optional third parameter, `send-end-token?`, if set to `true`, instructs the multicaster thread + to send `:end` if the queue is stopped. + The default value for `send-end-token?` is `false`. + + A queue may be stopped using `stop-multicaster-id!`, `stop-multicaster!` or `stop-all-multicasters!`. + + Returns a `java.util.concurrent.BlockingQueue` like `datomic.api/tx-report-queue`." + ([conn id] + (get-tx-report-queue-multicast! conn id false)) + ([conn id send-end-token?] + (assert (instance? Connection conn)) + (assert (boolean? send-end-token?)) + (rq/get-tx-report-queue-multicast! conn id send-end-token?))) + +(defn stop-multicast-consumer-id! + "Stop forwarding reports from datomic.api/tx-report-queue to the queue identified by `conn` and `id`. + If this is the last report destination for the given `conn`, the multicaster thread will exit. + Repeated calls are no-op. + + The multicaster thread will send `:end` if `send-end-token?` was `true` when `get-tx-report-queue-multicast!` + was called. + + Returns `true` if the queue was stopped. + Return `false` if the queue does not exist." + [conn id] + (assert (instance? Connection conn)) + (rq/stop-multicast-consumer-id! conn id)) + +(defn stop-multicaster! + "Stop forwarding reports from datomic.api/tx-report-queue to any queues belonging to `conn`. + The multicaster thread will exit. + Repeated calls are no-op. + + The multicaster thread will send `:end` if `send-end-token?` was `true` when `get-tx-report-queue-multicast!` + was called. + + Returns `true` if any queue belonging to `conn` was stopped. + Returns `false` is `conn` did not have any associated queues." + [conn] + (assert (instance? Connection conn)) + (rq/stop-multicaster! conn)) + +(defn stop-all-multicasters! + "Stop forwarding all reports from datomic.api/tx-report-queue for any `conn`. + All multicaster threads will exit. + Repeated calls are no-op. + + The multicaster thread will send `:end` if `send-end-token?` was `true` when `get-tx-report-queue-multicast!` + was called. + + Returns `true` if any queue was stopped. + Returns `false` if no queues existed." + [] + (rq/stop-all-multicasters!)) + +(comment + (do + (require 'com.github.ivarref.yoltq.log-init) + (com.github.ivarref.yoltq.log-init/init-logging! + [[#{"datomic.*" "com.datomic.*" "org.apache.*"} :warn] + [#{"ivarref.yoltq.report-queue"} :info] + [#{"ivarref.yoltq.poller"} :info] + [#{"com.github.ivarref.yoltq"} :debug] + ;[#{"ivarref.yoltq*"} :info] + [#{"*"} :info]]) + (stop!) + (future (let [received (atom []) + uri (str "datomic:mem://demo")] + (d/delete-database uri) + (d/create-database uri) + (let [conn (d/connect uri) + started-consuming? (promise) + n 1] + (init! {:conn conn + :error-backoff-time (Duration/ofSeconds 1) + :poll-delay (Duration/ofSeconds 1) + :max-execute-time (Duration/ofSeconds 3) + :slow-thread-show-stacktrace? false}) + (add-consumer! :q (fn [_] + (deliver started-consuming? true) + (log/info "sleeping...") + (Thread/sleep (.toMillis (Duration/ofSeconds 60))) + (log/info "done sleeping"))) + (start!) + @(d/transact conn [(put :q {:work 123})]) + @started-consuming? + (stop!) + nil))))) + +(comment + (do + (require 'com.github.ivarref.yoltq.log-init) + (com.github.ivarref.yoltq.log-init/init-logging! + [[#{"datomic.*" "com.datomic.*" "org.apache.*"} :warn] + [#{"com.github.ivarref.yoltq.report-queue"} :debug] + [#{"com.github.ivarref.yoltq.poller"} :info] + [#{"com.github.ivarref.yoltq.migrate"} :warn] + [#{"com.github.ivarref.yoltq"} :debug] + ;[#{"ivarref.yoltq*"} :info] + [#{"*"} :info]]) + (stop!) + (let [received (atom []) + uri (str "datomic:mem://demo")] + (d/delete-database uri) + (d/create-database uri) + (let [conn (d/connect uri) + started-consuming? (promise) + n 1] + (init! {:conn conn + :tx-report-queue (get-tx-report-queue-multicast! conn :yoltq true) + :slow-thread-show-stacktrace? false}) + (add-consumer! :q (fn [_] + (deliver started-consuming? true))) + (log/info "begin start! ...") + (start!) + (log/info "begin start! ... Done") + (Thread/sleep 2000) + (log/info "*******************************************") + @(d/transact conn [(put :q {:work 123})]) + @started-consuming? + (stop-multicaster! conn) + (log/info "*******************************************") + (stop!) + (log/info "stop! done") + nil))))
\ No newline at end of file diff --git a/src/com/github/ivarref/yoltq/error_poller.clj b/src/com/github/ivarref/yoltq/error_poller.clj new file mode 100644 index 0000000..dffff28 --- /dev/null +++ b/src/com/github/ivarref/yoltq/error_poller.clj @@ -0,0 +1,124 @@ +(ns com.github.ivarref.yoltq.error-poller + (:require [clojure.tools.logging :as log] + [com.github.ivarref.yoltq.ext-sys :as ext] + [com.github.ivarref.yoltq.utils :as u] + [datomic.api :as d])) + + +(defn get-state [v] + (case v + [:error :none] :recovery + [:error :some] :error + [:error :all] :error + [:recovery :none] :recovery + [:recovery :some] :recovery + [:recovery :all] :error + nil)) + + +(defn handle-error-count [{:keys [errors last-notify state] + :or {errors [] + last-notify 0 + state :recovery}} + {:keys [system-error-min-count system-error-callback-backoff] + :or {system-error-min-count 3}} + now-ms + error-count] + (let [new-errors (->> (conj errors error-count) + (take-last system-error-min-count) + (vec)) + classify (fn [coll] + (cond + (not= system-error-min-count (count coll)) + :missing + + (every? pos-int? coll) + :all + + (every? zero? coll) + :none + + :else + :some)) + old-state state] + (merge + {:errors new-errors + :last-notify last-notify} + (when-let [new-state (get-state [old-state (classify new-errors)])] + (merge + {:state new-state} + (when (and (= old-state :recovery) + (= new-state :error)) + {:run-callback :error + :last-notify now-ms}) + + (when (and (= new-state :error) + (= old-state :error) + (> now-ms + (+ last-notify system-error-callback-backoff))) + {:run-callback :error + :last-notify now-ms}) + + (when (and (= new-state :recovery) + (= old-state :error)) + {:run-callback :recovery})))))) + + +(defn do-poll-errors [{:keys [conn + system-error + on-system-error + on-system-recovery + healthy? + healthy-allowed-error-time] + :or {on-system-error (fn [] + (log/error "There are yoltq queues which have errors") + nil) + on-system-recovery (fn [] + (log/info "Yoltq recovered"))} + :as config} + now-ms] + (assert (some? conn) "expected :conn to be present") + (assert (some? system-error) "expected :system-error to be present") + (assert (nat-int? healthy-allowed-error-time) "expected :healthy-allowed-error-time to be present") + (let [max-init-time (- now-ms healthy-allowed-error-time) + error-count (or (d/q '[:find (count ?e) . + :in $ ?status ?max-init-time + :where + [?e :com.github.ivarref.yoltq/status ?status] + [?e :com.github.ivarref.yoltq/init-time ?init-time] + [(<= ?init-time ?max-init-time)]] + (d/db conn) + u/status-error + max-init-time) + 0)] + (if (pos-int? error-count) + (do + (log/debug "poll-errors found" error-count "errors in system") + (reset! healthy? false)) + (reset! healthy? true)) + (let [{:keys [run-callback] :as new-state} (swap! system-error handle-error-count config now-ms error-count)] + (when run-callback + (cond (= run-callback :error) + (on-system-error) + + (= run-callback :recovery) + (on-system-recovery) + + :else + (log/error "unhandled callback-type" run-callback)) + (log/debug "run-callback is" run-callback)) + error-count))) + + +(defn poll-errors [running? config-atom] + (try + (when @running? + (do-poll-errors @config-atom (ext/now-ms))) + (catch Throwable t + (log/error t "unexpected error in poll-errors:" (ex-message t)) + nil))) + + +(comment + (do-poll-errors @com.github.ivarref.yoltq/*config* (ext/now-ms))) + diff --git a/src/com/github/ivarref/yoltq/ext_sys.clj b/src/com/github/ivarref/yoltq/ext_sys.clj new file mode 100644 index 0000000..692b934 --- /dev/null +++ b/src/com/github/ivarref/yoltq/ext_sys.clj @@ -0,0 +1,27 @@ +(ns com.github.ivarref.yoltq.ext-sys + (:require [datomic.api :as d]) + (:refer-clojure :exclude [random-uuid]) + (:import (java.util UUID))) + + +(def ^:dynamic *now-ms-atom* nil) +(def ^:dynamic *squuid-atom* nil) +(def ^:dynamic *random-atom* nil) + + +(defn now-ms [] + (if *now-ms-atom* + @*now-ms-atom* + (System/currentTimeMillis))) + + +(defn squuid [] + (if *squuid-atom* + (UUID/fromString (str "00000000-0000-0000-0000-" (format "%012d" (swap! *squuid-atom* inc)))) + (d/squuid))) + + +(defn random-uuid [] + (if *random-atom* + (UUID/fromString (str "00000000-0000-0000-0000-" (format "%012d" (swap! *random-atom* inc)))) + (UUID/randomUUID))) diff --git a/src/com/github/ivarref/yoltq/impl.clj b/src/com/github/ivarref/yoltq/impl.clj new file mode 100644 index 0000000..ffb1ad8 --- /dev/null +++ b/src/com/github/ivarref/yoltq/impl.clj @@ -0,0 +1,248 @@ +(ns com.github.ivarref.yoltq.impl + (:require [clojure.edn :as edn] + [clojure.string :as str] + [clojure.tools.logging :as log] + [com.github.ivarref.double-trouble :as dt] + [com.github.ivarref.yoltq.ext-sys :as ext] + [com.github.ivarref.yoltq.utils :as u] + [datomic.api :as d]) + (:import (java.time Year))) + +(def schema + [#:db{:ident :com.github.ivarref.yoltq/id, :cardinality :db.cardinality/one, :valueType :db.type/uuid, :unique :db.unique/identity} + #:db{:ident :com.github.ivarref.yoltq/ext-id, :cardinality :db.cardinality/one, :valueType :db.type/string, :unique :db.unique/value} + #:db{:ident :com.github.ivarref.yoltq/queue-name, :cardinality :db.cardinality/one, :valueType :db.type/keyword, :index true} + #:db{:ident :com.github.ivarref.yoltq/job-group, :cardinality :db.cardinality/one, :valueType :db.type/keyword, :index true} + #:db{:ident :com.github.ivarref.yoltq/status, :cardinality :db.cardinality/one, :valueType :db.type/keyword, :index true} + #:db{:ident :com.github.ivarref.yoltq/payload, :cardinality :db.cardinality/one, :valueType :db.type/string} + #:db{:ident :com.github.ivarref.yoltq/payload-bytes, :cardinality :db.cardinality/one, :valueType :db.type/bytes} + #:db{:ident :com.github.ivarref.yoltq/opts, :cardinality :db.cardinality/one, :valueType :db.type/string} + #:db{:ident :com.github.ivarref.yoltq/bindings, :cardinality :db.cardinality/one, :valueType :db.type/string} + #:db{:ident :com.github.ivarref.yoltq/tries, :cardinality :db.cardinality/one, :valueType :db.type/long, :noHistory true} + #:db{:ident :com.github.ivarref.yoltq/lock, :cardinality :db.cardinality/one, :valueType :db.type/uuid, :noHistory true} + #:db{:ident :com.github.ivarref.yoltq/init-time, :cardinality :db.cardinality/one, :valueType :db.type/long} + #:db{:ident :com.github.ivarref.yoltq/processing-time, :cardinality :db.cardinality/one, :valueType :db.type/long} + #:db{:ident :com.github.ivarref.yoltq/done-time, :cardinality :db.cardinality/one, :valueType :db.type/long} + #:db{:ident :com.github.ivarref.yoltq/error-time, :cardinality :db.cardinality/one, :valueType :db.type/long} + #:db{:ident :com.github.ivarref.yoltq/version, :cardinality :db.cardinality/one, :valueType :db.type/string, :index true}]) + +(defn pr-str-inner [x] + (binding [*print-dup* false + *print-meta* false + *print-readably* true + *print-length* nil + *print-level* nil + *print-namespace-maps* false] + (pr-str x))) + +(defn pr-str-safe [what x] + (try + (if (= x (edn/read-string (pr-str-inner x))) + (pr-str-inner x) + (throw (ex-info (str "Could not read-string " what) {:input x}))) + (catch Exception e + (log/error "could not read-string" what ":" (ex-message e)) + (throw e)))) + +(defn default-partition-fn [_queue-keyword] + (keyword "yoltq" (str "queue_" (.getValue (Year/now))))) + +(defn put [{:keys [capture-bindings conn encode partition-fn] + :or {partition-fn default-partition-fn + encode (partial pr-str-safe :payload)} + :as config} + queue-name + payload + opts] + (if-let [q-config (get-in config [:handlers queue-name])] + (let [id (u/squuid) + encode (get q-config :encode encode) + partition-fn (get q-config :partition-fn partition-fn) + partition (partition-fn queue-name) + _ (assert (keyword? partition) "Partition must be a keyword") + depends-on (get q-config :depends-on (fn [_] nil)) + valid-payload? (get q-config :valid-payload? (fn [_] true)) + opts (merge + (when-let [deps (depends-on payload)] + {:depends-on deps}) + (or opts {})) + str-bindings (->> (reduce (fn [o k] + (assoc o (symbol k) (deref k))) + {} + (or capture-bindings [])) + (pr-str-safe :capture-bindings)) + _ (when-not (valid-payload? payload) + (log/error "Payload was not valid. Payload was:" payload) + (throw (ex-info (str "Payload was not valid: " payload) {:payload payload}))) + encoded (encode payload) + _ (when (not (or (bytes? encoded) (string? encoded))) + (log/error "Payload must be encoded to either a string or a byte array") + (throw (ex-info (str "Payload must be encoded to a string or a byte array. Payload: " payload) {:payload payload})))] + (log/debug "queue item" (str id) "for queue" queue-name "is pending status" u/status-init) + (do + (dt/ensure-partition! conn partition) + (merge + (if (bytes? encoded) + {:com.github.ivarref.yoltq/payload-bytes encoded} + {:com.github.ivarref.yoltq/payload encoded}) + {:db/id (d/tempid partition) + :com.github.ivarref.yoltq/id id + :com.github.ivarref.yoltq/queue-name queue-name + :com.github.ivarref.yoltq/status u/status-init + :com.github.ivarref.yoltq/bindings str-bindings + :com.github.ivarref.yoltq/opts (pr-str-safe :opts opts) + :com.github.ivarref.yoltq/lock (u/random-uuid) + :com.github.ivarref.yoltq/tries 0 + :com.github.ivarref.yoltq/init-time (u/now-ms) + :com.github.ivarref.yoltq/version "2"} + (when-let [[q ext-id] (:depends-on opts)] + (when-not (d/q '[:find ?e . + :in $ ?ext-id + :where + [?e :com.github.ivarref.yoltq/ext-id ?ext-id]] + (d/db conn) + (pr-str-safe :depends-on [q ext-id])) + (throw (ex-info (str ":depends-on not found in database. Queue: " q ", id: " ext-id) opts)))) + (when-let [ext-id (:id opts)] + {:com.github.ivarref.yoltq/ext-id (pr-str-safe :id [queue-name ext-id])}) + (when-let [job-group (:job-group opts)] + {:com.github.ivarref.yoltq/job-group job-group})))) + (do + (log/error "Did not find registered handler for queue" queue-name) + (throw (ex-info (str "Did not find registered handler for queue: " queue-name) {:queue queue-name}))))) + + +(defn depends-on-waiting? [{:keys [conn]} + q-item] + (let [db (d/db conn)] + (when-let [{:com.github.ivarref.yoltq/keys [opts]} (u/get-queue-item db (:id q-item))] + (when-let [[q id :as depends-on] (:depends-on opts)] + (when-not (d/q '[:find ?e . + :in $ ?ext-id + :where + [?e :com.github.ivarref.yoltq/ext-id ?ext-id] + [?e :com.github.ivarref.yoltq/status :done]] + db + (pr-str [q id])) + (log/info "queue item" (str (:id q-item)) "is waiting on" depends-on) + {:depends-on depends-on}))))) + + +(defn take! [{:keys [conn cas-failures hung-log-level tx-spent-time!] + :or {hung-log-level :error}} + {:keys [tx id queue-name was-hung? to-error?] :as queue-item-info}] + (when queue-item-info + (try + (cond to-error? + (log/logp hung-log-level "queue-item" (str id) "was hung and retried too many times. Giving up!") + + was-hung? + (log/logp hung-log-level "queue-item" (str id) "was hung, retrying ...") + + :else + nil) + (let [start-time (System/nanoTime) + {:keys [db-after]} @(d/transact conn tx) + _ (when tx-spent-time! (tx-spent-time! (- (System/nanoTime) start-time))) + {:com.github.ivarref.yoltq/keys [status] :as q-item} (u/get-queue-item db-after id)] + (log/debug "queue item" (str id) "for queue" queue-name "now has status" status) + q-item) + (catch Throwable t + (let [{:db/keys [error] :as m} (u/db-error-map t)] + (cond + (= :db.error/cas-failed error) + (do + (log/info "take! :db.error/cas-failed for queue item" (str id) "and attribute" (:a m)) + (when cas-failures + (swap! cas-failures inc)) + nil) + + :else + (do + (log/error t "Unexpected failure for queue item" (str id) ":" (ex-message t)) + nil))))))) + + +(defn mark-status! [{:keys [conn tx-spent-time!]} + {:com.github.ivarref.yoltq/keys [id lock tries]} + new-status] + (try + (let [tx [[:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/lock lock (u/random-uuid)] + [:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/tries tries (inc tries)] + [:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/status u/status-processing new-status] + (if (= new-status u/status-done) + {:db/id [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/done-time (u/now-ms)} + {:db/id [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/error-time (u/now-ms)})] + start-time (System/nanoTime) + {:keys [db-after]} @(d/transact conn tx)] + (when tx-spent-time! (tx-spent-time! (- (System/nanoTime) start-time))) + (u/get-queue-item db-after id)) + (catch Throwable t + (log/error t "unexpected error in mark-status!: " (ex-message t)) + nil))) + + +(defn fmt [id queue-name new-status tries spent-ns] + (str/join " " ["queue-item" (str id) + "for queue" queue-name + "now has status" new-status + "after" tries (if (= 1 tries) + "try" + "tries") + "in" (format "%.1f" (double (/ spent-ns 1e6))) "ms"])) + + +(defn execute! [{:keys [decode handlers mark-status-fn! start-execute-time collect-spent-time!] + :or {mark-status-fn! mark-status! + decode edn/read-string} + :as cfg} + {:com.github.ivarref.yoltq/keys [status id queue-name payload payload-bytes] :as queue-item}] + (when queue-item + (if (= :error status) + (assoc queue-item :failed? true) + (if-let [queue (get handlers queue-name)] + (let [{:keys [f allow-cas-failure?]} queue + decode (get queue :decode decode)] + (log/debug "queue item" (str id) "for queue" queue-name "is now processing") + (let [{:keys [retval exception]} + (try + (swap! start-execute-time assoc (Thread/currentThread) [(ext/now-ms) id queue-name]) + (let [payload (decode (or payload payload-bytes)) + v (f payload)] + {:retval v}) + (catch Throwable t + {:exception t}) + (finally + (swap! start-execute-time dissoc (Thread/currentThread)))) + {:db/keys [error] :as m} (u/db-error-map exception)] + (cond + (and (some? exception) + allow-cas-failure? + (= :db.error/cas-failed error) + (or (true? allow-cas-failure?) + (allow-cas-failure? (:a m)))) + (when-let [q-item (mark-status-fn! cfg queue-item u/status-done)] + (let [{:com.github.ivarref.yoltq/keys [init-time done-time tries]} q-item] + (log/info (fmt id queue-name u/status-done tries (- done-time init-time))) + (when collect-spent-time! (collect-spent-time! (- (u/now-ms) init-time))) + (assoc q-item :retval retval :success? true :allow-cas-failure? true))) + + (some? exception) + (when-let [q-item (mark-status-fn! cfg queue-item u/status-error)] + (let [{:com.github.ivarref.yoltq/keys [init-time error-time tries]} q-item + level (if (>= tries 3) :error :warn)] + (log/logp level exception (fmt id queue-name u/status-error tries (- error-time init-time))) + (log/logp level exception "error message was:" (str \" (ex-message exception) \") "for queue-item" (str id)) + (log/logp level exception "ex-data was:" (ex-data exception) "for queue-item" (str id)) + (when collect-spent-time! (collect-spent-time! (- (u/now-ms) init-time))) + (assoc q-item :exception exception))) + + :else + (when-let [q-item (mark-status-fn! cfg queue-item u/status-done)] + (let [{:com.github.ivarref.yoltq/keys [init-time done-time tries]} q-item] + (log/info (fmt id queue-name u/status-done tries (- done-time init-time))) + (when collect-spent-time! (collect-spent-time! (- (u/now-ms) init-time))) + (assoc q-item :retval retval :success? true)))))) + (do + (log/error "no handler for queue" queue-name) + nil))))) diff --git a/src/com/github/ivarref/yoltq/migrate.clj b/src/com/github/ivarref/yoltq/migrate.clj new file mode 100644 index 0000000..c97f679 --- /dev/null +++ b/src/com/github/ivarref/yoltq/migrate.clj @@ -0,0 +1,61 @@ +(ns com.github.ivarref.yoltq.migrate + (:require [datomic.api :as d] + [clojure.tools.logging :as log])) + +(defn to->v2-ent [{:keys [conn]} now-ms id] + (log/info "Migrating id" id) + (let [attr-val (fn [attr] + (when-let [old (d/q '[:find ?time . + :in $ ?e ?a + :where + [?e ?a ?time]] + (d/db conn) + [:com.github.ivarref.yoltq/id id] + attr)] + (let [now-ms (or now-ms + (.getTime (d/q '[:find (max ?txinst) . + :in $ ?e ?a + :where + [?e ?a _ ?tx true] + [?tx :db/txInstant ?txinst]] + (d/history (d/db conn)) + [:com.github.ivarref.yoltq/id id] + attr)))] + (log/info "Updating" id attr "to" now-ms) + [[:db/cas [:com.github.ivarref.yoltq/id id] + attr old now-ms]])))] + (vec (concat [[:db/cas [:com.github.ivarref.yoltq/id id] + :com.github.ivarref.yoltq/version nil "2"]] + (mapcat attr-val [:com.github.ivarref.yoltq/init-time + :com.github.ivarref.yoltq/processing-time + :com.github.ivarref.yoltq/done-time + :com.github.ivarref.yoltq/error-time]))))) + +(defn to->v2 [{:keys [conn loop? now-ms] + :or {loop? true} + :as cfg}] + (loop [tx-vec []] + (if-let [id (some->> (d/q '[:find [?id ...] + :in $ + :where + [?e :com.github.ivarref.yoltq/id ?id] + [(missing? $ ?e :com.github.ivarref.yoltq/version)]] + (d/db conn)) + (sort) + (not-empty) + (first))] + (let [tx (to->v2-ent cfg now-ms id)] + @(d/transact conn tx) + (if loop? + (recur (vec (take 10 (conj tx-vec tx)))) + tx)) + (do + (log/info "No items left to migrate") + tx-vec)))) + + +(defn migrate! [cfg] + (to->v2 cfg)) + +(comment + (migrate! @com.github.ivarref.yoltq/*config*)) diff --git a/src/com/github/ivarref/yoltq/poller.clj b/src/com/github/ivarref/yoltq/poller.clj new file mode 100644 index 0000000..9cf81c7 --- /dev/null +++ b/src/com/github/ivarref/yoltq/poller.clj @@ -0,0 +1,64 @@ +(ns com.github.ivarref.yoltq.poller + (:require [com.github.ivarref.yoltq.utils :as u] + [com.github.ivarref.yoltq.impl :as i] + [clojure.tools.logging :as log])) + + +(defn poll-once! [cfg q status] + (when-let [item (case status + :init (u/get-init cfg q) + :error (u/get-error cfg q) + :hung (u/get-hung cfg q))] + (with-bindings (get item :bindings {}) + (if (i/depends-on-waiting? cfg item) + nil + (some->> item + (i/take! cfg) + (i/execute! cfg)))))) + + +(defn poll-queue! [running? + {:keys [running-queues] :as cfg} + [queue-name status :as q]] + (try + (let [[old _] (swap-vals! running-queues conj q)] + (if-not (contains? old q) + (try + (log/debug "polling queue" queue-name "for status" status) + (let [start-time (u/now-ms) + last-res (loop [prev-res nil] + (when @running? + (let [res (poll-once! cfg queue-name status)] + (log/debug "poll-once! returned" res) + (if (and res (:success? res)) + (recur res) + prev-res))))] + (let [spent-ms (- (u/now-ms) start-time)] + (log/trace "done polling queue" q "in" spent-ms "ms")) + last-res) + (finally + (swap! running-queues disj q))) + (log/debug "queue" q "is already being polled, doing nothing..."))) + (catch Throwable t + (log/error t "poll-queue! crashed:" (ex-message t))) + (finally))) + +(comment + (def cfg @com.github.ivarref.yoltq/*config*)) + +(comment + (poll-queue! + (atom true) + @com.github.ivarref.yoltq/*config* + [:add-message-thread :init])) + +(defn poll-all-queues! [running? config-atom pool] + (try + (when @running? + (let [{:keys [handlers]} @config-atom] + (doseq [q (shuffle (vec (for [q-name (keys handlers) + status [:init :error :hung]] + [q-name status])))] + (.execute pool (fn [] (poll-queue! running? @config-atom q)))))) + (catch Throwable t + (log/error t "poll-all-queues! crashed:" (ex-message t))))) diff --git a/src/com/github/ivarref/yoltq/report_queue.clj b/src/com/github/ivarref/yoltq/report_queue.clj new file mode 100644 index 0000000..f83e3ba --- /dev/null +++ b/src/com/github/ivarref/yoltq/report_queue.clj @@ -0,0 +1,459 @@ +(ns com.github.ivarref.yoltq.report-queue + (:require [com.github.ivarref.yoltq.utils :as u] + [com.github.ivarref.yoltq.impl :as i] + [datomic.api :as d] + [clojure.tools.logging :as log]) + (:import (datomic Connection Datom) + (java.util.concurrent LinkedBlockingQueue ScheduledExecutorService BlockingQueue TimeUnit))) + +; Private API, subject to change! + +(defn process-poll-result! [cfg id-ident poll-result consumer] + (let [{:keys [tx-data db-after]} poll-result] + (when-let [new-ids (->> tx-data + (filter (fn [^Datom datom] (and + (= (.a datom) id-ident) + (.added datom)))) + (mapv (fn [^Datom datom] (.v datom))) + (into []) + (not-empty))] + (doseq [id new-ids] + (consumer (fn [] + (try + (let [{:com.github.ivarref.yoltq/keys [lock id status queue-name bindings]} (u/get-queue-item db-after id)] + (with-bindings (or bindings {}) + (if (i/depends-on-waiting? cfg {:id id}) + nil + (some->> + (u/prepare-processing db-after id queue-name lock status) + (i/take! cfg) + (i/execute! cfg))))) + (catch Throwable t + (log/error t "Unexpected error in process-poll-result!"))))))))) + +(defn report-queue-listener [running? + ready? + ^ScheduledExecutorService pool + config-atom] + (let [cfg @config-atom + conn (:conn cfg) + tx-report-queue-given (contains? cfg :tx-report-queue) + ^BlockingQueue q (if tx-report-queue-given + (get cfg :tx-report-queue) + (d/tx-report-queue conn)) + id-ident (d/q '[:find ?e . + :where [?e :db/ident :com.github.ivarref.yoltq/id]] + (d/db conn))] + (assert (instance? BlockingQueue q)) + (log/debug "tx-report-queue-given:" tx-report-queue-given) + (try + (let [running-local? (atom true)] + (while (and @running? @running-local?) + (when-let [poll-result (.poll ^BlockingQueue q 1 TimeUnit/SECONDS)] + (if (= poll-result :end) + (do + (log/debug "report-queue-listener received :end token. Exiting") + (reset! running-local? false)) + ;(log/warn "yoltq report-queue-listener received :end token. If the rest of the system is kept running, it will result in a partially broken system.")) + (process-poll-result! @config-atom + id-ident + poll-result + (fn [f] + (when @running? + (.execute ^ScheduledExecutorService pool f)))))) + (deliver ready? :ready))) + (catch Throwable t + (log/error t "Unexpected error in report-queue-listener:" (.getMessage t))) + (finally + (if tx-report-queue-given + (log/debug "Remove tx-report-queue handled elsewhere") + (do + (log/debug "Remove tx-report-queue") + (d/remove-tx-report-queue conn))))))) + +; https://stackoverflow.com/a/14488425 +(defn- dissoc-in + "Dissociates an entry from a nested associative structure returning a new + nested structure. keys is a sequence of keys. Any empty maps that result + will not be present in the new structure." + [m [k & ks :as keys]] + (if ks + (if-let [nextmap (get m k)] + (let [newmap (dissoc-in nextmap ks)] + (if (seq newmap) + (assoc m k newmap) + (dissoc m k))) + m) + (dissoc m k))) + +(defn- queues-to-shutdown [old-state new-state] + (assert (map? old-state)) + (assert (map? new-state)) + (doseq [x (vals new-state)] + (assert (vector? x))) + (doseq [x (vals old-state)] + (assert (vector? x))) + (let [new-qs (into #{} (mapv second (vals new-state)))] + (reduce + (fn [o [send-end-token? old-q]] + ;(assert (boolean? send-end-token?)) + ;(assert (instance? BlockingQueue old-q)) + (if (contains? new-qs old-q) + o + (conj o [send-end-token? old-q]))) + [] + (vals old-state)))) + +(comment + (queues-to-shutdown {:a [true 999] :b [false 777]} + {:a [true 123] :b [true 777]})) + +(defn- multicast-once [conn work-item old-state new-state] + (assert (map? old-state)) + (assert (map? new-state)) + (doseq [[send-end-token? q-to-shutdown] (queues-to-shutdown old-state new-state)] + (if send-end-token? + (do + #_(log/debug "offering :end token") + (if (.offer ^BlockingQueue q-to-shutdown :end 1 TimeUnit/MICROSECONDS) + (log/debug "Multicaster sent :end token") + (log/debug "Multicaster failed to send :end token"))) + (do + (log/debug "Multicaster not sending :end token")))) + (when (seq new-state) + (if (some? work-item) + (reduce-kv + (fn [m id [send-end-token? q]] + (let [ok-offer (.offer ^BlockingQueue q work-item 1 TimeUnit/MICROSECONDS)] + (if (true? ok-offer) + (assoc m id [send-end-token? q]) + (log/error "Multicaster failed to offer item for connection" conn "and queue id" id)))) + {} + new-state) + new-state))) + +(defonce ^:private multicast-state-lock (Object.)) +(defonce ^:private consumer-state-lock (Object.)) +(defonce ^:private multicast-state (atom {})) +(defonce ^:private thread-count (atom 0)) + +(defn- multicaster-loop [init-state conn ready?] + (assert (instance? Connection conn)) + (let [input-queue (d/tx-report-queue conn)] + (deliver ready? true) + (loop [old-state init-state] + (let [work-item (.poll ^BlockingQueue input-queue 16 TimeUnit/MILLISECONDS) + new-state (locking multicast-state-lock + ; writer to `multicast-state` must be protected by `multicast-state-lock` + ; it should block minimally / spend minimum amount of time + (swap! multicast-state (fn [old-state] (update-in old-state [:iter-count conn] (fnil inc 0)))) + (if-let [new-state (multicast-once conn work-item old-state (get-in @multicast-state [:queues conn] {}))] + new-state + (do (swap! multicast-state (fn [old-state] (dissoc-in old-state [:queues conn]))) + (swap! multicast-state (fn [old-state] (update-in old-state [:thread-count conn] dec))) + (d/remove-tx-report-queue conn) + (log/debug "Multicaster removed tx-report-queue for conn" conn) + nil)))] + (if new-state + (recur new-state) + nil))))) + +(defn- start-multicaster! [conn] + (assert (instance? Connection conn)) + (let [ready? (promise)] + (future + (log/debug "Multicaster starting for conn" conn) + (try + (swap! thread-count inc) + (let [new-state (swap! multicast-state (fn [old-state] (update-in old-state [:thread-count conn] (fnil inc 0))))] + (assert (= 1 (get-in new-state [:thread-count conn]))) + ; "parent" thread holds `multicast-state-lock` and + ; waits for `ready?` promise, so effectively this new thread also holds + ; the lock until `ready?` is delivered. That is: it is safe + ; for this thread to modify multicast-state regardless of what other threads are doing + (multicaster-loop (get-in new-state [:queues conn]) conn ready?)) + (catch Throwable t + (log/error t "Unexpected error in multicaster:" (.getMessage t)) + (log/error "Multicaster exiting for conn")) + (finally + (swap! thread-count dec) + (log/debug "Multicaster exiting for conn" conn)))) + (when (= :timeout (deref ready? 30000 :timeout)) + (throw (RuntimeException. "Timed out waiting for multicaster to start"))))) + +(defn- wait-multicast-thread-step + [conn state] + ; `get-tx-report-queue-multicast!` should return only when the multicaster thread + ; has picked up the new queue. + ; + ; Otherwise the following could happen: + ; 1. multicast thread is sleeping + ; 2: user-thread calls get-tx-report-queue-multicast! with `send-end-token?` `true` + ; 3: user-thread (or somebody else) calls `stop-multicaster`. + ; The multicast-state atom is now identical as it was in step 1. + ; , Step 2 and 3 happened while the multicast thread was sleeping. + ; 4: The multicast thread is scheduled and does _not_ detect any state change. + ; Therefore the multicast thread does _not_ send out an :end token as one would expect. + ; + ; The new queue is written to memory at this point. No other thread can remove it because + ; we are still, and have been during the modification of multicast-state, holding consumer-state-lock. + ; This means that the multicast thread cannot exit at this point. Also, because we hold the consumer-state-lock, + ; we can be sure that no other thread changes or has changed the state. + ; + ; Once [:iter-count conn] has changed, we know that the multicaster thread + ; has seen the new queue. This means that we can be sure that the queue + ; will receive the `:end` token if the queue is stopped. + (let [start-ms (System/currentTimeMillis) + iter-count (get-in state [:iter-count conn] -1)] + (loop [spin-count 0] + (if (not= iter-count (locking multicast-state-lock + (get-in @multicast-state [:iter-count conn] -1))) + nil + (let [spent-ms (- (System/currentTimeMillis) start-ms)] + (if (> spent-ms 30000) + (throw (RuntimeException. "Timed out waiting for multicaster thread")) + (do + (Thread/sleep 16) + (recur (inc spin-count))))))))) + +(defn get-tx-report-queue-multicast! + "Multicast the datomic.api/tx-report-queue to different consumers. + A multicaster thread is started on demand per connection. `conn` and `id` identifies the consumer. + Repeated calls using the same `conn` and `id` returns the same queue. + + The optional third parameter, `send-end-token?`, if set to `true`, instructs the multicaster thread + to send `:end` if the queue is stopped. The default value is `false`. + + A queue may be stopped using `stop-multicaster-id!`, `stop-multicaster!` or `stop-all-multicasters!`. + + Returns a `java.util.concurrent.BlockingQueue` like `datomic.api/tx-report-queue`." + ([conn id] + (get-tx-report-queue-multicast! conn id false)) + ([conn id send-end-token?] + (assert (instance? Connection conn)) + (locking consumer-state-lock + (let [[new-state the-q] + (locking multicast-state-lock + (assert (map? @multicast-state)) + (if-let [existing-q (get-in @multicast-state [:queues conn id])] + (do + (let [new-state (swap! multicast-state + (fn [old-state] + (update-in old-state [:queues conn id] (fn [[end-token? q]] + (if (not= end-token? send-end-token?) + (log/debug "flipped `send-end-token?`") + (log/debug "identical `send-end-token?`")) + [send-end-token? q]))))] + (log/debug "Returning existing queue for id" id) + (assert (instance? BlockingQueue (second existing-q))) + [new-state (second existing-q)])) + (let [needs-multicaster? (nil? (get-in @multicast-state [:queues conn])) + new-q (LinkedBlockingQueue.) + new-state (swap! multicast-state (fn [old-state] (assoc-in old-state [:queues conn id] [send-end-token? new-q])))] + (if needs-multicaster? + (do + (start-multicaster! conn) + (log/debug "Returning new queue for id" id "(multicaster thread started)") + [new-state new-q]) + (do + (log/debug "Returning new queue for id" id "(multicaster thread already running)") + [new-state new-q])))))] + ; wait for multicaster thread to pick up current Queue + (wait-multicast-thread-step conn new-state) + the-q)))) + +(defn- wait-multicast-threads-exit [[old-state new-state]] + (assert (map? old-state)) + (assert (map? new-state)) + (assert (map? (get old-state :queues {}))) + (assert (map? (get new-state :queues {}))) + (assert (map? (get old-state :thread-count {}))) + (assert (map? (get new-state :thread-count {}))) + (locking consumer-state-lock + ; No new multicast threads will be launched inside this block. + ; The lock is already held by parent function. + ; + ; Why do we need to _wait_ for multicaster thread(s) to exit after + ; removing all queue ids for a given connection? + ; Otherwise the following could happen: + ; 1. multicaster thread is sleeping + ; 2. user calls stop-multicaster! + ; One would expect that multicaster thread would exit, but it is still sleeping + ; 3. user calls get-tx-report-queue-multicast! with the same conn + ; The state is now empty, so a new multicaster thread is spawned. + ; 4. Now there is two multicaster threads for the same connection! + ; ... and since the datomic report queue can be shared between threads + ; it will seemingly work, but when the end event is sent, it will be + ; sent by multiple threads. + (let [old-conns (into #{} (keys (get old-state :queues {}))) + new-conns (into #{} (keys (get new-state :queues {})))] + (assert (every? + (fn [x] (instance? Connection x)) + old-conns)) + (assert (every? + (fn [x] (instance? Connection x)) + new-conns)) + (doseq [old-conn old-conns] + (when-not (contains? new-conns old-conn) + (let [old-threadcount (get-in old-state [:thread-count old-conn] nil)] + (assert (= 1 old-threadcount)) + (let [start-ms (System/currentTimeMillis)] + (loop [] + (if (= 0 (get-in @multicast-state [:thread-count old-conn])) + :ok + (do + (let [spent-ms (- (System/currentTimeMillis) start-ms)] + (if (> spent-ms 30000) + (throw (RuntimeException. "Timed out waiting for multicaster thread to exit")) + (do + (Thread/sleep 16) + (recur)))))))))))))) + +(defn- all-queues [state] + (->> (mapcat (fn [[conn qmap]] + (mapv (fn [q-id] [conn q-id]) + (keys qmap))) + (seq (get state :queues {}))) + (into #{}))) + +(comment + (do + (assert (= #{} + (all-queues {}))) + (assert (= #{} + (all-queues {:queues {}}))) + (assert (= #{[:conn-a :q-id]} + (all-queues {:queues {:conn-a {:q-id 1}}}))) + (assert (= #{[:conn-a :q-id] [:conn-a :q-id-2]} + (all-queues {:queues {:conn-a {:q-id 1 :q-id-2 2}}}))) + (assert (= #{[:conn-a :q-id-2] [:conn-b :q-id-3] [:conn-a :q-id]} + (all-queues {:queues {:conn-a {:q-id 1 :q-id-2 2} + :conn-b {:q-id-3 3}}}))))) + +(defn- removed-queues? [old new] + (not= (all-queues old) + (all-queues new))) + +(defn stop-multicast-consumer-id! [conn id] + (assert (instance? Connection conn)) + (let [did-remove? (atom nil)] + (locking consumer-state-lock + (wait-multicast-threads-exit + (locking multicast-state-lock + (let [[old new] (swap-vals! multicast-state (fn [old-state] + (let [new-state (dissoc-in old-state [:queues conn id])] + (if (= {} (get-in new-state [:queues conn])) + (dissoc-in old-state [:queues conn]) + new-state))))] + (reset! did-remove? (removed-queues? old new)) + [old new])))) + @did-remove?)) + +(defn stop-multicaster! [conn] + (assert (instance? Connection conn)) + (let [did-remove? (atom nil)] + (locking consumer-state-lock + (wait-multicast-threads-exit + (locking multicast-state-lock + (let [[old new] (swap-vals! multicast-state (fn [old-state] (dissoc-in old-state [:queues conn])))] + (reset! did-remove? (removed-queues? old new)) + [old new])))) + @did-remove?)) + +(defn stop-all-multicasters! [] + (let [did-remove? (atom nil)] + (locking consumer-state-lock + (wait-multicast-threads-exit + (locking multicast-state-lock + (let [[old new] (swap-vals! multicast-state (fn [old-state] (assoc old-state :queues {})))] + (reset! did-remove? (removed-queues? old new)) + [old new])))) + @did-remove?)) + +(comment + (do + (require 'com.github.ivarref.yoltq.log-init) + (require '[datomic.api :as d]) + (com.github.ivarref.yoltq.log-init/init-logging! + [[#{"datomic.*" "com.datomic.*" "org.apache.*"} :warn] + [#{"com.github.ivarref.yoltq.report-queue"} :debug] + [#{"com.github.ivarref.yoltq.poller"} :info] + [#{"com.github.ivarref.yoltq"} :debug] + ;[#{"ivarref.yoltq*"} :info] + [#{"*"} :info]]) + (defonce conn (let [uri (str "datomic:mem://demo") + _ (d/delete-database uri) + _ (d/create-database uri) + conn (d/connect uri)] + conn)))) + +(comment + (do + (require 'com.github.ivarref.yoltq.log-init) + (defn drain! [^BlockingQueue q] + (loop [items []] + (if-let [elem (.poll q 100 TimeUnit/MILLISECONDS)] + (recur (conj items elem)) + items))) + (com.github.ivarref.yoltq.log-init/init-logging! + [[#{"datomic.*" "com.datomic.*" "org.apache.*"} :warn] + [#{"com.github.ivarref.yoltq.report-queue"} :debug] + [#{"com.github.ivarref.yoltq.poller"} :info] + [#{"com.github.ivarref.yoltq"} :debug] + ;[#{"ivarref.yoltq*"} :info] + [#{"*"} :info]]) + (log/info "********************************") + (defonce conn (let [uri (str "datomic:mem://demo") + _ (d/delete-database uri) + _ (d/create-database uri) + conn (d/connect uri)] + conn)) + (log/info "stop-all!") + (stop-all-multicasters!) + (assert (= 0 @thread-count)) + (let [q1 (get-tx-report-queue-multicast! conn :q1 false) + q2 (get-tx-report-queue-multicast! conn :q2 false) + _ (get-tx-report-queue-multicast! conn :q1 true)] + @(d/transact conn [{:db/doc "demo"}]) + @(d/transact conn [{:db/doc "demo"}]) + @(d/transact conn [{:db/doc "demo"}]) + (log/info "begin drain q1") + (stop-multicast-consumer-id! conn :q1) + (stop-multicast-consumer-id! conn :q1) + (println "thread count" @thread-count) + (let [qitems-2 (drain! q2) + qitems-1 (drain! q1)] + (assert (= :end (last qitems-1))) + (println "drain count q1:" (count qitems-1)) + (println "drain count q2:" (count qitems-2)))))) + +(comment + (do + (let [q (get-tx-report-queue-multicast! conn :q1 true)] + (log/debug "stopping id :q1") + (stop-multicaster-id! conn :q1) + (let [drained (drain! q)] + (println "drained:" drained) + (assert (= [:end] drained))) + @multicast-state))) + +(comment + (stop-all-multicasters!)) + +(comment + (do + (let [q (get-tx-report-queue-multicast! conn :q2 false)] + (println "drain count:" (count (drain! q))) + @multicast-state + nil))) + +(comment + (get-tx-report-queue-multicast! conn :q1 false) + (get-tx-report-queue-multicast! conn :q1 true)) + +(comment + (do + @(d/transact conn [{:db/doc "demo"}]) + @(d/transact conn [{:db/doc "demo"}]) + @(d/transact conn [{:db/doc "demo"}]) + :yay))
\ No newline at end of file diff --git a/src/com/github/ivarref/yoltq/slow_executor_detector.clj b/src/com/github/ivarref/yoltq/slow_executor_detector.clj new file mode 100644 index 0000000..53dfe89 --- /dev/null +++ b/src/com/github/ivarref/yoltq/slow_executor_detector.clj @@ -0,0 +1,36 @@ +(ns com.github.ivarref.yoltq.slow-executor-detector + (:require [clojure.string :as str] + [clojure.tools.logging :as log] + [com.github.ivarref.yoltq.ext-sys :as ext]) + (:import (java.util.concurrent ExecutorService))) + +(defn- do-show-slow-threads [{:keys [start-execute-time + max-execute-time + slow? + slow-thread-show-stacktrace?] + :or {slow-thread-show-stacktrace? true}}] + (let [new-slow-val (atom false)] + (doseq [[^Thread thread [start-time queue-id queue-name]] @start-execute-time] + (when (> (ext/now-ms) (+ start-time max-execute-time)) + (reset! new-slow-val true) + (log/error "thread" (.getName thread) "spent too much time on" + "queue item" (str queue-id) + "for queue" queue-name + (if slow-thread-show-stacktrace? + (str "stacktrace: \n" (str/join "\n" (mapv str (seq (.getStackTrace thread))))) + "")))) + (reset! slow? @new-slow-val))) + +(defn show-slow-threads [^ExecutorService pool config-atom] + (try + (while (not (.isTerminated pool)) + (try + (do-show-slow-threads @config-atom) + (catch Throwable t + (log/error t "do-show-slow-threads crashed:" (ex-message t)))) + (dotimes [_ 3] + (when (not (.isTerminated pool)) + (Thread/sleep 1000)))) + (log/debug "show-slow-threads exiting") + (catch Throwable t + (log/error t "reap! crashed:" (ex-message t))))) diff --git a/src/com/github/ivarref/yoltq/test_queue.clj b/src/com/github/ivarref/yoltq/test_queue.clj new file mode 100644 index 0000000..ee9cd54 --- /dev/null +++ b/src/com/github/ivarref/yoltq/test_queue.clj @@ -0,0 +1,191 @@ +(ns com.github.ivarref.yoltq.test-queue + (:require [clojure.tools.logging :as log] + [com.github.ivarref.yoltq.report-queue :as rq] + [com.github.ivarref.yoltq.ext-sys :as ext] + [com.github.ivarref.yoltq :as yq] + [datomic.api :as d] + [com.github.ivarref.yoltq.poller :as poller] + [clojure.test :as test] + [com.github.ivarref.yoltq.utils :as u] + [com.github.ivarref.yoltq.impl :as i]) + (:import (java.util.concurrent BlockingQueue TimeUnit) + (datomic Datom))) + + +(defn bootstrap-poller! [txq running? poller-exited? bootstrapped? conn] + (let [ready? (promise)] + (future + (let [q (d/tx-report-queue conn)] + (try + (while @running? + (when-let [poll-result (.poll ^BlockingQueue q 10 TimeUnit/MILLISECONDS)] + (swap! txq conj poll-result)) + (deliver ready? true) + (reset! bootstrapped? true)) + (catch Throwable t + (log/error t "test-poller crashed: " (ex-message t))) + (finally + (try + (d/remove-tx-report-queue conn) + (catch Throwable t + (log/warn t "remove-tx-report-queue failed:" (ex-message t)))) + (deliver poller-exited? true))))) + @ready?)) + + +(defmacro with-virtual-queue! + [& body] + `(let [txq# (atom []) + poller-exited?# (promise) + bootstrapped?# (atom false) + running?# (atom true) + config# (atom {:bootstrap-poller! (partial bootstrap-poller! txq# running?# poller-exited?# bootstrapped?#) + :init-backoff-time 0 + :hung-log-level :warn + :prev-consumed (atom {}) + :tx-queue txq#})] + (with-bindings {#'yq/*config* config# + #'yq/*running?* (atom false) + #'yq/*test-mode* true + #'ext/*now-ms-atom* (atom 0) + #'ext/*random-atom* (atom 0) + #'ext/*squuid-atom* (atom 0)} + (try + ~@body + (finally + (reset! running?# false) + (when @bootstrapped?# + @poller-exited?#)))))) + + +(defn call-with-virtual-queue! + [f] + (with-virtual-queue! + (f))) + + +(defn run-report-queue! [min-items] + (let [{:keys [tx-queue conn]} @yq/*config* + id-ident (d/q '[:find ?e . + :where [?e :db/ident :com.github.ivarref.yoltq/id]] + (d/db conn))] + (let [timeout (+ 3000 (System/currentTimeMillis))] + (while (and (< (System/currentTimeMillis) timeout) + (< (count @tx-queue) min-items)) + (Thread/sleep 10))) + (when (< (count @tx-queue) min-items) + (let [msg (str "run-report-queue: timeout waiting for " min-items " items")] + (log/error msg) + (throw (ex-info msg {})))) + (let [res (atom [])] + (doseq [itm (first (swap-vals! tx-queue (constantly [])))] + (rq/process-poll-result! + @yq/*config* + id-ident + itm + (fn [f] (swap! res conj (f))))) + @res))) + + +(defn run-one-report-queue! [] + (first (run-report-queue! 1))) + + +(defn run-queue-once! [q status] + (poller/poll-once! @yq/*config* q status)) + + +(defn put! [q payload] + @(d/transact (:conn @yq/*config*) [(yq/put q payload)])) + + +(defn transact-result->maps [{:keys [tx-data db-after]}] + (let [m (->> tx-data + (group-by (fn [^Datom d] (.e d))) + (vals) + (mapv (fn [datoms] + (reduce (fn [o ^Datom d] + (if (.added d) + (assoc o (d/q '[:find ?r . + :in $ ?e + :where [?e :db/ident ?r]] + db-after + (.a d)) + (.v d)) + o)) + {} + datoms))))] + m)) + +(defn contains-queue-job? + [queue-id conn {::yq/keys [id queue-name status] :as m}] + (when (and (= queue-id queue-name) + (= status :init) + (d/q '[:find ?e . + :in $ ?id + :where + [?e ::yq/id ?id] + [?e ::yq/status :init]] + (d/db conn) + id)) + m)) + + +(defn get-tx-q-job [q-id] + (let [{:keys [tx-queue conn]} @yq/*config*] + (loop [timeout (+ 3000 (System/currentTimeMillis))] + (if-let [job (->> @tx-queue + (mapcat transact-result->maps) + (filter (partial contains-queue-job? q-id conn)) + (first))] + (u/get-queue-item (d/db conn) (::yq/id job)) + (if (< (System/currentTimeMillis) timeout) + (do (Thread/sleep 10) + (recur timeout)) + nil))))) + +(defmacro consume-expect! [queue-name expected-status] + `(if-let [job# (get-tx-q-job ~queue-name)] + (try + (with-bindings (:com.github.ivarref.yoltq/bindings job#) + (let [prep# (u/prepare-processing (d/db (:conn @yq/*config*)) + (:com.github.ivarref.yoltq/id job#) + ~queue-name + (:com.github.ivarref.yoltq/lock job#) + (:com.github.ivarref.yoltq/status job#))] + (if-let [depends-on# (i/depends-on-waiting? @yq/*config* prep#)] + depends-on# + (let [res# (some->> prep# + (i/take! @yq/*config*) + (i/execute! @yq/*config*))] + (swap! (:prev-consumed @yq/*config*) assoc ~queue-name res#) + (test/is (= ~expected-status (:com.github.ivarref.yoltq/status res#))) + (if (:retval res#) + (:retval res#) + (:exception res#)))))) + (catch Throwable t# + (log/error t# "unexpected error in consume-expect:" (ex-message t#)) + (throw t#))) + (test/is false (str "No job found for queue " ~queue-name)))) + +(defmacro consume! [queue-name] + `(consume-expect! ~queue-name :done)) + + +(defmacro force-retry! [queue-name] + `(if-let [job# (some-> @yq/*config* :prev-consumed deref (get ~queue-name))] + (let [db-res# @(d/transact (:conn @yq/*config*) [{:com.github.ivarref.yoltq/id (:com.github.ivarref.yoltq/id job#) + :com.github.ivarref.yoltq/status :init}]) + res# (some->> (u/prepare-processing (:db-after db-res#) + (:com.github.ivarref.yoltq/id job#) + ~queue-name + (:com.github.ivarref.yoltq/lock job#) + :init) + (i/take! @yq/*config*) + (i/execute! @yq/*config*))] + (swap! (:prev-consumed @yq/*config*) assoc ~queue-name res#) + (test/is (= :done (:com.github.ivarref.yoltq/status res#))) + (if (:retval res#) + (:retval res#) + (:exception res#))) + (test/is false "Expected to have previously consumed something. Was nil."))) diff --git a/src/com/github/ivarref/yoltq/utils.clj b/src/com/github/ivarref/yoltq/utils.clj new file mode 100644 index 0000000..9defd0e --- /dev/null +++ b/src/com/github/ivarref/yoltq/utils.clj @@ -0,0 +1,179 @@ +(ns com.github.ivarref.yoltq.utils + (:require [datomic.api :as d] + [clojure.edn :as edn] + [com.github.ivarref.yoltq.ext-sys :as ext] + [clojure.tools.logging :as log]) + (:refer-clojure :exclude [random-uuid]) + (:import (datomic Connection) + (java.time Duration))) + + +(def status-init :init) +(def status-processing :processing) +(def status-done :done) +(def status-error :error) + +(def current-version "2") + +(defn duration->millis [m] + (reduce-kv (fn [o k v] + (if (instance? Duration v) + (assoc o k (.toMillis v)) + (assoc o k v))) + {} + m)) + + +(defn squuid [] + (ext/squuid)) + + +(defn random-uuid [] + (ext/random-uuid)) + + +(defn now-ms [] + (ext/now-ms)) + + +(defn root-cause [e] + (if-let [root (ex-cause e)] + (root-cause root) + e)) + + +(defn db-error-map [^Throwable t] + (loop [e t] + (cond (nil? e) nil + + (and (map? (ex-data e)) + (contains? (ex-data e) :db/error)) + (ex-data e) + + :else + (recur (ex-cause e))))) + + +(defn get-queue-item [db id] + (-> (d/pull db '[:*] [:com.github.ivarref.yoltq/id id]) + (dissoc :db/id) + (update :com.github.ivarref.yoltq/opts (fn [s] (or (when s (edn/read-string s)) {}))) + (update :com.github.ivarref.yoltq/bindings + (fn [s] + (when s + (->> s + (edn/read-string) + (reduce-kv (fn [o k v] + (assoc o (resolve k) v)) + {}))))))) + + +(defn prepare-processing [db id queue-name old-lock old-status] + (let [new-lock (random-uuid)] + {:id id + :lock new-lock + :queue-name queue-name + :bindings (get (get-queue-item db id) :com.github.ivarref.yoltq/bindings {}) + :tx [[:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/lock old-lock new-lock] + [:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/status old-status status-processing] + {:db/id [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/processing-time (now-ms)}]})) + + +(defn get-init [{:keys [conn db init-backoff-time] :as cfg} queue-name] + (assert (instance? Connection conn) (str "Expected conn to be of type datomic.Connection. Was: " + (str (if (nil? conn) "nil" conn)) + "\nConfig was: " (str cfg))) + (let [db (or db (d/db conn))] + (if-let [ids (->> (d/q '[:find ?id ?lock + :in $ ?queue-name ?backoff ?current-version + :where + [?e :com.github.ivarref.yoltq/status :init] + [?e :com.github.ivarref.yoltq/queue-name ?queue-name] + [?e :com.github.ivarref.yoltq/init-time ?init-time] + [(>= ?backoff ?init-time)] + [?e :com.github.ivarref.yoltq/id ?id] + [?e :com.github.ivarref.yoltq/lock ?lock] + [?e :com.github.ivarref.yoltq/version ?current-version]] + db + queue-name + (- (now-ms) init-backoff-time) + current-version) + (not-empty))] + (let [[id old-lock] (rand-nth (into [] ids))] + (prepare-processing db id queue-name old-lock :init)) + (log/debug "no new-items in :init status for queue" queue-name)))) + +(defn- get-max-retries [cfg queue-name] + (let [v (get-in cfg [:handlers queue-name :max-retries] (:max-retries cfg))] + (if (and (number? v) (pos-int? v)) + v + Long/MAX_VALUE))) + +(defn get-error [{:keys [conn db error-backoff-time] :as cfg} queue-name] + (assert (instance? Connection conn) (str "Expected conn to be of type datomic.Connection. Was: " + (str (if (nil? conn) "nil" conn)) + "\nConfig was: " (str cfg))) + (let [db (or db (d/db conn)) + max-retries (get-max-retries cfg queue-name)] + (when-let [ids (->> (d/q '[:find ?id ?lock + :in $ ?queue-name ?backoff ?max-tries ?current-version + :where + [?e :com.github.ivarref.yoltq/status :error] + [?e :com.github.ivarref.yoltq/queue-name ?queue-name] + [?e :com.github.ivarref.yoltq/error-time ?time] + [(>= ?backoff ?time)] + [?e :com.github.ivarref.yoltq/tries ?tries] + [(>= ?max-tries ?tries)] + [?e :com.github.ivarref.yoltq/id ?id] + [?e :com.github.ivarref.yoltq/lock ?lock] + [?e :com.github.ivarref.yoltq/version ?current-version]] + db + queue-name + (- (now-ms) error-backoff-time) + max-retries + current-version) + (not-empty))] + (let [[id old-lock] (rand-nth (into [] ids))] + (prepare-processing db id queue-name old-lock :error))))) + + +(defn get-hung [{:keys [conn db now hung-backoff-time] :as cfg} queue-name] + (assert (instance? Connection conn) (str "Expected conn to be of type datomic.Connection. Was: " + (str (if (nil? conn) "nil" conn)) + "\nConfig was: " (str cfg))) + (let [now (or now (now-ms)) + max-retries (get-max-retries cfg queue-name) + db (or db (d/db conn))] + (when-let [ids (->> (d/q '[:find ?id ?lock ?tries + :in $ ?qname ?backoff ?current-version + :where + [?e :com.github.ivarref.yoltq/status :processing] + [?e :com.github.ivarref.yoltq/queue-name ?qname] + [?e :com.github.ivarref.yoltq/processing-time ?time] + [(>= ?backoff ?time)] + [?e :com.github.ivarref.yoltq/tries ?tries] + [?e :com.github.ivarref.yoltq/id ?id] + [?e :com.github.ivarref.yoltq/lock ?lock] + [?e :com.github.ivarref.yoltq/version ?current-version]] + db + queue-name + (- now hung-backoff-time) + current-version) + (not-empty))] + (let [new-lock (random-uuid) + [id old-lock tries _t] (rand-nth (into [] ids)) + to-error? (>= tries max-retries)] + {:id id + :lock new-lock + :queue-name queue-name + :was-hung? true + :to-error? to-error? + :bindings (get (get-queue-item db id) :com.github.ivarref.yoltq/bindings {}) + :tx (if (not to-error?) + [[:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/lock old-lock new-lock] + [:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/tries tries (inc tries)] + {:db/id [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/error-time now}] + [[:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/lock old-lock new-lock] + [:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/tries tries (inc tries)] + [:db/cas [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/status status-processing status-error] + {:db/id [:com.github.ivarref.yoltq/id id] :com.github.ivarref.yoltq/error-time now}])})))) |
