Prevent the watcher from submitting too many jobs
When the watcher runs on each node group, if it can obtain the group
lock, it submits a GROUP_VERIFY_DISKS job for each group. This happens
every 5 minutes due to a cron job. This patch stops the watcher from
submitting unnecessary verify disks jobs if there are some already
pending in the queue to prevent job congestion.
Signed-off-by: Federico Morg Pareschi <morg@google.com>
Reviewed-by: Brian Foley <bpfoley@google.com>
diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index 4e946b3..881ac83 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -345,10 +345,34 @@
return compat.any(nodes[node_name].offline for node_name in instance.snodes)
+def _GetPendingVerifyDisks(cl, uuid):
+ """Checks if there are any currently running or pending group verify jobs and
+ if so, returns their id.
+
+ """
+ qfilter = qlang.MakeSimpleFilter("status",
+ frozenset([constants.JOB_STATUS_RUNNING,
+ constants.JOB_STATUS_QUEUED,
+ constants.JOB_STATUS_WAITING]))
+ qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter)
+
+ ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data
+ if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)]
+ return ids
+
+
def _VerifyDisks(cl, uuid, nodes, instances):
"""Run a per-group "gnt-cluster verify-disks".
"""
+
+ existing_jobs = _GetPendingVerifyDisks(cl, uuid)
+ if existing_jobs:
+ logging.info("There are verify disks jobs already pending (%s), skipping "
+ "VerifyDisks step for %s.",
+ utils.CommaJoin(existing_jobs), uuid)
+ return
+
op = opcodes.OpGroupVerifyDisks(
group_name=uuid, priority=constants.OP_PRIO_LOW)
op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,