Prevent the watcher from submitting too many jobs When the watcher runs on each node group, if it can obtain the group lock, it submits a GROUP_VERIFY_DISKS job for each group. This happens every 5 minutes due to a cron job. This patch stops the watcher from submitting unnecessary verify disks jobs if there are some already pending in the queue to prevent job congestion. Signed-off-by: Federico Morg Pareschi <morg@google.com> Reviewed-by: Brian Foley <bpfoley@google.com>

commit: 76506388218085746241fe5d0b40151ee9a1b87f [log] [tgz]
author: Federico Morg Pareschi <morg@google.com> Thu Jun 23 12:57:01 2016 +0100
committer: Brian Foley <bpfoley@google.com> Thu Jun 23 14:56:37 2016 +0100
tree: 235ba844f43bc2ba0c7b1b8949a3202e3b720852
parent: e1db65e44d3d712706f8ddd303b27867a047caf0 [diff]
diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index 4e946b3..881ac83 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py

@@ -345,10 +345,34 @@
   return compat.any(nodes[node_name].offline for node_name in instance.snodes)
 
 
+def _GetPendingVerifyDisks(cl, uuid):
+  """Checks if there are any currently running or pending group verify jobs and
+  if so, returns their id.
+
+  """
+  qfilter = qlang.MakeSimpleFilter("status",
+                                    frozenset([constants.JOB_STATUS_RUNNING,
+                                               constants.JOB_STATUS_QUEUED,
+                                               constants.JOB_STATUS_WAITING]))
+  qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter)
+
+  ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data
+         if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)]
+  return ids
+
+
 def _VerifyDisks(cl, uuid, nodes, instances):
   """Run a per-group "gnt-cluster verify-disks".
 
   """
+
+  existing_jobs = _GetPendingVerifyDisks(cl, uuid)
+  if existing_jobs:
+    logging.info("There are verify disks jobs already pending (%s), skipping "
+                 "VerifyDisks step for %s.",
+                 utils.CommaJoin(existing_jobs), uuid)
+    return
+
   op = opcodes.OpGroupVerifyDisks(
     group_name=uuid, priority=constants.OP_PRIO_LOW)
   op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
commit	76506388218085746241fe5d0b40151ee9a1b87f	[log] [tgz]
author	Federico Morg Pareschi <morg@google.com>	Thu Jun 23 12:57:01 2016 +0100
committer	Brian Foley <bpfoley@google.com>	Thu Jun 23 14:56:37 2016 +0100
tree	235ba844f43bc2ba0c7b1b8949a3202e3b720852
parent	e1db65e44d3d712706f8ddd303b27867a047caf0 [diff]