Introduce a non-strict flag for group verify disk

The group verify disk ganeti job is executed every 5 minutes by a cron
job. This type of job tries to take all node locks in a nodegroup in
shared mode and can easily get stuck for a long time if there are
conflicting long-running jobs (like a create instance).

With this commit, we introduce a --no-strict flag to the watcher and
gnt_cluster verify-disks command to allow the group verify disk job to
only verify those nodes available for locking. The watcher will run a
fully-strict verify only every 30 minutes.

Signed-off-by: Federico Morg Pareschi <morg@google.com>
Reviewed-by: Brian Foley <bpfoley@google.com>
diff --git a/doc/examples/ganeti.cron.in b/doc/examples/ganeti.cron.in
index eedb58b..ad5c79c 100644
--- a/doc/examples/ganeti.cron.in
+++ b/doc/examples/ganeti.cron.in
@@ -3,8 +3,11 @@
 # On reboot, continue a Ganeti upgrade, if one was in progress
 @reboot root [ -x @SBINDIR@/gnt-cluster ] && @SBINDIR@/gnt-cluster upgrade --resume
 
-# Restart failed instances (every 5 minutes)
-*/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
+# Restart failed instances (in non-strict mode every 5 minutes)
+5-25/5,35-55/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher --no-strict
+
+# Restart failed instances (in strict mode every 30 minutes)
+*/30 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
 
 # Clean job archive (at 01:45 AM)
 45 1 * * * @GNTMASTERUSER@ [ -x @SBINDIR@/ganeti-cleaner ] && @SBINDIR@/ganeti-cleaner master
diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
index e3056b5..5e5f6ff 100644
--- a/lib/client/gnt_cluster.py
+++ b/lib/client/gnt_cluster.py
@@ -90,6 +90,10 @@
     "--data-collector-interval", default={}, type="keyval",
     help="Set collection intervals in seconds of data collectors.")
 
+STRICT_OPT = cli_option("--no-strict", default=False,
+                        dest="no_strict", action="store_true",
+                        help="Do not run group verify in strict mode")
+
 _EPO_PING_INTERVAL = 30 # 30 seconds between pings
 _EPO_PING_TIMEOUT = 1 # 1 second
 _EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
@@ -802,7 +806,8 @@
   """
   cl = GetClient()
 
-  op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup)
+  op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup,
+                                    is_strict=not opts.no_strict)
 
   result = SubmitOpCode(op, cl=cl, opts=opts)
 
@@ -2519,7 +2524,7 @@
      VERIFY_CLUTTER_OPT],
     "", "Does a check on the cluster configuration"),
   "verify-disks": (
-    VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT],
+    VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT, STRICT_OPT],
     "", "Does a check on the cluster disk status"),
   "repair-disk-sizes": (
     RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py
index 76809b6..b5cbf1f 100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@ -259,8 +259,10 @@
       return ResultWithJobs([])
     else:
       # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
-      return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
-                             for group in group_names])
+      return ResultWithJobs(
+          [[opcodes.OpGroupVerifyDisks(group_name=group,
+                                       is_strict=self.op.is_strict)]
+           for group in group_names])
 
 
 class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
diff --git a/lib/cmdlib/group.py b/lib/cmdlib/group.py
index 91f8752..2cf3483 100644
--- a/lib/cmdlib/group.py
+++ b/lib/cmdlib/group.py
@@ -851,6 +851,13 @@
     self.dont_collate_locks[locking.LEVEL_NODEGROUP] = True
     self.dont_collate_locks[locking.LEVEL_NODE] = True
 
+    # If run in strict mode, require locks for all nodes in the node group
+    # so we can verify all the disks. In non-strict mode, just verify the
+    # nodes that are available for locking.
+    if not self.op.is_strict:
+      self.opportunistic_locks[locking.LEVEL_NODE] = True
+      self.opportunistic_locks[locking.LEVEL_INSTANCE] = True
+
   def DeclareLocks(self, level):
     if level == locking.LEVEL_INSTANCE:
       assert not self.needed_locks[locking.LEVEL_INSTANCE]
@@ -893,8 +900,9 @@
 
     assert self.group_uuid in owned_groups
 
-    # Check if locked instances are still correct
-    CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_inst_names)
+    if self.op.is_strict:
+      # Check if locked instances are still correct
+      CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_inst_names)
 
     # Get instance information
     self.instances = dict(self.cfg.GetMultiInstanceInfoByName(owned_inst_names))
@@ -937,6 +945,7 @@
 
   def _VerifyDrbdStates(self, node_errors, offline_disk_instance_names):
     node_to_inst = {}
+    owned_node_uuids = set(self.owned_locks(locking.LEVEL_NODE))
     for inst in self.instances.values():
       disks = self.cfg.GetInstanceDisks(inst.uuid)
       if not (inst.disks_active and
@@ -944,8 +953,10 @@
         continue
 
       secondary_nodes = self.cfg.GetInstanceSecondaryNodes(inst.uuid)
-      for node_uuid in itertools.chain([inst.primary_node],
-                                       secondary_nodes):
+      for node_uuid in itertools.chain([inst.primary_node], secondary_nodes):
+        if not node_uuid in owned_node_uuids:
+          logging.info("Node %s is not locked, skipping check.", node_uuid)
+          continue
         node_to_inst.setdefault(node_uuid, []).append(inst)
 
     for (node_uuid, insts) in node_to_inst.items():
diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index 881ac83..700c843 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -361,7 +361,7 @@
   return ids
 
 
-def _VerifyDisks(cl, uuid, nodes, instances):
+def _VerifyDisks(cl, uuid, nodes, instances, is_strict):
   """Run a per-group "gnt-cluster verify-disks".
 
   """
@@ -374,7 +374,7 @@
     return
 
   op = opcodes.OpGroupVerifyDisks(
-    group_name=uuid, priority=constants.OP_PRIO_LOW)
+    group_name=uuid, priority=constants.OP_PRIO_LOW, is_strict=is_strict)
   op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
                 "Verifying disks of group %s" % uuid,
                 utils.EpochNano())]
@@ -501,6 +501,9 @@
                     help="Don't wait for child processes")
   parser.add_option("--no-verify-disks", dest="no_verify_disks", default=False,
                     action="store_true", help="Do not verify disk status")
+  parser.add_option("--no-strict", dest="no_strict",
+                    default=False, action="store_true",
+                    help="Do not run group verify in strict mode")
   parser.add_option("--rapi-ip", dest="rapi_ip",
                     default=constants.IP4_ADDRESS_LOCALHOST,
                     help="Use this IP to talk to RAPI.")
@@ -911,7 +914,8 @@
   # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
   # is implemented.
   if not opts.no_verify_disks and not only_ext:
-    _VerifyDisks(client, group_uuid, nodes, instances)
+    is_strict = not opts.no_strict
+    _VerifyDisks(client, group_uuid, nodes, instances, is_strict=is_strict)
 
   return constants.EXIT_SUCCESS
 
diff --git a/man/ganeti-watcher.rst b/man/ganeti-watcher.rst
index 539ba2e..533428e 100644
--- a/man/ganeti-watcher.rst
+++ b/man/ganeti-watcher.rst
@@ -10,7 +10,7 @@
 --------
 
 **ganeti-watcher** [\--debug] [\--job-age=*age* ] [\--ignore-pause]
-[\--rapi-ip=*IP*] [\--no-verify-disks]
+[\--rapi-ip=*IP*] [\--no-verify-disks] [\--no-strict]
 
 DESCRIPTION
 -----------
@@ -30,6 +30,11 @@
 The ``--debug`` option will increase the verbosity of the watcher
 and also activate logging to the standard error.
 
+The ``--no-strict`` option runs the group verify disks job in a
+non-strict mode. This only verifies those disks whose node locks could
+be acquired in a best-effort attempt and will skip nodes that are
+recognized as busy with other jobs.
+
 The ``--rapi-ip`` option needs to be set if the RAPI daemon was
 started with a particular IP (using the ``-b`` option). The two
 options need to be exactly the same to ensure that the watcher
diff --git a/man/gnt-cluster.rst b/man/gnt-cluster.rst
index 7bed7cf..cdcc474 100644
--- a/man/gnt-cluster.rst
+++ b/man/gnt-cluster.rst
@@ -1069,7 +1069,7 @@
 VERIFY-DISKS
 ~~~~~~~~~~~~
 
-**verify-disks** [\--node-group *nodegroup*]
+**verify-disks** [\--node-group *nodegroup*] [\--no-strict]
 
 The command checks which instances have degraded DRBD disks and
 activates the disks of those instances.
@@ -1077,6 +1077,11 @@
 With ``--node-group``, restrict the verification to those nodes and
 instances that live in the named group.
 
+The ``--no-strict`` option runs the group verify disks job in a
+non-strict mode. This only verifies those disks whose node locks could
+be acquired in a best-effort attempt and will skip nodes that are
+recognized as busy with other jobs.
+
 This command is run from the **ganeti-watcher** tool, which also
 has a different, complementary algorithm for doing this check.
 Together, these two should ensure that DRBD disks are kept
diff --git a/src/Ganeti/OpCodes.hs b/src/Ganeti/OpCodes.hs
index 8b0dc91..4a3660e 100644
--- a/src/Ganeti/OpCodes.hs
+++ b/src/Ganeti/OpCodes.hs
@@ -191,12 +191,14 @@
      [t| JobIdListOnly |],
      OpDoc.opClusterVerifyDisks,
      [ pOptGroupName
+     , pIsStrict
      ],
      [])
   , ("OpGroupVerifyDisks",
      [t| (Map String String, [String], Map String [[String]]) |],
      OpDoc.opGroupVerifyDisks,
      [ pGroupName
+     , pIsStrict
      ],
      "group_name")
   , ("OpClusterRepairDiskSizes",
diff --git a/src/Ganeti/OpParams.hs b/src/Ganeti/OpParams.hs
index 83fd2c4..e507a4a 100644
--- a/src/Ganeti/OpParams.hs
+++ b/src/Ganeti/OpParams.hs
@@ -311,6 +311,7 @@
   , pNodeSetup
   , pVerifyClutter
   , pLongSleep
+  , pIsStrict
   ) where
 
 import Control.Monad (liftM, mplus)
@@ -1970,3 +1971,9 @@
   withDoc "Whether to allow long instance shutdowns during exports" .
   defaultField [| False |] $
   simpleField "long_sleep" [t| Bool |]
+
+pIsStrict :: Field
+pIsStrict =
+  withDoc "Whether the operation is in strict mode or not." .
+  defaultField [| True |] $
+  simpleField "is_strict" [t| Bool |]
diff --git a/test/hs/Test/Ganeti/OpCodes.hs b/test/hs/Test/Ganeti/OpCodes.hs
index 603c394..bc16c9e 100644
--- a/test/hs/Test/Ganeti/OpCodes.hs
+++ b/test/hs/Test/Ganeti/OpCodes.hs
@@ -191,9 +191,9 @@
           arbitrary <*> genListSet Nothing <*> genListSet Nothing <*>
           arbitrary <*> arbitrary
       "OP_CLUSTER_VERIFY_DISKS" ->
-        OpCodes.OpClusterVerifyDisks <$> genMaybe genNameNE
+        OpCodes.OpClusterVerifyDisks <$> genMaybe genNameNE <*> arbitrary
       "OP_GROUP_VERIFY_DISKS" ->
-        OpCodes.OpGroupVerifyDisks <$> genNameNE
+        OpCodes.OpGroupVerifyDisks <$> genNameNE <*> arbitrary
       "OP_CLUSTER_REPAIR_DISK_SIZES" ->
         OpCodes.OpClusterRepairDiskSizes <$> genNodeNamesNE
       "OP_CLUSTER_CONFIG_QUERY" ->