Introduce a non-strict flag for group verify disk
The group verify disk ganeti job is executed every 5 minutes by a cron
job. This type of job tries to take all node locks in a nodegroup in
shared mode and can easily get stuck for a long time if there are
conflicting long-running jobs (like a create instance).
With this commit, we introduce a --no-strict flag to the watcher and
gnt_cluster verify-disks command to allow the group verify disk job to
only verify those nodes available for locking. The watcher will run a
fully-strict verify only every 30 minutes.
Signed-off-by: Federico Morg Pareschi <morg@google.com>
Reviewed-by: Brian Foley <bpfoley@google.com>
diff --git a/doc/examples/ganeti.cron.in b/doc/examples/ganeti.cron.in
index eedb58b..ad5c79c 100644
--- a/doc/examples/ganeti.cron.in
+++ b/doc/examples/ganeti.cron.in
@@ -3,8 +3,11 @@
# On reboot, continue a Ganeti upgrade, if one was in progress
@reboot root [ -x @SBINDIR@/gnt-cluster ] && @SBINDIR@/gnt-cluster upgrade --resume
-# Restart failed instances (every 5 minutes)
-*/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
+# Restart failed instances (in non-strict mode every 5 minutes)
+5-25/5,35-55/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher --no-strict
+
+# Restart failed instances (in strict mode every 30 minutes)
+*/30 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
# Clean job archive (at 01:45 AM)
45 1 * * * @GNTMASTERUSER@ [ -x @SBINDIR@/ganeti-cleaner ] && @SBINDIR@/ganeti-cleaner master
diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
index e3056b5..5e5f6ff 100644
--- a/lib/client/gnt_cluster.py
+++ b/lib/client/gnt_cluster.py
@@ -90,6 +90,10 @@
"--data-collector-interval", default={}, type="keyval",
help="Set collection intervals in seconds of data collectors.")
+STRICT_OPT = cli_option("--no-strict", default=False,
+ dest="no_strict", action="store_true",
+ help="Do not run group verify in strict mode")
+
_EPO_PING_INTERVAL = 30 # 30 seconds between pings
_EPO_PING_TIMEOUT = 1 # 1 second
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
@@ -802,7 +806,8 @@
"""
cl = GetClient()
- op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup)
+ op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup,
+ is_strict=not opts.no_strict)
result = SubmitOpCode(op, cl=cl, opts=opts)
@@ -2519,7 +2524,7 @@
VERIFY_CLUTTER_OPT],
"", "Does a check on the cluster configuration"),
"verify-disks": (
- VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT],
+ VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT, STRICT_OPT],
"", "Does a check on the cluster disk status"),
"repair-disk-sizes": (
RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py
index 76809b6..b5cbf1f 100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@ -259,8 +259,10 @@
return ResultWithJobs([])
else:
# Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
- return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
- for group in group_names])
+ return ResultWithJobs(
+ [[opcodes.OpGroupVerifyDisks(group_name=group,
+ is_strict=self.op.is_strict)]
+ for group in group_names])
class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
diff --git a/lib/cmdlib/group.py b/lib/cmdlib/group.py
index 91f8752..2cf3483 100644
--- a/lib/cmdlib/group.py
+++ b/lib/cmdlib/group.py
@@ -851,6 +851,13 @@
self.dont_collate_locks[locking.LEVEL_NODEGROUP] = True
self.dont_collate_locks[locking.LEVEL_NODE] = True
+ # If run in strict mode, require locks for all nodes in the node group
+ # so we can verify all the disks. In non-strict mode, just verify the
+ # nodes that are available for locking.
+ if not self.op.is_strict:
+ self.opportunistic_locks[locking.LEVEL_NODE] = True
+ self.opportunistic_locks[locking.LEVEL_INSTANCE] = True
+
def DeclareLocks(self, level):
if level == locking.LEVEL_INSTANCE:
assert not self.needed_locks[locking.LEVEL_INSTANCE]
@@ -893,8 +900,9 @@
assert self.group_uuid in owned_groups
- # Check if locked instances are still correct
- CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_inst_names)
+ if self.op.is_strict:
+ # Check if locked instances are still correct
+ CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_inst_names)
# Get instance information
self.instances = dict(self.cfg.GetMultiInstanceInfoByName(owned_inst_names))
@@ -937,6 +945,7 @@
def _VerifyDrbdStates(self, node_errors, offline_disk_instance_names):
node_to_inst = {}
+ owned_node_uuids = set(self.owned_locks(locking.LEVEL_NODE))
for inst in self.instances.values():
disks = self.cfg.GetInstanceDisks(inst.uuid)
if not (inst.disks_active and
@@ -944,8 +953,10 @@
continue
secondary_nodes = self.cfg.GetInstanceSecondaryNodes(inst.uuid)
- for node_uuid in itertools.chain([inst.primary_node],
- secondary_nodes):
+ for node_uuid in itertools.chain([inst.primary_node], secondary_nodes):
+ if not node_uuid in owned_node_uuids:
+ logging.info("Node %s is not locked, skipping check.", node_uuid)
+ continue
node_to_inst.setdefault(node_uuid, []).append(inst)
for (node_uuid, insts) in node_to_inst.items():
diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index 881ac83..700c843 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -361,7 +361,7 @@
return ids
-def _VerifyDisks(cl, uuid, nodes, instances):
+def _VerifyDisks(cl, uuid, nodes, instances, is_strict):
"""Run a per-group "gnt-cluster verify-disks".
"""
@@ -374,7 +374,7 @@
return
op = opcodes.OpGroupVerifyDisks(
- group_name=uuid, priority=constants.OP_PRIO_LOW)
+ group_name=uuid, priority=constants.OP_PRIO_LOW, is_strict=is_strict)
op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
"Verifying disks of group %s" % uuid,
utils.EpochNano())]
@@ -501,6 +501,9 @@
help="Don't wait for child processes")
parser.add_option("--no-verify-disks", dest="no_verify_disks", default=False,
action="store_true", help="Do not verify disk status")
+ parser.add_option("--no-strict", dest="no_strict",
+ default=False, action="store_true",
+ help="Do not run group verify in strict mode")
parser.add_option("--rapi-ip", dest="rapi_ip",
default=constants.IP4_ADDRESS_LOCALHOST,
help="Use this IP to talk to RAPI.")
@@ -911,7 +914,8 @@
# This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
# is implemented.
if not opts.no_verify_disks and not only_ext:
- _VerifyDisks(client, group_uuid, nodes, instances)
+ is_strict = not opts.no_strict
+ _VerifyDisks(client, group_uuid, nodes, instances, is_strict=is_strict)
return constants.EXIT_SUCCESS
diff --git a/man/ganeti-watcher.rst b/man/ganeti-watcher.rst
index 539ba2e..533428e 100644
--- a/man/ganeti-watcher.rst
+++ b/man/ganeti-watcher.rst
@@ -10,7 +10,7 @@
--------
**ganeti-watcher** [\--debug] [\--job-age=*age* ] [\--ignore-pause]
-[\--rapi-ip=*IP*] [\--no-verify-disks]
+[\--rapi-ip=*IP*] [\--no-verify-disks] [\--no-strict]
DESCRIPTION
-----------
@@ -30,6 +30,11 @@
The ``--debug`` option will increase the verbosity of the watcher
and also activate logging to the standard error.
+The ``--no-strict`` option runs the group verify disks job in a
+non-strict mode. This only verifies those disks whose node locks could
+be acquired in a best-effort attempt and will skip nodes that are
+recognized as busy with other jobs.
+
The ``--rapi-ip`` option needs to be set if the RAPI daemon was
started with a particular IP (using the ``-b`` option). The two
options need to be exactly the same to ensure that the watcher
diff --git a/man/gnt-cluster.rst b/man/gnt-cluster.rst
index 7bed7cf..cdcc474 100644
--- a/man/gnt-cluster.rst
+++ b/man/gnt-cluster.rst
@@ -1069,7 +1069,7 @@
VERIFY-DISKS
~~~~~~~~~~~~
-**verify-disks** [\--node-group *nodegroup*]
+**verify-disks** [\--node-group *nodegroup*] [\--no-strict]
The command checks which instances have degraded DRBD disks and
activates the disks of those instances.
@@ -1077,6 +1077,11 @@
With ``--node-group``, restrict the verification to those nodes and
instances that live in the named group.
+The ``--no-strict`` option runs the group verify disks job in a
+non-strict mode. This only verifies those disks whose node locks could
+be acquired in a best-effort attempt and will skip nodes that are
+recognized as busy with other jobs.
+
This command is run from the **ganeti-watcher** tool, which also
has a different, complementary algorithm for doing this check.
Together, these two should ensure that DRBD disks are kept
diff --git a/src/Ganeti/OpCodes.hs b/src/Ganeti/OpCodes.hs
index 8b0dc91..4a3660e 100644
--- a/src/Ganeti/OpCodes.hs
+++ b/src/Ganeti/OpCodes.hs
@@ -191,12 +191,14 @@
[t| JobIdListOnly |],
OpDoc.opClusterVerifyDisks,
[ pOptGroupName
+ , pIsStrict
],
[])
, ("OpGroupVerifyDisks",
[t| (Map String String, [String], Map String [[String]]) |],
OpDoc.opGroupVerifyDisks,
[ pGroupName
+ , pIsStrict
],
"group_name")
, ("OpClusterRepairDiskSizes",
diff --git a/src/Ganeti/OpParams.hs b/src/Ganeti/OpParams.hs
index 83fd2c4..e507a4a 100644
--- a/src/Ganeti/OpParams.hs
+++ b/src/Ganeti/OpParams.hs
@@ -311,6 +311,7 @@
, pNodeSetup
, pVerifyClutter
, pLongSleep
+ , pIsStrict
) where
import Control.Monad (liftM, mplus)
@@ -1970,3 +1971,9 @@
withDoc "Whether to allow long instance shutdowns during exports" .
defaultField [| False |] $
simpleField "long_sleep" [t| Bool |]
+
+pIsStrict :: Field
+pIsStrict =
+ withDoc "Whether the operation is in strict mode or not." .
+ defaultField [| True |] $
+ simpleField "is_strict" [t| Bool |]
diff --git a/test/hs/Test/Ganeti/OpCodes.hs b/test/hs/Test/Ganeti/OpCodes.hs
index 603c394..bc16c9e 100644
--- a/test/hs/Test/Ganeti/OpCodes.hs
+++ b/test/hs/Test/Ganeti/OpCodes.hs
@@ -191,9 +191,9 @@
arbitrary <*> genListSet Nothing <*> genListSet Nothing <*>
arbitrary <*> arbitrary
"OP_CLUSTER_VERIFY_DISKS" ->
- OpCodes.OpClusterVerifyDisks <$> genMaybe genNameNE
+ OpCodes.OpClusterVerifyDisks <$> genMaybe genNameNE <*> arbitrary
"OP_GROUP_VERIFY_DISKS" ->
- OpCodes.OpGroupVerifyDisks <$> genNameNE
+ OpCodes.OpGroupVerifyDisks <$> genNameNE <*> arbitrary
"OP_CLUSTER_REPAIR_DISK_SIZES" ->
OpCodes.OpClusterRepairDiskSizes <$> genNodeNamesNE
"OP_CLUSTER_CONFIG_QUERY" ->