| {-| Implementation of node evacuation |
| |
| -} |
| |
| {- |
| |
| Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc. |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are |
| met: |
| |
| 1. Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| |
| 2. Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS |
| IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| -} |
| |
| module Ganeti.HTools.Cluster.Evacuate |
| ( EvacSolution(..) |
| , nodeEvacInstance |
| , tryNodeEvac |
| , emptyEvacSolution |
| , updateEvacSolution |
| , reverseEvacSolution |
| ) where |
| |
| import qualified Data.IntSet as IntSet |
| import Data.List (foldl') |
| import Data.Maybe (fromJust) |
| |
| import Ganeti.BasicTypes |
| import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..)) |
| import Ganeti.HTools.Cluster.Metrics (compCVNodes) |
| import Ganeti.HTools.Cluster.Moves (applyMoveEx) |
| import Ganeti.HTools.Cluster.Utils ( splitCluster, iMoveToJob |
| , instancePriGroup, availableGroupNodes) |
| import qualified Ganeti.HTools.Container as Container |
| import qualified Ganeti.HTools.Group as Group |
| import qualified Ganeti.HTools.Instance as Instance |
| import qualified Ganeti.HTools.Node as Node |
| import Ganeti.HTools.Types |
| import qualified Ganeti.OpCodes as OpCodes |
| import Ganeti.Types |
| |
| -- | Node evacuation/group change iallocator result type. This result |
| -- type consists of actual opcodes (a restricted subset) that are |
| -- transmitted back to Ganeti. |
| data EvacSolution = EvacSolution |
| { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully |
| , esFailed :: [(Idx, String)] -- ^ Instances which were not |
| -- relocated |
| , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs |
| } deriving (Show) |
| |
| -- | The empty evac solution. |
| emptyEvacSolution :: EvacSolution |
| emptyEvacSolution = EvacSolution { esMoved = [] |
| , esFailed = [] |
| , esOpCodes = [] |
| } |
| |
| -- | Reverses an evacuation solution. |
| -- |
| -- Rationale: we always concat the results to the top of the lists, so |
| -- for proper jobset execution, we should reverse all lists. |
| reverseEvacSolution :: EvacSolution -> EvacSolution |
| reverseEvacSolution (EvacSolution f m o) = |
| EvacSolution (reverse f) (reverse m) (reverse o) |
| |
| -- | A simple type for the running solution of evacuations. |
| type EvacInnerState = |
| Either String (Node.List, Instance.Instance, Score, Ndx) |
| |
| -- | Function which fails if the requested mode is change secondary. |
| -- |
| -- This is useful since except DRBD, no other disk template can |
| -- execute change secondary; thus, we can just call this function |
| -- instead of always checking for secondary mode. After the call to |
| -- this function, whatever mode we have is just a primary change. |
| failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m () |
| failOnSecondaryChange ChangeSecondary dt = |
| fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++ |
| "' can't execute change secondary" |
| failOnSecondaryChange _ _ = return () |
| |
| |
| -- | Inner fold function for changing one node of an instance. |
| -- |
| -- Depending on the instance disk template, this will either change |
| -- the secondary (for DRBD) or the primary node (for shared |
| -- storage). However, the operation is generic otherwise. |
| -- |
| -- The running solution is either a @Left String@, which means we |
| -- don't have yet a working solution, or a @Right (...)@, which |
| -- represents a valid solution; it holds the modified node list, the |
| -- modified instance (after evacuation), the score of that solution, |
| -- and the new secondary node index. |
| evacOneNodeInner :: AlgorithmOptions |
| -> Node.List -- ^ Cluster node list |
| -> Instance.Instance -- ^ Instance being evacuated |
| -> Gdx -- ^ The group index of the instance |
| -> (Ndx -> IMove) -- ^ Operation constructor |
| -> EvacInnerState -- ^ Current best solution |
| -> Ndx -- ^ Node we're evaluating as target |
| -> EvacInnerState -- ^ New best solution |
| evacOneNodeInner opts nl inst gdx op_fn accu ndx = |
| case applyMoveEx (algIgnoreSoftErrors opts) nl inst (op_fn ndx) of |
| Bad fm -> let fail_msg = " Node " ++ Container.nameOf nl ndx ++ |
| " failed: " ++ show fm ++ ";" |
| in either (Left . (++ fail_msg)) Right accu |
| Ok (nl', inst', _, _) -> |
| let nodes = Container.elems nl' |
| -- The fromJust below is ugly (it can fail nastily), but |
| -- at this point we should have any internal mismatches, |
| -- and adding a monad here would be quite involved |
| grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes) |
| new_cv = compCVNodes grpnodes |
| new_accu = Right (nl', inst', new_cv, ndx) |
| in case accu of |
| Left _ -> new_accu |
| Right (_, _, old_cv, _) -> |
| if old_cv < new_cv |
| then accu |
| else new_accu |
| |
| -- | Generic function for changing one node of an instance. |
| -- |
| -- This is similar to 'nodeEvacInstance' but will be used in a few of |
| -- its sub-patterns. It folds the inner function 'evacOneNodeInner' |
| -- over the list of available nodes, which results in the best choice |
| -- for relocation. |
| evacOneNodeOnly :: AlgorithmOptions |
| -> Node.List -- ^ The node list (cluster-wide) |
| -> Instance.List -- ^ Instance list (cluster-wide) |
| -> Instance.Instance -- ^ The instance to be evacuated |
| -> Gdx -- ^ The group we're targetting |
| -> [Ndx] -- ^ The list of available nodes |
| -- for allocation |
| -> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
| evacOneNodeOnly opts nl il inst gdx avail_nodes = do |
| op_fn <- case Instance.mirrorType inst of |
| MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances" |
| MirrorInternal -> Ok ReplaceSecondary |
| MirrorExternal -> Ok FailoverToAny |
| (nl', inst', _, ndx) <- annotateResult "Can't find any good node" . |
| eitherToResult $ |
| foldl' (evacOneNodeInner opts nl inst gdx op_fn) |
| (Left "") avail_nodes |
| let idx = Instance.idx inst |
| il' = Container.add idx inst' il |
| ops = iMoveToJob nl' il' idx (op_fn ndx) |
| return (nl', il', ops) |
| |
| -- | Compute result of changing all nodes of a DRBD instance. |
| -- |
| -- Given the target primary and secondary node (which might be in a |
| -- different group or not), this function will 'execute' all the |
| -- required steps and assuming all operations succceed, will return |
| -- the modified node and instance lists, the opcodes needed for this |
| -- and the new group score. |
| evacDrbdAllInner :: AlgorithmOptions |
| -> Node.List -- ^ Cluster node list |
| -> Instance.List -- ^ Cluster instance list |
| -> Instance.Instance -- ^ The instance to be moved |
| -> Gdx -- ^ The target group index |
| -- (which can differ from the |
| -- current group of the |
| -- instance) |
| -> (Ndx, Ndx) -- ^ Tuple of new |
| -- primary\/secondary nodes |
| -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score) |
| evacDrbdAllInner opts nl il inst gdx (t_pdx, t_sdx) = do |
| let primary = Container.find (Instance.pNode inst) nl |
| idx = Instance.idx inst |
| apMove = applyMoveEx $ algIgnoreSoftErrors opts |
| -- if the primary is offline, then we first failover |
| (nl1, inst1, ops1) <- |
| if Node.offline primary |
| then do |
| (nl', inst', _, _) <- |
| annotateResult "Failing over to the secondary" . |
| opToResult $ apMove nl inst Failover |
| return (nl', inst', [Failover]) |
| else return (nl, inst, []) |
| let (o1, o2, o3) = (ReplaceSecondary t_pdx, |
| Failover, |
| ReplaceSecondary t_sdx) |
| -- we now need to execute a replace secondary to the future |
| -- primary node |
| (nl2, inst2, _, _) <- |
| annotateResult "Changing secondary to new primary" . |
| opToResult $ |
| apMove nl1 inst1 o1 |
| let ops2 = o1:ops1 |
| -- we now execute another failover, the primary stays fixed now |
| (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" . |
| opToResult $ apMove nl2 inst2 o2 |
| let ops3 = o2:ops2 |
| -- and finally another replace secondary, to the final secondary |
| (nl4, inst4, _, _) <- |
| annotateResult "Changing secondary to final secondary" . |
| opToResult $ |
| apMove nl3 inst3 o3 |
| let ops4 = o3:ops3 |
| il' = Container.add idx inst4 il |
| ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4 |
| let nodes = Container.elems nl4 |
| -- The fromJust below is ugly (it can fail nastily), but |
| -- at this point we should have any internal mismatches, |
| -- and adding a monad here would be quite involved |
| grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes) |
| new_cv = compCVNodes grpnodes |
| return (nl4, il', ops, new_cv) |
| |
| -- | Run evacuation for a single instance. |
| -- |
| -- /Note:/ this function should correctly execute both intra-group |
| -- evacuations (in all modes) and inter-group evacuations (in the |
| -- 'ChangeAll' mode). Of course, this requires that the correct list |
| -- of target nodes is passed. |
| nodeEvacInstance :: AlgorithmOptions |
| -> Node.List -- ^ The node list (cluster-wide) |
| -> Instance.List -- ^ Instance list (cluster-wide) |
| -> EvacMode -- ^ The evacuation mode |
| -> Instance.Instance -- ^ The instance to be evacuated |
| -> Gdx -- ^ The group we're targetting |
| -> [Ndx] -- ^ The list of available nodes |
| -- for allocation |
| -> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
| nodeEvacInstance opts nl il mode inst@(Instance.Instance |
| {Instance.diskTemplate = dt@DTDiskless}) |
| gdx avail_nodes = |
| failOnSecondaryChange mode dt >> |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| nodeEvacInstance _ _ _ _ (Instance.Instance |
| {Instance.diskTemplate = DTPlain}) _ _ = |
| fail "Instances of type plain cannot be relocated" |
| |
| nodeEvacInstance _ _ _ _ (Instance.Instance |
| {Instance.diskTemplate = DTFile}) _ _ = |
| fail "Instances of type file cannot be relocated" |
| |
| nodeEvacInstance opts nl il mode inst@(Instance.Instance |
| {Instance.diskTemplate = dt@DTSharedFile}) |
| gdx avail_nodes = |
| failOnSecondaryChange mode dt >> |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| nodeEvacInstance opts nl il mode inst@(Instance.Instance |
| {Instance.diskTemplate = dt@DTBlock}) |
| gdx avail_nodes = |
| failOnSecondaryChange mode dt >> |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| nodeEvacInstance opts nl il mode inst@(Instance.Instance |
| {Instance.diskTemplate = dt@DTRbd}) |
| gdx avail_nodes = |
| failOnSecondaryChange mode dt >> |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| nodeEvacInstance opts nl il mode inst@(Instance.Instance |
| {Instance.diskTemplate = dt@DTExt}) |
| gdx avail_nodes = |
| failOnSecondaryChange mode dt >> |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| nodeEvacInstance opts nl il mode inst@(Instance.Instance |
| {Instance.diskTemplate = dt@DTGluster}) |
| gdx avail_nodes = |
| failOnSecondaryChange mode dt >> |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| nodeEvacInstance opts nl il ChangePrimary |
| inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) |
| _ _ = |
| do |
| (nl', inst', _, _) <- opToResult |
| $ applyMoveEx (algIgnoreSoftErrors opts) nl inst |
| Failover |
| let idx = Instance.idx inst |
| il' = Container.add idx inst' il |
| ops = iMoveToJob nl' il' idx Failover |
| return (nl', il', ops) |
| |
| nodeEvacInstance opts nl il ChangeSecondary |
| inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) |
| gdx avail_nodes = |
| evacOneNodeOnly opts nl il inst gdx avail_nodes |
| |
| -- The algorithm for ChangeAll is as follows: |
| -- |
| -- * generate all (primary, secondary) node pairs for the target groups |
| -- * for each pair, execute the needed moves (r:s, f, r:s) and compute |
| -- the final node list state and group score |
| -- * select the best choice via a foldl that uses the same Either |
| -- String solution as the ChangeSecondary mode |
| nodeEvacInstance opts nl il ChangeAll |
| inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8}) |
| gdx avail_nodes = |
| do |
| let no_nodes = Left "no nodes available" |
| node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s] |
| (nl', il', ops, _) <- |
| annotateResult "Can't find any good nodes for relocation" . |
| eitherToResult $ |
| foldl' |
| (\accu nodes -> case evacDrbdAllInner opts nl il inst gdx nodes of |
| Bad msg -> |
| case accu of |
| Right _ -> accu |
| -- we don't need more details (which |
| -- nodes, etc.) as we only selected |
| -- this group if we can allocate on |
| -- it, hence failures will not |
| -- propagate out of this fold loop |
| Left _ -> Left $ "Allocation failed: " ++ msg |
| Ok result@(_, _, _, new_cv) -> |
| let new_accu = Right result in |
| case accu of |
| Left _ -> new_accu |
| Right (_, _, _, old_cv) -> |
| if old_cv < new_cv |
| then accu |
| else new_accu |
| ) no_nodes node_pairs |
| |
| return (nl', il', ops) |
| |
| -- | Updates the evac solution with the results of an instance |
| -- evacuation. |
| updateEvacSolution :: (Node.List, Instance.List, EvacSolution) |
| -> Idx |
| -> Result (Node.List, Instance.List, [OpCodes.OpCode]) |
| -> (Node.List, Instance.List, EvacSolution) |
| updateEvacSolution (nl, il, es) idx (Bad msg) = |
| (nl, il, es { esFailed = (idx, msg):esFailed es}) |
| updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) = |
| (nl, il, es { esMoved = new_elem:esMoved es |
| , esOpCodes = opcodes:esOpCodes es }) |
| where inst = Container.find idx il |
| new_elem = (idx, |
| instancePriGroup nl inst, |
| Instance.allNodes inst) |
| |
| -- | Compute the list of nodes that are to be evacuated, given a list |
| -- of instances and an evacuation mode. |
| nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list |
| -> EvacMode -- ^ The evacuation mode we're using |
| -> [Idx] -- ^ List of instance indices being evacuated |
| -> IntSet.IntSet -- ^ Set of node indices |
| nodesToEvacuate il mode = |
| IntSet.delete Node.noSecondary . |
| foldl' (\ns idx -> |
| let i = Container.find idx il |
| pdx = Instance.pNode i |
| sdx = Instance.sNode i |
| dt = Instance.diskTemplate i |
| withSecondary = case dt of |
| DTDrbd8 -> IntSet.insert sdx ns |
| _ -> ns |
| in case mode of |
| ChangePrimary -> IntSet.insert pdx ns |
| ChangeSecondary -> withSecondary |
| ChangeAll -> IntSet.insert pdx withSecondary |
| ) IntSet.empty |
| |
| -- | Node-evacuation IAllocator mode main function. |
| tryNodeEvac :: AlgorithmOptions |
| -> Group.List -- ^ The cluster groups |
| -> Node.List -- ^ The node list (cluster-wide, not per group) |
| -> Instance.List -- ^ Instance list (cluster-wide) |
| -> EvacMode -- ^ The evacuation mode |
| -> [Idx] -- ^ List of instance (indices) to be evacuated |
| -> Result (Node.List, Instance.List, EvacSolution) |
| tryNodeEvac opts _ ini_nl ini_il mode idxs = |
| let evac_ndx = nodesToEvacuate ini_il mode idxs |
| offline = map Node.idx . filter Node.offline $ Container.elems ini_nl |
| excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline |
| group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx |
| (Container.elems nl))) $ |
| splitCluster ini_nl ini_il |
| (fin_nl, fin_il, esol) = |
| foldl' (\state@(nl, il, _) inst -> |
| let gdx = instancePriGroup nl inst |
| pdx = Instance.pNode inst in |
| updateEvacSolution state (Instance.idx inst) $ |
| availableGroupNodes group_ndx |
| (IntSet.insert pdx excl_ndx) gdx >>= |
| nodeEvacInstance opts nl il mode inst gdx |
| ) |
| (ini_nl, ini_il, emptyEvacSolution) |
| (map (`Container.find` ini_il) idxs) |
| in return (fin_nl, fin_il, reverseEvacSolution esol) |