blob: 422063559fc26b64e6d730edaea346096be8b987 [file] [log] [blame]
{-| Implementation of the auto-repair logic for Ganeti.
-}
{-
Copyright (C) 2013, 2015 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-}
module Ganeti.HTools.Repair
( InstanceData(..)
, parseInitTag
, getArData
, arStateName
, delCurTag
, setInitialState
, arStatusCmp
, updateTag
, detectBroken
) where
import Control.Monad (mplus, foldM)
import Data.Function (on)
import Data.List (sortBy, groupBy, intercalate)
import Data.Maybe (mapMaybe, fromJust)
import Data.Ord (comparing)
import System.Time (ClockTime(TOD))
import Ganeti.BasicTypes (GenericResult(..), Result)
import qualified Ganeti.Constants as C
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
import qualified Ganeti.HTools.Node as Node
import qualified Ganeti.HTools.Tags.Constants as Tags
import Ganeti.HTools.Types
import Ganeti.OpCodes (OpCode(..))
import Ganeti.OpParams ( RecreateDisksInfo(RecreateDisksAll)
, ReplaceDisksMode(ReplaceNewSecondary)
)
import Ganeti.Types (makeJobIdS, fromJobId, mkNonEmpty, mkNonNegative)
import Ganeti.Utils (chompPrefix, sepSplit, tryRead, clockTimeToString)
-- | Description of an instance annotated with repair-related information.
data InstanceData = InstanceData { arInstance :: Instance.Instance
, arState :: AutoRepairStatus
, tagsToRemove :: [String]
}
deriving (Eq, Show)
-- | Parse a tag into an 'AutoRepairData' record.
--
-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
-- malformed.
parseInitTag :: String -> Maybe AutoRepairData
parseInitTag tag =
let parsePending = do
subtag <- chompPrefix Tags.autoRepairTagPending tag
case sepSplit ':' subtag of
[rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
_ -> fail ("Invalid tag: " ++ show tag)
parseResult = do
subtag <- chompPrefix Tags.autoRepairTagResult tag
case sepSplit ':' subtag of
[rtype, uuid, ts, result, jobs] -> do
arData <- makeArData rtype uuid ts jobs
result' <- autoRepairResultFromRaw result
return arData { arResult = Just result' }
_ -> fail ("Invalid tag: " ++ show tag)
makeArData rtype uuid ts jobs = do
rtype' <- autoRepairTypeFromRaw rtype
ts' <- tryRead "auto-repair time" ts
jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
return AutoRepairData { arType = rtype'
, arUuid = uuid
, arTime = TOD ts' 0
, arJobs = jobs'
, arResult = Nothing
, arTag = tag
}
in
parsePending `mplus` parseResult
-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
getArData :: AutoRepairStatus -> Maybe AutoRepairData
getArData status =
case status of
ArHealthy (Just d) -> Just d
ArFailedRepair d -> Just d
ArPendingRepair d -> Just d
ArNeedsRepair d -> Just d
_ -> Nothing
-- | Return a short name for each auto-repair status.
--
-- This is a more concise representation of the status, because the default
-- "Show" formatting includes all the accompanying auto-repair data.
arStateName :: AutoRepairStatus -> String
arStateName status =
case status of
ArHealthy _ -> "Healthy"
ArFailedRepair _ -> "Failure"
ArPendingRepair _ -> "Pending repair"
ArNeedsRepair _ -> "Needs repair"
-- | Return a new list of tags to remove that includes @arTag@ if present.
delCurTag :: InstanceData -> [String]
delCurTag instData =
let arData = getArData $ arState instData
rmTags = tagsToRemove instData
in
case arData of
Just d -> arTag d : rmTags
Nothing -> rmTags
-- | Set the initial auto-repair state of an instance from its auto-repair tags.
--
-- The rules when there are multiple tags is:
--
-- * the earliest failure result always wins
--
-- * two or more pending repairs results in a fatal error
--
-- * a pending result from id X and a success result from id Y result in error
-- if Y is newer than X
--
-- * if there are no pending repairs, the newest success result wins,
-- otherwise the pending result is used.
setInitialState :: Instance.Instance -> Result InstanceData
setInitialState inst =
let arData = mapMaybe parseInitTag $ Instance.allTags inst
-- Group all the AutoRepairData records by id (i.e. by repair task), and
-- present them from oldest to newest.
arData' = sortBy (comparing arUuid) arData
arGroups = groupBy ((==) `on` arUuid) arData'
arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
in
foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
-- | Update the initial status of an instance with new repair task tags.
--
-- This function gets called once per repair group in an instance's tag, and it
-- determines whether to set the status of the instance according to this new
-- group, or to keep the existing state. See the documentation for
-- 'setInitialState' for the rules to be followed when determining this.
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
arStatusCmp instData arData =
let curSt = arState instData
arData' = sortBy (comparing keyfn) arData
keyfn d = (arResult d, arTime d)
newData = last arData'
newSt = case arResult newData of
Just ArSuccess -> ArHealthy $ Just newData
Just ArEnoperm -> ArHealthy $ Just newData
Just ArFailure -> ArFailedRepair newData
Nothing -> ArPendingRepair newData
in
case curSt of
ArFailedRepair _ -> Ok instData -- Always keep the earliest failure.
ArHealthy _ -> Ok instData { arState = newSt
, tagsToRemove = delCurTag instData
}
ArPendingRepair d -> Bad (
"An unfinished repair was found in instance " ++
Instance.name (arInstance instData) ++ ": found tag " ++
show (arTag newData) ++ ", but older pending tag " ++
show (arTag d) ++ "exists.")
ArNeedsRepair _ -> Bad
"programming error: ArNeedsRepair found as an initial state"
-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
updateTag :: AutoRepairData -> AutoRepairData
updateTag arData =
let ini = [autoRepairTypeToRaw $ arType arData,
arUuid arData,
clockTimeToString $ arTime arData]
end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
(pfx, middle) =
case arResult arData of
Nothing -> (Tags.autoRepairTagPending, [])
Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs])
in
arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
-- | Detect brokenness with an instance and suggest repair type and jobs to run.
detectBroken :: Node.List -> Instance.Instance
-> Maybe (AutoRepairType, [OpCode])
detectBroken nl inst =
let disk = Instance.diskTemplate inst
iname = Instance.name inst
offPri = Node.offline $ Container.find (Instance.pNode inst) nl
offSec = Node.offline $ Container.find (Instance.sNode inst) nl
in
case disk of
DTDrbd8
| offPri && offSec ->
Just (
ArReinstall,
[ OpInstanceRecreateDisks { opInstanceName = iname
, opInstanceUuid = Nothing
, opRecreateDisksInfo = RecreateDisksAll
, opNodes = []
-- FIXME: there should be a better way to
-- specify opcode parameters than abusing
-- mkNonEmpty in this way (using the fact
-- that Maybe is used both for optional
-- fields, and to express failure).
, opNodeUuids = Nothing
, opIallocator = mkNonEmpty "hail"
}
, OpInstanceReinstall { opInstanceName = iname
, opInstanceUuid = Nothing
, opOsType = Nothing
, opTempOsParams = Nothing
, opOsparamsPrivate = Nothing
, opOsparamsSecret = Nothing
, opForceVariant = False
}
])
| offPri ->
Just (
ArFailover,
[ OpInstanceFailover { opInstanceName = iname
, opInstanceUuid = Nothing
-- FIXME: ditto, see above.
, opShutdownTimeout = fromJust $ mkNonNegative
C.defaultShutdownTimeout
, opIgnoreConsistency = False
, opTargetNode = Nothing
, opTargetNodeUuid = Nothing
, opIgnoreIpolicy = False
, opIallocator = Nothing
, opMigrationCleanup = False
}
])
| offSec ->
Just (
ArFixStorage,
[ OpInstanceReplaceDisks { opInstanceName = iname
, opInstanceUuid = Nothing
, opReplaceDisksMode = ReplaceNewSecondary
, opReplaceDisksList = []
, opRemoteNode = Nothing
-- FIXME: ditto, see above.
, opRemoteNodeUuid = Nothing
, opIallocator = mkNonEmpty "hail"
, opEarlyRelease = False
, opIgnoreIpolicy = False
}
])
| otherwise -> Nothing
DTPlain
| offPri ->
Just (
ArReinstall,
[ OpInstanceRecreateDisks { opInstanceName = iname
, opInstanceUuid = Nothing
, opRecreateDisksInfo = RecreateDisksAll
, opNodes = []
-- FIXME: ditto, see above.
, opNodeUuids = Nothing
, opIallocator = mkNonEmpty "hail"
}
, OpInstanceReinstall { opInstanceName = iname
, opInstanceUuid = Nothing
, opOsType = Nothing
, opTempOsParams = Nothing
, opOsparamsPrivate = Nothing
, opOsparamsSecret = Nothing
, opForceVariant = False
}
])
| otherwise -> Nothing
_ -> Nothing -- Other cases are unimplemented for now: DTDiskless,
-- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.