Improve missing job error handling in maintd

If luxi returned an error when getting statuses for any maintenance job
that was submitted while handling an incident and has not yet completed,
maintd would immediately fail, and could not recover from this condition
on subsequent rounds of maintenance.

This could happen if, for example, ganeti was manually shut down while
the job was running, and the job file was archived after ganeti
restarted.

Fix by treating missing/unparseable jobs as failed.

Signed-off-by: Brian Foley <bpfoley@google.com>
Reviewed-by: Viktor Bachraty <vbachraty@google.com>
diff --git a/src/Ganeti/BasicTypes.hs b/src/Ganeti/BasicTypes.hs
index c686283..caec414 100644
--- a/src/Ganeti/BasicTypes.hs
+++ b/src/Ganeti/BasicTypes.hs
@@ -43,6 +43,7 @@
   , ResultT(..)
   , mkResultT
   , mkResultT'
+  , mkResultTEither
   , withError
   , withErrorT
   , toError
@@ -320,6 +321,11 @@
            => m (GenericResult s a) -> ResultT e m a
 mkResultT' = mkResultT . liftM (genericResult (Bad . show) Ok)
 
+-- | Generalisation of mkResultT accepting any showable failures.
+mkResultTEither :: (Monad m, FromString e, Show s)
+           => m (Either s a) -> ResultT e m a
+mkResultTEither = mkResultT . liftM (either (Bad . show) Ok)
+
 -- | Simple checker for whether a 'GenericResult' is OK.
 isOk :: GenericResult a b -> Bool
 isOk (Ok _) = True
diff --git a/src/Ganeti/Jobs.hs b/src/Ganeti/Jobs.hs
index 21b8815..e31d74e 100644
--- a/src/Ganeti/Jobs.hs
+++ b/src/Ganeti/Jobs.hs
@@ -152,7 +152,7 @@
       callback jids'
       waitForJobs jids' client
 
--- | Wait for one job units it is finished, using the WaitForJobChange
+-- | Wait for one job until it is finished, using the WaitForJobChange
 -- luxi command. Return the JobId and the and the final job status.
 waitForJob :: L.Client -> L.JobId -> ResultT String IO (L.JobId, JobStatus)
 waitForJob c jid = waitForJob' J.JSNull 0 where
diff --git a/src/Ganeti/MaintD/Server.hs b/src/Ganeti/MaintD/Server.hs
index 670d2bb..b88b23e 100644
--- a/src/Ganeti/MaintD/Server.hs
+++ b/src/Ganeti/MaintD/Server.hs
@@ -56,7 +56,7 @@
 import qualified Text.JSON as J
 
 import Ganeti.BasicTypes ( GenericResult(..), ResultT, runResultT, mkResultT
-                         , withErrorT, isBad)
+                         , mkResultTEither, withErrorT, isBad, isOk)
 import qualified Ganeti.Constants as C
 import Ganeti.Daemon ( OptType, CheckFn, PrepFn, MainFn, oDebug
                      , oNoVoting, oYesDoIt, oPort, oBindAddress, oNoDaemonize)
@@ -76,7 +76,7 @@
 import qualified Ganeti.Path as Path
 import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
 import Ganeti.Types (JobId(..), JobStatus(..))
-import Ganeti.Utils (threadDelaySeconds)
+import Ganeti.Utils (threadDelaySeconds, partitionM)
 import Ganeti.Utils.Http (httpConfFromOpts, plainJSON, error404)
 import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceRoundDelay
                             , maintenanceBalancing)
@@ -128,10 +128,27 @@
   logDebug $ "Jobs submitted in the last round: "
              ++ show (map fromJobId oldjobs)
   luxiSocket <- liftIO Path.defaultQuerySocket
-  jobresults <- bracket (mkResultT . liftM (either (Bad . show) Ok)
-                         . tryIOError $ L.getLuxiClient luxiSocket)
-                  (liftIO . L.closeClient)
-                  $ mkResultT . waitForJobs oldjobs
+
+  -- Filter out any jobs in the maintenance list which can't be parsed by luxi
+  -- anymore. This can happen if the job file is corrupted, missing or archived.
+  -- We have to query one job at a time, as luxi returns a single error if any
+  -- job in the query list can't be read/parsed.
+  (okjobs, badjobs) <- bracket
+       (mkResultTEither . tryIOError $ L.getLuxiClient luxiSocket)
+       (liftIO . L.closeClient)
+       $  mkResultT . liftM Ok
+       . (\c -> partitionM (\j -> liftM isOk $ L.queryJobsStatus c [j]) oldjobs)
+
+  unless (null badjobs) $ do
+    logInfo . (++) "Unparsable jobs (marking as failed): "
+        . show $ map fromJobId badjobs
+    mapM_ (failIncident memstate) badjobs
+
+  jobresults <- bracket
+      (mkResultTEither . tryIOError $ L.getLuxiClient luxiSocket)
+      (liftIO . L.closeClient)
+      $ mkResultT . (\c -> waitForJobs okjobs c)
+
   let failedjobs = map fst $ filter ((/=) JOB_STATUS_SUCCESS . snd) jobresults
   unless (null failedjobs) $ do
     logInfo . (++) "Failed jobs: " . show $ map fromJobId failedjobs
diff --git a/src/Ganeti/Utils.hs b/src/Ganeti/Utils.hs
index fbe3a36..42a8db9 100644
--- a/src/Ganeti/Utils.hs
+++ b/src/Ganeti/Utils.hs
@@ -101,6 +101,7 @@
   , threadDelaySeconds
   , monotoneFind
   , iterateJust
+  , partitionM
   ) where
 
 import Prelude ()
@@ -871,7 +872,15 @@
              else monotoneFind heuristics p xs'
     _ -> Nothing
 
--- | Iterate a funtion as long as it returns Just values, collecting
+-- | Iterate a function as long as it returns Just values, collecting
 -- all the Justs that where obtained.
 iterateJust :: (a -> Maybe a) -> a -> [a]
 iterateJust f a = a : maybe [] (iterateJust f) (f a)
+
+-- | A version of partition with a monadic predicate
+-- Implementation taken from David Fox's Extras package.
+partitionM :: (Monad m) => (a -> m Bool) -> [a] -> m ([a], [a])
+partitionM p xs = foldM f ([], []) xs
+  where f (a, b) x = do
+        pv <- p x
+        return $ if pv then (x : a, b) else (a, x : b)