diff --git a/.gitignore b/.gitignore
index e653ffc..0d19ea9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,6 +167,7 @@
 /src/ganeti-kvmd
 /src/ganeti-luxid
 /src/ganeti-metad
+/src/ganeti-maintd
 /src/ganeti-mond
 /src/rpc-test
 
diff --git a/Makefile.am b/Makefile.am
index 75fb5d8..cb81b6a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -151,6 +151,7 @@
 	src/Ganeti/JQueue \
 	src/Ganeti/Locking \
 	src/Ganeti/Logging \
+	src/Ganeti/MaintD \
 	src/Ganeti/Monitoring \
 	src/Ganeti/Metad \
 	src/Ganeti/Objects \
@@ -302,6 +303,8 @@
 	$(addsuffix /*.py[co],$(DIRS)) \
 	$(addsuffix /*.hi,$(HS_DIRS)) \
 	$(addsuffix /*.o,$(HS_DIRS)) \
+	$(addsuffix /*.dyn_hi,$(HS_DIRS)) \
+	$(addsuffix /*.dyn_o,$(HS_DIRS)) \
 	$(addsuffix /*.$(HTEST_SUFFIX)_hi,$(HS_DIRS)) \
 	$(addsuffix /*.$(HTEST_SUFFIX)_o,$(HS_DIRS)) \
 	$(HASKELL_PACKAGE_VERSIONS_FILE) \
@@ -352,6 +355,7 @@
 	src/ganeti-confd \
 	src/ganeti-wconfd \
 	src/ganeti-luxid \
+	src/ganeti-maintd \
 	src/ganeti-metad \
 	src/ganeti-mond \
 	.hpc/*.mix src/*.tix test/hs/*.tix *.tix \
@@ -371,7 +375,7 @@
 HS_GENERATED_FILES = $(HS_PROGS) src/hluxid src/ganeti-luxid \
 	src/hconfd src/ganeti-confd
 if ENABLE_MOND
-HS_GENERATED_FILES += src/ganeti-mond
+HS_GENERATED_FILES += src/ganeti-mond src/ganeti-maintd
 endif
 if ENABLE_METADATA
 HS_GENERATED_FILES += src/ganeti-metad
@@ -410,6 +414,7 @@
 	doc/examples/systemd/ganeti-kvmd.service \
 	doc/examples/systemd/ganeti-luxid.service \
 	doc/examples/systemd/ganeti-metad.service \
+	doc/examples/systemd/ganeti-maintd.service \
 	doc/examples/systemd/ganeti-mond.service \
 	doc/examples/systemd/ganeti-noded.service \
 	doc/examples/systemd/ganeti-rapi.service \
@@ -660,6 +665,7 @@
 	doc/design-2.14.rst \
 	doc/design-2.15.rst \
 	doc/design-2.16.rst \
+	doc/design-2.17.rst \
 	doc/design-allocation-efficiency.rst \
 	doc/design-autorepair.rst \
 	doc/design-bulk-create.rst \
@@ -691,11 +697,15 @@
 	doc/design-location.rst \
 	doc/design-linuxha.rst \
 	doc/design-lu-generated-jobs.rst \
+	doc/design-macvtap.rst \
+	doc/design-memory-over-commitment.rst \
+	doc/design-migration-speed-hbal.rst \
 	doc/design-monitoring-agent.rst \
 	doc/design-move-instance-improvements.rst \
 	doc/design-multi-reloc.rst \
 	doc/design-multi-storage-htools.rst \
 	doc/design-multi-version-tests.rst \
+	doc/design-n-m-redundancy.rst \
 	doc/design-network.rst \
 	doc/design-network2.rst \
 	doc/design-node-add.rst \
@@ -769,7 +779,7 @@
 	src/hs2py \
 	src/rpc-test
 if ENABLE_MOND
-HS_COMPILE_PROGS += src/ganeti-mond
+HS_COMPILE_PROGS += src/ganeti-mond src/ganeti-maintd
 endif
 if ENABLE_METADATA
 HS_COMPILE_PROGS += src/ganeti-metad
@@ -885,6 +895,7 @@
 	$(patsubst src.%,--exclude Test.%,$(subst /,.,$(patsubst %.hs,%, $(HS_LIB_SRCS))))
 
 HS_LIB_SRCS = \
+	src/Ganeti/Prelude.hs \
 	src/Ganeti/BasicTypes.hs \
 	src/Ganeti/Codec.hs \
 	src/Ganeti/Common.hs \
@@ -906,10 +917,12 @@
 	src/Ganeti/DataCollectors.hs \
 	src/Ganeti/DataCollectors/CLI.hs \
 	src/Ganeti/DataCollectors/CPUload.hs \
+	src/Ganeti/DataCollectors/Diagnose.hs \
 	src/Ganeti/DataCollectors/Diskstats.hs \
 	src/Ganeti/DataCollectors/Drbd.hs \
 	src/Ganeti/DataCollectors/InstStatus.hs \
 	src/Ganeti/DataCollectors/InstStatusTypes.hs \
+	src/Ganeti/DataCollectors/KvmRSS.hs \
 	src/Ganeti/DataCollectors/Lv.hs \
 	src/Ganeti/DataCollectors/Program.hs \
 	src/Ganeti/DataCollectors/Types.hs \
@@ -929,6 +942,8 @@
 	src/Ganeti/HTools/Cluster/AllocationSolution.hs \
 	src/Ganeti/HTools/Cluster/Evacuate.hs \
 	src/Ganeti/HTools/Cluster/Metrics.hs \
+	src/Ganeti/HTools/Cluster/MetricsComponents.hs \
+	src/Ganeti/HTools/Cluster/MetricsTH.hs \
 	src/Ganeti/HTools/Cluster/Moves.hs \
 	src/Ganeti/HTools/Cluster/Utils.hs \
 	src/Ganeti/HTools/Container.hs \
@@ -952,6 +967,8 @@
 	src/Ganeti/HTools/Program/Hsqueeze.hs \
 	src/Ganeti/HTools/Program/Hroller.hs \
 	src/Ganeti/HTools/Program/Main.hs \
+	src/Ganeti/HTools/RedundancyLevel.hs \
+	src/Ganeti/HTools/Repair.hs \
 	src/Ganeti/HTools/Tags.hs \
 	src/Ganeti/HTools/Tags/Constants.hs \
 	src/Ganeti/HTools/Types.hs \
@@ -981,12 +998,23 @@
 	src/Ganeti/Logging/Lifted.hs \
 	src/Ganeti/Logging/WriterLog.hs \
 	src/Ganeti/Luxi.hs \
+	src/Ganeti/MaintD/Autorepairs.hs \
+	src/Ganeti/MaintD/Balance.hs \
+	src/Ganeti/MaintD/CleanupIncidents.hs \
+	src/Ganeti/MaintD/CollectIncidents.hs \
+	src/Ganeti/MaintD/FailIncident.hs \
+	src/Ganeti/MaintD/HandleIncidents.hs \
+        src/Ganeti/MaintD/MemoryState.hs \
+	src/Ganeti/MaintD/Server.hs \
+	src/Ganeti/MaintD/Utils.hs \
 	src/Ganeti/Network.hs \
 	src/Ganeti/Objects.hs \
 	src/Ganeti/Objects/BitArray.hs \
 	src/Ganeti/Objects/Disk.hs \
 	src/Ganeti/Objects/Instance.hs \
+	src/Ganeti/Objects/HvState.hs \
 	src/Ganeti/Objects/Lens.hs \
+	src/Ganeti/Objects/Maintenance.hs \
 	src/Ganeti/Objects/Nic.hs \
 	src/Ganeti/OpCodes.hs \
 	src/Ganeti/OpCodes/Lens.hs \
@@ -1034,6 +1062,7 @@
 	src/Ganeti/Utils.hs \
 	src/Ganeti/Utils/Atomic.hs \
 	src/Ganeti/Utils/AsyncWorker.hs \
+	src/Ganeti/Utils/Http.hs \
 	src/Ganeti/Utils/IORef.hs \
 	src/Ganeti/Utils/Livelock.hs \
 	src/Ganeti/Utils/Monad.hs \
@@ -1503,7 +1532,7 @@
 	cp -f $< $@
 
 if ENABLE_MOND
-nodist_sbin_SCRIPTS += src/ganeti-mond
+nodist_sbin_SCRIPTS += src/ganeti-mond src/ganeti-maintd
 endif
 
 if ENABLE_METADATA
@@ -1614,6 +1643,7 @@
 	daemons/ganeti-cleaner.in \
 	$(pkglib_python_scripts) \
 	devel/build_chroot \
+	devel/cert_digest.py \
 	devel/upload \
 	devel/webserver \
 	tools/kvm-ifup.in \
@@ -1714,6 +1744,9 @@
 	test/autotools/autotools-check-news.test \
 	test/data/htools/clean-nonzero-score.data \
 	test/data/htools/common-suffix.data \
+	test/data/htools/dyn1.json \
+	test/data/htools/dyn2.json \
+	test/data/htools/dyn3.json \
 	test/data/htools/empty-cluster.data \
 	test/data/htools/hail-alloc-dedicated-1.json \
 	test/data/htools/hail-alloc-desired-location.json \
@@ -1728,23 +1761,28 @@
 	test/data/htools/hail-alloc-secondary.json \
 	test/data/htools/hail-alloc-spindles.json \
 	test/data/htools/hail-alloc-twodisks.json \
+	test/data/htools/hail-alloc-memory-over-commitment.json \
 	test/data/htools/hail-change-group.json \
 	test/data/htools/hail-invalid-reloc.json \
 	test/data/htools/hail-node-evac.json \
 	test/data/htools/hail-reloc-drbd.json \
 	test/data/htools/hail-reloc-drbd-crowded.json \
+	test/data/htools/hbal-avoid-disk-moves.data \
 	test/data/htools/hbal-cpu-speed.data \
 	test/data/htools/hbal-desiredlocation-1.data \
 	test/data/htools/hbal-desiredlocation-2.data \
 	test/data/htools/hbal-desiredlocation-3.data \
 	test/data/htools/hbal-desiredlocation-4.data \
 	test/data/htools/hbal-dyn.data \
+	test/data/htools/hbal-dyn2.data \
 	test/data/htools/hbal-evac.data \
 	test/data/htools/hbal-excl-tags.data \
 	test/data/htools/hbal-forth.data \
 	test/data/htools/hbal-location-1.data \
 	test/data/htools/hbal-location-exclusion.data \
 	test/data/htools/hbal-location-2.data \
+	test/data/htools/hbal-memory-over-commitment.data \
+	test/data/htools/hbal-memory-over-commitment-2.data \
 	test/data/htools/hbal-migration-1.data \
 	test/data/htools/hbal-migration-2.data \
 	test/data/htools/hbal-migration-3.data \
@@ -1837,6 +1875,8 @@
 	test/data/cluster_config_2.13.json \
 	test/data/cluster_config_2.14.json \
 	test/data/cluster_config_2.15.json \
+	test/data/cluster_config_2.16.json \
+	test/data/cluster_config_2.17.json \
 	test/data/instance-minor-pairing.txt \
 	test/data/instance-disks.txt \
 	test/data/ip-addr-show-dummy0.txt \
diff --git a/NEWS b/NEWS
index f22825e..944fd0d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,43 @@
 ====
 
 
+Version 2.17.0 beta1
+--------------------
+
+*(Released Mon, 22 Feb 2016)*
+
+Incompatible/important changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- The IAllocator protocol has been extended by a new ``hv_state`` parameter.
+  This new parameter is used to estimate the amount of memory utilized by
+  the node. It replaces ``reserved_mem`` on hypervisors other than ``xen-pvm``
+  and ``xen-hvm`` because ``reserved_mem`` was reported incorrectly on them.
+  If this ``hv_state`` parameter is not presented in an iallocator input, the
+  old ``reserved_mem`` will be used.
+- Tools now log into a separate log file ``tools.log``. Also, each log
+  message of tools is now properly labelled with the name of the tool
+  that submitted the message.
+- The options ``--debug`` and ``--verbose`` of ``gnt-cluster
+  renew-crypto`` and ``gnt-node {add,remove,modify}`` now (also) control the
+  log level of the SSH calls to all nodes.
+
+New features
+~~~~~~~~~~~~
+
+- There is a new daemon, the :doc:`Ganeti Maintenance Daemon <design-repaird>`,
+  that coordinates all maintenance operations on a cluster, i.e. rebalancing,
+  activate disks, ERROR_down handling and node repairs actions.
+- ``htools`` support memory over-commitment now. Look at
+  :doc:`Memory Over Commitment <design-memory-over-commitment>` for the
+  details.
+- ``hbal`` has a new option ``--avoid-disk-moves *factor*`` that allows disk
+  moves only if the gain in the cluster metrics is ``*factor*`` times higher
+  than with no disk moves.
+- ``hcheck`` reports the level of redundancy for each node group as a new ouput
+  parameter, see :doc:`N+M Redundancy <design-n-m-redundancy>`.
+
+
 Version 2.16.0 rc1
 ------------------
 
diff --git a/README b/README
index 4327d89..6f3e88b 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-Ganeti 2.16
+Ganeti 2.17
 ===========
 
 For installation instructions, read the INSTALL and the doc/install.rst
diff --git a/configure.ac b/configure.ac
index 4d57798..e9be40a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,8 +1,8 @@
 # Configure script for Ganeti
 m4_define([gnt_version_major], [2])
-m4_define([gnt_version_minor], [16])
+m4_define([gnt_version_minor], [17])
 m4_define([gnt_version_revision], [0])
-m4_define([gnt_version_suffix], [~rc1])
+m4_define([gnt_version_suffix], [~beta1])
 m4_define([gnt_version_full],
           m4_format([%d.%d.%d%s],
                     gnt_version_major, gnt_version_minor,
diff --git a/daemons/daemon-util.in b/daemons/daemon-util.in
index 6af85c2..0cdbbe5 100644
--- a/daemons/daemon-util.in
+++ b/daemons/daemon-util.in
@@ -56,7 +56,7 @@
 }
 
 if _mond_enabled; then
-  DAEMONS+=( ganeti-mond )
+  DAEMONS+=( ganeti-mond ganeti-maintd)
 fi
 
 # The full list of all daemons we know about
@@ -111,6 +111,9 @@
     metad)
       echo "@GNTMETADUSER@:@GNTMETADGROUP@"
       ;;
+    maintd)
+      echo "@GNTMONDUSER@:@GNTMONDGROUP@"
+      ;;
     *)
       echo "root:@GNTDAEMONSGROUP@"
       ;;
diff --git a/devel/build_chroot b/devel/build_chroot
index b6a6379..d1160f6 100755
--- a/devel/build_chroot
+++ b/devel/build_chroot
@@ -42,10 +42,13 @@
 SHA1_LIST='
 cabal-install-1.18.0.2.tar.gz 2d1f7a48d17b1e02a1e67584a889b2ff4176a773
 cabal-install-1.22.4.0.tar.gz b98eea96d321cdeed83a201c192dac116e786ec2
+cabal-install-1.22.6.0.tar.gz d474b0eef6944af1abef92419cea13cee50993f3
 ghc-7.6.3-i386-unknown-linux.tar.bz2 f042b4171a2d4745137f2e425e6949c185f8ea14
 ghc-7.6.3-x86_64-unknown-linux.tar.bz2 46ec3f3352ff57fba0dcbc8d9c20f7bcb6924b77
 ghc-7.8.4-i386-unknown-linux-deb7.tar.bz2 4f523f854c37a43b738359506a89a37a9fa9fc5f
 ghc-7.8.4-x86_64-unknown-linux-deb7.tar.bz2 3f68321b064e5c1ffcb05838b85bcc00aa2315b4
+ghc-7.10.2-i386-unknown-linux-deb7.tar.bz2 c759ab9af566f5c3c9b75b702615f1d0c2f999fd
+ghc-7.10.2-x86_64-unknown-linux-deb7.tar.bz2 f028e4a07995353a47286478fc8644f66defa227
 '
 
 # export all variables needed in the schroot
@@ -407,6 +410,136 @@
        'hlint>=1.9.12'
 ;;
 
+  jessie-ghc710)
+
+    GHC_VERSION="7.10.2"
+    GHC_VARIANT="-deb7"
+    CABAL_INSTALL_VERSION="1.22.6.0"
+    # the version of the Cabal library below must match the version used by
+    # CABAL_INSTALL_VERSION, see the dependencies of cabal-install
+    CABAL_LIB_VERSION=">=1.22.2 && <1.23"
+    export GHC_VERSION GHC_VARIANT CABAL_INSTALL_VERSION
+
+    in_chroot -- \
+      $APT_INSTALL \
+        autoconf automake \
+        zlib1g-dev \
+        libgmp3-dev \
+        libcurl4-openssl-dev \
+        libpcre3-dev \
+        happy \
+        hlint hscolour pandoc \
+        shelltestrunner \
+        graphviz qemu-utils \
+        python-docutils \
+        python-simplejson \
+        python-pyparsing \
+        python-pyinotify \
+        python-pycurl \
+        python-ipaddr \
+        python-yaml \
+        python-paramiko \
+        git \
+        git-email \
+        vim
+
+    in_chroot -- \
+      $APT_INSTALL python-setuptools python-dev build-essential
+
+    in_chroot -- \
+      easy_install \
+        logilab-astng==0.24.1 \
+        logilab-common==0.58.3 \
+        mock==1.0.1 \
+        pylint==0.26.0
+
+    in_chroot -- \
+      easy_install \
+        sphinx==1.1.3 \
+        pep8==1.3.3 \
+        coverage==3.4 \
+        bitarray==0.8.0
+
+    install_ghc
+
+    install_cabal
+
+    in_chroot -- \
+      cabal update
+
+    in_chroot -- \
+      cabal install --global \
+        HUnit-1.2.5.2 \
+        PSQueue-1.1 \
+        StateVar-1.1.0.0 \
+        ansi-terminal-0.6.2.1 \
+        ansi-wl-pprint-0.6.7.2 \
+        base-orphans-0.4.1 \
+        base64-bytestring-1.0.0.1 \
+        blaze-builder-0.4.0.1 \
+        bytestring-builder-0.10.6.0.0 \
+        bytestring-mmap-0.2.2 \
+        curl-1.3.8 \
+        enumerator-0.4.20 \
+        extensible-exceptions-0.1.1.4 \
+        hashable-1.2.3.3 \
+        case-insensitive-1.2.0.4 \
+        hinotify-0.3.7 \
+        hostname-1.0 \
+        hslogger-1.2.9 \
+        monads-tf-0.1.0.2 \
+        MonadCatchIO-transformers-0.3.1.3 \
+        nats-1 \
+        parallel-3.2.0.6 \
+        prelude-extras-0.4 \
+        primitive-0.6 \
+        reflection-2 \
+        regex-base-0.93.2 \
+        regex-pcre-0.94.4 \
+        regex-posix-0.95.2 \
+        scientific-0.3.3.8 \
+        attoparsec-0.12.1.6 \
+        attoparsec-enumerator-0.3.4 \
+        streaming-commons-0.1.12.1 \
+        blaze-builder-enumerator-0.2.1.0 \
+        syb-0.5.1 \
+        json-0.9.1 \
+        tagged-0.8.1 \
+        tf-random-0.5 \
+        QuickCheck-2.7.6 \
+        Crypto-4.2.5.1 \
+        transformers-compat-0.4.0.4 \
+        distributive-0.4.4 \
+        exceptions-0.8.0.2 \
+        temporary-1.2.0.3 \
+        transformers-base-0.4.4 \
+        monad-control-1.0.0.4 \
+        lifted-base-0.2.3.6 \
+        unix-compat-0.4.1.4 \
+        unordered-containers-0.2.5.1 \
+        semigroups-0.16.2.2 \
+        bifunctors-5 \
+        utf8-string-0.3.8 \
+        vector-0.11.0.0 \
+        void-0.7 \
+        contravariant-1.3.2 \
+        comonad-4.2.7.2 \
+        profunctors-5.1.1 \
+        semigroupoids-5.0.0.2 \
+        free-4.12.1 \
+        adjunctions-4.2.1 \
+        kan-extensions-4.2.2 \
+        lens-4.12.3 \
+        xml-1.3.14 \
+        test-framework-0.8.1.1 \
+        test-framework-hunit-0.3.0.1 \
+        test-framework-quickcheck2-0.3.0.3 \
+        zlib-bindings-0.1.1.5 \
+        zlib-enum-0.2.3.1 \
+        snap-core-0.9.7.2 \
+        snap-server-0.9.5.1 \
+;;
+
   jessie-ghc78)
 
     GHC_VERSION="7.8.4"
@@ -560,7 +693,8 @@
         test-framework-0.8.0.3 \
         test-framework-hunit-0.3.0.1 \
         test-framework-quickcheck2-0.3.0.2 \
-        'transformers>=0.3.0.0'
+        'transformers>=0.3.0.0' \
+        zlib-0.5.4.2
     ;;
 
   *)
diff --git a/devel/cert_digest.py b/devel/cert_digest.py
new file mode 100755
index 0000000..683fbd3
--- /dev/null
+++ b/devel/cert_digest.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+# Copyright (C) 2015 Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This is a test script to ease debugging of SSL problems. It can be
+# applied on any of Ganeti's SSL certificates (for example client.pem
+# and server.pem) and will output a digest.
+
+import sys
+import OpenSSL
+
+
+def usage():
+    print "%s filename" % sys.argv[0]
+    print
+    print "'filename' must be a filename of an SSL certificate in PEM format."
+
+
+if __name__ == "__main__":
+
+    if len(sys.argv) < 2:
+      usage()
+
+    cert_fd = open(sys.argv[1], "r")
+    cert_plain = cert_fd.read()
+
+    print "Certificate:"
+    print cert_plain
+
+    cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
+                                           cert_plain)
+
+    print "Digest:"
+    print cert.digest("sha1")
+
diff --git a/doc/design-2.17.rst b/doc/design-2.17.rst
new file mode 100644
index 0000000..bd1414f
--- /dev/null
+++ b/doc/design-2.17.rst
@@ -0,0 +1,10 @@
+==================
+Ganeti 2.17 design
+==================
+
+The following designs' implementations were completed in Ganeti 2.17.
+
+- :doc:`design-memory-over-commitment`
+- :doc:`design-migration-speed-hbal`
+- :doc:`design-n-m-redundancy`
+- :doc:`design-repaird`
\ No newline at end of file
diff --git a/doc/design-draft.rst b/doc/design-draft.rst
index b2ce6a2..e7c47a3 100644
--- a/doc/design-draft.rst
+++ b/doc/design-draft.rst
@@ -2,7 +2,7 @@
 Design document drafts
 ======================
 
-.. Last updated for Ganeti 2.16
+.. Last updated for Ganeti 2.17
 
 .. toctree::
    :maxdepth: 2
@@ -24,7 +24,7 @@
    design-network2.rst
    design-configlock.rst
    design-multi-storage-htools.rst
-   design-repaird.rst
+   design-macvtap.rst
    design-scsi-kvm.rst
    design-disks.rst
 
diff --git a/doc/design-macvtap.rst b/doc/design-macvtap.rst
new file mode 100644
index 0000000..1440ab9
--- /dev/null
+++ b/doc/design-macvtap.rst
@@ -0,0 +1,266 @@
+===============
+MacVTap support
+===============
+
+.. contents:: :depth: 3
+
+This is a design document detailing the implementation of `MacVTap`
+support in Ganeti. The initial implementation targets the KVM
+hypervisor, but it is intended to be ported to the XEN hypervisor as
+well.
+
+Current state and shortcomings
+==============================
+
+Currently, Ganeti provides a number of options for networking a virtual
+machine, that are the ``bridged``, ``routed``, and ``openvswitch``
+modes.  ``MacVTap``, is another virtual network interface in Linux, that
+is not supported by Ganeti and that could be added to the currently
+supported solutions. It is an interface that acts as a regular TUN/TAP
+device, and thus it is transparently supported by QEMU. Because of its
+design, it can greatly simplify Ganeti setups using bridged instances.
+
+In brief, the MacVTap interface is based on the ``MacVLan`` Linux
+driver, which basically allows a single physical interface to be
+associated with multiple IPs and MAC addresses. It is meant to replace
+the combination of the TUN/TAP and bridge drivers with a more
+lightweight setup that doesn't require any extra configuration on the
+host. MacVTap driver is supposed to be more efficient than using a
+regular bridge.  Unlike bridges, it doesn't need to do STP or to
+discover/learn MAC addresses of other connected devices on a given
+domain, as it it knows every MAC address it can receive. In fact, it
+introduces a bridge-like behavior for virtual machines but without the
+need to have a real bridge setup on the host. Instead, each virtual
+interface extends an existing network device by attaching directly to
+it, having its own MAC address, and providing a separate virtual
+interface to be used by the userspace processes. The MacVTap MAC address
+is used on the external network and the guest OS cannot spoof or change
+that address.
+
+Background
+==========
+
+This section provides some extra information for the MacVTap interface,
+that we took into account for the rest of this design document.
+
+MacVTap modes of operation
+--------------------------
+
+A MacVTap device can operate in one of four modes, just like the MacVLan
+driver does. These modes determine how the tap endpoints communicate
+between each other providing various levels of isolation between them.
+Those modes are the following:
+
+* `VEPA (Virtual Ethernet Port Aggregator) mode`: The default mode that
+  is compatible with virtualization-enabled switches. The communication
+  between endpoints on the same lower device, happens through the
+  external switch.
+
+* `Bridge mode`: It works almost like a traditional bridge, connecting
+  all endpoints directly to each other.
+
+* `Private mode`: An endpoint in this mode can never communicate to any
+  other endpoint on the same lower device.
+
+* `Passthru mode`: This mode was added later to work on some limitations
+  on MacVLans (more details here_).
+
+MacVTap internals
+-----------------
+
+The creation of a MacVTap device is *not* done by opening the
+`/dev/net/tun` device and issuing a corresponding `ioctl()` to register
+a network device as happens in tap devices. Instead, there are two ways
+to create a MacVTap device. The first one is using the `rtnetlink(7)`
+interface directly, just like the `libvirt` or the `iproute2` utilities
+do, and the second one is to use the high-level `ip-link` command. Since
+creating a MacVTap interface programmatically using the netlink protocol
+is a bit more complicated than creating a normal TUN/TAP device, we
+propose using the ip-link tool for the MacVTap handling, which it is
+much simpler and straightforward in use, and also fulfills all our
+needs. Additionally, since Ganeti already depends on `iproute2` being
+installed in the system, this does not introduces an extra dependency.
+
+The following example, creates a MacVTap device using the `ip-link`
+tool, named `macvtap0`, operating in `bridge` mode, and which is using
+`eth0` as its lower device:
+
+::
+
+  ip link add link eth0 name macvtap0 address 1a:36:1b:aa:b3:77 type macvtap mode bridge
+
+Once a MacVTap interface is created, an actual character device appears
+under `/dev`, called ``/dev/tapXX``, where ``XX`` is the interface index
+of the device.
+
+Proposed changes
+================
+
+In order to be able to create instances using the MacVTap device driver,
+we propose some modifications that affect the ``nicparams`` slot of the
+Ganeti's configuration ``NIC`` object, and also the code part regarding
+to the KVM hypervisor, as detailed in the following sections.
+
+Configuration changes
+---------------------
+
+The nicparams ``mode`` attribute will be extended to support the
+``macvtap`` mode. When using the MacVTap mode, the ``link`` attribute
+will specify the network device where the MacVTap interfaces will be
+attached to, the *lower device*. Note that the lower device should
+exists, otherwise the operation will fail. If no link is specified, the
+cluster-wide default NIC `link` param will be used instead.
+
+We propose the MacVTap mode to be configurable, and so the nicparams
+object will be extended with an extra slot named ``mvtap_mode``. This
+parameter will only be used if the network mode is set to MacVTap since
+it does not make sense in other modes, similarly to the `vlan` slot of
+the `openvswitch` mode.
+
+Below there is a snippet of some of the ``gnt-network`` commands'
+output:
+
+Network connection
+~~~~~~~~~~~~~~~~~~
+
+::
+
+  gnt-network connect -N mode=macvtap,link=eth0,mvtap_mode=bridge vtap-net vtap_group
+
+Network listing
+~~~~~~~~~~~~~~~
+
+::
+
+  gnt-network list
+
+  Network  Subnet           Gateway       MacPrefix GroupList
+  br-net   10.48.1.0/24     10.48.1.254   -         default (bridged, br0, , )
+  vtap-net 192.168.100.0/24 192.168.100.1 -         vtap_group (macvtap, eth0, , bridge)
+
+Network information
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+  gnt-network info
+
+  Network name: vtap-net
+  UUID: 4f139b48-3f08-46b1-911f-d37de7e12dcf
+  Serial number: 1
+  Subnet: 192.168.100.0/28
+  Gateway: 192.168.100.1
+  IPv6 Subnet: 2001:db8:2ffc::/64
+  IPv6 Gateway: 2001:db8:2ffc::1
+  Mac Prefix: None
+  size: 16
+  free: 10 (62.50%)
+  usage map:
+        0 XXXXX..........X                                   63
+         (X) used    (.) free
+  externally reserved IPs:
+    192.168.100.0, 192.168.100.1, 192.168.100.15
+  connected to node groups:
+    vtap_group (mode:macvtap link:eth0 vlan: mvtap_mode:bridge)
+  used by 2 instances:
+    inst1.example.com: 0:192.168.100.2
+    inst2.example.com: 0:192.168.100.3
+
+
+Hypervisor changes
+------------------
+
+A new method will be introduced in the KVM's `netdev.py` module, named
+``OpenVTap``, similar to the ``OpenTap`` method, that will be
+responsible for creating a MacVTap device using the `ip-link` command,
+and returning its file descriptor. The ``OpenVtap`` method will receive
+as arguments the network's `link`, the mode of the MacVTap device
+(``mvtap_mode``), and also the ``interface name`` of the device to be
+created, otherwise we will not be able to retrieve it, and so opening
+the created device.
+
+Since we want the names among the MacVTap devices to be unique on the
+same node, we will make use of the existing ``_GenerateKvmTapName``
+method to generate device names but with some modifications, to be
+adapted to our needs. This method is actually a wrapper over the
+``GenerateTapName`` method which currently is being used to generate TAP
+interface names for NICs meant to be used in instance communication
+using the ``gnt.com`` prefix. We propose extending this method to
+generate names for the MacVTap interface too, using the ``vtap`` prefix.
+To do so, we could add an extra boolean argument in that method, named
+`inst_comm`, to differentiate the two cases, so that the method will
+return the appropriate name depending on its usage. This argument will
+be optional and defaulted to `True`, to not affect the existing API.
+
+Currently, the `OpenTap` method handles the `vhost-net`, `mq`, and the
+`vnet_hdr` features. The `vhost-net` feature will be normally supported
+for the MacVTap devices too, and so is the `multiqueue` feature, which
+can be enabled using the `numrxqueues` and `numtxqueues` parameters of
+the `ip-link` command. The only drawback seems to be the `vnet_hdr`
+feature modification. For a MacVTap device this flag is enabled by
+default, and it can not be disabled if a user requests to.
+
+A last hypervisor change will be the introduction of a new method named
+``_RemoveStaleMacvtapDevs`` that will remove any remaining MacVTap
+devices, and which is detailed in the following section.
+
+Tools changes
+-------------
+
+Some of the Ganeti tools should also be extended to support MacVTap
+devices. Those are the ``kvm-ifup`` and ``net-common`` scripts. These
+modifications will include a new method named ``setup_macvtap`` that
+will simply change the device status to `UP` just before and instance is
+started:
+
+::
+
+  ip link set $INTERFACE up
+
+As mentioned in the `Background` section, MacVTap devices are
+persistent. So, we have to manually delete the MacVTap device after an
+instance shutdown. To do so, we propose creating a ``kvm-ifdown``
+script, that will be invoked after an instance shutdown in order to
+remove the relevant MacVTap devices. The ``kvm-ifdown`` script should
+explicitly call the following commands and currently will be functional
+for MacVTap NICs only:
+
+::
+
+  ip link set $INTERFACE down
+  ip link delete $INTERFACE
+
+To be able to call the `kvm-ifdown` script we should extend the KVM's
+``_ConfigureNIC`` method with an extra argument that is the name of the
+script to be invoked, instead of calling by default the `kvm-ifup`
+script, as it currently happens.
+
+The invocation of the `kvm-ifdown` script will be made through a
+separate method that we will create, named ``_RemoveStaleMacvtapDevs``.
+This method will read the NIC runtime files of an instance and will
+remove any devices using the MacVTap interface. This method will be
+included in the ``CleanupInstance`` method in order to cover all the
+cases where an instance using MacVTap NICs needs to be cleaned up.
+
+Besides the instance shutdown, there are a couple of cases where the
+MacVTap NICs will need to be cleaned up too. In case of an internal
+instance shutdown, where the ``kvmd`` is not enabled, the instance will
+be in ``ERROR_DOWN`` state. In that case, when the instance is started
+either by the `ganeti-watcher` or by the admin, the ``CleanupInstance``
+method, and consequently the `kvm-ifdown` script, will not be called and
+so the MacVTap NICs will have to manually be deleted. Otherwise starting
+the instance will result in more than one MacVTap devices using the same
+MAC address. An instance migration is another case where deleting an
+instance will keep stale MacVTap devices on the source node.  In order
+to solve those potential issues, we will explicitly call the
+``_RemoveStaleMacvtapDevs`` method after a successful instance migration
+on the source node, and also before creating a new device for a NIC that
+is using the MacVTap interface to remove any stale devices.
+
+.. _here: http://thread.gmane.org/gmane.comp.emulators.kvm.devel/61824/)
+
+.. vim: set textwidth=72 :
+.. Local Variables:
+.. mode: rst
+.. fill-column: 72
+.. End:
diff --git a/doc/design-memory-over-commitment.rst b/doc/design-memory-over-commitment.rst
new file mode 100644
index 0000000..281a6ef
--- /dev/null
+++ b/doc/design-memory-over-commitment.rst
@@ -0,0 +1,181 @@
+======================
+Memory Over Commitment
+======================
+
+.. contents:: :depth: 4
+
+This document describes the proposed changes to support memory
+overcommitment in Ganeti.
+
+Background
+==========
+
+Memory is a non-preemptable resource, and thus cannot be shared, e.g.,
+in a round-robin fashion. Therefore, Ganeti is very careful to make
+sure there is always enough physical memory for the memory promised
+to the instances. In fact, even in an N+1 redundant way: should one
+node fail, its instances can be relocated to other nodes while still
+having enough physical memory for the memory promised to all instances.
+
+Overview over the current memory model
+--------------------------------------
+
+To make decisions, ``htools`` query the following parameters from Ganeti.
+
+- The amount of memory used by each instance. This is the state-of-record
+  backend parameter ``maxmem`` for that instance (maybe inherited from
+  group-level or cluster-level backend paramters). It tells the hypervisor
+  the maximal amount of memory that instance may use.
+
+- The state-of-world parameters for the node memory. They are collected
+  live and are hypervisor specific. The following parameters are collected.
+
+  - memory_total: the total memory size on the node
+
+  - memory_free: the available memory on the node for instances
+
+  - memory_dom0: the memory used by the node itself, if available
+
+  For Xen, the amount of total and free memory are obtained by parsing
+  the output of Xen ``info`` command (e.g., ``xm info``). The dom0
+  memory is obtained by looking in the output of the ``list`` command
+  for ``Domain-0``.
+
+  For the ``kvm`` hypervisor, all these paramters are obtained by
+  reading ``/proc/memstate``, where the entries ``MemTotal`` and
+  ``Active`` are considered the values for ``memory_total`` and
+  ``memory_dom0``, respectively. The value for ``memory_free`` is
+  taken as the sum of the entries ``MemFree``, ``Buffers``, and ``Cached``.
+
+
+Current state and shortcomings
+==============================
+
+While the current model of never over committing memory serves well
+to provide reliability guarantees to instances, it does not suit well
+situations were the actual use of memory in the instances is spiky. Consider
+a scenario where instances only touch a small portion of their memory most
+of the time, but occasionally use a large amount of memory. Then, at any moment,
+a large fraction of the memory used for the instances sits around without
+being actively used. By swapping out the not actively used memory, resources
+can be used more efficiently.
+
+Proposed changes
+================
+
+We propose to support over commitment of memory if desired by the
+administrator. Memory will change from being a hard constraint to
+being a question of policy. The default will be not to over commit
+memory.
+
+Extension of the policy by a new parameter
+------------------------------------------
+
+The instance policy is extended by a new real-number field ``memory-ratio``.
+Policies on groups inherit this parameter from the cluster wide policy in the
+same way as all other parameters of the instance policy.
+
+When a cluster is upgraded from an earlier version not containing
+``memory-ratio``, the value ``1.0`` is inserted for this new field in
+the cluster-level ``ipolicy``; in this way, the status quo of not over
+committing memory is preserved via upgrades. The ``gnt-cluster
+modify`` and ``gnt-group modify`` commands are extended to allow
+setting of the ``memory-ratio``.
+
+The ``htools`` text format is extended to also contain this new
+ipolicy parameter. It is added as an optional entry at the end of the
+parameter list of an ipolicy line, to remain backwards compatible.
+If the paramter is missing, the value ``1.0`` is assumed.
+
+Changes to the memory reporting on non ``xen-hvm`` and ``xen-pvm``
+------------------------------------------------------------------
+
+For all hypervisors ``memory_dom0`` corresponds to the amount of memory used
+by Ganeti itself and all other non-hypervisor processes running on this node.
+The amount of memory currently reported for ``memory_dom0`` on hypervisors
+other than ``xen-hvm`` and ``xen-pvm``, however, includes the amount of active
+memory of the hypervisor processes. This is in conflict with the underlying
+assumption ``memory_dom0`` memory is not available for instance.
+
+Therefore, for hypervisors other than ``xen-pvm`` and ``xen-hvm`` we will use
+a new state-of-recored hypervisor paramter called ``mem_node`` in htools
+instead of the reported ``memory_dom0``. As a hypervisor state parameter, it is
+run-time tunable and inheritable at group and cluster levels. If this paramter
+is not present, a default value of ``1024M`` will be used, which is a
+conservative estimate of the amount of memory used by Ganeti on a medium-sized
+cluster. The reason for using a state-of-record value is to have a stable
+amount of reserved memory, irrespective of the current activity of Ganeti.
+
+Currently, hypervisor state parameters are partly implemented but not used
+by ganeti.
+
+Changes to the memory policy
+----------------------------
+
+The memory policy will be changed in that we assume that one byte
+of physical node memory can hold ``memory-ratio`` bytes of instance
+memory, but still only one byte of Ganeti memory. Of course, in practise
+this has to be backed by swap space; it is the administrator's responsibility
+to ensure that each node has swap of at
+least ``(memory-ratio - 1.0) * (memory_total - memory_dom0)``. Ganeti
+will warn if the amount of swap space is not big enough.
+
+
+The new memory policy will be as follows.
+
+- The difference between the total memory of a node and its dom0
+  memory will be considered the amount of *available memory*.
+
+- The amount of *used memory* will be (as is now) the sum of
+  the memory of all instance and the reserved memory.
+
+- The *relative memory usage* is the fraction of used and available
+  memory. Note that the relative usage can be bigger than ``1.0``.
+
+- The memory-related constraint for instance placement is that
+  afterwards the relative memory usage be at most the
+  memory-ratio. Again, if the ratio of the memory of the real
+  instances on the node to available memory is bigger than the
+  memory-ratio this is considered a hard violation, otherwise
+  it is considered a soft violation.
+
+- The definition of N+1 redundancy (including
+  :doc:`design-shared-storage-redundancy`) is kept literally as is.
+  Note, however, that the meaning does change, as the definition depends
+  on the notion of allowed moves, which is changed by this proposal.
+
+
+Changes to cluster verify
+-------------------------
+
+The only place where the Ganeti core handles memory is
+when ``gnt-cluster verify`` verifies N+1 redundancy. This code will be changed
+to follow the new memory model.
+
+Additionally, ``gnt-cluster verify`` will warn if the sum of available memory
+and swap space is not at least as big as the used memory.
+
+Changes to ``htools``
+---------------------
+
+The underlying model of the cluster will be changed in accordance with
+the suggested change of the memory policy. As all higher-level ``htools``
+operations go through only the primitives of adding/moving an instance
+if possible, and inspecting the cluster metrics, changing the base
+model will make all ``htools`` compliant with the new memory model.
+
+Balancing
+---------
+
+The cluster metric components will not be changed. Note the standard
+deviation of relative memory usage is already one of the components.
+For dynamic (load-based) balancing, the amount of not immediately
+discardable memory will serve as an indication of memory activity;
+as usual, the measure will be the standard deviation of the relative
+value (i.e., the ratio of non-discardable memory to available
+memory). The weighting for this metric component will have to be
+determined by experimentation and will depend on the memory ratio;
+for a memory ratio of ``1.0`` the weight will be ``0.0``, as memory
+need not be taken into account if no over-commitment is in place.
+For memory ratios bigger than ``1.0``, the weight will be positive
+and grow with the ratio.
diff --git a/doc/design-migration-speed-hbal.rst b/doc/design-migration-speed-hbal.rst
new file mode 100644
index 0000000..a0dcfe0
--- /dev/null
+++ b/doc/design-migration-speed-hbal.rst
@@ -0,0 +1,28 @@
+==================================
+Migration speed accounting in Hbal
+==================================
+
+.. contents:: :depth: 2
+
+Hbal usually performs complex sequence of moves during cluster balancing in
+order to achieve local optimal cluster state. Unfortunately, each move may take
+significant amount of time. Thus, during the sequence of moves the situation on
+cluster may change (e.g., because of adding new instance or because of instance
+or node parameters change) and desired moves can become unprofitable.
+
+Usually disk moves become a bottleneck and require sufficient amount of time.
+:ref:`Instance move improvements <move-performance>` considers
+disk moves speed in more details. Currently, ``hbal`` has a ``--no-disk-moves``
+option preventing disk moves during cluster balancing in order to perform fast
+(but of course non optimal) balancing. It may be useful, but ideally we need to
+find a balance between optimal configuration and time to reach this
+configuration.
+
+Avoiding insignificant disk moves
+=================================
+
+Allowing only profitable enough disk moves may become a first step to reach
+a compromise between moves speed and optimal scoring. This can be implemented
+by introducing ``--avoid-disk-moves *FACTOR*`` option which will admit disk
+moves only if the gain in the cluster metrics is *FACTOR* times
+higher than the gain achievable by non disk moves.
diff --git a/doc/design-move-instance-improvements.rst b/doc/design-move-instance-improvements.rst
index c64b4bf..6948fd8 100644
--- a/doc/design-move-instance-improvements.rst
+++ b/doc/design-move-instance-improvements.rst
@@ -31,6 +31,8 @@
 aspects of the problem, they do not exclude each other and will be presented
 independently.
 
+.. _move-performance:
+
 The performance of Ganeti moves
 ===============================
 
diff --git a/doc/design-n-m-redundancy.rst b/doc/design-n-m-redundancy.rst
new file mode 100644
index 0000000..4536f4c
--- /dev/null
+++ b/doc/design-n-m-redundancy.rst
@@ -0,0 +1,77 @@
+===========================
+Checking for N+M redundancy
+===========================
+
+.. contents:: :depth: 4
+
+This document describes how the level of redundancy is estimated
+in Ganeti.
+
+
+Current state and shortcomings
+==============================
+
+Ganeti keeps the cluster N+1 redundant, also taking into account
+:doc:`design-shared-storage-redundancy`. In other words, Ganeti
+tries to keep the cluster in a state, where after failure of a single
+node, no matter which one, all instances can be started immediately.
+However, e.g., for planning
+maintenance, it is sometimes desirable to know from how many node
+losses the cluster can recover from. This is also useful information,
+when operating big clusters and expecting long times for hardware repair.
+
+
+Proposed changes
+================
+
+Higher redundancy as a sequential concept
+-----------------------------------------
+
+The intuitive meaning of an N+M redundant cluster is that M nodes can
+fail without instances being lost. However, when DRBD is used, already
+failure of 2 nodes can cause complete loss of an instance. Therefore, the
+best we can hope for, is to be able to recover from M sequential failures.
+This intuition that a cluster is N+M redundant, if M nodes can fail one-by-one,
+leaving enough time for a rebalance in between, without losing instances, is
+formalized in the next definition.
+
+Definition of N+M redundancy
+----------------------------
+
+We keep the definition of :doc:`design-shared-storage-redundancy`. Moreover,
+for M a non-negative integer, we define a cluster to be N+(M+2) redundant,
+if after draining any node the standard rebalancing procedure (as, e.g.,
+provided by `hbal`) will fully evacuate that node and result in an N+(M+1)
+redundant cluster.
+
+Independence of Groups
+----------------------
+
+Immediately from the definition, we see that the redundancy level, i.e.,
+the maximal M such that the cluster is N+M redundant, can be computed
+in a group-by-group manner: the standard balancing algorithm will never
+move instances between node groups. The redundancy level of the cluster
+is then the minimum of the redundancy level of the independent groups.
+
+Estimation of the redundancy level
+----------------------------------
+
+The definition of N+M redundancy requires to consider M failures in
+arbitrary order, thus considering super-exponentially many cases for
+large M. As, however, balancing moves instances anyway, the redundancy
+level mainly depends on the amount of node resources available to the
+instances in a node group. So we can get a good approximation of the
+redundancy level of a node group by only considering draining one largest
+node in that group. This is how Ganeti will estimate the redundancy level.
+
+Modifications to existing tools
+-------------------------------
+
+As redundancy levels higher than N+1 are mainly about planning capacity,
+they level of redundancy only needs to be computed on demand. Hence, we
+keep the tool changes minimal.
+
+- ``hcheck`` will report the level of redundancy for each node group as
+  a new output parameter
+
+The rest of Ganeti will not be changed.
diff --git a/doc/design-node-security.rst b/doc/design-node-security.rst
index 1215277..f4f10aa 100644
--- a/doc/design-node-security.rst
+++ b/doc/design-node-security.rst
@@ -129,48 +129,19 @@
 access and a compromised normal node, one can make this node a master
 candidate and then still have the power to compromise the whole cluster.
 
-To mitigate this issue, we propose the following changes:
+Various options have been explored to mitigate this, with no feasible
+solution so far. We generally advise to not expose RAPI to the Internet.
+For more details on making Ganeti secure, see :doc:`security`.
 
-- Add a flag ``master_capability_rapi_modifiable`` to the cluster
-  configuration which indicates whether or not it should be possible
-  to modify the ``master_capable`` flag of nodes via RAPI. The flag is
-  set to ``False`` by default and can itself only be changed on the
-  commandline. In this design doc, we refer to the flag as the
-  "rapi flag" from here on.
-- Only if the ``master_capabability_rapi_modifiable`` switch is set to
-  ``True``, it is possible to modify the master-capability flag of
-  nodes.
-
-With this setup, there are the following definitions of "potential
-master candidates" depending on the rapi flag:
-
-- If the rapi flag is set to ``True``, all cluster nodes are potential
-  master candidates, because as described above, all of them can
-  eventually be made master candidates via RAPI and thus security-wise,
-  we haven't won anything above the current SSH handling.
-- If the rapi flag is set to ``False``, only the master capable nodes
-  are considered potential master candidates, as it is not possible to
-  make them master candidates via RAPI at all.
-
-Note that when the rapi flag is changed, the state of the
-``ganeti_pub_keys`` file on all nodes  has to be updated accordingly.
-This should be done in the client script ``gnt_cluster`` before the
-RPC call to update the configuration is made, because this way, if
-someone would try to perform that RPC call on master to trick it into
-thinking that the flag is enabled, this would not help as the content of
-the ``ganeti_pub_keys`` file is a crucial part in the design of the
-distribution of the SSH keys.
-
-Note: One could think of always allowing to disable the master-capability
-via RAPI and just restrict the enabling of it, thus making it possible
-to RAPI-"freeze" the nodes' master-capability state once it disabled.
-However, we think these are rather confusing semantics of the involved
-flags and thus we go with proposed design.
-
-Note that this change will break RAPI compatibility, at least if the
-rapi flag is not explicitely set to ``True``. We made this choice to
-have the more secure option as default, because otherwise it is
-unlikely to be widely used.
+Alternatively, there was the idea of adding a flag to the cluster config
+that would 'freeze' the ``master_capable`` state of nodes. This turned
+out to be infeasible, as promoting a node from not ``master_capable``
+to ``master_capable`` would mean to add the nodes's key to the
+``ganeti_pub_keys`` file. Due to security reasons, this needed to be
+done in the client (similar to when adding a node). That would have
+meant that it would no longer be possible to set this flag via RAPI. As
+setting this flag via RAPI is a feature our users depend on and that
+has been available in the past, we refrain from breaking this feature.
 
 
 Cluster initialization
diff --git a/doc/design-repaird.rst b/doc/design-repaird.rst
index 6dad3e7..67fe45b 100644
--- a/doc/design-repaird.rst
+++ b/doc/design-repaird.rst
@@ -189,7 +189,7 @@
 Returns a list of all non-cleared incidents. Each incident is reported
 as a JSON object with at least the following information.
 
-- ``id`` The unique identifier assigned to the event.
+- ``uuid`` The unique identifier assigned to the event.
 
 - ``node`` The UUID of the node on which the even was observed.
 
diff --git a/doc/examples/ganeti.default b/doc/examples/ganeti.default
index 49b7d8a..f0649a2 100644
--- a/doc/examples/ganeti.default
+++ b/doc/examples/ganeti.default
@@ -5,3 +5,4 @@
 MOND_ARGS=""
 WCONFD_ARGS=""
 LUXID_ARGS=""
+MAINTD_ARGS=""
diff --git a/doc/examples/ganeti.default-debug b/doc/examples/ganeti.default-debug
index 00dece4..249f3fa 100644
--- a/doc/examples/ganeti.default-debug
+++ b/doc/examples/ganeti.default-debug
@@ -5,3 +5,4 @@
 MOND_ARGS="-d"
 WCONFD_ARGS="-d"
 LUXID_ARGS="-d"
+MAINTD_ARGS="-d"
diff --git a/doc/examples/systemd/ganeti-maintd.service.in b/doc/examples/systemd/ganeti-maintd.service.in
new file mode 100644
index 0000000..f7e906e
--- /dev/null
+++ b/doc/examples/systemd/ganeti-maintd.service.in
@@ -0,0 +1,18 @@
+[Unit]
+Description = Ganeti maintenance daemon (maintd)
+Documentation = man:ganeti-maintd(8)
+Requires = ganeti-common.service
+After = ganeti-common.service
+PartOf = ganeti-master.target
+ConditionPathExists = @LOCALSTATEDIR@/lib/ganeti/config.data
+
+[Service]
+Type = simple
+User = @GNTMONDUSER@
+Group = @GNTMONDGROUP@
+ExecStart = @SBINDIR@/ganeti-maintd -f
+Restart = on-failure
+SuccessExitStatus = 0 11
+
+[Install]
+WantedBy = ganeti-master.target ganeti.target
diff --git a/doc/hooks.rst b/doc/hooks.rst
index de794bb..667906b 100644
--- a/doc/hooks.rst
+++ b/doc/hooks.rst
@@ -1,7 +1,7 @@
 Ganeti customisation using hooks
 ================================
 
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
 
 .. contents::
 
@@ -9,7 +9,8 @@
 ------------
 
 In order to allow customisation of operations, Ganeti runs scripts in
-sub-directories of ``@SYSCONFDIR@/ganeti/hooks``. These sub-directories
+sub-directories of ``@SYSCONFDIR@/ganeti/hooks`` (that is usually
+``/etc/ganeti/hooks``). These sub-directories
 are named ``$hook-$phase.d``, where ``$phase`` is either ``pre`` or
 ``post`` and ``$hook`` matches the directory name given for a hook (e.g.
 ``cluster-verify-post.d`` or ``node-add-pre.d``).
@@ -17,6 +18,10 @@
 This is similar to the ``/etc/network/`` structure present in Debian
 for network interface handling.
 
+Note that Ganeti does not create its ``hooks`` directory by default.
+If you want to use hooks scripts, create it on all nodes. This applies
+also to all sub directories such as ``node-add-pre.d``.
+
 Organisation
 ------------
 
@@ -31,6 +36,11 @@
 Note that, even though we call them scripts, we are actually talking
 about any executable.
 
+The filenames of the scripts need to match the regular expression
+``^[a-zA-Z0-9_-]+$``. This means in particular, that scripts having
+a filename extension (such as ``myhook.sh``) are silently ignored
+by Ganeti.
+
 *pre* scripts
 ~~~~~~~~~~~~~
 
diff --git a/doc/iallocator.rst b/doc/iallocator.rst
index 406f52a..5e59857 100644
--- a/doc/iallocator.rst
+++ b/doc/iallocator.rst
@@ -1,7 +1,7 @@
 Ganeti automatic instance allocation
 ====================================
 
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
 
 .. contents::
 
diff --git a/doc/index.rst b/doc/index.rst
index 225c88f..a8b3fba 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -82,6 +82,7 @@
    design-2.14.rst
    design-2.15.rst
    design-2.16.rst
+   design-2.17.rst
 
 Draft designs
 -------------
@@ -118,11 +119,14 @@
    design-location.rst
    design-linuxha.rst
    design-lu-generated-jobs.rst
+   design-memory-over-commitment.rst
+   design-migration-speed-hbal.rst
    design-monitoring-agent.rst
    design-move-instance-improvements.rst
    design-multi-reloc.rst
    design-multi-version-tests.rst
    design-network.rst
+   design-n-m-redundancy.rst
    design-node-add.rst
    design-node-security.rst
    design-oob.rst
@@ -137,6 +141,7 @@
    design-query2.rst
    design-query-splitting.rst
    design-reason-trail.rst
+   design-repaird.rst
    design-restricted-commands.rst
    design-shared-storage.rst
    design-shared-storage-redundancy.rst
diff --git a/doc/rapi.rst b/doc/rapi.rst
index dc7784a..d6cab78 100644
--- a/doc/rapi.rst
+++ b/doc/rapi.rst
@@ -232,7 +232,8 @@
                                      constants.ISPECS_STD,
                                      constants.IPOLICY_DTS,
                                      constants.IPOLICY_VCPU_RATIO,
-                                     constants.IPOLICY_SPINDLE_RATIO])
+                                     constants.IPOLICY_SPINDLE_RATIO,
+                                     constants.IPOLICY_MEMORY_RATIO])
 
 
 .. pyassert::
@@ -280,6 +281,8 @@
   Maximum ratio of virtual to physical CPUs (`float`)
 :pyeval:`constants.IPOLICY_SPINDLE_RATIO`
   Maximum ratio of instances to their node's ``spindle_count`` (`float`)
+:pyeval:`constants.IPOLICY_MEMORY_RATIO`
+  Maximum ratio of memory overcommitment (`float`)
 
 Usage examples
 --------------
diff --git a/doc/security.rst b/doc/security.rst
index ea86a8a..ff3ac0d 100644
--- a/doc/security.rst
+++ b/doc/security.rst
@@ -1,7 +1,7 @@
 Security in Ganeti
 ==================
 
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
 
 Ganeti was developed to run on internal, trusted systems. As such, the
 security model is all-or-nothing.
diff --git a/doc/virtual-cluster.rst b/doc/virtual-cluster.rst
index 7213a80..e4614fd 100644
--- a/doc/virtual-cluster.rst
+++ b/doc/virtual-cluster.rst
@@ -1,7 +1,7 @@
 Virtual cluster support
 =======================
 
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
 
 .. contents::
 
diff --git a/lib/backend.py b/lib/backend.py
index 51273f6..e597410 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1463,7 +1463,9 @@
                   pub_key_file=pathutils.SSH_PUB_KEYS,
                   ssconf_store=None,
                   noded_cert_file=pathutils.NODED_CERT_FILE,
-                  run_cmd_fn=ssh.RunSshCmdWithStdin):
+                  run_cmd_fn=ssh.RunSshCmdWithStdin,
+                  ssh_update_debug=False,
+                  ssh_update_verbose=False):
   """Distributes a node's public SSH key across the cluster.
 
   Note that this function should only be executed on the master node, which
@@ -1499,7 +1501,9 @@
                            pub_key_file=pub_key_file,
                            ssconf_store=ssconf_store,
                            noded_cert_file=noded_cert_file,
-                           run_cmd_fn=run_cmd_fn)
+                           run_cmd_fn=run_cmd_fn,
+                           ssh_update_debug=ssh_update_debug,
+                           ssh_update_verbose=ssh_update_verbose)
 
 
 # Node info named tuple specifically for the use with AddNodeSshKeyBulk
@@ -1517,7 +1521,9 @@
                       pub_key_file=pathutils.SSH_PUB_KEYS,
                       ssconf_store=None,
                       noded_cert_file=pathutils.NODED_CERT_FILE,
-                      run_cmd_fn=ssh.RunSshCmdWithStdin):
+                      run_cmd_fn=ssh.RunSshCmdWithStdin,
+                      ssh_update_debug=False,
+                      ssh_update_verbose=False):
   """Distributes a node's public SSH key across the cluster.
 
   Note that this function should only be executed on the master node, which
@@ -1595,13 +1601,14 @@
         (constants.SSHS_OVERRIDE, all_keys)
 
       try:
+        backoff = 5  # seconds
         utils.RetryByNumberOfTimes(
-            constants.SSHS_MAX_RETRIES,
+            constants.SSHS_MAX_RETRIES, backoff,
             errors.SshUpdateError,
             run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
             ssh_port_map.get(node_info.name), node_data,
-            debug=False, verbose=False, use_cluster_key=False,
-            ask_key=False, strict_host_check=False)
+            debug=ssh_update_debug, verbose=ssh_update_verbose,
+            use_cluster_key=False, ask_key=False, strict_host_check=False)
       except errors.SshUpdateError as e:
         # Clean up the master's public key file if adding key fails
         if node_info.to_public_keys:
@@ -1641,13 +1648,13 @@
     if node in potential_master_candidates:
       logging.debug("Updating SSH key files of node '%s'.", node)
       try:
+        backoff = 5  # seconds
         utils.RetryByNumberOfTimes(
-            constants.SSHS_MAX_RETRIES,
-            errors.SshUpdateError,
+            constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
             run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
             ssh_port_map.get(node), pot_mc_data,
-            debug=False, verbose=False, use_cluster_key=False,
-            ask_key=False, strict_host_check=False)
+            debug=ssh_update_debug, verbose=ssh_update_verbose,
+            use_cluster_key=False, ask_key=False, strict_host_check=False)
       except errors.SshUpdateError as last_exception:
         error_msg = ("When adding the key of node '%s', updating SSH key"
                      " files of node '%s' failed after %s retries."
@@ -1663,12 +1670,15 @@
       if to_authorized_keys:
         run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
                    ssh_port_map.get(node), base_data,
-                   debug=False, verbose=False, use_cluster_key=False,
-                   ask_key=False, strict_host_check=False)
+                   debug=ssh_update_debug, verbose=ssh_update_verbose,
+                   use_cluster_key=False, ask_key=False,
+                   strict_host_check=False)
 
   return node_errors
 
 
+# TODO: will be fixed with pending patch series.
+# pylint: disable=R0913
 def RemoveNodeSshKey(node_uuid, node_name,
                      master_candidate_uuids,
                      potential_master_candidates,
@@ -1682,7 +1692,9 @@
                      ssconf_store=None,
                      noded_cert_file=pathutils.NODED_CERT_FILE,
                      readd=False,
-                     run_cmd_fn=ssh.RunSshCmdWithStdin):
+                     run_cmd_fn=ssh.RunSshCmdWithStdin,
+                     ssh_update_debug=False,
+                     ssh_update_verbose=False):
   """Removes the node's SSH keys from the key files and distributes those.
 
   Note that at least one of the flags C{from_authorized_keys},
@@ -1736,7 +1748,9 @@
                               ssconf_store=ssconf_store,
                               noded_cert_file=noded_cert_file,
                               readd=readd,
-                              run_cmd_fn=run_cmd_fn)
+                              run_cmd_fn=run_cmd_fn,
+                              ssh_update_debug=ssh_update_debug,
+                              ssh_update_verbose=ssh_update_verbose)
 
 
 # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
@@ -1759,7 +1773,9 @@
                          ssconf_store=None,
                          noded_cert_file=pathutils.NODED_CERT_FILE,
                          readd=False,
-                         run_cmd_fn=ssh.RunSshCmdWithStdin):
+                         run_cmd_fn=ssh.RunSshCmdWithStdin,
+                         ssh_update_debug=False,
+                         ssh_update_verbose=False):
   """Removes the node's SSH keys from the key files and distributes those.
 
   Note that at least one of the flags C{from_authorized_keys},
@@ -1902,13 +1918,13 @@
           logging.debug("Updating key setup of potential master candidate node"
                         " %s.", node)
           try:
+            backoff = 5  # seconds
             utils.RetryByNumberOfTimes(
-                constants.SSHS_MAX_RETRIES,
-                errors.SshUpdateError,
+                constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
                 run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
                 ssh_port, pot_mc_data,
-                debug=False, verbose=False, use_cluster_key=False,
-                ask_key=False, strict_host_check=False)
+                debug=ssh_update_debug, verbose=ssh_update_verbose,
+                use_cluster_key=False, ask_key=False, strict_host_check=False)
           except errors.SshUpdateError as last_exception:
             error_msg = error_msg_final % (
                 node_info.name, node, last_exception)
@@ -1919,13 +1935,13 @@
           if from_authorized_keys:
             logging.debug("Updating key setup of normal node %s.", node)
             try:
+              backoff = 5  # seconds
               utils.RetryByNumberOfTimes(
-                  constants.SSHS_MAX_RETRIES,
-                  errors.SshUpdateError,
+                  constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
                   run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
                   ssh_port, base_data,
-                  debug=False, verbose=False, use_cluster_key=False,
-                  ask_key=False, strict_host_check=False)
+                  debug=ssh_update_debug, verbose=ssh_update_verbose,
+                  use_cluster_key=False, ask_key=False, strict_host_check=False)
             except errors.SshUpdateError as last_exception:
               error_msg = error_msg_final % (
                   node_info.name, node, last_exception)
@@ -1973,13 +1989,14 @@
       logging.debug("Updating SSH key setup of target node '%s'.",
                     node_info.name)
       try:
+        backoff = 5  # seconds
         utils.RetryByNumberOfTimes(
-            constants.SSHS_MAX_RETRIES,
+            constants.SSHS_MAX_RETRIES, backoff,
             errors.SshUpdateError,
             run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
             ssh_port, data,
-            debug=False, verbose=False, use_cluster_key=False,
-            ask_key=False, strict_host_check=False)
+            debug=ssh_update_debug, verbose=ssh_update_verbose,
+            use_cluster_key=False, ask_key=False, strict_host_check=False)
       except errors.SshUpdateError as last_exception:
         result_msgs.append(
             (node_info.name,
@@ -1992,18 +2009,52 @@
       ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
 
   return result_msgs
+# pylint: enable=R0913
 
 
-def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
-                        ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
+def RemoveSshKeyFromPublicKeyFile(node_name,
+                                  pub_key_file=pathutils.SSH_PUB_KEYS,
+                                  ssconf_store=None):
+  """Removes a SSH key from the master's public key file.
+
+  This is an operation that is only used to clean up after failed operations
+  (for example failed hooks before adding a node). To avoid abuse of this
+  function (and the matching RPC call), we add a safety check to make sure
+  that only stray keys can be removed that belong to nodes that are not
+  in the cluster (anymore).
+
+  @type node_name: string
+  @param node_name: the name of the node whose key is removed
+
+  """
+  if not ssconf_store:
+    ssconf_store = ssconf.SimpleStore()
+
+  node_list = ssconf_store.GetNodeList()
+
+  if node_name in node_list:
+    raise errors.SshUpdateError("Cannot remove key of node '%s',"
+                                " because it still belongs to the cluster."
+                                % node_name)
+
+  keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file)
+  if not keys_by_name or node_name not in keys_by_name:
+    logging.info("The node '%s' whose key is supposed to be removed does not"
+                 " have an entry in the public key file. Hence, there is"
+                 " nothing left to do.", node_name)
+
+  ssh.RemovePublicKey(node_name, key_file=pub_key_file)
+
+
+def _GenerateNodeSshKey(node_name, ssh_port_map, ssh_key_type, ssh_key_bits,
                         ssconf_store=None,
                         noded_cert_file=pathutils.NODED_CERT_FILE,
                         run_cmd_fn=ssh.RunSshCmdWithStdin,
-                        suffix=""):
+                        suffix="",
+                        ssh_update_debug=False,
+                        ssh_update_verbose=False):
   """Generates the root SSH key pair on the node.
 
-  @type node_uuid: str
-  @param node_uuid: UUID of the node whose key is removed
   @type node_name: str
   @param node_name: name of the node whose key is remove
   @type ssh_port_map: dict of str to int
@@ -2017,12 +2068,6 @@
   if not ssconf_store:
     ssconf_store = ssconf.SimpleStore()
 
-  keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
-  if not keys_by_uuid or node_uuid not in keys_by_uuid:
-    raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
-                                " be regenerated is not registered in the"
-                                " public keys file." % (node_name, node_uuid))
-
   data = {}
   _InitSshUpdateData(data, noded_cert_file, ssconf_store)
   cluster_name = data[constants.SSHS_CLUSTER_NAME]
@@ -2030,8 +2075,8 @@
 
   run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
              ssh_port_map.get(node_name), data,
-             debug=False, verbose=False, use_cluster_key=False,
-             ask_key=False, strict_host_check=False)
+             debug=ssh_update_debug, verbose=ssh_update_verbose,
+             use_cluster_key=False, ask_key=False, strict_host_check=False)
 
 
 def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
@@ -2055,58 +2100,15 @@
   return old_master_keys_by_uuid
 
 
-def _GetNewMasterKey(root_keyfiles, master_node_uuid):
-  new_master_keys = []
-  for (_, (_, public_key_file)) in root_keyfiles.items():
-    public_key_dir = os.path.dirname(public_key_file)
-    public_key_file_tmp_filename = \
-        os.path.splitext(os.path.basename(public_key_file))[0] \
-        + constants.SSHS_MASTER_SUFFIX + ".pub"
-    public_key_path_tmp = os.path.join(public_key_dir,
-                                       public_key_file_tmp_filename)
-    if os.path.exists(public_key_path_tmp):
-      # for some key types, there might not be any keys
-      key = utils.ReadFile(public_key_path_tmp)
-      new_master_keys.append(key)
-  if not new_master_keys:
-    raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
-  return {master_node_uuid: new_master_keys}
-
-
-def _ReplaceMasterKeyOnMaster(root_keyfiles):
-  number_of_moves = 0
-  for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
-    key_dir = os.path.dirname(public_key_file)
-    private_key_file_tmp = \
-      os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
-    public_key_file_tmp = private_key_file_tmp + ".pub"
-    private_key_path_tmp = os.path.join(key_dir,
-                                        private_key_file_tmp)
-    public_key_path_tmp = os.path.join(key_dir,
-                                       public_key_file_tmp)
-    if os.path.exists(public_key_file):
-      utils.CreateBackup(public_key_file)
-      utils.RemoveFile(public_key_file)
-    if os.path.exists(private_key_file):
-      utils.CreateBackup(private_key_file)
-      utils.RemoveFile(private_key_file)
-    if os.path.exists(public_key_path_tmp) and \
-        os.path.exists(private_key_path_tmp):
-      # for some key types, there might not be any keys
-      shutil.move(public_key_path_tmp, public_key_file)
-      shutil.move(private_key_path_tmp, private_key_file)
-      number_of_moves += 1
-  if not number_of_moves:
-    raise errors.SshUpdateError("Could not move at least one master SSH key.")
-
-
 def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
                  potential_master_candidates, old_key_type, new_key_type,
                  new_key_bits,
                  ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
                  ssconf_store=None,
                  noded_cert_file=pathutils.NODED_CERT_FILE,
-                 run_cmd_fn=ssh.RunSshCmdWithStdin):
+                 run_cmd_fn=ssh.RunSshCmdWithStdin,
+                 ssh_update_debug=False,
+                 ssh_update_verbose=False):
   """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
 
   @type node_uuids: list of str
@@ -2144,11 +2146,9 @@
     raise errors.ProgrammerError("List of nodes UUIDs and node names"
                                  " does not match in length.")
 
-  (_, root_keyfiles) = \
-    ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
-  (_, old_pub_keyfile) = root_keyfiles[old_key_type]
-  (_, new_pub_keyfile) = root_keyfiles[new_key_type]
-  old_master_key = utils.ReadFile(old_pub_keyfile)
+  old_pub_keyfile = ssh.GetSshPubKeyFilename(old_key_type)
+  new_pub_keyfile = ssh.GetSshPubKeyFilename(new_key_type)
+  old_master_key = ssh.ReadLocalSshPubKeys([old_key_type])
 
   node_uuid_name_map = zip(node_uuids, node_names)
 
@@ -2179,20 +2179,13 @@
     node_list.append((node_uuid, node_name, master_candidate,
                       potential_master_candidate))
 
-    keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
-                                       key_file=ganeti_pub_keys_file)
-    if not keys_by_uuid:
-      raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
-                                  " not generating a new key."
-                                  % (node_name, node_uuid))
-
     if master_candidate:
       logging.debug("Fetching old SSH key from node '%s'.", node_name)
-      old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
-                                             node_name, cluster_name,
-                                             ssh_port_map[node_name],
-                                             False, # ask_key
-                                             False) # key_check
+      old_pub_key = ssh.ReadRemoteSshPubKey(old_pub_keyfile,
+                                            node_name, cluster_name,
+                                            ssh_port_map[node_name],
+                                            False, # ask_key
+                                            False) # key_check
       if old_pub_key != old_master_key:
         # If we are already in a multi-key setup (that is past Ganeti 2.12),
         # we can safely remove the old key of the node. Otherwise, we cannot
@@ -2216,7 +2209,13 @@
         node_info_to_remove,
         master_candidate_uuids,
         potential_master_candidates,
-        master_uuid=master_node_uuid)
+        master_uuid=master_node_uuid,
+        pub_key_file=ganeti_pub_keys_file,
+        ssconf_store=ssconf_store,
+        noded_cert_file=noded_cert_file,
+        run_cmd_fn=run_cmd_fn,
+        ssh_update_debug=ssh_update_debug,
+        ssh_update_verbose=ssh_update_verbose)
     if node_errors:
       all_node_errors = all_node_errors + node_errors
 
@@ -2224,19 +2223,20 @@
       in node_list:
 
     logging.debug("Generating new SSH key for node '%s'.", node_name)
-    _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
-                        new_key_bits, pub_key_file=ganeti_pub_keys_file,
+    _GenerateNodeSshKey(node_name, ssh_port_map, new_key_type, new_key_bits,
                         ssconf_store=ssconf_store,
                         noded_cert_file=noded_cert_file,
-                        run_cmd_fn=run_cmd_fn)
+                        run_cmd_fn=run_cmd_fn,
+                        ssh_update_verbose=ssh_update_verbose,
+                        ssh_update_debug=ssh_update_debug)
 
     try:
       logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
-      pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
-                                         node_name, cluster_name,
-                                         ssh_port_map[node_name],
-                                         False, # ask_key
-                                         False) # key_check
+      pub_key = ssh.ReadRemoteSshPubKey(new_pub_keyfile,
+                                        node_name, cluster_name,
+                                        ssh_port_map[node_name],
+                                        False, # ask_key
+                                        False) # key_check
     except:
       raise errors.SshUpdateError("Could not fetch key of node %s"
                                   " (UUID %s)" % (node_name, node_uuid))
@@ -2256,7 +2256,9 @@
       node_keys_to_add, potential_master_candidates,
       pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
       noded_cert_file=noded_cert_file,
-      run_cmd_fn=run_cmd_fn)
+      run_cmd_fn=run_cmd_fn,
+      ssh_update_debug=ssh_update_debug,
+      ssh_update_verbose=ssh_update_verbose)
   if node_errors:
     all_node_errors = all_node_errors + node_errors
 
@@ -2268,19 +2270,21 @@
 
   # Generate a new master key with a suffix, don't touch the old one for now
   logging.debug("Generate new ssh key of master.")
-  _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
+  _GenerateNodeSshKey(master_node_name, ssh_port_map,
                       new_key_type, new_key_bits,
-                      pub_key_file=ganeti_pub_keys_file,
                       ssconf_store=ssconf_store,
                       noded_cert_file=noded_cert_file,
                       run_cmd_fn=run_cmd_fn,
-                      suffix=constants.SSHS_MASTER_SUFFIX)
+                      suffix=constants.SSHS_MASTER_SUFFIX,
+                      ssh_update_debug=ssh_update_debug,
+                      ssh_update_verbose=ssh_update_verbose)
   # Read newly created master key
-  new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
+  new_master_keys = ssh.ReadLocalSshPubKeys(
+      [new_key_type], suffix=constants.SSHS_MASTER_SUFFIX)
 
   # Replace master key in the master nodes' public key file
   ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
-  for pub_key in new_master_key_dict[master_node_uuid]:
+  for pub_key in new_master_keys:
     ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
 
   # Add new master key to all node's public and authorized keys
@@ -2290,12 +2294,15 @@
       to_authorized_keys=True, to_public_keys=True,
       get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
       ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
-      run_cmd_fn=run_cmd_fn)
+      run_cmd_fn=run_cmd_fn,
+      ssh_update_debug=ssh_update_debug,
+      ssh_update_verbose=ssh_update_verbose)
   if node_errors:
     all_node_errors = all_node_errors + node_errors
 
   # Remove the old key file and rename the new key to the non-temporary filename
-  _ReplaceMasterKeyOnMaster(root_keyfiles)
+  ssh.ReplaceSshKeys(new_key_type, new_key_type,
+                     src_key_suffix=constants.SSHS_MASTER_SUFFIX)
 
   # Remove old key from authorized keys
   (auth_key_file, _) = \
@@ -2310,7 +2317,13 @@
       potential_master_candidates,
       keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
       from_public_keys=False, clear_authorized_keys=False,
-      clear_public_keys=False)
+      clear_public_keys=False,
+      pub_key_file=ganeti_pub_keys_file,
+      ssconf_store=ssconf_store,
+      noded_cert_file=noded_cert_file,
+      run_cmd_fn=run_cmd_fn,
+      ssh_update_debug=ssh_update_debug,
+      ssh_update_verbose=ssh_update_verbose)
   if node_errors:
     all_node_errors = all_node_errors + node_errors
 
@@ -5775,18 +5788,25 @@
   return _verify_cmd(path, cmd)
 
 
-def RunRestrictedCmd(cmd,
-                     _lock_timeout=_RCMD_LOCK_TIMEOUT,
-                     _lock_file=pathutils.RESTRICTED_COMMANDS_LOCK_FILE,
-                     _path=pathutils.RESTRICTED_COMMANDS_DIR,
-                     _sleep_fn=time.sleep,
-                     _prepare_fn=_PrepareRestrictedCmd,
-                     _runcmd_fn=utils.RunCmd,
-                     _enabled=constants.ENABLE_RESTRICTED_COMMANDS):
-  """Executes a restricted command after performing strict tests.
+def RunConstrainedCmd(cmd,
+                      lock_file,
+                      path,
+                      inp=None,
+                      _lock_timeout=_RCMD_LOCK_TIMEOUT,
+                      _sleep_fn=time.sleep,
+                      _prepare_fn=_PrepareRestrictedCmd,
+                      _runcmd_fn=utils.RunCmd,
+                      _enabled=constants.ENABLE_RESTRICTED_COMMANDS):
+  """Executes a command after performing strict tests.
 
   @type cmd: string
   @param cmd: Command name
+  @type lock_file: string
+  @param lock_file: path to the lock file
+  @type path: string
+  @param path: path to the directory in which the command is present
+  @type inp: string
+  @param inp: Input to be passed to the command
   @rtype: string
   @return: Command output
   @raise RPCFail: In case of an error
@@ -5801,14 +5821,24 @@
   try:
     cmdresult = None
     try:
-      lock = utils.FileLock.Open(_lock_file)
+      lock = utils.FileLock.Open(lock_file)
       lock.Exclusive(blocking=True, timeout=_lock_timeout)
 
-      (status, value) = _prepare_fn(_path, cmd)
+      (status, value) = _prepare_fn(path, cmd)
 
       if status:
+        if inp:
+          input_fd = tempfile.TemporaryFile()
+          input_fd.write(inp)
+          input_fd.flush()
+          input_fd.seek(0)
+        else:
+          input_fd = None
         cmdresult = _runcmd_fn([value], env={}, reset_env=True,
-                               postfork_fn=lambda _: lock.Unlock())
+                               postfork_fn=lambda _: lock.Unlock(),
+                               input_fd=input_fd)
+        if input_fd:
+          input_fd.close()
       else:
         logging.error(value)
     except Exception: # pylint: disable=W0703
diff --git a/lib/bootstrap.py b/lib/bootstrap.py
index a824977..fc19a06 100644
--- a/lib/bootstrap.py
+++ b/lib/bootstrap.py
@@ -867,6 +867,7 @@
     default_nodegroup.uuid: default_nodegroup,
     }
   now = time.time()
+  maintenance = objects.Maintenance(serial_no=1, ctime=now, mtime=now)
   config_data = objects.ConfigData(version=version,
                                    cluster=cluster_config,
                                    nodegroups=nodegroups,
@@ -875,6 +876,7 @@
                                    networks={},
                                    disks={},
                                    filters={},
+                                   maintenance=maintenance,
                                    serial_no=1,
                                    ctime=now, mtime=now)
   utils.WriteFile(cfg_file,
@@ -934,6 +936,8 @@
     constants.NDS_CLUSTER_NAME: cluster_name,
     constants.NDS_NODE_DAEMON_CERTIFICATE:
       utils.ReadFile(pathutils.NODED_CERT_FILE),
+    constants.NDS_HMAC:
+      utils.ReadFile(pathutils.CONFD_HMAC_KEY),
     constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
     constants.NDS_START_NODE_DAEMON: True,
     constants.NDS_NODE_NAME: node,
diff --git a/lib/cli.py b/lib/cli.py
index 362f2ae..1a1815c 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -2894,6 +2894,7 @@
                           ipolicy_disk_templates=None,
                           ipolicy_vcpu_ratio=None,
                           ipolicy_spindle_ratio=None,
+                          ipolicy_memory_ratio=None,
                           group_ipolicy=False,
                           allowed_values=None,
                           fill_all=False):
@@ -2931,6 +2932,8 @@
     ipolicy_out[constants.IPOLICY_VCPU_RATIO] = ipolicy_vcpu_ratio
   if ipolicy_spindle_ratio is not None:
     ipolicy_out[constants.IPOLICY_SPINDLE_RATIO] = ipolicy_spindle_ratio
+  if ipolicy_memory_ratio is not None:
+    ipolicy_out[constants.IPOLICY_MEMORY_RATIO] = ipolicy_memory_ratio
 
   assert not (frozenset(ipolicy_out.keys()) - constants.IPOLICY_ALL_KEYS)
 
diff --git a/lib/cli_opts.py b/lib/cli_opts.py
index 9f4d530..73a2ca9 100644
--- a/lib/cli_opts.py
+++ b/lib/cli_opts.py
@@ -82,6 +82,7 @@
   "DST_NODE_OPT",
   "EARLY_RELEASE_OPT",
   "ENABLED_DATA_COLLECTORS_OPT",
+  "DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT",
   "ENABLED_DISK_TEMPLATES_OPT",
   "ENABLED_HV_OPT",
   "ENABLED_USER_SHUTDOWN_OPT",
@@ -123,6 +124,7 @@
   "IGNORE_SOFT_ERRORS_OPT",
   "IGNORE_SIZE_OPT",
   "INCLUDEDEFAULTS_OPT",
+  "INPUT_OPT",
   "INSTALL_IMAGE_OPT",
   "INSTANCE_COMMUNICATION_NETWORK_OPT",
   "INSTANCE_COMMUNICATION_OPT",
@@ -134,8 +136,12 @@
   "IPOLICY_STD_SPECS_OPT",
   "IPOLICY_STD_SPECS_STR",
   "IPOLICY_VCPU_RATIO",
+  "IPOLICY_MEMORY_RATIO",
   "LONG_SLEEP_OPT",
   "MAC_PREFIX_OPT",
+  "MAINT_BALANCE_OPT",
+  "MAINT_BALANCE_THRESHOLD_OPT",
+  "MAINT_INTERVAL_OPT",
   "MAINTAIN_NODE_HEALTH_OPT",
   "MASTER_NETDEV_OPT",
   "MASTER_NETMASK_OPT",
@@ -807,6 +813,13 @@
                                    help=("The maximum allowed instances to"
                                          " spindle ratio"))
 
+IPOLICY_MEMORY_RATIO = cli_option("--ipolicy-memory-ratio",
+                                   dest="ipolicy_memory_ratio",
+                                   type="maybefloat", default=None,
+                                   help=("The maximum allowed used memory to"
+                                         " physicall memory ratio (in terms of"
+                                         " memory overcommitment)"))
+
 HYPERVISOR_OPT = cli_option("-H", "--hypervisor-parameters", dest="hypervisor",
                             help="Hypervisor and hypervisor options, in the"
                             " format hypervisor:option=value,option=value,...",
@@ -1099,6 +1112,21 @@
                help="Comma-separated list of compression tools which are"
                     " allowed to be used by Ganeti in various operations")
 
+MAINT_INTERVAL_OPT = \
+  cli_option("--maintenance-interval", dest="maint_round_delay", type="int",
+             default=None, help="Minimal time in seconds, the maintenance"
+             " daemon waits between rounds")
+
+MAINT_BALANCE_OPT = \
+  cli_option("--auto-balance-cluster", dest="maint_balance", type="bool",
+             default=None, metavar=_YORNO, help="Whether the maintenance"
+             " daemon should balance the cluster")
+
+MAINT_BALANCE_THRESHOLD_OPT = \
+  cli_option("--auto-balance-threshold", dest="maint_balance_threshold",
+             type="float", default=None, metavar="CLUSTERSCORE",
+             help="Minimal gain for an auto-balancing step to be taken")
+
 VG_NAME_OPT = cli_option("--vg-name", dest="vg_name",
                          help=("Enables LVM and specifies the volume group"
                                " name (cluster-wide) for disk allocation"
@@ -1587,6 +1615,17 @@
                "in the format collector=bool, where collector is one of %s."
                % ", ".join(constants.DATA_COLLECTOR_NAMES))
 
+DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT = \
+    cli_option("--diagnose-data-collector-filename",
+                         dest="diagnose_data_collector_filename",
+                         help=("Set's the file name of the script"
+                               " diagnose data collector should run"
+                               " If this value is empty string, the collector"
+                               " will return a success value"
+                               " without running anything"),
+                         type="string")
+
+
 VERIFY_CLUTTER_OPT = cli_option(
     "--verify-ssh-clutter", default=False, dest="verify_clutter",
     help="Verify that Ganeti did not clutter"
@@ -1596,6 +1635,11 @@
     "--long-sleep", default=False, dest="long_sleep",
     help="Allow long shutdowns when backing up instances", action="store_true")
 
+INPUT_OPT = cli_option("--input", dest="input", default=None,
+                       help=("input to be passed as stdin"
+                             " to the repair command"),
+                       type="string")
+
 SSH_KEY_TYPE_OPT = \
     cli_option("--ssh-key-type", default=None,
                choices=list(constants.SSHK_ALL), dest="ssh_key_type",
@@ -1653,6 +1697,7 @@
   IPOLICY_DISK_TEMPLATES,
   IPOLICY_VCPU_RATIO,
   IPOLICY_SPINDLE_RATIO,
+  IPOLICY_MEMORY_RATIO,
   ]
 
 # instance policy split specs options
diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
index eb3bc9d..e3056b5 100644
--- a/lib/client/gnt_cluster.py
+++ b/lib/client/gnt_cluster.py
@@ -58,6 +58,7 @@
 from ganeti import ssh
 from ganeti import uidpool
 from ganeti import utils
+from ganeti import wconfd
 from ganeti.client import base
 
 
@@ -240,6 +241,7 @@
     ipolicy_disk_templates=opts.ipolicy_disk_templates,
     ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
     ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+    ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
     fill_all=True)
 
   if opts.candidate_pool_size is None:
@@ -1208,7 +1210,9 @@
         node_certificates=new_node_cert or new_cluster_cert,
         renew_ssh_keys=new_ssh_keys,
         ssh_key_type=ssh_key_type,
-        ssh_key_bits=ssh_key_bits)
+        ssh_key_bits=ssh_key_bits,
+        verbose=verbose,
+        debug=debug)
     SubmitOpCode(renew_op, cl=cl)
 
   ToStdout("All requested certificates and keys have been replaced."
@@ -1265,10 +1269,10 @@
 
   # get the key files of all non-master nodes
   for node in nonmaster_nodes:
-    pub_key = ssh.ReadRemoteSshPubKeys(pub_key_filename, node, cluster_name,
-                                       ssh_port_map[node],
-                                       options.ssh_key_check,
-                                       options.ssh_key_check)
+    pub_key = ssh.ReadRemoteSshPubKey(pub_key_filename, node, cluster_name,
+                                      ssh_port_map[node],
+                                      options.ssh_key_check,
+                                      options.ssh_key_check)
     ssh.AddPublicKey(node_uuid_map[node], pub_key, key_file=pub_key_file)
 
 
@@ -1388,6 +1392,7 @@
           opts.ipolicy_disk_templates is not None or
           opts.ipolicy_vcpu_ratio is not None or
           opts.ipolicy_spindle_ratio is not None or
+          opts.ipolicy_memory_ratio is not None or
           opts.modify_etc_hosts is not None or
           opts.file_storage_dir is not None or
           opts.install_image is not None or
@@ -1397,7 +1402,11 @@
           opts.compression_tools is not None or
           opts.shared_file_storage_dir is not None or
           opts.enabled_user_shutdown is not None or
+          opts.maint_round_delay is not None or
+          opts.maint_balance is not None or
+          opts.maint_balance_threshold is not None or
           opts.data_collector_interval or
+          opts.diagnose_data_collector_filename is not None or
           opts.enabled_data_collectors):
     ToStderr("Please give at least one of the parameters.")
     return 1
@@ -1441,6 +1450,7 @@
     ipolicy_disk_templates=opts.ipolicy_disk_templates,
     ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
     ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+    ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
     )
 
   mnh = opts.maintain_node_health
@@ -1542,8 +1552,12 @@
     shared_file_storage_dir=opts.shared_file_storage_dir,
     compression_tools=compression_tools,
     enabled_user_shutdown=opts.enabled_user_shutdown,
+    maint_round_delay=opts.maint_round_delay,
+    maint_balance=opts.maint_balance,
+    maint_balance_threshold=opts.maint_balance_threshold,
     enabled_data_collectors=enabled_data_collectors,
     data_collector_interval=data_collector_interval,
+    diagnose_data_collector_filename=opts.diagnose_data_collector_filename
     )
   return base.GetResult(None, opts, SubmitOrSend(op, opts))
 
@@ -1938,6 +1952,21 @@
     return _off_fn(opts, node_list, inst_map)
 
 
+def RemoveRepair(opts, args):
+  """Uncoditionally remove a repair event
+
+  @param opts: the command line options selected by the user (ignored)
+  @type args: list
+  @param args: one element, the uuid of the event to remove
+  @rtype: int
+  @return: the desired exit code
+
+  """
+  uuid = args[0]
+  wconfd.Client().RmMaintdIncident(uuid)
+  return 0
+
+
 def _GetCreateCommand(info):
   buf = StringIO()
   buf.write("gnt-cluster init")
@@ -2556,7 +2585,9 @@
      INSTANCE_POLICY_OPTS +
      [GLOBAL_FILEDIR_OPT, GLOBAL_SHARED_FILEDIR_OPT, ZEROING_IMAGE_OPT,
       COMPRESSION_TOOLS_OPT] +
-     [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT],
+     [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT,
+      DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT,
+      MAINT_INTERVAL_OPT, MAINT_BALANCE_OPT, MAINT_BALANCE_THRESHOLD_OPT],
     "[opts...]",
     "Alters the parameters of the cluster"),
   "renew-crypto": (
@@ -2586,6 +2617,9 @@
   "upgrade": (
     UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
     "Upgrade (or downgrade) to a new Ganeti version"),
+  "remove-repair": (
+    RemoveRepair, [ArgUnknown()], [], "<uuid>",
+    "Remove a repair event from the list of pending events"),
   }
 
 
diff --git a/lib/client/gnt_group.py b/lib/client/gnt_group.py
index 5f44001..8b3c9e5 100644
--- a/lib/client/gnt_group.py
+++ b/lib/client/gnt_group.py
@@ -63,6 +63,7 @@
     minmax_ispecs=opts.ipolicy_bounds_specs,
     ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
     ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+    ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
     ipolicy_disk_templates=opts.ipolicy_disk_templates,
     group_ipolicy=True)
 
@@ -170,7 +171,8 @@
   allmods = [opts.ndparams, opts.alloc_policy, opts.diskparams, opts.hv_state,
              opts.disk_state, opts.ipolicy_bounds_specs,
              opts.ipolicy_vcpu_ratio, opts.ipolicy_spindle_ratio,
-             opts.diskparams, opts.ipolicy_disk_templates]
+             opts.ipolicy_memory_ratio, opts.diskparams,
+             opts.ipolicy_disk_templates]
   if allmods.count(None) == len(allmods):
     ToStderr("Please give at least one of the parameters.")
     return 1
@@ -190,6 +192,7 @@
     ipolicy_disk_templates=opts.ipolicy_disk_templates,
     ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
     ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+    ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
     group_ipolicy=True,
     allowed_values=[constants.VALUE_DEFAULT])
 
diff --git a/lib/client/gnt_node.py b/lib/client/gnt_node.py
index 972376d..59b7a77 100644
--- a/lib/client/gnt_node.py
+++ b/lib/client/gnt_node.py
@@ -250,9 +250,9 @@
                          strict_host_check=options.ssh_key_check)
 
   (_, pub_keyfile) = root_keyfiles[ssh_key_type]
-  pub_key = ssh.ReadRemoteSshPubKeys(pub_keyfile, node, cluster_name, ssh_port,
-                                     options.ssh_key_check,
-                                     options.ssh_key_check)
+  pub_key = ssh.ReadRemoteSshPubKey(pub_keyfile, node, cluster_name, ssh_port,
+                                    options.ssh_key_check,
+                                    options.ssh_key_check)
   # Unfortunately, we have to add the key with the node name rather than
   # the node's UUID here, because at this point, we do not have a UUID yet.
   # The entry will be corrected in noded later.
@@ -357,7 +357,9 @@
                          master_capable=opts.master_capable,
                          disk_state=disk_state,
                          hv_state=hv_state,
-                         node_setup=modify_ssh_setup)
+                         node_setup=modify_ssh_setup,
+                         verbose=opts.verbose,
+                         debug=opts.debug > 0)
   SubmitOpCode(op, opts=opts)
 
 
@@ -660,7 +662,9 @@
   @return: the desired exit code
 
   """
-  op = opcodes.OpNodeRemove(node_name=args[0])
+  op = opcodes.OpNodeRemove(node_name=args[0],
+                            debug=opts.debug > 0,
+                            verbose=opts.verbose)
   SubmitOpCode(op, opts=opts)
   return 0
 
@@ -1001,7 +1005,9 @@
                                auto_promote=opts.auto_promote,
                                powered=opts.node_powered,
                                hv_state=hv_state,
-                               disk_state=disk_state)
+                               disk_state=disk_state,
+                               verbose=opts.verbose,
+                               debug=opts.debug > 0)
 
   # even if here we process the result, we allow submit only
   result = SubmitOrSend(op, opts)
@@ -1054,6 +1060,19 @@
   return exit_code
 
 
+def RepairCommand(opts, args):
+  cl = GetClient()
+  if opts.input:
+    inp = opts.input.decode('string_escape')
+  else:
+    inp = None
+  op = opcodes.OpRepairCommand(command=args[0], node_name=args[1],
+                               input=inp)
+  result = SubmitOrSend(op, opts, cl=cl)
+  print result
+  return constants.EXIT_SUCCESS
+
+
 class ReplyStatus(object):
   """Class holding a reply status for synchronous confd clients.
 
@@ -1148,7 +1167,7 @@
      CAPAB_MASTER_OPT, CAPAB_VM_OPT, NODE_PARAMS_OPT, HV_STATE_OPT,
      DISK_STATE_OPT],
     "[-s ip] [--readd] [--no-ssh-key-check] [--force-join]"
-    " [--no-node-setup] [--verbose] [--network] <node_name>",
+    " [--no-node-setup] [--verbose] [--network] [--debug] <node_name>",
     "Add a node to the cluster"),
   "evacuate": (
     EvacuateNode, ARGS_ONE_NODE,
@@ -1194,7 +1213,7 @@
     [MC_OPT, DRAINED_OPT, OFFLINE_OPT,
      CAPAB_MASTER_OPT, CAPAB_VM_OPT, SECONDARY_IP_OPT,
      AUTO_PROMOTE_OPT, DRY_RUN_OPT, PRIORITY_OPT, NODE_PARAMS_OPT,
-     NODE_POWERED_OPT, HV_STATE_OPT, DISK_STATE_OPT],
+     NODE_POWERED_OPT, HV_STATE_OPT, DISK_STATE_OPT, VERBOSE_OPT],
     "<node_name>", "Alters the parameters of a node"),
   "powercycle": (
     PowercycleNode, ARGS_ONE_NODE,
@@ -1211,8 +1230,8 @@
     "on|off|cycle|status [nodes...]",
     "Change power state of node by calling out-of-band helper."),
   "remove": (
-    RemoveNode, ARGS_ONE_NODE, [DRY_RUN_OPT, PRIORITY_OPT],
-    "<node_name>", "Removes a node from the cluster"),
+    RemoveNode, ARGS_ONE_NODE, [DRY_RUN_OPT, PRIORITY_OPT, VERBOSE_OPT],
+    "[--verbose] [--debug] <node_name>", "Removes a node from the cluster"),
   "volumes": (
     ListVolumes, [ArgNode()],
     [NOHDR_OPT, SEP_OPT, USEUNITS_OPT, FIELDS_OPT, PRIORITY_OPT],
@@ -1263,6 +1282,10 @@
     [SYNC_OPT, PRIORITY_OPT] + SUBMIT_OPTS + [SHOW_MACHINE_OPT, NODEGROUP_OPT],
     "<command> <node_name> [<node_name>...]",
     "Executes a restricted command on node(s)"),
+  "repair-command": (
+    RepairCommand, [ArgUnknown(min=1, max=1), ArgNode(min=1, max=1)],
+    [SUBMIT_OPT, INPUT_OPT], "{--input <input>} <command> <node_name>",
+    "Executes a repair command on a node"),
   }
 
 #: dictionary with aliases for commands
diff --git a/lib/cmdlib/__init__.py b/lib/cmdlib/__init__.py
index 5fd9b8d..08d9616 100644
--- a/lib/cmdlib/__init__.py
+++ b/lib/cmdlib/__init__.py
@@ -126,7 +126,8 @@
 from ganeti.cmdlib.misc import \
   LUOobCommand, \
   LUExtStorageDiagnose, \
-  LURestrictedCommand
+  LURestrictedCommand, \
+  LURepairCommand
 from ganeti.cmdlib.test import \
   LUTestOsParams, \
   LUTestDelay, \
diff --git a/lib/cmdlib/base.py b/lib/cmdlib/base.py
index 57eb8d5..1e8b2d9 100644
--- a/lib/cmdlib/base.py
+++ b/lib/cmdlib/base.py
@@ -438,6 +438,30 @@
     # pylint: disable=W0613,R0201
     return lu_result
 
+  def HooksAbortCallBack(self, phase, feedback_fn, exception):
+    """Called when the hooks get aborted by an exception.
+
+    This method is called everytime a hooks phase is aborted by an exception.
+    This exception is most likely of type C{errors.HooksAbort}. However, we
+    keep the design of this function broad enough to handle any kind of
+    exception.
+
+    The intended purpose of this call back is to run any action that is
+    necessary to bring the cluster back to a clean state from the point
+    in time before calling the hook.
+
+    @type phase: string
+    @param phase: one of L{constants.HOOKS_PHASE_POST} or
+        L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
+    @type feedback_fn: callable
+    @param feedback_fn: function used send feedback back to the caller
+    @type exception: Exception
+    @param exception: The exception that was raised during the execution of
+    hooks.
+
+    """
+    pass
+
   def _ExpandAndLockInstance(self, allow_forthcoming=False):
     """Helper function to expand and lock an instance.
 
diff --git a/lib/cmdlib/cluster/__init__.py b/lib/cmdlib/cluster/__init__.py
index 43df844..74d109c 100644
--- a/lib/cmdlib/cluster/__init__.py
+++ b/lib/cmdlib/cluster/__init__.py
@@ -191,7 +191,9 @@
       potential_master_candidates,
       cluster_info.ssh_key_type, # Old key type
       self.ssh_key_type,         # New key type
-      self.ssh_key_bits)         # New key bits
+      self.ssh_key_bits,         # New key bits
+      self.op.debug,
+      self.op.verbose)
     result[master_uuid].Raise("Could not renew the SSH keys of all nodes")
 
     # After the keys have been successfully swapped, time to commit the change
@@ -1477,6 +1479,20 @@
         feedback_fn("Cluster LVM configuration already in desired"
                     " state, not changing")
 
+  def _SetDiagnoseDataCollectorFilename(self, feedback_fn):
+    """Determines and sets the filename of the script
+    diagnose data collector should run.
+
+    """
+    if self.op.diagnose_data_collector_filename is not None:
+      fn = self.op.diagnose_data_collector_filename
+      if fn != self.cfg.GetDiagnoseDataCollectorFilename():
+        self.cfg.SetDiagnoseDataCollectorFilename(fn)
+      else:
+        feedback_fn("Diagnose data collector filename"
+                    " configuration already in desired"
+                    " state, not changing")
+
   def _SetFileStorageDir(self, feedback_fn):
     """Set the file storage directory.
 
@@ -1644,6 +1660,7 @@
     self._SetSharedFileStorageDir(feedback_fn)
     self.cfg.Update(self.cluster, feedback_fn)
     self._SetDrbdHelper(feedback_fn)
+    self._SetDiagnoseDataCollectorFilename(feedback_fn)
 
     # re-read the fresh configuration again
     self.cluster = self.cfg.GetClusterInfo()
@@ -1822,6 +1839,15 @@
     if self.op.compression_tools is not None:
       self.cfg.SetCompressionTools(self.op.compression_tools)
 
+    if self.op.maint_round_delay is not None:
+      self.cfg.SetMaintdRoundDelay(self.op.maint_round_delay)
+
+    if self.op.maint_balance is not None:
+      self.cfg.SetMaintdBalance(self.op.maint_balance)
+
+    if self.op.maint_balance_threshold is not None:
+      self.cfg.SetMaintdBalanceThreshold(self.op.maint_balance_threshold)
+
     network_name = self.op.instance_communication_network
     if network_name is not None:
       return self._ModifyInstanceCommunicationNetwork(self.cfg,
diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py
index 8785fbc..76809b6 100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@ -388,6 +388,8 @@
     @ivar sbp: dictionary of {primary-node: list of instances} for all
         instances for which this node is secondary (config)
     @ivar mfree: free memory, as reported by hypervisor (runtime)
+    @ivar mtotal: total memory, as reported by hypervisor (runtime)
+    @ivar mdom0: domain0 memory, as reported by hypervisor (runtime)
     @ivar dfree: free disk, as reported by the node (runtime)
     @ivar offline: the offline status (config)
     @type rpc_fail: boolean
@@ -419,6 +421,8 @@
       self.sinst = []
       self.sbp = {}
       self.mfree = 0
+      self.mtotal = 0
+      self.mdom0 = 0
       self.dfree = 0
       self.offline = offline
       self.vm_capable = vm_capable
@@ -985,6 +989,10 @@
 
     """
     cluster_info = self.cfg.GetClusterInfo()
+    ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info,
+                                                            self.group_info)
+    memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO]
+
     for node_uuid, n_img in node_image.items():
       # This code checks that every node which is now listed as
       # secondary has enough memory to host all instances it is
@@ -994,8 +1002,9 @@
       # WARNING: we currently take into account down instances as well
       # as up ones, considering that even if they're down someone
       # might want to start them even in the event of a node failure.
+      node_cfg = self.all_node_info[node_uuid]
       if n_img.offline or \
-         self.all_node_info[node_uuid].group != self.group_uuid:
+         node_cfg.group != self.group_uuid:
         # we're skipping nodes marked offline and nodes in other groups from
         # the N+1 warning, since most likely we don't have good memory
         # information from them; we already list instances living on such
@@ -1008,7 +1017,13 @@
           bep = cluster_info.FillBE(all_insts[inst_uuid])
           if bep[constants.BE_AUTO_BALANCE]:
             needed_mem += bep[constants.BE_MINMEM]
-        test = n_img.mfree < needed_mem
+        mnode = n_img.mdom0
+        (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0]
+        if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM:
+          mnode = hv_state["mem_node"]
+        # minimum allowed free memory (it's negative due to over-commitment)
+        mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1)
+        test = n_img.mfree - needed_mem < mem_treshold
         self._ErrorIf(test, constants.CV_ENODEN1,
                       self.cfg.GetNodeName(node_uuid),
                       "not enough memory to accomodate instance failovers"
@@ -1596,12 +1611,16 @@
     """
     # try to read free memory (from the hypervisor)
     hv_info = nresult.get(constants.NV_HVINFO, None)
-    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
+    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info \
+                                         or "memory_total" not in hv_info \
+                                         or "memory_dom0" not in hv_info
     self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
                   "rpc call to node failed (hvinfo)")
     if not test:
       try:
         nimg.mfree = int(hv_info["memory_free"])
+        nimg.mtotal = int(hv_info["memory_total"])
+        nimg.mdom0 = int(hv_info["memory_dom0"])
       except (ValueError, TypeError):
         self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
                       "node returned invalid nodeinfo, check hypervisor")
diff --git a/lib/cmdlib/common.py b/lib/cmdlib/common.py
index 638abd7..a15f95e 100644
--- a/lib/cmdlib/common.py
+++ b/lib/cmdlib/common.py
@@ -483,7 +483,9 @@
     potential_master_candidates,
     True, # add node's key to all node's 'authorized_keys'
     True, # all nodes are potential master candidates
-    False) # do not update the node's public keys
+    False, # do not update the node's public keys
+    lu.op.debug,
+    lu.op.verbose)
   ssh_result[master_node].Raise(
     "Could not update the SSH setup of node '%s' after promotion"
     " (UUID: %s)." % (node.name, node.uuid))
diff --git a/lib/cmdlib/misc.py b/lib/cmdlib/misc.py
index 62bff52..d0bad88 100644
--- a/lib/cmdlib/misc.py
+++ b/lib/cmdlib/misc.py
@@ -40,7 +40,11 @@
 from ganeti import query
 from ganeti import utils
 from ganeti.cmdlib.base import NoHooksLU, QueryBase
-from ganeti.cmdlib.common import GetWantedNodes, SupportsOob
+from ganeti.cmdlib.common import (
+  GetWantedNodes,
+  SupportsOob,
+  ExpandNodeUuidAndName
+)
 
 
 class LUOobCommand(NoHooksLU):
@@ -418,3 +422,35 @@
         result.append((True, nres.payload))
 
     return result
+
+
+class LURepairCommand(NoHooksLU):
+  """Logical unit for executing repair commands.
+
+  """
+  REQ_BGL = False
+
+  def ExpandNames(self):
+    self.node_uuid, _ = ExpandNodeUuidAndName(self.cfg, None, self.op.node_name)
+
+    self.needed_locks = {
+      locking.LEVEL_NODE: self.node_uuid,
+      }
+    self.share_locks = {
+      locking.LEVEL_NODE: False,
+      }
+
+  def CheckPrereq(self):
+    """Check prerequisites.
+
+    """
+
+  def Exec(self, feedback_fn):
+    """Execute restricted command and return output.
+
+    """
+    owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
+    assert self.node_uuid in owned_nodes
+    return self.rpc.call_repair_command(self.op.node_name,
+                                            self.op.command,
+                                            self.op.input).data[1]
diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
index 210fd97..d1eae5e 100644
--- a/lib/cmdlib/node.py
+++ b/lib/cmdlib/node.py
@@ -151,6 +151,24 @@
   def PreparePostHookNodes(self, post_hook_node_uuids):
     return post_hook_node_uuids + [self.new_node.uuid]
 
+  def HooksAbortCallBack(self, phase, feedback_fn, exception):
+    """Cleans up if the hooks fail.
+
+    This function runs actions that necessary to bring the cluster into a
+    clean state again. This is necessary if for example the hooks of this
+    operation failed and leave the node in an inconsistent state.
+
+    """
+    if phase == constants.HOOKS_PHASE_PRE:
+      feedback_fn("Pre operation hook failed. Rolling back preparations.")
+
+      master_node = self.cfg.GetMasterNodeInfo().name
+      remove_result = self.rpc.call_node_ssh_key_remove_light(
+        [master_node],
+        self.op.node_name)
+      remove_result[master_node].Raise(
+        "Error removing SSH key of node '%s'." % self.op.node_name)
+
   def CheckPrereq(self):
     """Check prerequisites.
 
@@ -358,7 +376,9 @@
         True, # from public keys
         False, # clear authorized keys
         True, # clear public keys
-        True) # it's a readd
+        True, # it's a readd
+        self.op.debug,
+        self.op.verbose)
       remove_result[master_node].Raise(
         "Could not remove SSH keys of node %s before readding,"
         " (UUID: %s)." % (new_node_name, new_node_uuid))
@@ -368,7 +388,7 @@
       [master_node], new_node_uuid, new_node_name,
       potential_master_candidates,
       is_master_candidate, is_potential_master_candidate,
-      is_potential_master_candidate)
+      is_potential_master_candidate, self.op.debug, self.op.verbose)
 
     result[master_node].Raise("Could not update the node's SSH setup.")
     WarnAboutFailedSshUpdates(result, master_node, feedback_fn)
@@ -874,7 +894,9 @@
             False, # currently, all nodes are potential master candidates
             False, # do not clear node's 'authorized_keys'
             False, # do not clear node's 'ganeti_pub_keys'
-            False) # no readd
+            False, # no readd
+            self.op.debug,
+            self.op.verbose)
           ssh_result[master_node].Raise(
             "Could not adjust the SSH setup after demoting node '%s'"
             " (UUID: %s)." % (node.name, node.uuid))
@@ -1574,7 +1596,9 @@
         potential_master_candidate, # from_public_keys
         True, # clear node's 'authorized_keys'
         True, # clear node's 'ganeti_public_keys'
-        False) # no readd
+        False, # no readd
+        self.op.debug,
+        self.op.verbose)
       result[master_node].Raise(
         "Could not remove the SSH key of node '%s' (UUID: %s)." %
         (self.op.node_name, self.node.uuid))
diff --git a/lib/config/__init__.py b/lib/config/__init__.py
index 096b213..346ac26 100644
--- a/lib/config/__init__.py
+++ b/lib/config/__init__.py
@@ -225,6 +225,30 @@
     """
     return self._UnlockedGetNdParams(node)
 
+  def _UnlockedGetFilledHvStateParams(self, node):
+    cfg = self._ConfigData()
+    cluster_hv_state = cfg.cluster.hv_state_static
+    def_hv = self._UnlockedGetHypervisorType()
+    cluster_fv = constants.HVST_DEFAULTS if def_hv not in cluster_hv_state \
+                                         else cluster_hv_state[def_hv]
+    group_hv_state = self._UnlockedGetNodeGroup(node.group).hv_state_static
+    group_fv = cluster_fv if def_hv not in group_hv_state else \
+               objects.FillDict(cluster_fv, group_hv_state[def_hv])
+    node_fv = group_fv if def_hv not in node.hv_state_static else \
+              objects.FillDict(group_fv, node.hv_state_static[def_hv])
+    return {def_hv: node_fv}
+
+  @ConfigSync(shared=1)
+  def GetFilledHvStateParams(self, node):
+    """Get the node params populated with cluster defaults.
+
+    @type node: L{objects.Node}
+    @param node: The node we want to know the params for
+    @return: A dict with the filled in node hv_state params for the default hv
+
+    """
+    return self._UnlockedGetFilledHvStateParams(node)
+
   @ConfigSync(shared=1)
   def GetNdGroupParams(self, nodegroup):
     """Get the node groups params populated with cluster defaults.
@@ -1268,12 +1292,18 @@
     """
     return self._ConfigData().cluster.gluster_storage_dir
 
+  def _UnlockedGetHypervisorType(self):
+    """Get the hypervisor type for this cluster.
+
+    """
+    return self._ConfigData().cluster.enabled_hypervisors[0]
+
   @ConfigSync(shared=1)
   def GetHypervisorType(self):
     """Get the hypervisor type for this cluster.
 
     """
-    return self._ConfigData().cluster.enabled_hypervisors[0]
+    return self._UnlockedGetHypervisorType()
 
   @ConfigSync(shared=1)
   def GetRsaHostKey(self):
@@ -2950,6 +2980,21 @@
     self._ConfigData().cluster.serial_no += 1
 
   @ConfigSync(shared=1)
+  def GetDiagnoseDataCollectorFilename(self):
+    """Return the diagnose data collector filename
+
+    """
+    return self._ConfigData().cluster.diagnose_data_collector_filename
+
+  @ConfigSync()
+  def SetDiagnoseDataCollectorFilename(self, fn):
+    """Set the volume group name.
+
+    """
+    self._ConfigData().cluster.diagnose_data_collector_filename = fn
+    self._ConfigData().cluster.serial_no += 1
+
+  @ConfigSync(shared=1)
   def GetDRBDHelper(self):
     """Return DRBD usermode helper.
 
@@ -3378,6 +3423,21 @@
       if disk_uuid in inst_info.disks:
         return inst_uuid
 
+  def SetMaintdRoundDelay(self, delay):
+    """Set the minimal time the maintenance daemon should wait between rounds"""
+    utils.SimpleRetry(True, self._wconfd.SetMaintdRoundDelay, 0.1, 30,
+                      args=[delay])
+
+  def SetMaintdBalance(self, flag):
+    """Enable/disable auto-balancing by the maintenance daemon"""
+    utils.SimpleRetry(True, self._wconfd.SetMaintdBalance, 0.1, 30,
+                      args=[flag])
+
+  def SetMaintdBalanceThreshold(self, score):
+    """Set the minimal score improvement per move for balancing steps"""
+    utils.SimpleRetry(True, self._wconfd.SetMaintdBalanceThreshold, 0.1, 30,
+                      args=[score])
+
 
 class DetachedConfig(ConfigWriter):
   """Read-only snapshot of the config."""
diff --git a/lib/masterd/iallocator.py b/lib/masterd/iallocator.py
index ed6b358..631acff 100644
--- a/lib/masterd/iallocator.py
+++ b/lib/masterd/iallocator.py
@@ -572,6 +572,7 @@
       "master_capable": ninfo.master_capable,
       "vm_capable": ninfo.vm_capable,
       "ndparams": cfg.GetNdParams(ninfo),
+      "hv_state": cfg.GetFilledHvStateParams(ninfo)
       })
       for ninfo in node_cfg.values())
 
diff --git a/lib/mcpu.py b/lib/mcpu.py
index ff8ef1f..0ab5cc0 100644
--- a/lib/mcpu.py
+++ b/lib/mcpu.py
@@ -485,7 +485,16 @@
     lu.CheckPrereq()
 
     hm = self.BuildHooksManager(lu)
-    h_results = hm.RunPhase(constants.HOOKS_PHASE_PRE)
+    try:
+      h_results = hm.RunPhase(constants.HOOKS_PHASE_PRE)
+    except Exception, err:  # pylint: disable=W0703
+      # This gives the LU a chance of cleaning up in case of an hooks failure.
+      # The type of exception is deliberately broad to be able to react to
+      # any kind of failure.
+      lu.HooksAbortCallBack(constants.HOOKS_PHASE_PRE, self.Log, err)
+      # We re-raise the exception to not alter the behavior of LU handling
+      # otherwise.
+      raise err
     lu.HooksCallBack(constants.HOOKS_PHASE_PRE, h_results,
                      self.Log, None)
 
diff --git a/lib/objects.py b/lib/objects.py
index e91719e..f53f846 100644
--- a/lib/objects.py
+++ b/lib/objects.py
@@ -63,7 +63,7 @@
 
 __all__ = ["ConfigObject", "ConfigData", "NIC", "Disk", "Instance",
            "OS", "Node", "NodeGroup", "Cluster", "FillDict", "Network",
-           "Filter"]
+           "Filter", "Maintenance"]
 
 _TIMESTAMPS = ["ctime", "mtime"]
 _UUID = ["uuid"]
@@ -416,6 +416,7 @@
     "networks",
     "disks",
     "filters",
+    "maintenance",
     "serial_no",
     ] + _TIMESTAMPS
 
@@ -428,6 +429,7 @@
     """
     mydict = super(ConfigData, self).ToDict(_with_private=_with_private)
     mydict["cluster"] = mydict["cluster"].ToDict()
+    mydict["maintenance"] = mydict["maintenance"].ToDict()
     for key in ("nodes", "instances", "nodegroups", "networks", "disks",
                 "filters"):
       mydict[key] = outils.ContainerToDicts(mydict[key])
@@ -449,6 +451,7 @@
     obj.networks = outils.ContainerFromDicts(obj.networks, dict, Network)
     obj.disks = outils.ContainerFromDicts(obj.disks, dict, Disk)
     obj.filters = outils.ContainerFromDicts(obj.filters, dict, Filter)
+    obj.maintenance = Maintenance.FromDict(obj.maintenance)
     return obj
 
   def DisksOfType(self, dev_type):
@@ -491,6 +494,9 @@
       disk.UpgradeConfig()
     if self.filters is None:
       self.filters = {}
+    if self.maintenance is None:
+      self.maintenance = Maintenance.FromDict({})
+    self.maintenance.UpgradeConfig()
 
   def _UpgradeEnabledDiskTemplates(self):
     """Upgrade the cluster's enabled disk templates by inspecting the currently
@@ -549,6 +555,20 @@
                "predicates", "action", "reason_trail"] + _UUID
 
 
+class Maintenance(ConfigObject):
+  """Config object representing the state of the maintenance daemon"""
+  __slots__ = ["roundDelay", "jobs", "evacuated", "balance", "balanceThreshold",
+               "incidents", "serial_no"] + _TIMESTAMPS
+
+  def UpgradeConfig(self):
+    if self.serial_no is None:
+      self.serial_no = 1
+    if self.mtime is None:
+      self.mtime = time.time()
+    if self.ctime is None:
+      self.ctime = time.time()
+
+
 class Disk(ConfigObject):
   """Config object representing a block device."""
   __slots__ = [
@@ -1493,6 +1513,11 @@
     if self.powered is None:
       self.powered = True
 
+    if self.hv_state_static is None:
+      self.hv_state_static = {}
+    if self.disk_state_static is None:
+      self.disk_state_static = {}
+
   def ToDict(self, _with_private=False):
     """Custom function for serializing.
 
@@ -1590,6 +1615,11 @@
     if self.ipolicy is None:
       self.ipolicy = MakeEmptyIPolicy()
 
+    if self.hv_state_static is None:
+      self.hv_state_static = {}
+    if self.disk_state_static is None:
+      self.disk_state_static = {}
+
     if self.networks is None:
       self.networks = {}
 
@@ -1675,6 +1705,7 @@
     "compression_tools",
     "enabled_user_shutdown",
     "data_collectors",
+    "diagnose_data_collector_filename",
     "ssh_key_type",
     "ssh_key_bits",
     ] + _TIMESTAMPS + _UUID
diff --git a/lib/pathutils.py b/lib/pathutils.py
index 77a1cc4..78e321a 100644
--- a/lib/pathutils.py
+++ b/lib/pathutils.py
@@ -123,6 +123,7 @@
 HOOKS_BASE_DIR = CONF_DIR + "/hooks"
 FILE_STORAGE_PATHS_FILE = CONF_DIR + "/file-storage-paths"
 RESTRICTED_COMMANDS_DIR = CONF_DIR + "/restricted-commands"
+REPAIR_COMMANDS_DIR = CONF_DIR + "/node-repair-commands"
 
 #: Node daemon certificate path
 NODED_CERT_FILE = DATA_DIR + "/server.pem"
@@ -134,6 +135,9 @@
 #: Locked in exclusive mode while noded verifies a remote command
 RESTRICTED_COMMANDS_LOCK_FILE = LOCK_DIR + "/ganeti-restricted-commands.lock"
 
+#: Locked in exclusive mode while noded verifies a remote command
+REPAIR_COMMANDS_LOCK_FILE = LOCK_DIR + "/ganeti-repair-commands.lock"
+
 #: Lock file for watcher, locked in shared mode by watcher; lock in exclusive
 # mode to block watcher (see L{cli._RunWhileDaemonsStoppedHelper.Call}
 WATCHER_LOCK_FILE = LOCK_DIR + "/ganeti-watcher.lock"
@@ -190,3 +194,4 @@
 LOG_WATCHER = GetLogFilename("watcher")
 LOG_COMMANDS = GetLogFilename("commands")
 LOG_BURNIN = GetLogFilename("burnin")
+LOG_TOOLS = GetLogFilename("tools")
diff --git a/lib/query.py b/lib/query.py
index 43d8fad..6cea103 100644
--- a/lib/query.py
+++ b/lib/query.py
@@ -1309,32 +1309,6 @@
     return _FS_UNAVAIL
 
 
-def _GetNodeHvState(_, node):
-  """Converts node's hypervisor state for query result.
-
-  """
-  hv_state = node.hv_state
-
-  if hv_state is None:
-    return _FS_UNAVAIL
-
-  return dict((name, value.ToDict()) for (name, value) in hv_state.items())
-
-
-def _GetNodeDiskState(_, node):
-  """Converts node's disk state for query result.
-
-  """
-  disk_state = node.disk_state
-
-  if disk_state is None:
-    return _FS_UNAVAIL
-
-  return dict((disk_kind, dict((name, value.ToDict())
-                               for (name, value) in kind_state.items()))
-              for (disk_kind, kind_state) in disk_state.items())
-
-
 def _BuildNodeFields():
   """Builds list of fields for node queries.
 
@@ -1361,10 +1335,16 @@
     (_MakeField("custom_ndparams", "CustomNodeParameters", QFT_OTHER,
                 "Custom node parameters"),
       NQ_GROUP, 0, _GetItemAttr("ndparams")),
-    (_MakeField("hv_state", "HypervisorState", QFT_OTHER, "Hypervisor state"),
-     NQ_CONFIG, 0, _GetNodeHvState),
+    # FIXME: The code below return custom hv_state instead of filled one.
+    # Anyway, this functionality is unlikely to be used.
+    (_MakeField("hv_state", "HypervisorState", QFT_OTHER,
+                "Static hypervisor state for default hypervisor only"),
+     NQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
+    (_MakeField("custom_hv_state", "CustomHypervisorState", QFT_OTHER,
+                "Custom static hypervisor state"),
+     NQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
     (_MakeField("disk_state", "DiskState", QFT_OTHER, "Disk state"),
-     NQ_CONFIG, 0, _GetNodeDiskState),
+     NQ_CONFIG, 0, _GetItemAttr("disk_state_static")),
     ]
 
   fields.extend(_BuildNDFields(False))
@@ -2448,6 +2428,9 @@
     (_MakeField("ipolicy", "InstancePolicy", QFT_OTHER,
                 "Instance policy limitations (merged)"),
      GQ_CONFIG, 0, lambda ctx, _: ctx.group_ipolicy),
+    (_MakeField("networks", "Networks", QFT_OTHER,
+                "Node group networks"),
+     GQ_CONFIG, 0, _GetItemAttr("networks")),
     (_MakeField("custom_ipolicy", "CustomInstancePolicy", QFT_OTHER,
                 "Custom instance policy limitations"),
      GQ_CONFIG, 0, _GetItemAttr("ipolicy")),
@@ -2463,6 +2446,11 @@
     (_MakeField("custom_diskparams", "CustomDiskParameters", QFT_OTHER,
                 "Custom disk parameters"),
      GQ_CONFIG, 0, _GetItemAttr("diskparams")),
+    (_MakeField("hv_state", "HypervisorState", QFT_OTHER,
+                "Custom static hypervisor state"),
+     GQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
+    (_MakeField("disk_state", "DiskState", QFT_OTHER, "Disk state"),
+     GQ_CONFIG, 0, _GetItemAttr("disk_state_static")),
     ])
 
   # ND parameters
@@ -2775,6 +2763,11 @@
     (_MakeField("master_node", "Master", QFT_TEXT, "Master node name"),
      CQ_CONFIG, QFF_HOSTNAME,
      lambda ctx, cluster: _GetNodeName(ctx, None, cluster.master_node)),
+    (_MakeField("hv_state", "HypervisorState", QFT_OTHER,
+                "Custom static hypervisor state"),
+     CQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
+    (_MakeField("disk_state", "DiskState", QFT_OTHER, "Disk state"),
+     CQ_CONFIG, 0, _GetItemAttr("disk_state_static")),
     ]
 
   # Simple fields
diff --git a/lib/rapi/rlib2.py b/lib/rapi/rlib2.py
index 14c12ac..8514fcb 100644
--- a/lib/rapi/rlib2.py
+++ b/lib/rapi/rlib2.py
@@ -93,7 +93,7 @@
 
 N_FIELDS = ["name", "offline", "master_candidate", "drained",
             "dtotal", "dfree", "sptotal", "spfree",
-            "mtotal", "mnode", "mfree",
+            "mtotal", "mnode", "mfree", "hv_state",
             "pinst_cnt", "sinst_cnt",
             "ctotal", "cnos", "cnodes", "csockets",
             "pip", "sip", "role",
@@ -121,7 +121,7 @@
   "diskparams",
   "custom_diskparams",
   "ndparams",
-  "custom_ndparams",
+  "custom_ndparams"
   ] + _COMMON_FIELDS
 
 FILTER_RULE_FIELDS = [
diff --git a/lib/rpc_defs.py b/lib/rpc_defs.py
index 71fa231..48f2ecb 100644
--- a/lib/rpc_defs.py
+++ b/lib/rpc_defs.py
@@ -543,7 +543,9 @@
     ("to_public_keys", None, "Whether the node's key should be added"
      " to all nodes' public key file"),
     ("get_public_keys", None, "Whether the node should get the other nodes'"
-     " public keys")],
+     " public keys"),
+    ("debug", None, "Set loglevel of ssh calls to 'debug'."),
+    ("verbose", None, "Set loglevel of ssh calls to 'verbose'.")],
     None, None, "Distribute a new node's public SSH key on the cluster."),
   ("node_ssh_key_remove", MULTI, None, constants.RPC_TMO_FAST, [
     ("node_uuid", None, "UUID of the node whose key is removed"),
@@ -559,7 +561,9 @@
     ("clear_public_keys", None,
      "If the 'ganeti_pub_keys' file of the node should be cleared."),
     ("readd", None,
-     "Whether this is a readd operation.")],
+     "Whether this is a readd operation."),
+    ("debug", None, "Set loglevel of ssh calls to 'debug'."),
+    ("verbose", None, "Set loglevel of ssh calls to 'verbose'.")],
     None, None, "Remove a node's SSH key from the other nodes' key files."),
   ("node_ssh_keys_renew", MULTI, None, constants.RPC_TMO_4HRS, [
     ("node_uuids", None, "UUIDs of the nodes whose key is renewed"),
@@ -568,8 +572,13 @@
     ("potential_master_candidates", None, "Potential master candidates"),
     ("old_key_type", None, "The type of key previously used"),
     ("new_key_type", None, "The type of key to generate"),
-    ("new_key_bits", None, "The length of the key to generate")],
+    ("new_key_bits", None, "The length of the key to generate"),
+    ("debug", None, "Set logging of SSH update tool to 'debug'."),
+    ("verbose", None, "Set logging of SSH update tool to 'info'.")],
     None, None, "Renew all SSH key pairs of all nodes nodes."),
+  ("node_ssh_key_remove_light", MULTI, None, constants.RPC_TMO_FAST, [
+    ("node_name", None, "Name of the node whose key is removed")],
+    None, None, "Remove a node's SSH key from the master's public key file."),
   ]
 
 _MISC_CALLS = [
@@ -593,6 +602,10 @@
   ("restricted_command", MULTI, None, constants.RPC_TMO_SLOW, [
     ("cmd", None, "Command name"),
     ], None, None, "Runs restricted command"),
+  ("repair_command", SINGLE, None, constants.RPC_TMO_SLOW, [
+    ("cmd", None, "Command name"),
+    ("inp", None, "Input to be passed as stdin"),
+    ], None, None, "Runs repair command"),
   ("run_oob", SINGLE, None, constants.RPC_TMO_NORMAL, [
     ("oob_program", None, None),
     ("command", None, None),
diff --git a/lib/server/noded.py b/lib/server/noded.py
index a5e05dd..1397fbd 100644
--- a/lib/server/noded.py
+++ b/lib/server/noded.py
@@ -932,12 +932,15 @@
 
     """
     (node_uuid, node_name, potential_master_candidates,
-     to_authorized_keys, to_public_keys, get_public_keys) = params
+     to_authorized_keys, to_public_keys, get_public_keys,
+     debug, verbose) = params
     return backend.AddNodeSshKey(node_uuid, node_name,
                                  potential_master_candidates,
                                  to_authorized_keys=to_authorized_keys,
                                  to_public_keys=to_public_keys,
-                                 get_public_keys=get_public_keys)
+                                 get_public_keys=get_public_keys,
+                                 ssh_update_debug=debug,
+                                 ssh_update_verbose=verbose)
 
   @staticmethod
   def perspective_node_ssh_keys_renew(params):
@@ -946,10 +949,12 @@
     """
     (node_uuids, node_names, master_candidate_uuids,
      potential_master_candidates, old_key_type, new_key_type,
-     new_key_bits) = params
+     new_key_bits, debug, verbose) = params
     return backend.RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
                                 potential_master_candidates, old_key_type,
-                                new_key_type, new_key_bits)
+                                new_key_type, new_key_bits,
+                                ssh_update_debug=debug,
+                                ssh_update_verbose=verbose)
 
   @staticmethod
   def perspective_node_ssh_key_remove(params):
@@ -959,7 +964,7 @@
     (node_uuid, node_name,
      master_candidate_uuids, potential_master_candidates,
      from_authorized_keys, from_public_keys, clear_authorized_keys,
-     clear_public_keys, readd) = params
+     clear_public_keys, readd, debug, verbose) = params
     return backend.RemoveNodeSshKey(node_uuid, node_name,
                                     master_candidate_uuids,
                                     potential_master_candidates,
@@ -967,7 +972,17 @@
                                     from_public_keys=from_public_keys,
                                     clear_authorized_keys=clear_authorized_keys,
                                     clear_public_keys=clear_public_keys,
-                                    readd=readd)
+                                    readd=readd,
+                                    ssh_update_debug=debug,
+                                    ssh_update_verbose=verbose)
+
+  @staticmethod
+  def perspective_node_ssh_key_remove_light(params):
+    """Removes a node's SSH key from the master's public key file.
+
+    """
+    (node_name, ) = params
+    return backend.RemoveSshKeyFromPublicKeyFile(node_name)
 
   # cluster --------------------------
 
@@ -1024,7 +1039,23 @@
     """
     (cmd, ) = params
 
-    return backend.RunRestrictedCmd(cmd)
+    return backend.RunConstrainedCmd(
+      cmd,
+      lock_file=pathutils.RESTRICTED_COMMANDS_LOCK_FILE,
+      path=pathutils.RESTRICTED_COMMANDS_DIR)
+
+  @staticmethod
+  def perspective_repair_command(params):
+    """ Run a repair command.
+
+    """
+    (cmd, inp, ) = params
+
+    return backend.RunConstrainedCmd(
+      cmd,
+      lock_file=pathutils.REPAIR_COMMANDS_LOCK_FILE,
+      path=pathutils.REPAIR_COMMANDS_DIR,
+      inp=inp)
 
   @staticmethod
   def perspective_write_ssconf_files(params):
diff --git a/lib/ssh.py b/lib/ssh.py
index a8fe86d..0fb592b 100644
--- a/lib/ssh.py
+++ b/lib/ssh.py
@@ -35,6 +35,7 @@
 
 import logging
 import os
+import shutil
 import tempfile
 
 from collections import namedtuple
@@ -1073,8 +1074,8 @@
                              (result.cmd, result.fail_reason))
 
 
-def ReadRemoteSshPubKeys(pub_key_file, node, cluster_name, port, ask_key,
-                         strict_host_check):
+def ReadRemoteSshPubKey(pub_key_file, node, cluster_name, port, ask_key,
+                        strict_host_check):
   """Fetches a public SSH key from a node via SSH.
 
   @type pub_key_file: string
@@ -1100,6 +1101,153 @@
   return result.stdout
 
 
+def GetSshKeyFilenames(key_type, suffix=""):
+  """Get filenames of the SSH key pair of the given type.
+
+  @type key_type: string
+  @param key_type: type of SSH key, must be element of C{constants.SSHK_ALL}
+  @type suffix: string
+  @param suffix: optional suffix for the key filenames
+  @rtype: tuple of (string, string)
+  @returns: a tuple containing the name of the private key file and the
+       public key file.
+
+  """
+  if key_type not in constants.SSHK_ALL:
+    raise errors.SshUpdateError("Unsupported key type '%s'. Supported key types"
+                                " are: %s." % (key_type, constants.SSHK_ALL))
+  (_, root_keyfiles) = \
+      GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
+  if not key_type in root_keyfiles.keys():
+    raise errors.SshUpdateError("No keyfile for key type '%s' available."
+                                % key_type)
+
+  key_filenames = root_keyfiles[key_type]
+  if suffix:
+    key_filenames = [_ComputeKeyFilePathWithSuffix(key_filename, suffix)
+                     for key_filename in key_filenames]
+
+  return key_filenames
+
+
+def GetSshPubKeyFilename(key_type, suffix=""):
+  """Get filename of the public SSH key of the given type.
+
+  @type key_type: string
+  @param key_type: type of SSH key, must be element of C{constants.SSHK_ALL}
+  @type suffix: string
+  @param suffix: optional suffix for the key filenames
+  @rtype: string
+  @returns: file name of the public key file
+
+  """
+  return GetSshKeyFilenames(key_type, suffix=suffix)[1]
+
+
+def _ComputeKeyFilePathWithSuffix(key_filepath, suffix):
+  """Converts the given key filename to a key filename with a suffix.
+
+  @type key_filepath: string
+  @param key_filepath: path of the key file
+  @type suffix: string
+  @param suffix: suffix to be appended to the basename of the file
+
+  """
+  path = os.path.dirname(key_filepath)
+  ext = os.path.splitext(os.path.basename(key_filepath))[1]
+  basename = os.path.splitext(os.path.basename(key_filepath))[0]
+  return os.path.join(path, basename + suffix + ext)
+
+
+def ReplaceSshKeys(src_key_type, dest_key_type,
+                   src_key_suffix="", dest_key_suffix=""):
+  """Replaces an SSH key pair by another SSH key pair.
+
+  Note that both parts, the private and the public key, are replaced.
+
+  @type src_key_type: string
+  @param src_key_type: key type of key pair that is replacing the other
+      key pair
+  @type dest_key_type: string
+  @param dest_key_type: key type of the key pair that is being replaced
+      by the source key pair
+  @type src_key_suffix: string
+  @param src_key_suffix: optional suffix of the key files of the source
+      key pair
+  @type dest_key_suffix: string
+  @param dest_key_suffix: optional suffix of the keey files of the
+      destination key pair
+
+  """
+  (src_priv_filename, src_pub_filename) = GetSshKeyFilenames(
+      src_key_type, suffix=src_key_suffix)
+  (dest_priv_filename, dest_pub_filename) = GetSshKeyFilenames(
+      dest_key_type, suffix=dest_key_suffix)
+
+  if not (os.path.exists(src_priv_filename) and
+          os.path.exists(src_pub_filename)):
+    raise errors.SshUpdateError(
+        "At least one of the source key files is missing: %s",
+        ", ".join([src_priv_filename, src_pub_filename]))
+
+  for dest_file in [dest_priv_filename, dest_pub_filename]:
+    if os.path.exists(dest_file):
+      utils.CreateBackup(dest_file)
+      utils.RemoveFile(dest_file)
+
+  shutil.move(src_priv_filename, dest_priv_filename)
+  shutil.move(src_pub_filename, dest_pub_filename)
+
+
+def ReadLocalSshPubKeys(key_types, suffix=""):
+  """Reads the local root user SSH key.
+
+  @type key_types: list of string
+  @param key_types: types of SSH keys. Must be subset of constants.SSHK_ALL. If
+      'None' or [], all available keys are returned.
+  @type suffix: string
+  @param suffix: optional suffix to be attached to key names when reading
+      them. Used for temporary key files.
+  @rtype: list of string
+  @return: list of public keys
+
+  """
+  fetch_key_types = []
+  if key_types:
+    fetch_key_types += key_types
+  else:
+    fetch_key_types = constants.SSHK_ALL
+
+  (_, root_keyfiles) = \
+      GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
+
+  result_keys = []
+  for (public_key_type, (_, public_key_file)) in root_keyfiles.items():
+
+    if public_key_type not in fetch_key_types:
+      continue
+
+    public_key_dir = os.path.dirname(public_key_file)
+    public_key_filename = ""
+    if suffix:
+      public_key_filename = \
+          os.path.splitext(os.path.basename(public_key_file))[0] \
+          + suffix + ".pub"
+    else:
+      public_key_filename = public_key_file
+    public_key_path = os.path.join(public_key_dir,
+                                   public_key_filename)
+
+    if not os.path.exists(public_key_path):
+      raise errors.SshUpdateError("Cannot find SSH public key of type '%s'."
+                                  % public_key_type)
+    else:
+      key = utils.ReadFile(public_key_path)
+      result_keys.append(key)
+
+  return result_keys
+
+
 # Update gnt-cluster.rst when changing which combinations are valid.
 KeyBitInfo = namedtuple('KeyBitInfo', ['default', 'validation_fn'])
 SSH_KEY_VALID_BITS = {
diff --git a/lib/tools/cfgupgrade.py b/lib/tools/cfgupgrade.py
index 14e2e20..59ab1e1 100644
--- a/lib/tools/cfgupgrade.py
+++ b/lib/tools/cfgupgrade.py
@@ -59,11 +59,11 @@
 #: Target major version we will upgrade to
 TARGET_MAJOR = 2
 #: Target minor version we will upgrade to
-TARGET_MINOR = 16
+TARGET_MINOR = 17
 #: Target major version for downgrade
 DOWNGRADE_MAJOR = 2
 #: Target minor version for downgrade
-DOWNGRADE_MINOR = 15
+DOWNGRADE_MINOR = 16
 
 # map of legacy device types
 # (mapping differing old LD_* constants to new DT_* constants)
@@ -183,8 +183,8 @@
       self._Downgrade(config_major, config_minor, config_version,
                       config_revision)
 
-    # Upgrade from 2.{0..15} to 2.16
-    elif config_major == 2 and config_minor in range(0, 16):
+    # Upgrade from 2.{0..n-1} to 2.n
+    elif config_major == 2 and config_minor in range(0, TARGET_MINOR):
       if config_revision != 0:
         logging.warning("Config revision is %s, not 0", config_revision)
       if not self.UpgradeAll():
@@ -340,6 +340,8 @@
         cluster["data_collectors"].get(
             name, dict(active=True,
                        interval=constants.MOND_TIME_INTERVAL * 1e6))
+    if "diagnose_data_collector_filename" not in cluster:
+      cluster["diagnose_data_collector_filename"] = ""
 
     # These parameters are set to pre-2.16 default values, which
     # differ from post-2.16 default values
@@ -696,6 +698,14 @@
         else:
           disk["nodes"] = []
 
+  @OrFail("Upgrading maintenance data")
+  def UpgradeMaintenance(self):
+    # pylint can't infer config_data type
+    # pylint: disable=E1103
+    maintenance = self.config_data.get("maintenance", None)
+    if maintenance is None:
+      self.config_data["maintenance"] = {}
+
   def UpgradeAll(self):
     self.config_data["version"] = version.BuildVersion(TARGET_MAJOR,
                                                        TARGET_MINOR, 0)
@@ -711,48 +721,33 @@
              self.UpgradeInstanceIndices,
              self.UpgradeFilters,
              self.UpgradeDiskNodes,
-             self.UpgradeDiskTemplate]
+             self.UpgradeDiskTemplate,
+             self.UpgradeMaintenance]
     for s in steps:
       s()
     return not self.errors
 
   # DOWNGRADE ------------------------------------------------------------
 
-  @OrFail("Removing SSH parameters")
-  def DowngradeSshKeyParams(self):
-    """Removes the SSH key type and bits parameters from the config.
-
-    Also fails if these have been changed from values appropriate in lower
-    Ganeti versions.
-
-    """
-    # pylint: disable=E1103
-    # Because config_data is a dictionary which has the get method.
-    cluster = self.config_data.get("cluster", None)
-    if cluster is None:
-      raise Error("Can't find the cluster entry in the configuration")
-
-    def _FetchAndDelete(key):
-      val = cluster.get(key, None)
-      if key in cluster:
-        del cluster[key]
-      return val
-
-    ssh_key_type = _FetchAndDelete("ssh_key_type")
-    _FetchAndDelete("ssh_key_bits")
-
-    if ssh_key_type is not None and ssh_key_type != "dsa":
-      raise Error("The current Ganeti setup is using non-DSA SSH keys, and"
-                  " versions below 2.16 do not support these. To downgrade,"
-                  " please perform a gnt-cluster renew-crypto using the "
-                  " --new-ssh-keys and --ssh-key-type=dsa options, generating"
-                  " DSA keys that older versions can also use.")
-
   def DowngradeAll(self):
+    if "maintenance" in self.config_data:
+      del self.config_data["maintenance"]
+    if "cluster" in self.config_data:
+      cluster = self.config_data["cluster"]
+      if "diagnose_data_collector_filename" in cluster:
+        del cluster["diagnose_data_collector_filename"]
+      if "data_collectors" in cluster:
+        if constants.DATA_COLLECTOR_DIAGNOSE in cluster["data_collectors"]:
+          del cluster["data_collectors"][constants.DATA_COLLECTOR_DIAGNOSE]
+        if constants.DATA_COLLECTOR_KVM_R_S_S in cluster["data_collectors"]:
+          del cluster["data_collectors"][constants.DATA_COLLECTOR_KVM_R_S_S]
+      if "ipolicy" in cluster:
+        ipolicy = cluster["ipolicy"]
+        if "memory-ratio" in ipolicy:
+          del ipolicy["memory-ratio"]
     self.config_data["version"] = version.BuildVersion(DOWNGRADE_MAJOR,
                                                        DOWNGRADE_MINOR, 0)
 
-    self.DowngradeSshKeyParams()
     return not self.errors
 
   def _ComposePaths(self):
diff --git a/lib/tools/common.py b/lib/tools/common.py
index 60fe169..d8f1588 100644
--- a/lib/tools/common.py
+++ b/lib/tools/common.py
@@ -182,6 +182,19 @@
   return name
 
 
+def VerifyHmac(data, error_fn):
+  """Verifies the presence of the hmac secret.
+
+  @type data: dict
+
+  """
+  hmac = data.get(constants.NDS_HMAC)
+  if not hmac:
+    raise error_fn("Hmac key must be provided")
+
+  return hmac
+
+
 def LoadData(raw, data_check):
   """Parses and verifies input data.
 
diff --git a/lib/tools/ensure_dirs.py b/lib/tools/ensure_dirs.py
index 0a197ba..66b37e8 100644
--- a/lib/tools/ensure_dirs.py
+++ b/lib/tools/ensure_dirs.py
@@ -250,7 +250,9 @@
   """
   (opts, args) = ParseOptions()
 
-  utils.SetupToolLogging(opts.debug, opts.verbose)
+  utils.SetupToolLogging(
+      opts.debug, opts.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   if args:
     logging.error("No arguments are expected")
diff --git a/lib/tools/node_cleanup.py b/lib/tools/node_cleanup.py
index f8ec076..08a9548 100644
--- a/lib/tools/node_cleanup.py
+++ b/lib/tools/node_cleanup.py
@@ -80,7 +80,9 @@
   """
   opts = ParseOptions()
 
-  utils.SetupToolLogging(opts.debug, opts.verbose)
+  utils.SetupToolLogging(
+      opts.debug, opts.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   try:
     # List of files to delete. Contains tuples consisting of the absolute path
diff --git a/lib/tools/node_daemon_setup.py b/lib/tools/node_daemon_setup.py
index e45e2e0..c971d15 100644
--- a/lib/tools/node_daemon_setup.py
+++ b/lib/tools/node_daemon_setup.py
@@ -51,6 +51,7 @@
 _DATA_CHECK = ht.TStrictDict(False, True, {
   constants.NDS_CLUSTER_NAME: ht.TNonEmptyString,
   constants.NDS_NODE_DAEMON_CERTIFICATE: ht.TNonEmptyString,
+  constants.NDS_HMAC: ht.TNonEmptyString,
   constants.NDS_SSCONF: ht.TDictOf(ht.TNonEmptyString, ht.TString),
   constants.NDS_START_NODE_DAEMON: ht.TBool,
   constants.NDS_NODE_NAME: ht.TString,
@@ -117,7 +118,9 @@
   """
   opts = ParseOptions()
 
-  utils.SetupToolLogging(opts.debug, opts.verbose)
+  utils.SetupToolLogging(
+      opts.debug, opts.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   try:
     getent = runtime.GetEnts()
@@ -127,11 +130,18 @@
     cluster_name = common.VerifyClusterName(data, SetupError,
                                             constants.NDS_CLUSTER_NAME)
     cert_pem = common.VerifyCertificateStrong(data, SetupError)
+    hmac_key = common.VerifyHmac(data, SetupError)
     ssdata = VerifySsconf(data, cluster_name)
 
     logging.info("Writing ssconf files ...")
     ssconf.WriteSsconfFiles(ssdata, dry_run=opts.dry_run)
 
+    logging.info("Writing hmac.key ...")
+    utils.WriteFile(pathutils.CONFD_HMAC_KEY, data=hmac_key,
+                    mode=pathutils.NODED_CERT_MODE,
+                    uid=getent.masterd_uid, gid=getent.masterd_gid,
+                    dry_run=opts.dry_run)
+
     logging.info("Writing node daemon certificate ...")
     utils.WriteFile(pathutils.NODED_CERT_FILE, data=cert_pem,
                     mode=pathutils.NODED_CERT_MODE,
diff --git a/lib/tools/prepare_node_join.py b/lib/tools/prepare_node_join.py
index fa45a58..0a0e2c8 100644
--- a/lib/tools/prepare_node_join.py
+++ b/lib/tools/prepare_node_join.py
@@ -195,7 +195,9 @@
   """
   opts = ParseOptions()
 
-  utils.SetupToolLogging(opts.debug, opts.verbose)
+  utils.SetupToolLogging(
+      opts.debug, opts.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   try:
     data = common.LoadData(sys.stdin.read(), _DATA_CHECK)
diff --git a/lib/tools/ssh_update.py b/lib/tools/ssh_update.py
index b37972e..23f5077 100644
--- a/lib/tools/ssh_update.py
+++ b/lib/tools/ssh_update.py
@@ -210,7 +210,9 @@
   """
   opts = ParseOptions()
 
-  utils.SetupToolLogging(opts.debug, opts.verbose)
+  utils.SetupToolLogging(
+      opts.debug, opts.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   try:
     data = common.LoadData(sys.stdin.read(), _DATA_CHECK)
diff --git a/lib/tools/ssl_update.py b/lib/tools/ssl_update.py
index 56e8d6a..05be975 100644
--- a/lib/tools/ssl_update.py
+++ b/lib/tools/ssl_update.py
@@ -114,7 +114,9 @@
   """
   opts = ParseOptions()
 
-  utils.SetupToolLogging(opts.debug, opts.verbose)
+  utils.SetupToolLogging(
+      opts.debug, opts.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   try:
     data = common.LoadData(sys.stdin.read(), _DATA_CHECK)
diff --git a/lib/utils/log.py b/lib/utils/log.py
index 3703221..903d993 100644
--- a/lib/utils/log.py
+++ b/lib/utils/log.py
@@ -34,10 +34,10 @@
 import os.path
 import logging
 import logging.handlers
-from cStringIO import StringIO
 
 from ganeti import constants
 from ganeti import compat
+from ganeti import pathutils
 
 
 class _ReopenableLogHandler(logging.handlers.BaseRotatingHandler):
@@ -188,7 +188,8 @@
 
 def SetupLogging(logfile, program, debug=0, stderr_logging=False,
                  multithreaded=False, syslog=constants.SYSLOG_USAGE,
-                 console_logging=False, root_logger=None):
+                 console_logging=False, root_logger=None,
+                 verbose=True):
   """Configures the logging module.
 
   @type logfile: str
@@ -212,6 +213,8 @@
       the system console if logging fails
   @type root_logger: logging.Logger
   @param root_logger: Root logger to use (for unittests)
+  @type verbose: boolean
+  @param verbose: whether to log at 'info' level already (logfile logging only)
   @raise EnvironmentError: if we can't open the log file and
       syslog/stderr logging is disabled
   @rtype: callable
@@ -252,7 +255,7 @@
     syslog_handler.setLevel(logging.INFO)
     root_logger.addHandler(syslog_handler)
 
-  if syslog != constants.SYSLOG_ONLY:
+  if syslog != constants.SYSLOG_ONLY and logfile:
     # this can fail, if the logging directories are not setup or we have
     # a permisssion problem; in this case, it's best to log but ignore
     # the error if stderr_logging is True, and if false we re-raise the
@@ -267,8 +270,10 @@
       logfile_handler.setFormatter(formatter)
       if debug:
         logfile_handler.setLevel(logging.DEBUG)
-      else:
+      elif verbose:
         logfile_handler.setLevel(logging.INFO)
+      else:
+        logfile_handler.setLevel(logging.WARN)
       root_logger.addHandler(logfile_handler)
       reopen_handlers.append(logfile_handler)
     except EnvironmentError:
@@ -282,45 +287,37 @@
 
 
 def SetupToolLogging(debug, verbose, threadname=False,
-                     _root_logger=None, _stream=None):
+                     toolname=None, logfile=pathutils.LOG_TOOLS):
   """Configures the logging module for tools.
 
-  All log messages are sent to stderr.
+  All log messages are sent to the tools.log logfile.
 
+  @type toolname: string
+  @param toolname: name of the tool that's logging
   @type debug: boolean
   @param debug: Disable log message filtering
   @type verbose: boolean
   @param verbose: Enable verbose log messages
   @type threadname: boolean
   @param threadname: Whether to include thread name in output
+  @type logfile: string
+  @param logfile: the path of the log file to use, use "None"
+    for tools which don't necessarily run on Ganeti nodes (and
+    thus don't have the Ganeti log directory).
 
   """
-  if _root_logger is None:
-    root_logger = logging.getLogger("")
-  else:
-    root_logger = _root_logger
+  if not toolname:
+    toolname = "unspecified_tool"
 
-  fmt = StringIO()
-  fmt.write("%(asctime)s:")
-
-  if threadname:
-    fmt.write(" %(threadName)s")
-
-  if debug or verbose:
-    fmt.write(" %(levelname)s")
-
-  fmt.write(" %(message)s")
-
-  formatter = logging.Formatter(fmt.getvalue())
-
-  stderr_handler = logging.StreamHandler(_stream)
-  stderr_handler.setFormatter(formatter)
+  # 'SetupLogging' takes a quite unintuitive 'debug' option that
+  # is '0' for 'log higher than debug level' and '1' for
+  # 'log at NOSET' level. Hence this conversion.
+  debug_int = 0
   if debug:
-    stderr_handler.setLevel(logging.NOTSET)
-  elif verbose:
-    stderr_handler.setLevel(logging.INFO)
-  else:
-    stderr_handler.setLevel(logging.WARNING)
+    debug_int = 1
 
-  root_logger.setLevel(logging.NOTSET)
-  root_logger.addHandler(stderr_handler)
+  SetupLogging(logfile,
+               program=toolname,
+               debug=debug_int,
+               multithreaded=threadname,
+               verbose=verbose)
diff --git a/lib/utils/process.py b/lib/utils/process.py
index 268ff54..5933929 100644
--- a/lib/utils/process.py
+++ b/lib/utils/process.py
@@ -185,7 +185,8 @@
   @type noclose_fds: list
   @param noclose_fds: list of additional (fd >=3) file descriptors to leave
                       open for the child process
-  @type input_fd: C{file}-like object or numeric file descriptor
+  @type input_fd: C{file}-like object containing an actual file descriptor
+                  or numeric file descriptor
   @param input_fd: File descriptor for process' standard input
   @type postfork_fn: Callable receiving PID as parameter
   @param postfork_fn: Callback run after fork but before timeout
@@ -526,7 +527,8 @@
   @type noclose_fds: list
   @param noclose_fds: list of additional (fd >=3) file descriptors to leave
                       open for the child process
-  @type input_fd: C{file}-like object or numeric file descriptor
+  @type input_fd: C{file}-like object containing an actual file descriptor
+                  or numeric file descriptor
   @param input_fd: File descriptor for process' standard input
   @type postfork_fn: Callable receiving PID as parameter
   @param postfork_fn: Function run after fork but before timeout
diff --git a/lib/utils/retry.py b/lib/utils/retry.py
index 8079303..895cc0e 100644
--- a/lib/utils/retry.py
+++ b/lib/utils/retry.py
@@ -253,7 +253,8 @@
                      wait_fn=inc_tries, _time_fn=get_tries)
 
 
-def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs):
+def RetryByNumberOfTimes(max_retries, backoff, exception_class, fn, *args,
+                         **kwargs):
   """Retries calling a function up to the specified number of times.
 
   @type max_retries: integer
@@ -264,9 +265,23 @@
   @type fn: callable
   @param fn: Function to be called (up to the specified maximum number of
              retries.
+  @type backoff: int
+  @param backoff: this enables and configures the back off behavior after
+     failed tries. If value is '0', there will be no delay between failed
+     tries. If the value is a positive integer, it is interpreted as the
+     base length of the back off delay (in seconds). That means there will be a
+     delay between failed tries of the length specified in this paramter. With
+     each next retry, the delay is increased by the factor of two. For example,
+     if the value is '2', the first delay is 2 seconds, the second 4 seconds,
+     the third 8 seconds (until the max_retries) are hit or the function call
+     succeeds.
 
   """
+  if backoff < 0:
+    raise exception_class("Backoff must be a non-negative integer.")
+
   last_exception = None
+  delay = backoff
   for i in range(max_retries):
     try:
       fn(*args, **kwargs)
@@ -274,6 +289,8 @@
     except errors.OpExecError as e:
       logging.error("Error after retry no. %s: %s.", i, e)
       last_exception = e
+      time.sleep(delay)
+      delay *= 2
   else:
     if last_exception:
       raise exception_class("Error after %s retries. Last exception: %s."
diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index d8df6bf..881ac83 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -345,10 +345,34 @@
   return compat.any(nodes[node_name].offline for node_name in instance.snodes)
 
 
+def _GetPendingVerifyDisks(cl, uuid):
+  """Checks if there are any currently running or pending group verify jobs and
+  if so, returns their id.
+
+  """
+  qfilter = qlang.MakeSimpleFilter("status",
+                                    frozenset([constants.JOB_STATUS_RUNNING,
+                                               constants.JOB_STATUS_QUEUED,
+                                               constants.JOB_STATUS_WAITING]))
+  qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter)
+
+  ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data
+         if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)]
+  return ids
+
+
 def _VerifyDisks(cl, uuid, nodes, instances):
   """Run a per-group "gnt-cluster verify-disks".
 
   """
+
+  existing_jobs = _GetPendingVerifyDisks(cl, uuid)
+  if existing_jobs:
+    logging.info("There are verify disks jobs already pending (%s), skipping "
+                 "VerifyDisks step for %s.",
+                 utils.CommaJoin(existing_jobs), uuid)
+    return
+
   op = opcodes.OpGroupVerifyDisks(
     group_name=uuid, priority=constants.OP_PRIO_LOW)
   op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
@@ -704,6 +728,7 @@
   # we are on master now
   utils.EnsureDaemon(constants.RAPI)
   utils.EnsureDaemon(constants.WCONFD)
+  utils.EnsureDaemon(constants.MAINTD)
 
   # If RAPI isn't responding to queries, try one restart
   logging.debug("Attempting to talk to remote API on %s",
@@ -843,7 +868,7 @@
 
   logging.debug("Using state file %s", state_path)
 
-  # Global watcher
+  # Group watcher file lock
   statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
   if not statefile:
     return constants.EXIT_FAILURE
@@ -866,26 +891,27 @@
 
     started = _CheckInstances(client, notepad, instances, locks)
     _CheckDisks(client, notepad, nodes, instances, started)
-
-    # Check if the nodegroup only has ext storage type
-    only_ext = compat.all(i.disk_template == constants.DT_EXT
-                          for i in instances.values())
-
-    # We skip current NodeGroup verification if there are only external storage
-    # devices. Currently we provide an interface for external storage provider
-    # for disk verification implementations, however current ExtStorageDevice
-    # does not provide an API for this yet.
-    #
-    # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
-    # is implemented.
-    if not opts.no_verify_disks and not only_ext:
-      _VerifyDisks(client, group_uuid, nodes, instances)
   except Exception, err:
     logging.info("Not updating status file due to failure: %s", err)
     raise
   else:
     # Save changes for next run
     notepad.Save(state_path)
+    notepad.Close()
+
+  # Check if the nodegroup only has ext storage type
+  only_ext = compat.all(i.disk_template == constants.DT_EXT
+                        for i in instances.values())
+
+  # We skip current NodeGroup verification if there are only external storage
+  # devices. Currently we provide an interface for external storage provider
+  # for disk verification implementations, however current ExtStorageDevice
+  # does not provide an API for this yet.
+  #
+  # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
+  # is implemented.
+  if not opts.no_verify_disks and not only_ext:
+    _VerifyDisks(client, group_uuid, nodes, instances)
 
   return constants.EXIT_SUCCESS
 
diff --git a/lib/watcher/state.py b/lib/watcher/state.py
index 5c51b5b..b8ff4ef 100644
--- a/lib/watcher/state.py
+++ b/lib/watcher/state.py
@@ -111,7 +111,7 @@
     self._orig_data = serializer.Dump(self._data)
 
   def Save(self, filename):
-    """Save state to file, then unlock and close it.
+    """Save state to file.
 
     """
     assert self.statefile
diff --git a/man/ganeti-maintd.rst b/man/ganeti-maintd.rst
new file mode 100644
index 0000000..d04fa6a
--- /dev/null
+++ b/man/ganeti-maintd.rst
@@ -0,0 +1,101 @@
+ganeti-maintd(8) Ganeti | Version @GANETI_VERSION@
+==================================================
+
+Name
+----
+
+ganeti-maintd - Ganeti maintenance daemon
+
+Synopsis
+--------
+**ganeti-maintd** [-f] [-d] [-p *PORT*] [-b *ADDRESS*] [--no-voting --yes-do-it]
+
+DESCRIPTION
+-----------
+
+**ganeti-maintd** is the the daemon carrying out regular maintenance
+of the cluster.
+
+For testing purposes, you can give the ``-f`` option and the
+program won't detach from the running terminal.
+
+Debug-level message can be activated by giving the ``-d`` option.
+
+The **ganeti-maintd** daemon listens to port 1816 TCP, on all interfaces,
+by default. The port can be overridden by an entry the services database
+by passing the ``-p`` option.
+The ``-b`` option can be used to specify the address to bind to
+(defaults to ``0.0.0.0``).
+
+The daemon will refuse to start if it cannot verify that the majority
+of cluster nodes believes that it is running on the master node. To
+allow failover in a two-node cluster, this can be overridden by the
+``--no-voting`` option. In this case, the ``--yes-do-it`` option has
+to be given as well.
+
+Operation
+~~~~~~~~~
+
+The maintenance daemon will carry out precisely the same jobs that
+**harep**\(1) would do if continously run. In particular, it can
+be controlled by the same set of opt-in tags.
+
+Communication
+~~~~~~~~~~~~~
+
+The daemon will expose its internal state via HTTP. The answer is
+encoded in JSON format and is specific to the particular request.
+
+``/``
++++++
+The root resource. It will return the list of supported protocol
+versions. At the moment, only version ``1`` is supported.
+
+``1/status``
+++++++++++++
+
+List of all currently ongoing incidents. This is a list of JSON
+objects, each containing at least the following information.
+
+- ``uuid`` The unique identifier assigned to the event.
+
+- ``node`` The UUID of the node on which the even was observed.
+
+- ``original`` The very JSON object reported by self-diagnose data collector.
+
+- ``repair-status`` A string describing the progress made on this event so
+  far. It is one of the following.
+
+  + ``noted`` The event has been observed, but no action has been taken yet
+
+  + ``pending`` At least one job has been submitted in reaction to the event
+    and none of the submitted jobs has failed so far.
+
+  + ``canceled`` The event has been canceled, i.e., ordered to be ignored, but
+    is still observed.
+
+  + ``failed`` At least one of the submitted jobs has failed. To avoid further
+    damage, the repair daemon will not take any further action for this event.
+
+  + ``completed`` All Ganeti actions associated with this event have been
+    completed successfully, including tagging the node.
+
+- ``jobs`` The list of the numbers of ganeti jobs submitted in response to
+  this event.
+
+- ``tag`` A string that is the tag that either has been added to the node, or,
+  if the repair event is not yet finalized, will be added in case of success.
+
+
+``/1/jobs``
++++++++++++
+The list of jobs the daemon will wait for to finish, before starting
+the next round of maintenance.
+
+``/1/evacuated``
+++++++++++++++++
+The list of instance names the daemon does not expect to have load
+data available because they have been recently evacuated from an
+offline (or drained) node. Currently, this affects only Xen instances,
+as for other hypervisors the overall CPU load on the node is taken
+as balancing measure.
diff --git a/man/ganeti.rst b/man/ganeti.rst
index d3b37e8..b68ad08 100644
--- a/man/ganeti.rst
+++ b/man/ganeti.rst
@@ -179,8 +179,8 @@
   discovered or set manually. Only used for estimating how many VCPUs
   are left for instances
 
-Note that currently this option is unused by Ganeti; values will be
-recorded but will not influence the Ganeti operation.
+Note that currently only ``mem_node`` is used by Ganeti; other values
+will be recorded but will not influence the Ganeti operation.
 
 
 Disk State Parameters
diff --git a/man/gnt-cluster.rst b/man/gnt-cluster.rst
index 9b0374c..7bed7cf 100644
--- a/man/gnt-cluster.rst
+++ b/man/gnt-cluster.rst
@@ -198,6 +198,7 @@
 | [\--ipolicy-disk-templates *template* [,*template*...]]
 | [\--ipolicy-spindle-ratio *ratio*]
 | [\--ipolicy-vcpu-ratio *ratio*]
+| [\--ipolicy-memory-ratio *ratio*]
 | [\--disk-state *diskstate*]
 | [\--hypervisor-state *hvstate*]
 | [\--drbd-usermode-helper *helper*]
@@ -587,6 +588,7 @@
 
 - ``--ipolicy-spindle-ratio`` limits the instances-spindles ratio
 - ``--ipolicy-vcpu-ratio`` limits the vcpu-cpu ratio
+- ``--ipolicy-memory-ratio`` limits the memory over-commitment ratio
 
 All the instance policy elements can be overridden at group level. Group
 level overrides can be removed by specifying ``default`` as the value of
@@ -736,6 +738,10 @@
 | [\--user-shutdown {yes \| no}]
 | [\--enabled-data-collectors *collectors*]
 | [\--data-collector-interval *intervals*]
+| [\--maintenance-interval *seconds*]
+| [\--auto-balance-cluster {yes \| no }]
+| [\--auto-balance-threshold *score* ]
+| [\--diagnose-data-collector-filename *filename*]
 
 
 Modify the options for the cluster.
@@ -807,6 +813,21 @@
 and number of seconds specifying the interval at which the collector
 shall be collected.
 
+The ``--diagnose-data-collector-filename`` option specifies the filename
+of the script diagnose data collector should run. If this value is an
+empty string, the data collector will return sucess without running
+anything. The default value is empty string.
+
+The ``--maintenance-interval`` option specified the minimal waiting
+time by the maintenance daemon between maintenance rounds.
+The ``--auto-balance-cluster`` option tell the maintenance daemon
+whether to also keep the cluster in a balanced fashion. If so, it
+will carry out moves, provided the gain in the cluster score for
+that move is at least the value specified by ``--auto-balance-threshold``
+in absolute terms, unless the cluster score it at least 10 times that
+value, in which case all beneficial steps will be done if auto-balancing
+is enabled.
+
 See **gnt-cluster init** for a description of ``--install-image`` and
 ``--zeroing-image``.
 
@@ -856,6 +877,16 @@
 See **ganeti**\(7) for a description of ``--submit`` and other common
 options.
 
+REMOVE-REPAIR
+~~~~~~~~~~~~~
+
+**remove-repair** *uuid*
+
+Unconditionally remove the specified repair event from the list of repair
+events tracked by the maintenance daemon. Note that if the node still reports
+the same breakage, a new event for this breakage will be created at next
+node querying by the daemon.
+
 RENAME
 ~~~~~~
 
@@ -882,6 +913,7 @@
 | [\--new-ssh-keys] [\--no-ssh-key-check]
 | [\--new-cluster-domain-secret] [\--cluster-domain-secret *filename*]
 | [\--ssh-key-type *type*] | [\--ssh-key-bits *bits*]
+| [\--verbose] | [\--debug]
 
 This command will stop all Ganeti daemons in the cluster and start
 them again once the new certificates and keys are replicated. The
@@ -927,6 +959,11 @@
 properties of the disk types used. They are described in more detail
 in the ``init`` option description.
 
+The options ``--verbose`` and ``--debug`` increase the log level
+of underlying ssh calls to all nodes. If running ``renew-crypto``
+causes any problems, use them and inspect the ``tools.log`` file
+for any unusual output.
+
 REPAIR-DISK-SIZES
 ~~~~~~~~~~~~~~~~~
 
diff --git a/man/gnt-group.rst b/man/gnt-group.rst
index 1c313b2..7864687 100644
--- a/man/gnt-group.rst
+++ b/man/gnt-group.rst
@@ -31,6 +31,7 @@
 | [\--ipolicy-disk-templates *template* [,*template*...]]
 | [\--ipolicy-spindle-ratio *ratio*]
 | [\--ipolicy-vcpu-ratio *ratio*]
+| [\--ipolicy-memory-ratio *ratio*]
 | [\--disk-state *diskstate*]
 | [\--hypervisor-state *hvstate*]
 | {*group*}
@@ -103,6 +104,7 @@
 | [\--ipolicy-disk-templates *template* [,*template*...]]
 | [\--ipolicy-spindle-ratio *ratio*]
 | [\--ipolicy-vcpu-ratio *ratio*]
+| [\--ipolicy-memory-ratio *ratio*]
 | {*group*}
 
 Modifies some parameters from the node group.
diff --git a/man/gnt-node.rst b/man/gnt-node.rst
index bf3fff3..65eb6a3 100644
--- a/man/gnt-node.rst
+++ b/man/gnt-node.rst
@@ -30,6 +30,7 @@
 | [\--disk-state *diskstate*]
 | [\--hypervisor-state *hvstate*]
 | [\--no-node-setup]
+| [\--verbose] | [\--debug]
 | {*nodename*}
 
 Adds the given node to the cluster.
@@ -87,6 +88,10 @@
 running, the ``node-cleanup`` tool can be run on the machine to be added
 to clean remains of the previous cluster from the node.
 
+The options ``--verbose`` and ``--debug`` control the log level of the
+operation, in particular the one of the underlying SSH calls that
+Ganeti makes when adding a node.
+
 Example::
 
     # gnt-node add node5.example.com
@@ -339,6 +344,7 @@
 | [\--node-powered=``yes|no``]
 | [\--hypervisor-state *hvstate*]
 | [\--disk-state *diskstate*]
+| [\--verbose] [\--debug]
 | {*node*}
 
 This command changes the role of the node. Each options takes
@@ -372,6 +378,11 @@
 ``--force`` is needed as well, and the target node for the first change
 must be the master.
 
+The options ``--verbose`` and ``--debug`` control the log level of the
+operation, in particular the one of the underlying SSH calls that
+Ganeti makes when modifying some parameters a node (e.g. promoting
+or demoting a node to or from 'master candidate' status).
+
 See **ganeti**\(7) for a description of ``--submit`` and other common
 options.
 
@@ -383,11 +394,16 @@
 REMOVE
 ~~~~~~
 
-**remove** {*nodename*}
+**remove** [\--verbose] [\--debug] {*nodename*}
 
 Removes a node from the cluster. Instances must be removed or
 migrated to another cluster before.
 
+The options ``--verbose`` and ``--debug`` control the log level of the
+operation, in particular the one of the underlying SSH calls that
+Ganeti makes when removing a node.
+
+
 Example::
 
     # gnt-node remove node5.example.com
@@ -652,6 +668,23 @@
 output lines. ``--sync`` forces the opcode to acquire the node lock(s)
 in exclusive mode.
 
+REPAIR-COMMAND
+~~~~~~~~~~~~~~~~~~
+
+| **repair-command** { --input *input* } *command* *node*
+
+Executes a repair command. Repair commands reside in
+``@SYSCONFDIR@/ganeti/node-repair-commands`` on a node, either as a regular
+file or as a symlink. The directory must be owned by root and not be
+world- or group-writable. If a command fails verification or otherwise
+fails to start, the node daemon log must be consulted for more detailed
+information.
+
+Example for running a command::
+
+    # gnt-node repair-command --input "input string" \
+      mycommand node.example.com
+
 Tags
 ~~~~
 
diff --git a/man/hbal.rst b/man/hbal.rst
index ec2e3d1..9910de1 100644
--- a/man/hbal.rst
+++ b/man/hbal.rst
@@ -28,12 +28,15 @@
 **[ -g *delta* ]** **[ \--min-gain-limit *threshold* ]**
 **[ -O *name...* ]**
 **[ \--no-disk-moves ]**
+**[ \--avoid-disk-moves *factor* ]**
 **[ \--no-instance-moves ]**
 **[ -U *util-file* ]**
+**[ \--idle-default ]**
 **[ \--ignore-dynu ]**
 **[ \--ignore-soft-errors ]**
 **[ \--mond *yes|no* ]**
 **[ \--mond-xen ]**
+**[ \--mond-kvm-rss ]**
 **[ \--exit-on-missing-mond-data ]**
 **[ \--evac-mode ]**
 **[ \--restricted-migration ]**
@@ -363,6 +366,12 @@
   a much quicker balancing, but of course the improvements are
   limited. It is up to the user to decide when to use one or another.
 
+\--avoid-disk-moves=*factor*
+  This parameter prevents hbal from not profitable enough disk moves.
+  During each balancing step it will admit disk move only if the gain
+  in the cluster metrics is *factor* times higher than the gain
+  achievable without disk moves.
+
 \--no-instance-moves
   This parameter prevents hbal from using instance moves
   (i.e. "gnt-instance migrate/failover") operations. This will only use
@@ -414,6 +423,13 @@
   metrics and thus the influence of the dynamic utilisation will be
   practically insignificant.
 
+\--idle-default
+  If given, all dynamic utilisation information not provided explicitly
+  by the ``-U`` option or by the MonDs, if ``--mond`` is given, will be
+  assumed to be 0. Note that without this option the default assumption
+  about utilization will apply for the unspecified resources, which is 1.0,
+  i.e., full load, for every instance.
+
 \--ignore-dynu
   If given, all dynamic utilisation information will be ignored by
   assuming it to be 0. This option will take precedence over any data
@@ -448,6 +464,14 @@
   If given, also query Xen-specific collectors from MonD, provided
   that monitoring daemons are queried at all.
 
+\--mond-kvm-rss
+  If given, also query the residual set size for kvm instances, provided
+  that monitoring daemons are queried at all.
+
+\--mem-weight=*factor*
+  Scale the weight of the dynamic memory utilization in the cluster metrics
+  by the given factor.
+
 \--exit-on-missing-mond-data
   If given, abort if the data obtainable from querying MonDs is incomplete.
   The default behavior is to continue with a best guess based on the static
diff --git a/man/htools.rst b/man/htools.rst
index f1ff44b..cdf3c8d 100644
--- a/man/htools.rst
+++ b/man/htools.rst
@@ -224,6 +224,7 @@
   - disk templates
   - vcpu ratio
   - spindle ratio
+  - memory ratio (optional)
 
 \--mond=*yes|no*
   If given the program will query all MonDs to fetch data from the
diff --git a/qa/qa_cluster.py b/qa/qa_cluster.py
index 2199d00..9105018 100644
--- a/qa/qa_cluster.py
+++ b/qa/qa_cluster.py
@@ -1450,14 +1450,6 @@
     nodes = qa_config.AcquireManyNodes(n)
     live_instances.append(cf(nodes))
 
-  # 2.16 only - prior to performing a downgrade, we have to make sure that the
-  # SSH keys used are such that the lower version can still use them,
-  # regardless of cluster defaults.
-  if constants.VERSION_MINOR != 16:
-    raise qa_error.Error("Please remove the key type downgrade code in 2.17")
-  AssertCommand(["gnt-cluster", "renew-crypto", "--no-ssh-key-check", "-f",
-                 "--new-ssh-keys", "--ssh-key-type=dsa"])
-
   AssertRedirectedCommand(["gnt-cluster", "upgrade", "--to", other_version])
   AssertRedirectedCommand(["gnt-cluster", "verify"])
 
diff --git a/qa/qa_node.py b/qa/qa_node.py
index 1ed6bbe..55af8b8 100644
--- a/qa/qa_node.py
+++ b/qa/qa_node.py
@@ -93,6 +93,31 @@
     if node != master:
       NodeAdd(node, readd=False)
 
+  for node in qa_config.get("nodes"):
+    def GetNonStartDaemons():
+      cmd = utils.ShellQuoteArgs(["ps", "-Ao", "comm"])
+      prcs = AssertCommand(cmd, node=node)[1]
+
+      non_start_daemons = []
+
+      def AddIfNotStarted(daemon):
+        if daemon not in prcs:
+          non_start_daemons.append(daemon)
+
+      AddIfNotStarted('ganeti-noded')
+      if constants.ENABLE_MOND:
+        AddIfNotStarted('ganeti-mond')
+      if node == master:
+        AddIfNotStarted('ganeti-wconfd')
+        AddIfNotStarted('ganeti-rapi')
+        AddIfNotStarted('ganeti-luxid')
+        AddIfNotStarted('ganeti-maintd')
+      return non_start_daemons
+
+    nsd = GetNonStartDaemons()
+    for daemon in nsd:
+      raise qa_error.Error(daemon + ' is not running at %s' % node.primary)
+
 
 def MarkNodeAddedAll():
   """Mark all nodes as added.
diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py
index a008247..9830066 100644
--- a/qa/qa_rapi.py
+++ b/qa/qa_rapi.py
@@ -729,7 +729,8 @@
   # Identifying the node - RAPI provides these itself
   IDENTIFIERS = ["node_name", "node_uuid"]
   # As the name states, these can be set but not retrieved yet
-  NOT_EXPOSED_YET = ["hv_state", "disk_state", "auto_promote"]
+  NOT_EXPOSED_YET = ["hv_state", "disk_state", "auto_promote",
+                     "debug", "verbose"]
 
   _DoGetPutTests("/2/nodes/%s" % node.primary,
                  "/2/nodes/%s/modify" % node.primary,
diff --git a/src/Ganeti/BasicTypes.hs b/src/Ganeti/BasicTypes.hs
index 15a26a3..caec414 100644
--- a/src/Ganeti/BasicTypes.hs
+++ b/src/Ganeti/BasicTypes.hs
@@ -8,7 +8,7 @@
 
 {-
 
-Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
+Copyright (C) 2009, 2010, 2011, 2012, 2015 Google Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,8 @@
   , Result
   , ResultT(..)
   , mkResultT
+  , mkResultT'
+  , mkResultTEither
   , withError
   , withErrorT
   , toError
@@ -50,6 +52,7 @@
   , tryError
   , Error(..) -- re-export from Control.Monad.Error
   , MonadIO(..) -- re-export from Control.Monad.IO.Class
+  , FromString(..)
   , isOk
   , isBad
   , justOk
@@ -75,8 +78,12 @@
   , compareNameComponent
   , ListSet(..)
   , emptyListSet
+  , Down(..)
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Applicative
 import Control.Exception (try)
 import Control.Monad
@@ -85,13 +92,15 @@
 import Control.Monad.Trans
 import Control.Monad.Trans.Control
 import Data.Function
-import Data.List
+import Data.List (find, isPrefixOf)
 import Data.Maybe
-import Data.Monoid
 import Data.Set (Set)
 import qualified Data.Set as Set (empty)
 import Text.JSON (JSON)
 import qualified Text.JSON as JSON (readJSON, showJSON)
+#if MIN_VERSION_base(4,6,0)
+import Data.Ord
+#endif
 
 -- Remove after we require >= 1.8.58
 -- See: https://github.com/ndmitchell/hlint/issues/24
@@ -112,26 +121,42 @@
 -- | Type alias for a string Result.
 type Result = GenericResult String
 
+-- | Type class for things that can be built from strings.
+class FromString a where
+  mkFromString :: String -> a
+
+-- | Trivial 'String' instance; requires FlexibleInstances extension
+-- though.
+instance FromString [Char] where
+  mkFromString = id
+
+instance FromString IOError where
+  mkFromString = userError
+
 -- | 'Monad' instance for 'GenericResult'.
-instance (Error a) => Monad (GenericResult a) where
+instance (FromString a) => Monad (GenericResult a) where
   (>>=) (Bad x) _ = Bad x
   (>>=) (Ok x) fn = fn x
   return = Ok
-  fail   = Bad . strMsg
+  fail   = Bad . mkFromString
 
 instance Functor (GenericResult a) where
   fmap _ (Bad msg) = Bad msg
   fmap fn (Ok val) = Ok (fn val)
 
-instance (Error a, Monoid a) => MonadPlus (GenericResult a) where
-  mzero = Bad $ strMsg "zero Result when used as MonadPlus"
+instance (FromString a, Monoid a) => Alternative (GenericResult a) where
+  empty = Bad $ mkFromString "zero Result when used as empty"
   -- for mplus, when we 'add' two Bad values, we concatenate their
   -- error descriptions
-  (Bad x) `mplus` (Bad y) = Bad (x `mappend` strMsg "; " `mappend` y)
-  (Bad _) `mplus` x = x
-  x@(Ok _) `mplus` _ = x
+  (Bad x) <|> (Bad y) = Bad (x `mappend` mkFromString "; " `mappend` y)
+  (Bad _) <|> x = x
+  x@(Ok _) <|> _ = x
 
-instance (Error a) => MonadError a (GenericResult a) where
+instance (FromString a, Monoid a) => MonadPlus (GenericResult a) where
+  mzero = empty
+  mplus = (<|>)
+
+instance (FromString a) => MonadError a (GenericResult a) where
   throwError = Bad
   {-# INLINE throwError #-}
   catchError x h = genericResult h (const x) x
@@ -143,10 +168,6 @@
   _       <*> (Bad x) = Bad x
   (Ok f)  <*> (Ok x)  = Ok $ f x
 
-instance (Error a, Monoid a) => Alternative (GenericResult a) where
-  empty = mzero
-  (<|>) = mplus
-
 -- | This is a monad transformation for Result. It's implementation is
 -- based on the implementations of MaybeT and ErrorT.
 --
@@ -154,7 +175,6 @@
 -- If 'mplus' combines two failing operations, errors of both of them
 -- are combined.
 newtype ResultT a m b = ResultT {runResultT :: m (GenericResult a b)}
-  deriving (Functor)
 
 -- | Eliminates a 'ResultT' value given appropriate continuations
 elimResultT :: (Monad m)
@@ -168,16 +188,19 @@
     result (Bad e)  = l e
 {-# INLINE elimResultT #-}
 
-instance (Applicative m, Monad m, Error a) => Applicative (ResultT a m) where
+instance (Monad m) => Functor (ResultT a m) where
+  fmap f = ResultT . liftM (fmap f) . runResultT
+
+instance (Monad m, FromString a) => Applicative (ResultT a m) where
   pure = return
   (<*>) = ap
 
-instance (Monad m, Error a) => Monad (ResultT a m) where
-  fail err = ResultT (return . Bad $ strMsg err)
+instance (Monad m, FromString a) => Monad (ResultT a m) where
+  fail err = ResultT (return . Bad $ mkFromString err)
   return   = lift . return
   (>>=)    = flip (elimResultT throwError)
 
-instance (Monad m, Error a) => MonadError a (ResultT a m) where
+instance (Monad m, FromString a) => MonadError a (ResultT a m) where
   throwError = ResultT . return . Bad
   catchError = catchErrorT
 
@@ -185,24 +208,24 @@
   lift = ResultT . liftM Ok
 
 -- | The instance catches any 'IOError' using 'try' and converts it into an
--- error message using 'strMsg'.
+-- error message using 'mkFromString'.
 --
 -- This way, monadic code within 'ResultT' that uses solely 'liftIO' to
 -- include 'IO' actions ensures that all IO exceptions are handled.
 --
 -- Other exceptions (see instances of 'Exception') are not currently handled.
 -- This might be revised in the future.
-instance (MonadIO m, Error a) => MonadIO (ResultT a m) where
+instance (MonadIO m, FromString a) => MonadIO (ResultT a m) where
   liftIO = ResultT . liftIO
                    . liftM (either (failError . show) return)
                    . (try :: IO a -> IO (Either IOError a))
 
-instance (MonadBase IO m, Error a) => MonadBase IO (ResultT a m) where
+instance (MonadBase IO m, FromString a) => MonadBase IO (ResultT a m) where
   liftBase = ResultT . liftBase
                    . liftM (either (failError . show) return)
                    . (try :: IO a -> IO (Either IOError a))
 
-instance (Error a) => MonadTransControl (ResultT a) where
+instance (FromString a) => MonadTransControl (ResultT a) where
 #if MIN_VERSION_monad_control(1,0,0)
 -- Needs Undecidable instances
   type StT (ResultT a) b = GenericResult a b
@@ -216,7 +239,7 @@
   {-# INLINE liftWith #-}
   {-# INLINE restoreT #-}
 
-instance (Error a, MonadBaseControl IO m)
+instance (FromString a, MonadBaseControl IO m)
          => MonadBaseControl IO (ResultT a m) where
 #if MIN_VERSION_monad_control(1,0,0)
 -- Needs Undecidable instances
@@ -233,17 +256,18 @@
   {-# INLINE liftBaseWith #-}
   {-# INLINE restoreM #-}
 
-instance (Monad m, Error a, Monoid a) => MonadPlus (ResultT a m) where
-  mzero = ResultT $ return mzero
+instance (Monad m, FromString a, Monoid a)
+         => Alternative (ResultT a m) where
+  empty = ResultT $ return mzero
   -- Ensure that 'y' isn't run if 'x' contains a value. This makes it a bit
   -- more complicated than 'mplus' of 'GenericResult'.
-  mplus x y = elimResultT combine return x
+  x <|> y = elimResultT combine return x
     where combine x' = ResultT $ liftM (mplus (Bad x')) (runResultT y)
 
-instance (Alternative m, Monad m, Error a, Monoid a)
-         => Alternative (ResultT a m) where
-  empty = mzero
-  (<|>) = mplus
+instance (Monad m, FromString a, Monoid a)
+         => MonadPlus (ResultT a m) where
+  mzero = empty
+  mplus = (<|>)
 
 -- | Changes the error message of a result value, if present.
 -- Note that since 'GenericResult' is also a 'MonadError', this function
@@ -253,7 +277,7 @@
 withError f = genericResult (throwError . f) return
 
 -- | Changes the error message of a @ResultT@ value, if present.
-withErrorT :: (Monad m, Error e)
+withErrorT :: (Monad m, FromString e)
            => (e' -> e) -> ResultT e' m a -> ResultT e m a
 withErrorT f = ResultT . liftM (withError f) . runResultT
 
@@ -269,10 +293,10 @@
 toErrorBase = (toError =<<) . liftBase . runResultT
 {-# INLINE toErrorBase #-}
 
--- | An alias for @withError strMsg@, which is often used to lift a pure error
--- to a monad stack. See also 'annotateResult'.
-toErrorStr :: (MonadError e m, Error e) => Result a -> m a
-toErrorStr = withError strMsg
+-- | An alias for @withError mkFromString@, which is often
+-- used to lift a pure error to a monad stack. See also 'annotateResult'.
+toErrorStr :: (MonadError e m, FromString e) => Result a -> m a
+toErrorStr = withError mkFromString
 
 -- | Run a given computation and if an error occurs, return it as `Left` of
 -- `Either`.
@@ -289,9 +313,19 @@
 -- should be handled by the given action.
 --
 -- See also 'toErrorStr'.
-mkResultT :: (Monad m, Error e) => m (Result a) -> ResultT e m a
+mkResultT :: (Monad m, FromString e) => m (Result a) -> ResultT e m a
 mkResultT = ResultT . liftM toErrorStr
 
+-- | Generalisation of mkResultT accepting any showable failures.
+mkResultT' :: (Monad m, FromString e, Show s)
+           => m (GenericResult s a) -> ResultT e m a
+mkResultT' = mkResultT . liftM (genericResult (Bad . show) Ok)
+
+-- | Generalisation of mkResultT accepting any showable failures.
+mkResultTEither :: (Monad m, FromString e, Show s)
+           => m (Either s a) -> ResultT e m a
+mkResultTEither = mkResultT . liftM (either (Bad . show) Ok)
+
 -- | Simple checker for whether a 'GenericResult' is OK.
 isOk :: GenericResult a b -> Bool
 isOk (Ok _) = True
@@ -329,32 +363,33 @@
 -- 'MonadError'. Since 'Result' is an instance of 'MonadError' itself,
 -- it's a generalization of type @String -> Result a -> Result a@.
 -- See also 'toErrorStr'.
-annotateResult :: (MonadError e m, Error e) => String -> Result a -> m a
+annotateResult :: (MonadError e m, FromString e) => String -> Result a -> m a
 annotateResult owner = toErrorStr . annotateError owner
 
 -- | Annotate an error with an ownership information inside a 'MonadError'.
 -- See also 'annotateResult'.
-annotateError :: (MonadError e m, Error e, Monoid e) => String -> m a -> m a
+annotateError :: (MonadError e m, FromString e, Monoid e)
+              => String -> m a -> m a
 annotateError owner =
-  flip catchError (throwError . mappend (strMsg $ owner ++ ": "))
+  flip catchError (throwError . mappend (mkFromString $ owner ++ ": "))
 {-# INLINE annotateError #-}
 
 -- | Throws a 'String' message as an error in a 'MonadError'.
 -- This is a generalization of 'Bad'.
 -- It's similar to 'fail', but works within a 'MonadError', avoiding the
 -- unsafe nature of 'fail'.
-failError :: (MonadError e m, Error e) => String -> m a
-failError = throwError . strMsg
+failError :: (MonadError e m, FromString e) => String -> m a
+failError = throwError . mkFromString
 
 -- | A synonym for @flip@ 'catchErrorT'.
-handleErrorT :: (Monad m, Error e)
+handleErrorT :: (Monad m, FromString e)
              => (e' -> ResultT e m a) -> ResultT e' m a -> ResultT e m a
 handleErrorT handler = elimResultT handler return
 {-# INLINE handleErrorT #-}
 
 -- | Catches an error in a @ResultT@ value. This is similar to 'catchError',
 -- but in addition allows to change the error type.
-catchErrorT :: (Monad m, Error e)
+catchErrorT :: (Monad m, FromString e)
             => ResultT e' m a -> (e' -> ResultT e m a) -> ResultT e m a
 catchErrorT = flip handleErrorT
 {-# INLINE catchErrorT #-}
@@ -471,3 +506,52 @@
 
 emptyListSet :: ListSet a
 emptyListSet = ListSet Set.empty
+
+#if MIN_VERSION_base(4,6,0)
+-- Down already defined in Data.Ord
+#else
+-- Copyright   :  (c) The University of Glasgow 2005
+-- License     :  BSD-style
+
+newtype Down a = Down a deriving (Eq, Show, Read)
+
+instance Ord a => Ord (Down a) where
+    compare (Down x) (Down y) = y `compare` x
+
+{- License text of the above code fragment:
+
+The Glasgow Haskell Compiler License
+
+Copyright 2004, The University Court of the University of Glasgow.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- Neither name of the University nor the names of its contributors may be
+used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY COURT OF THE UNIVERSITY OF
+GLASGOW AND THE CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+UNIVERSITY COURT OF THE UNIVERSITY OF GLASGOW OR THE CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
+
+-}
+
+#endif
diff --git a/src/Ganeti/Codec.hs b/src/Ganeti/Codec.hs
index 404c70b..6f54c0d 100644
--- a/src/Ganeti/Codec.hs
+++ b/src/Ganeti/Codec.hs
@@ -37,12 +37,17 @@
   , decompressZlib
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Codec.Compression.Zlib
 import qualified Codec.Compression.Zlib.Internal as I
-import Control.Monad.Error
+import Control.Monad (liftM)
+import Control.Monad.Error.Class (MonadError(..))
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Internal as BL
-import Data.Monoid (mempty)
+
+import Ganeti.BasicTypes
 
 
 -- | Compresses a lazy bytestring.
@@ -52,11 +57,12 @@
 
 -- | Decompresses a lazy bytestring, throwing decoding errors using
 -- 'throwError'.
-decompressZlib :: (MonadError e m, Error e) => BL.ByteString -> m BL.ByteString
+decompressZlib :: (MonadError e m, FromString e)
+               => BL.ByteString -> m BL.ByteString
 decompressZlib = I.foldDecompressStream
                      (liftM . BL.chunk)
                      (return mempty)
-                     (const $ throwError . strMsg . ("Zlib: " ++))
+                     (const $ throwError . mkFromString . ("Zlib: " ++))
                  . I.decompressWithErrors
                      I.zlibFormat
                      I.defaultDecompressParams
diff --git a/src/Ganeti/Confd/Client.hs b/src/Ganeti/Confd/Client.hs
index 49ab5fd..ae77090 100644
--- a/src/Ganeti/Confd/Client.hs
+++ b/src/Ganeti/Confd/Client.hs
@@ -82,7 +82,7 @@
       hmac = hmacKey client
       jobs = map (queryOneServer semaphore answer crType cQuery hmac) dest
       watchdog reqAnswers = do
-        threadDelay $ 1000000 * C.confdClientExpireTimeout
+        threadDelaySeconds C.confdClientExpireTimeout
         _ <- swapMVar reqAnswers 0
         putMVar semaphore ()
       waitForResult reqAnswers = do
diff --git a/src/Ganeti/Confd/ClientFunctions.hs b/src/Ganeti/Confd/ClientFunctions.hs
index 3213669..a119d99 100644
--- a/src/Ganeti/Confd/ClientFunctions.hs
+++ b/src/Ganeti/Confd/ClientFunctions.hs
@@ -35,6 +35,7 @@
 module Ganeti.Confd.ClientFunctions
   ( getInstances
   , getInstanceDisks
+  , getDiagnoseCollectorFilename
   ) where
 
 import Control.Monad (liftM)
@@ -89,3 +90,15 @@
 getInstanceDisks node srvAddr srvPort =
   liftM (uncurry (++)) (getInstances node srvAddr srvPort) >>=
     mapM (\i -> liftM ((,) i) (getDisks i srvAddr srvPort))
+
+-- | Get the name of the diagnose collector.
+getDiagnoseCollectorFilename
+  :: Maybe String -> Maybe Int -> BT.ResultT String IO String
+getDiagnoseCollectorFilename srvAddr srvPort = do
+  client <- liftIO $ getConfdClient srvAddr srvPort
+  reply <- liftIO . query client ReqConfigQuery
+             $ PlainQuery "/cluster/diagnose_data_collector_filename"
+  case fmap (J.readJSON . confdReplyAnswer) reply of
+    Just (J.Ok filename) -> return filename
+    Just (J.Error msg) -> fail msg
+    Nothing -> fail "No answer from the Confd server"
diff --git a/src/Ganeti/Confd/Server.hs b/src/Ganeti/Confd/Server.hs
index b32eb70..a2ec0a9 100644
--- a/src/Ganeti/Confd/Server.hs
+++ b/src/Ganeti/Confd/Server.hs
@@ -40,7 +40,9 @@
   , prepMain
   ) where
 
-import Control.Applicative((<$>))
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent
 import Control.Monad (forever, liftM)
 import Data.IORef
diff --git a/src/Ganeti/Confd/Utils.hs b/src/Ganeti/Confd/Utils.hs
index afb8e4f..ba5585f 100644
--- a/src/Ganeti/Confd/Utils.hs
+++ b/src/Ganeti/Confd/Utils.hs
@@ -47,7 +47,9 @@
 
 import qualified Data.Attoparsec.Text as P
 
-import Control.Applicative ((*>))
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.ByteString as B
 import Data.Text (pack)
 import qualified Text.JSON as J
diff --git a/src/Ganeti/Config.hs b/src/Ganeti/Config.hs
index 4d0e5a0..5687b54 100644
--- a/src/Ganeti/Config.hs
+++ b/src/Ganeti/Config.hs
@@ -68,6 +68,7 @@
     , getInstDisksFromObj
     , getDrbdMinorsForDisk
     , getDrbdMinorsForInstance
+    , getFilledHvStateParams
     , getFilledInstHvParams
     , getFilledInstBeParams
     , getFilledInstOsParams
@@ -82,10 +83,11 @@
     , instNodes
     ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow ((&&&))
-import Control.Monad
-import Control.Monad.State
+import Control.Monad (liftM)
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.UTF8 as UTF8
 import qualified Data.Foldable as F
@@ -99,6 +101,7 @@
 
 import Ganeti.BasicTypes
 import qualified Ganeti.Constants as C
+import qualified Ganeti.ConstantUtils as CU
 import Ganeti.Errors
 import Ganeti.JSON (fromJResult, fromContainer, GenericContainer(..))
 import Ganeti.Objects
@@ -364,6 +367,36 @@
       ginsts = map (getNodeInstances cfg) gnodes in
   (concatMap fst ginsts, concatMap snd ginsts)
 
+-- | default FilledHvStateParams.
+defaultHvStateParams :: FilledHvStateParams
+defaultHvStateParams = FilledHvStateParams
+  { hvstateCpuNode  = CU.hvstDefaultCpuNode
+  , hvstateCpuTotal = CU.hvstDefaultCpuTotal
+  , hvstateMemHv    = CU.hvstDefaultMemoryHv
+  , hvstateMemNode  = CU.hvstDefaultMemoryNode
+  , hvstateMemTotal = CU.hvstDefaultMemoryTotal
+  }
+
+-- | Retrieves the node's static hypervisor state parameters, missing values
+-- filled with group's parameters, missing group parameters are filled
+-- with cluster's parameters. Currently, returns hvstate parameters only for
+-- the default hypervisor.
+getFilledHvStateParams :: ConfigData -> Node -> FilledHvState
+getFilledHvStateParams cfg n =
+  let cluster_hv_state =
+        fromContainer . clusterHvStateStatic $ configCluster cfg
+      def_hv = getDefaultHypervisor cfg
+      cluster_fv = fromMaybe defaultHvStateParams $ M.lookup def_hv
+                                                    cluster_hv_state
+      group_fv = case getGroupOfNode cfg n >>=
+                      M.lookup def_hv . fromContainer . groupHvStateStatic of
+                   Just pv -> fillParams cluster_fv pv
+                   Nothing -> cluster_fv
+      node_fv = case M.lookup def_hv . fromContainer $ nodeHvStateStatic n of
+                      Just pv -> fillParams group_fv pv
+                      Nothing -> group_fv
+  in GenericContainer $ M.fromList [(def_hv, node_fv)]
+
 -- | Retrieves the instance hypervisor params, missing values filled with
 -- cluster defaults.
 getFilledInstHvParams :: [String] -> ConfigData -> Instance -> HvParams
diff --git a/src/Ganeti/ConstantUtils.hs b/src/Ganeti/ConstantUtils.hs
index 6a61cf2..dc966d6 100644
--- a/src/Ganeti/ConstantUtils.hs
+++ b/src/Ganeti/ConstantUtils.hs
@@ -37,8 +37,10 @@
 -}
 module Ganeti.ConstantUtils where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Data.Char (ord)
-import Data.Monoid (Monoid(..))
 import Data.Set (Set)
 import qualified Data.Set as Set (difference, fromList, toList, union)
 
@@ -204,8 +206,31 @@
 ipolicySpindleRatio :: String
 ipolicySpindleRatio = "spindle-ratio"
 
+ipolicyMemoryRatio :: String
+ipolicyMemoryRatio = "memory-ratio"
+
 ipolicyDefaultsVcpuRatio :: Double
 ipolicyDefaultsVcpuRatio = 4.0
 
 ipolicyDefaultsSpindleRatio :: Double
 ipolicyDefaultsSpindleRatio = 32.0
+
+ipolicyDefaultsMemoryRatio :: Double
+ipolicyDefaultsMemoryRatio = 1.0
+
+-- * Hypervisor state default parameters
+
+hvstDefaultCpuNode :: Int
+hvstDefaultCpuNode = 1
+
+hvstDefaultCpuTotal :: Int
+hvstDefaultCpuTotal = 1
+
+hvstDefaultMemoryHv :: Int
+hvstDefaultMemoryHv = 1024
+
+hvstDefaultMemoryTotal :: Int
+hvstDefaultMemoryTotal = 1024
+
+hvstDefaultMemoryNode :: Int
+hvstDefaultMemoryNode = 4096
diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
index 420ccb6..13bff2e 100644
--- a/src/Ganeti/Constants.hs
+++ b/src/Ganeti/Constants.hs
@@ -367,6 +367,9 @@
 mond :: String
 mond = Runtime.daemonName GanetiMond
 
+maintd :: String
+maintd = Runtime.daemonName GanetiMaintd
+
 noded :: String
 noded = Runtime.daemonName GanetiNoded
 
@@ -398,6 +401,9 @@
 defaultMondPort :: Int
 defaultMondPort = 1815
 
+defaultMaintdPort :: Int
+defaultMaintdPort = 1816
+
 defaultMetadPort :: Int
 defaultMetadPort = 80
 
@@ -413,6 +419,7 @@
   [ (confd, (Udp, defaultConfdPort))
   , (metad, (Tcp, defaultMetadPort))
   , (mond, (Tcp, defaultMondPort))
+  , (maintd, (Tcp, defaultMaintdPort))
   , (noded, (Tcp, defaultNodedPort))
   , (rapi, (Tcp, defaultRapiPort))
   , (ssh, (Tcp, 22))
@@ -2028,11 +2035,12 @@
 hvstDefaults :: Map String Int
 hvstDefaults =
   Map.fromList
-  [(hvstCpuNode, 1),
-   (hvstCpuTotal, 1),
-   (hvstMemoryHv, 0),
-   (hvstMemoryTotal, 0),
-   (hvstMemoryNode, 0)]
+  [ (hvstCpuNode    , ConstantUtils.hvstDefaultCpuNode    )
+  , (hvstCpuTotal   , ConstantUtils.hvstDefaultCpuTotal   )
+  , (hvstMemoryHv   , ConstantUtils.hvstDefaultMemoryHv   )
+  , (hvstMemoryTotal, ConstantUtils.hvstDefaultMemoryTotal)
+  , (hvstMemoryNode , ConstantUtils.hvstDefaultMemoryNode )
+  ]
 
 hvstsParameterTypes :: Map String VType
 hvstsParameterTypes =
@@ -2187,13 +2195,17 @@
 ipolicySpindleRatio :: String
 ipolicySpindleRatio = ConstantUtils.ipolicySpindleRatio
 
+ipolicyMemoryRatio :: String
+ipolicyMemoryRatio = ConstantUtils.ipolicyMemoryRatio
+
 ispecsMinmaxKeys :: FrozenSet String
 ispecsMinmaxKeys = ConstantUtils.mkSet [ispecsMax, ispecsMin]
 
 ipolicyParameters :: FrozenSet String
 ipolicyParameters =
   ConstantUtils.mkSet [ConstantUtils.ipolicyVcpuRatio,
-                       ConstantUtils.ipolicySpindleRatio]
+                       ConstantUtils.ipolicySpindleRatio,
+                       ConstantUtils.ipolicyMemoryRatio]
 
 ipolicyAllKeys :: FrozenSet String
 ipolicyAllKeys =
@@ -4323,8 +4335,9 @@
                                      , (ispecSpindleUse, 1)
                                      ] :: Map String Int))
   , (ipolicyDts,          PyValueEx (ConstantUtils.toList diskTemplates))
-  , (ipolicyVcpuRatio,    PyValueEx (4.0 :: Double))
-  , (ipolicySpindleRatio, PyValueEx (32.0 :: Double))
+  , (ipolicyVcpuRatio,    PyValueEx ConstantUtils.ipolicyDefaultsVcpuRatio)
+  , (ipolicySpindleRatio, PyValueEx ConstantUtils.ipolicyDefaultsSpindleRatio)
+  , (ipolicyMemoryRatio,  PyValueEx ConstantUtils.ipolicyDefaultsMemoryRatio)
   ]
 
 masterPoolSizeDefault :: Int
@@ -4816,6 +4829,9 @@
 ndsSsconf :: String
 ndsSsconf = "ssconf"
 
+ndsHmac :: String
+ndsHmac = "hmac_key"
+
 ndsStartNodeDaemon :: String
 ndsStartNodeDaemon = "start_node_daemon"
 
@@ -4856,6 +4872,9 @@
 opcodeReasonSrcNoded :: String
 opcodeReasonSrcNoded = _opcodeReasonSrcDaemon ++ ":noded"
 
+opcodeReasonSrcMaintd :: String
+opcodeReasonSrcMaintd = _opcodeReasonSrcDaemon ++ ":maintd"
+
 opcodeReasonSrcOpcode :: String
 opcodeReasonSrcOpcode = "gnt:opcode"
 
@@ -4978,6 +4997,12 @@
 mondDefaultCategory :: String
 mondDefaultCategory = "default"
 
+-- * Maintenance daemon
+
+-- | Default wait in seconds time between maintenance rounds.
+maintdDefaultRoundDelay :: Int
+maintdDefaultRoundDelay = 300
+
 -- * Disk access modes
 
 diskUserspace :: String
@@ -5466,9 +5491,17 @@
 dataCollectorLv         :: String
 dataCollectorLv         = "lv"
 
+-- | Collector for the resident set size of kvm processes, i.e.,
+-- the number of pages the kvm process has in RAM.
+dataCollectorKvmRSS     :: String
+dataCollectorKvmRSS     = "kvm-inst-rss"
+
 dataCollectorInstStatus :: String
 dataCollectorInstStatus = "inst-status-xen"
 
+dataCollectorDiagnose :: String
+dataCollectorDiagnose = "diagnose"
+
 dataCollectorParameterInterval :: String
 dataCollectorParameterInterval = "interval"
 
@@ -5480,6 +5513,8 @@
                       , dataCollectorLv
                       , dataCollectorInstStatus
                       , dataCollectorXenCpuLoad
+                      , dataCollectorKvmRSS
+                      , dataCollectorDiagnose
                       ]
 
 dataCollectorStateActive :: String
@@ -5491,11 +5526,25 @@
 dataCollectorsIntervalName :: String
 dataCollectorsIntervalName = "data_collector_interval"
 
+dataCollectorDiagnoseDirectory :: String
+dataCollectorDiagnoseDirectory = sysconfdir ++ "/ganeti/node-diagnose-commands"
+
 -- * HTools tag prefixes
 
 exTagsPrefix :: String
 exTagsPrefix = Tags.exTagsPrefix
 
+-- * MaintD tag prefixes
+
+maintdPrefix :: String
+maintdPrefix = "maintd:"
+
+maintdSuccessTagPrefix :: String
+maintdSuccessTagPrefix = maintdPrefix ++ "repairready:"
+
+maintdFailureTagPrefix :: String
+maintdFailureTagPrefix = maintdPrefix ++ "repairfailed:"
+
 -- | The polling frequency to wait for a job status change
 cliWfjcFrequency :: Int
 cliWfjcFrequency = 20
@@ -5503,3 +5552,4 @@
 -- | Default 'WaitForJobChange' timeout in seconds
 defaultWfjcTimeout :: Int
 defaultWfjcTimeout = 60
+
diff --git a/src/Ganeti/Cpu/LoadParser.hs b/src/Ganeti/Cpu/LoadParser.hs
index 7be0759..e2ffa01 100644
--- a/src/Ganeti/Cpu/LoadParser.hs
+++ b/src/Ganeti/Cpu/LoadParser.hs
@@ -36,7 +36,10 @@
 -}
 module Ganeti.Cpu.LoadParser (cpustatParser) where
 
-import Control.Applicative ((<*>), (<*), (*>), (<$>), (<|>))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative ((<|>))
 import qualified Data.Attoparsec.Text as A
 import qualified Data.Attoparsec.Combinator as AC
 import Data.Attoparsec.Text (Parser)
@@ -50,16 +53,16 @@
 oneCPUstatParser :: Parser CPUstat
 oneCPUstatParser =
   let nameP = stringP
-      userP = numberP
-      niceP = numberP
-      systemP = numberP
-      idleP = numberP
-      iowaitP = numberP
-      irqP = numberP
-      softirqP = numberP
-      stealP = numberP
-      guestP = numberP
-      guest_niceP = numberP
+      userP = integerP
+      niceP = integerP
+      systemP = integerP
+      idleP = integerP
+      iowaitP = integerP
+      irqP = integerP
+      softirqP = integerP
+      stealP = integerP
+      guestP = integerP
+      guest_niceP = integerP
   in
     CPUstat <$> nameP <*> userP <*> niceP <*> systemP <*> idleP <*> iowaitP
             <*> irqP <*> softirqP <*> stealP <*> guestP <*> guest_niceP
diff --git a/src/Ganeti/Cpu/Types.hs b/src/Ganeti/Cpu/Types.hs
index cc67e4d..5786435 100644
--- a/src/Ganeti/Cpu/Types.hs
+++ b/src/Ganeti/Cpu/Types.hs
@@ -37,6 +37,7 @@
 module Ganeti.Cpu.Types
   ( CPUstat(..)
   , CPUavgload(..)
+  , emptyCPUavgload
   ) where
 
 import Ganeti.THH
@@ -49,17 +50,25 @@
   , simpleField "cpu_total"  [t| Double |]
   ])
 
+-- | CPU activity of an idle node. This can be used as a default
+-- value for offline nodes.
+emptyCPUavgload :: CPUavgload
+emptyCPUavgload = CPUavgload { cavCpuNumber = 1
+                             , cavCpus = [ 0.0 ]
+                             , cavCpuTotal = 0.0
+                             }
+
 -- | This is the format of the data parsed by the input file.
 $(buildObject "CPUstat" "cs"
   [ simpleField "name"       [t| String |]
-  , simpleField "user"       [t| Int |]
-  , simpleField "nice"       [t| Int |]
-  , simpleField "system"     [t| Int |]
-  , simpleField "idle"       [t| Int |]
-  , simpleField "iowait"     [t| Int |]
-  , simpleField "irq"        [t| Int |]
-  , simpleField "softirq"    [t| Int |]
-  , simpleField "steal"      [t| Int |]
-  , simpleField "guest"      [t| Int |]
-  , simpleField "guest_nice" [t| Int |]
+  , simpleField "user"       [t| Integer |]
+  , simpleField "nice"       [t| Integer |]
+  , simpleField "system"     [t| Integer |]
+  , simpleField "idle"       [t| Integer |]
+  , simpleField "iowait"     [t| Integer |]
+  , simpleField "irq"        [t| Integer |]
+  , simpleField "softirq"    [t| Integer |]
+  , simpleField "steal"      [t| Integer |]
+  , simpleField "guest"      [t| Integer |]
+  , simpleField "guest_nice" [t| Integer |]
   ])
diff --git a/src/Ganeti/DataCollectors.hs b/src/Ganeti/DataCollectors.hs
index 33ad9cb..3c1146d 100644
--- a/src/Ganeti/DataCollectors.hs
+++ b/src/Ganeti/DataCollectors.hs
@@ -34,14 +34,18 @@
 
 module Ganeti.DataCollectors( collectors ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.Map (findWithDefault)
-import Data.Monoid (mempty)
 
 import qualified Ganeti.DataCollectors.CPUload as CPUload
+import qualified Ganeti.DataCollectors.Diagnose as Diagnose
 import qualified Ganeti.DataCollectors.Diskstats as Diskstats
 import qualified Ganeti.DataCollectors.Drbd as Drbd
 import qualified Ganeti.DataCollectors.InstStatus as InstStatus
+import qualified Ganeti.DataCollectors.KvmRSS as KvmRSS
 import qualified Ganeti.DataCollectors.Lv as Lv
 import qualified Ganeti.DataCollectors.XenCpuLoad as XenCpuLoad
 import Ganeti.DataCollectors.Types (DataCollector(..),ReportBuilder(..))
@@ -54,10 +58,12 @@
 collectors =
   [ cpuLoadCollector
   , xenCpuLoadCollector
+  , kvmRSSCollector
   , diskStatsCollector
   , drdbCollector
   , instStatusCollector
   , lvCollector
+  , diagnoseCollector
   ]
   where
     f .&&. g = \x y -> f x y && g x y
@@ -83,6 +89,9 @@
     lvCollector =
       DataCollector Lv.dcName Lv.dcCategory Lv.dcKind
         (StatelessR Lv.dcReport) Nothing activeConfig updateInterval
+    diagnoseCollector =
+      DataCollector Diagnose.dcName Diagnose.dcCategory Diagnose.dcKind
+        (StatelessR Diagnose.dcReport) Nothing activeConfig updateInterval
     cpuLoadCollector =
       DataCollector CPUload.dcName CPUload.dcCategory CPUload.dcKind
         (StatefulR CPUload.dcReport) (Just CPUload.dcUpdate) activeConfig
@@ -91,3 +100,6 @@
       DataCollector XenCpuLoad.dcName XenCpuLoad.dcCategory XenCpuLoad.dcKind
         (StatefulR XenCpuLoad.dcReport) (Just XenCpuLoad.dcUpdate) activeConfig
         updateInterval
+    kvmRSSCollector =
+      DataCollector KvmRSS.dcName KvmRSS.dcCategory KvmRSS.dcKind
+        (StatelessR KvmRSS.dcReport) Nothing activeConfig updateInterval
diff --git a/src/Ganeti/DataCollectors/CPUload.hs b/src/Ganeti/DataCollectors/CPUload.hs
index 65ac423..ca9376c 100644
--- a/src/Ganeti/DataCollectors/CPUload.hs
+++ b/src/Ganeti/DataCollectors/CPUload.hs
@@ -5,7 +5,7 @@
 
 {-
 
-Copyright (C) 2013 Google Inc.
+Copyright (C) 2013, 2016 Google Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,7 @@
 
 import Control.Arrow (first)
 import qualified Control.Exception as E
+import Control.Monad (liftM)
 import Data.Attoparsec.Text.Lazy as A
 import Data.Maybe (fromMaybe)
 import Data.Text.Lazy (pack, unpack)
@@ -71,8 +72,8 @@
 bufferSize = C.cpuavgloadBufferSize
 
 -- | The window size of the values that will export the average load.
-windowSize :: Integer
-windowSize = toInteger C.cpuavgloadWindowSize
+windowSizeInUSec :: Integer
+windowSizeInUSec = 1000000 * toInteger C.cpuavgloadWindowSize
 
 -- | The default setting for the maximum amount of not parsed character to
 -- print in case of error.
@@ -111,17 +112,17 @@
   in buildDCReport cpuLoadData
 
 -- | Data stored by the collector in mond's memory.
-type Buffer = Seq.Seq (ClockTime, [Int])
+type Buffer = Seq.Seq (ClockTime, [Integer])
 
 -- | Compute the load from a CPU.
-computeLoad :: CPUstat -> Int
+computeLoad :: CPUstat -> Integer
 computeLoad cpuData =
   csUser cpuData + csNice cpuData + csSystem cpuData
   + csIowait cpuData + csIrq cpuData + csSoftirq cpuData
   + csSteal cpuData + csGuest cpuData + csGuestNice cpuData
 
 -- | Reads and Computes the load for each CPU.
-dcCollectFromFile :: FilePath -> IO (ClockTime, [Int])
+dcCollectFromFile :: FilePath -> IO (ClockTime, [Integer])
 dcCollectFromFile inputFile = do
   contents <-
     ((E.try $ readFile inputFile) :: IO (Either IOError String)) >>=
@@ -149,10 +150,7 @@
 -- | Update a Map Entry.
 updateEntry :: Buffer -> Buffer -> Buffer
 updateEntry newBuffer mapEntry =
-  (Seq.><) newBuffer
-  (if Seq.length mapEntry < bufferSize
-    then mapEntry
-    else Seq.drop 1 mapEntry)
+  (Seq.><) newBuffer (Seq.take bufferSize mapEntry)
 
 -- | Updates the given Collector data.
 dcUpdate :: Maybe CollectorData -> IO CollectorData
@@ -178,7 +176,7 @@
             (timestampR, listR) = rightmost
             workInWindow = zipWith (-) listL listR
             timediff = timestampL - timestampR
-            overall = fromInteger (timediff * ticks) / 1000000 :: Double
+            overall = fromIntegral (timediff * ticks) / 1000000 :: Double
         if overall > 0
           then BT.Ok $ map (flip (/) overall . fromIntegral) workInWindow
           else BT.Bad $ "Time covered by data is not sufficient."
@@ -190,7 +188,8 @@
 buildJsonReport :: Buffer -> IO J.JSValue
 buildJsonReport v = do
   ticks <- getSysVar ClockTick
-  let res = computeAverage v windowSize ticks
+  now <- liftM clockTimeToUSec getClockTime
+  let res = computeAverage v (now - windowSizeInUSec) ticks
       showError s = J.showJSON $ GJ.containerFromList [("error", s)]
   return $ BT.genericResult showError (J.showJSON . formatData) res
 
diff --git a/src/Ganeti/DataCollectors/Diagnose.hs b/src/Ganeti/DataCollectors/Diagnose.hs
new file mode 100644
index 0000000..aaa5ac4
--- /dev/null
+++ b/src/Ganeti/DataCollectors/Diagnose.hs
@@ -0,0 +1,157 @@
+{-| Self-diagnose data collector
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.DataCollectors.Diagnose
+  ( dcName
+  , dcCategory
+  , dcKind
+  , dcReport
+  ) where
+
+import Control.Monad.Trans.Class (lift)
+import System.Directory (doesFileExist)
+import System.FilePath.Posix (isValid, takeFileName, (</>))
+import System.Posix.Files ( getFileStatus
+                          , fileOwner
+                          , fileGroup
+                          , fileMode
+                          , ownerModes
+                          , groupReadMode
+                          , groupExecuteMode
+                          , otherReadMode
+                          , otherExecuteMode
+                          , intersectFileModes
+                          , unionFileModes
+                          , ownerExecuteMode
+                          , isRegularFile
+                          , regularFileMode
+                          )
+import System.Process (readProcess)
+import Text.JSON (JSValue(..), toJSObject, toJSString, decode, Result(..))
+
+import Ganeti.BasicTypes (runResultT, ResultT(..), genericResult)
+import Ganeti.Confd.ClientFunctions (getDiagnoseCollectorFilename)
+import Ganeti.Constants (dataCollectorDiagnose, dataCollectorDiagnoseDirectory)
+import Ganeti.DataCollectors.Types ( DCCategory(..)
+                                   , DCKind(..)
+                                   , DCVersion(..)
+                                   , DCReport(..)
+                                   , buildReport
+                                   )
+
+-- | The name of this data collector.
+dcName :: String
+dcName = dataCollectorDiagnose
+
+-- | The category of this data collector.
+dcCategory :: Maybe DCCategory
+dcCategory = Just DCNode
+
+-- | The kind of this data collector.
+dcKind :: DCKind
+dcKind = DCKStatus
+
+-- | The version of this data collector.
+dcVersion :: DCVersion
+dcVersion = DCVerBuiltin
+
+-- | The version number for the data format of this data collector.
+dcFormatVersion :: Int
+dcFormatVersion = 1
+
+okWithDetails :: String -> JSValue
+okWithDetails details = JSObject $ toJSObject
+  [ ("status", JSString $ toJSString "Ok")
+  , ("details", JSString $ toJSString details)
+  ]
+
+
+fnToVal :: String -> IO JSValue
+fnToVal fn
+  | null fn = return $ okWithDetails
+      "No file specified for diagnose data collector"
+  | not $ isValid fn = return $ okWithDetails
+      "Invalid filename specified for diagnose data collector"
+  | takeFileName fn /= fn = return $ okWithDetails
+      "Filepaths cannot be specified for diagnose data collector"
+  | otherwise = do
+      let fp = dataCollectorDiagnoseDirectory </> fn
+      exists <- doesFileExist fp
+      if exists
+        then do
+          fs <- getFileStatus fp
+          let maxFileMode = foldl1 unionFileModes [ ownerModes
+                                                  , groupReadMode
+                                                  , groupExecuteMode
+                                                  , otherReadMode
+                                                  , otherExecuteMode
+                                                  , regularFileMode
+                                                  ]
+              isSubSetOf m1 m2 = m1 `intersectFileModes` m2 == m1
+          case () of _
+                       | fileOwner fs /= 0 -> return . okWithDetails $
+                         "File for diagnose data collector " ++
+                         "must be owned by root"
+                       | fileGroup fs /= 0 -> return . okWithDetails $
+                         "File for diagnose data collector " ++
+                         "must have group root"
+                       | not $ isRegularFile fs -> return . okWithDetails $
+                         "File for diagnose data collector " ++
+                         "must be a regular file"
+                       | not $ isSubSetOf (fileMode fs) maxFileMode ->
+                         return . okWithDetails $
+                           "File for diagnose data collector " ++
+                           "must have permissions 755 or stricter"
+                       | not $ isSubSetOf ownerExecuteMode (fileMode fs) ->
+                         return . okWithDetails $
+                           "File for diagnose data collector " ++
+                           "must be executable by owner"
+                       | otherwise -> do
+                         r <- fmap decode (readProcess fp [] "")
+                         case r of
+                           Ok val -> return val
+                           Error str -> return . okWithDetails $
+                             "Could not parse result: " ++ str
+        else return $ okWithDetails
+          "File specified for diagnose data collector does not exist"
+
+buildJsonReport :: IO JSValue
+buildJsonReport = fmap (genericResult okWithDetails id) . runResultT $ do
+  statusFnName <- getDiagnoseCollectorFilename Nothing Nothing
+  lift $ fnToVal statusFnName
+
+-- | The data exported by the data collector, taken from the default location.
+dcReport :: IO DCReport
+dcReport = buildJsonReport >>=
+  buildReport dcName dcVersion dcFormatVersion dcCategory dcKind
diff --git a/src/Ganeti/DataCollectors/KvmRSS.hs b/src/Ganeti/DataCollectors/KvmRSS.hs
new file mode 100644
index 0000000..3f26617
--- /dev/null
+++ b/src/Ganeti/DataCollectors/KvmRSS.hs
@@ -0,0 +1,119 @@
+{-| kvm resident set size collector
+
+It collects the resident set size (RSS) for all kvm
+processes managed by Ganeti, i.e., the number of pages
+the process has in RAM. The value is obtained
+by taking the corresponding value from /proc/$pid/memstat.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.DataCollectors.KvmRSS
+  ( dcName
+  , dcVersion
+  , dcFormatVersion
+  , dcCategory
+  , dcKind
+  , dcReport
+  ) where
+
+import Control.Monad (liftM)
+import Data.Char (isSpace)
+import Data.Maybe (mapMaybe)
+import Network.BSD (getHostName)
+import System.FilePath ((</>))
+import qualified Text.JSON as J
+import Text.Printf (printf)
+
+import Ganeti.BasicTypes
+import Ganeti.Confd.ClientFunctions (getInstances)
+import qualified Ganeti.Constants as C
+import Ganeti.DataCollectors.Types
+import Ganeti.Objects
+import Ganeti.Path (kvmPidDir)
+
+-- | The name of this data collector for the resident set size (RSS).
+dcName :: String
+dcName = C.dataCollectorKvmRSS
+
+-- | The version number for the data format of this data collector.
+dcFormatVersion :: Int
+dcFormatVersion = 1
+
+-- | The version of this data collector.
+dcVersion :: DCVersion
+dcVersion = DCVerBuiltin
+
+-- | The category of this data collector.
+dcCategory :: Maybe DCCategory
+dcCategory = Nothing
+
+-- | The kind of this data collector.
+dcKind :: DCKind
+dcKind = DCKPerf
+
+-- | Parse the contents of a pid file.
+parsePid :: Monad m => String -> m Int
+parsePid s = case reads s of
+  [(pid, r)] | all isSpace r -> return pid
+  _ -> fail $ "Couldn't parse pid " ++ s
+
+-- | From the contents of a memstat file get the resident set size,
+-- in pages.
+parseRss :: Monad m => String -> m Int
+parseRss s =
+  let drop1 = dropWhile isSpace . dropWhile (not . isSpace) . dropWhile isSpace
+  in case reads (drop1 s) of
+    [(n, _)] -> return n
+    _ -> fail $ "Failed to parse memstat " ++ s
+
+-- | For an instance, collect the resident set size, if available.
+collectInstanceRSS :: String -> IO (Result (String, J.JSValue))
+collectInstanceRSS inst = runResultT $ do
+  piddir <- liftIO kvmPidDir
+  let pidfile = piddir </> inst
+  pidstring <- liftIO $ readFile pidfile
+  pid <- parsePid pidstring
+  let procfspath = printf "/proc/%d/statm" pid
+  memstat <- liftIO $ readFile procfspath
+  rss <- parseRss memstat
+  return (inst, J.showJSON rss)
+
+-- | The data exported by the data collector.
+dcReport :: IO DCReport
+dcReport = do
+  node <- getHostName
+  instances <- liftM (genericResult (const []) (mapMaybe instName . fst))
+               . runResultT $ getInstances node Nothing Nothing
+  reports <- liftM justOk $ mapM collectInstanceRSS instances
+  buildReport dcName dcVersion dcFormatVersion dcCategory dcKind
+           . J.JSObject $ J.toJSObject reports
diff --git a/src/Ganeti/DataCollectors/Types.hs b/src/Ganeti/DataCollectors/Types.hs
index 8b60be1..20386ce 100644
--- a/src/Ganeti/DataCollectors/Types.hs
+++ b/src/Ganeti/DataCollectors/Types.hs
@@ -68,7 +68,7 @@
 import Ganeti.Utils (getCurrentTimeUSec)
 
 -- | The possible classes a data collector can belong to.
-data DCCategory = DCInstance | DCStorage | DCDaemon | DCHypervisor
+data DCCategory = DCInstance | DCStorage | DCDaemon | DCHypervisor | DCNode
   deriving (Show, Eq, Read, Enum, Bounded)
 
 -- | Get the category name and return it as a string.
@@ -145,7 +145,7 @@
 
 -- | Type for the value field of the `CollectorMap` below.
 data CollectorData =
-  CPULoadData (Seq.Seq (ClockTime, [Int]))
+  CPULoadData (Seq.Seq (ClockTime, [Integer]))
   | InstanceCpuLoad (Map.Map String (Seq.Seq (ClockTime, Double)))
 
 instance NFData ClockTime where
diff --git a/src/Ganeti/DataCollectors/XenCpuLoad.hs b/src/Ganeti/DataCollectors/XenCpuLoad.hs
index 10c39cd..1526b57 100644
--- a/src/Ganeti/DataCollectors/XenCpuLoad.hs
+++ b/src/Ganeti/DataCollectors/XenCpuLoad.hs
@@ -42,7 +42,10 @@
   , dcUpdate
   ) where
 
-import Control.Applicative ((<$>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Arrow ((***))
 import Control.Monad (liftM, when)
 import Control.Monad.IO.Class (liftIO)
@@ -143,7 +146,8 @@
                       combinedValues
       withoutOld = Map.filter
                      (liftA2 (&&) (not . Seq.null)
-                      $ (>) (fromIntegral $ C.xentopAverageThreshold * 1000000)
+                      $ (>) (fromIntegral
+                               $ 3 * C.xentopAverageThreshold * 1000000)
                         . (clockTimeToUSec now -) . clockTimeToUSec
                         . fst . flip Seq.index 0)
                      withinRange
diff --git a/src/Ganeti/Errors.hs b/src/Ganeti/Errors.hs
index 5d64892..1dccb93 100644
--- a/src/Ganeti/Errors.hs
+++ b/src/Ganeti/Errors.hs
@@ -122,13 +122,13 @@
   , ("FileStoragePathError", [excErrMsg])
   ])
 
-instance Error GanetiException where
-  strMsg = GenericError
-
 instance JSON GanetiException where
   showJSON = saveGanetiException
   readJSON = loadGanetiException
 
+instance FromString GanetiException where
+  mkFromString = GenericError
+
 -- | Error monad using 'GanetiException' type alias.
 type ErrorResult = GenericResult GanetiException
 
diff --git a/src/Ganeti/HTools/AlgorithmParams.hs b/src/Ganeti/HTools/AlgorithmParams.hs
index b93f437..8a53e69 100644
--- a/src/Ganeti/HTools/AlgorithmParams.hs
+++ b/src/Ganeti/HTools/AlgorithmParams.hs
@@ -41,11 +41,17 @@
   , fromCLIOptions
   ) where
 
+import qualified Data.Set as Set
+
 import qualified Ganeti.HTools.CLI as CLI
 import qualified Ganeti.HTools.Types as T
 
 data AlgorithmOptions = AlgorithmOptions
   { algDiskMoves :: Bool            -- ^ Whether disk moves are allowed
+  , algDiskMovesFactor :: Double    -- ^ Allow only disk moves leads to gain
+                                    -- in cluster score more than
+                                    -- algDiskMovesFactor times higher than
+                                    -- the gain in migration moves
   , algInstanceMoves :: Bool        -- ^ Whether instance moves are allowed
   , algRestrictedMigration :: Bool  -- ^ Whether migration is restricted
   , algIgnoreSoftErrors :: Bool     -- ^ Whether to always ignore soft errors
@@ -56,6 +62,8 @@
                                     -- like global N+1 redundancy
   , algCapacityIgnoreGroups :: [T.Gdx] -- ^ Groups to ignore in capacity checks
   , algRestrictToNodes :: Maybe [String] -- ^ nodes to restrict allocation to
+  , algAllowedNodes :: Maybe (Set.Set Int) -- ^ if given, do not perform any
+                                    -- operations involving other nodes
   , algAcceptExisting :: Bool       -- ^ accept existing violations in capacity
                                     -- checks
   }
@@ -64,6 +72,7 @@
 fromCLIOptions :: CLI.Options -> AlgorithmOptions
 fromCLIOptions opts = AlgorithmOptions
   { algDiskMoves = CLI.optDiskMoves opts
+  , algDiskMovesFactor = CLI.optAvoidDiskMoves opts
   , algInstanceMoves = CLI.optInstMoves opts
   , algRestrictedMigration = CLI.optRestrictedMigrate opts
   , algIgnoreSoftErrors = CLI.optIgnoreSoftErrors opts
@@ -73,6 +82,7 @@
   , algCapacity = CLI.optCapacity opts
   , algCapacityIgnoreGroups = []
   , algRestrictToNodes = CLI.optRestrictToNodes opts
+  , algAllowedNodes = Nothing
   , algAcceptExisting = CLI.optAcceptExisting opts
   }
 
diff --git a/src/Ganeti/HTools/Backend/IAlloc.hs b/src/Ganeti/HTools/Backend/IAlloc.hs
index 3a67c2d..e40c3d7 100644
--- a/src/Ganeti/HTools/Backend/IAlloc.hs
+++ b/src/Ganeti/HTools/Backend/IAlloc.hs
@@ -65,7 +65,7 @@
 import Ganeti.HTools.CLI
 import Ganeti.HTools.Loader
 import Ganeti.HTools.Types
-import Ganeti.JSON (maybeFromObj, JSRecord, tryFromObj, toArray, asObjectList, readEitherString, fromJResult, fromObj, fromObjWithDefault, asJSObject)
+import Ganeti.JSON (maybeFromObj, JSRecord, tryFromObj, toArray, asObjectList, readEitherString, fromJResult, fromObj, fromObjWithDefault, asJSObject, emptyContainer)
 import Ganeti.Types ( EvacMode(ChangePrimary, ChangeSecondary)
                     , adminStateFromRaw, AdminState(..))
 import Ganeti.Utils
@@ -157,6 +157,7 @@
   offline <- extract "offline"
   drained <- extract "drained"
   guuid   <- extract "group"
+  hvstate   <- extractDef emptyContainer "hv_state"
   vm_capable  <- annotateResult desc $ maybeFromObj a "vm_capable"
   let vm_capable' = fromMaybe True vm_capable
   gidx <- lookupGroup ktg n guuid
@@ -178,8 +179,9 @@
   dfree  <- lvextract 0 "free_disk"
   ctotal <- lvextract 0.0 "total_cpus"
   cnos <- lvextract 0 "reserved_cpus"
-  let node = flip Node.setNodeTags tags $
-             Node.create n mtotal mnode mfree dtotal dfree ctotal cnos
+  let node_mem = obtainNodeMemory hvstate mnode
+      node = flip Node.setNodeTags tags $
+             Node.create n mtotal node_mem mfree dtotal dfree ctotal cnos
              (not live || drained) sptotal spfree gidx excl_stor
   return (n, node)
 
diff --git a/src/Ganeti/HTools/Backend/Luxi.hs b/src/Ganeti/HTools/Backend/Luxi.hs
index 53b0794..639d74d 100644
--- a/src/Ganeti/HTools/Backend/Luxi.hs
+++ b/src/Ganeti/HTools/Backend/Luxi.hs
@@ -51,47 +51,14 @@
 import qualified Ganeti.HTools.Group as Group
 import qualified Ganeti.HTools.Node as Node
 import qualified Ganeti.HTools.Instance as Instance
-import Ganeti.JSON (fromObj, fromJVal, tryFromObj, arrayMaybeFromJVal)
+import Ganeti.JSON (fromJVal, tryFromObj, arrayMaybeFromJVal,
+                    getKeysFromContainer, Container)
+import Ganeti.Objects (PartialNicParams)
 
 {-# ANN module "HLint: ignore Eta reduce" #-}
 
 -- * Utility functions
 
--- | Get values behind \"data\" part of the result.
-getData :: (Monad m) => JSValue -> m JSValue
-getData (JSObject o) = fromObj (fromJSObject o) "data"
-getData x = fail $ "Invalid input, expected dict entry but got " ++ show x
-
--- | Converts a (status, value) into m value, if possible.
-parseQueryField :: (Monad m) => JSValue -> m (JSValue, JSValue)
-parseQueryField (JSArray [status, result]) = return (status, result)
-parseQueryField o =
-  fail $ "Invalid query field, expected (status, value) but got " ++ show o
-
--- | Parse a result row.
-parseQueryRow :: (Monad m) => JSValue -> m [(JSValue, JSValue)]
-parseQueryRow (JSArray arr) = mapM parseQueryField arr
-parseQueryRow o =
-  fail $ "Invalid query row result, expected array but got " ++ show o
-
--- | Parse an overall query result and get the [(status, value)] list
--- for each element queried.
-parseQueryResult :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
-parseQueryResult (JSArray arr) = mapM parseQueryRow arr
-parseQueryResult o =
-  fail $ "Invalid query result, expected array but got " ++ show o
-
--- | Prepare resulting output as parsers expect it.
-extractArray :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
-extractArray v =
-  getData v >>= parseQueryResult
-
--- | Testing result status for more verbose error message.
-fromJValWithStatus :: (Text.JSON.JSON a, Monad m) => (JSValue, JSValue) -> m a
-fromJValWithStatus (st, v) = do
-  st' <- fromJVal st
-  Qlang.checkRS st' v >>= fromJVal
-
 annotateConvert :: String -> String -> String -> Result a -> Result a
 annotateConvert otype oname oattr =
   annotateResult $ otype ++ " '" ++ oname ++
@@ -106,7 +73,7 @@
                -> (JSValue, JSValue) -- ^ The value we're trying to convert
                -> Result a           -- ^ The annotated result
 genericConvert otype oname oattr =
-  annotateConvert otype oname oattr . fromJValWithStatus
+  annotateConvert otype oname oattr . L.fromJValWithStatus
 
 convertArrayMaybe :: (Text.JSON.JSON a) =>
                   String             -- ^ The object type
@@ -128,7 +95,8 @@
      ["name", "mtotal", "mnode", "mfree", "dtotal", "dfree",
       "ctotal", "cnos", "offline", "drained", "vm_capable",
       "ndp/spindle_count", "group.uuid", "tags",
-      "ndp/exclusive_storage", "sptotal", "spfree", "ndp/cpu_speed"]
+      "ndp/exclusive_storage", "sptotal", "spfree", "ndp/cpu_speed",
+      "hv_state"]
      Qlang.EmptyFilter
 
 -- | The input data for instance query.
@@ -149,7 +117,7 @@
 queryGroupsMsg :: L.LuxiOp
 queryGroupsMsg =
   L.Query (Qlang.ItemTypeOpCode Qlang.QRGroup)
-     ["uuid", "name", "alloc_policy", "ipolicy", "tags"]
+     ["uuid", "name", "alloc_policy", "ipolicy", "tags", "networks"]
      Qlang.EmptyFilter
 
 -- | Wraper over 'callMethod' doing node query.
@@ -172,7 +140,7 @@
 getInstances :: NameAssoc
              -> JSValue
              -> Result [(String, Instance.Instance)]
-getInstances ktn arr = extractArray arr >>= mapM (parseInstance ktn)
+getInstances ktn arr = L.extractArray arr >>= mapM (parseInstance ktn)
 
 -- | Construct an instance from a JSON object.
 parseInstance :: NameAssoc
@@ -182,7 +150,7 @@
                   , status, pnode, snodes, tags, oram
                   , auto_balance, disk_template, su
                   , dsizes, dspindles, forthcoming ] = do
-  xname <- annotateResult "Parsing new instance" (fromJValWithStatus name)
+  xname <- annotateResult "Parsing new instance" (L.fromJValWithStatus name)
   let convert a = genericConvert "Instance" xname a
   xdisk <- convert "disk_usage" disk
   xmem <- case oram of -- FIXME: remove the "guessing"
@@ -212,15 +180,16 @@
 
 -- | Parse a node list in JSON format.
 getNodes :: NameAssoc -> JSValue -> Result [(String, Node.Node)]
-getNodes ktg arr = extractArray arr >>= mapM (parseNode ktg)
+getNodes ktg arr = L.extractArray arr >>= mapM (parseNode ktg)
 
 -- | Construct a node from a JSON object.
 parseNode :: NameAssoc -> [(JSValue, JSValue)] -> Result (String, Node.Node)
 parseNode ktg [ name, mtotal, mnode, mfree, dtotal, dfree
               , ctotal, cnos, offline, drained, vm_capable, spindles, g_uuid
-              , tags, excl_stor, sptotal, spfree, cpu_speed ]
+              , tags, excl_stor, sptotal, spfree, cpu_speed, hv_state ]
+
     = do
-  xname <- annotateResult "Parsing new node" (fromJValWithStatus name)
+  xname <- annotateResult "Parsing new node" (L.fromJValWithStatus name)
   let convert a = genericConvert "Node" xname a
   xoffline <- convert "offline" offline
   xdrained <- convert "drained" drained
@@ -249,9 +218,11 @@
       -- is the only supported disk template
   xctotal <- lvconvert 0.0 "ctotal" ctotal
   xcnos <- lvconvert 0 "cnos" cnos
-  let node = flip Node.setCpuSpeed xcpu_speed .
+  xhv_state <- convert "hv_state" hv_state
+  let node_mem = obtainNodeMemory xhv_state xmnode
+      node = flip Node.setCpuSpeed xcpu_speed .
              flip Node.setNodeTags xtags $
-             Node.create xname xmtotal xmnode xmfree xdtotal xdfree
+             Node.create xname xmtotal node_mem xmfree xdtotal xdfree
              xctotal xcnos (not live || xdrained) xsptotal xspfree
              xgdx xexcl_stor
   return (xname, node)
@@ -272,19 +243,20 @@
 
 -- | Parses the cluster groups.
 getGroups :: JSValue -> Result [(String, Group.Group)]
-getGroups jsv = extractArray jsv >>= mapM parseGroup
+getGroups jsv = L.extractArray jsv >>= mapM parseGroup
 
 -- | Parses a given group information.
 parseGroup :: [(JSValue, JSValue)] -> Result (String, Group.Group)
-parseGroup [uuid, name, apol, ipol, tags] = do
-  xname <- annotateResult "Parsing new group" (fromJValWithStatus name)
+parseGroup [uuid, name, apol, ipol, tags, nets] = do
+  xname <- annotateResult "Parsing new group" (L.fromJValWithStatus name)
   let convert a = genericConvert "Group" xname a
   xuuid <- convert "uuid" uuid
   xapol <- convert "alloc_policy" apol
   xipol <- convert "ipolicy" ipol
   xtags <- convert "tags" tags
-  -- TODO: parse networks to which this group is connected
-  return (xuuid, Group.create xname xuuid xapol [] xipol xtags)
+  xnets <- convert "networks" nets :: Result (Container PartialNicParams)
+  let xnetids = getKeysFromContainer xnets
+  return (xuuid, Group.create xname xuuid xapol xnetids xipol xtags)
 
 parseGroup v = fail ("Invalid group query result: " ++ show v)
 
diff --git a/src/Ganeti/HTools/Backend/MonD.hs b/src/Ganeti/HTools/Backend/MonD.hs
index 9944bd6..be420a5 100644
--- a/src/Ganeti/HTools/Backend/MonD.hs
+++ b/src/Ganeti/HTools/Backend/MonD.hs
@@ -41,6 +41,16 @@
 module Ganeti.HTools.Backend.MonD
   ( queryAllMonDDCs
   , pMonDData
+  , Report(..)
+  , DataCollector
+  , dName
+  , fromCurl
+  , mkReport
+  , totalCPUCollector
+  , xenCPUCollector
+  , kvmRSSCollector
+  , scaleMemoryWeight
+  , useInstanceRSSData
   ) where
 
 import Control.Monad
@@ -56,8 +66,9 @@
 import Ganeti.BasicTypes
 import qualified Ganeti.Constants as C
 import Ganeti.Cpu.Types
-import qualified Ganeti.DataCollectors.XenCpuLoad as XenCpuLoad
 import qualified Ganeti.DataCollectors.CPUload as CPUload
+import qualified Ganeti.DataCollectors.KvmRSS as KvmRSS
+import qualified Ganeti.DataCollectors.XenCpuLoad as XenCpuLoad
 import Ganeti.DataCollectors.Types ( DCReport, DCCategory
                                    , dcReportData, dcReportName
                                    , getCategoryName )
@@ -76,6 +87,7 @@
 -- | The actual data types for MonD's Data Collectors.
 data Report = CPUavgloadReport CPUavgload
             | InstanceCpuReport (Map.Map String Double)
+            | InstanceRSSReport (Map.Map String Double)
 
 -- | Type describing a data collector basic information.
 data DataCollector = DataCollector
@@ -188,14 +200,90 @@
                                 , dUse = useInstanceCpuData
                                 }
 
+-- * kvm instance RSS collector
+
+-- | Parse results of the kvm instance RSS data Collector
+mkKvmRSSReport :: DCReport -> Maybe Report
+mkKvmRSSReport =
+  liftM InstanceRSSReport . maybeParseMap . dcReportData
+
+-- | Conversion constant from htools' internal memory unit,
+-- which is MiB to RSS unit, which reported in pages (of 4kiB
+-- each).
+pagesPerMiB :: Double
+pagesPerMiB = 256.0
+
+-- | Update cluster data based on per-instance RSS data.
+-- Also set the node's memoy util pool correctly. Our unit
+-- of memory usage is pages; there are 256 pages per MiB
+-- of node memory not used by the node itself.
+useInstanceRSSData :: [(Node.Node, Report)]
+                   -> (Node.List, Instance.List)
+                   -> Result (Node.List, Instance.List)
+useInstanceRSSData reports (nl, il) = do
+  let toMap (InstanceRSSReport m) = Just m
+      toMap _                     = Nothing
+  let usage = Map.unions $ mapMaybe (toMap . snd) reports
+      missingData = (Set.fromList . map Instance.name $ IntMap.elems il)
+                    Set.\\ Map.keysSet usage
+  unless (Set.null missingData)
+    . Bad . (++) "No RSS information available for "
+    . show $ Set.elems missingData
+  let updateInstance inst =
+        let mem = Map.lookup (Instance.name inst) usage
+            dynU = Instance.util inst
+            dynU' = maybe dynU (\m -> dynU { memWeight = m }) mem
+        in inst { Instance.util = dynU' }
+  let il' = IntMap.map updateInstance il
+  let updateNode node =
+        let mem = sum
+                  . map (\ idx -> maybe 0 (memWeight . Instance.util)
+                                  $ IntMap.lookup idx il')
+                  $ Node.pList node
+            dynU = Node.utilLoad node
+            dynU' = dynU { memWeight = mem }
+            pool = Node.utilPool node
+            nodePages = (Node.tMem node - fromIntegral (Node.nMem node))
+                        * pagesPerMiB
+            pool' = pool { memWeight = nodePages }
+        in node { Node.utilLoad = dynU', Node.utilPool = pool' }
+  let nl' = IntMap.map updateNode nl
+  return (nl', il')
+
+-- | Update cluster data based on the per-instance CPU usage
+kvmRSSCollector :: DataCollector
+kvmRSSCollector = DataCollector { dName = KvmRSS.dcName
+                                , dCategory = KvmRSS.dcCategory
+                                , dMkReport = mkKvmRSSReport
+                                , dUse = useInstanceRSSData
+                                }
+
+-- | Scale the importance of the memory weight in dynamic utilisation,
+-- by multiplying the usage with the given factor. Note that the underlying
+-- model for dynamic utilisation is that they are reported in arbitrary units.
+scaleMemoryWeight :: Double
+                  -> (Node.List, Instance.List)
+                  -> (Node.List, Instance.List)
+scaleMemoryWeight f (nl, il) =
+  let updateInst inst =
+        let dynU = Instance.util inst
+            dynU' = dynU { memWeight = f * memWeight dynU}
+        in inst { Instance.util = dynU' }
+      updateNode node =
+        let dynU = Node.utilLoad node
+            dynU' = dynU { memWeight = f * memWeight dynU}
+        in node { Node.utilLoad = dynU' }
+  in (IntMap.map updateNode nl, IntMap.map updateInst il)
+
 -- * Collector choice
 
 -- | The list of Data Collectors used by hail and hbal.
 collectors :: Options -> [DataCollector]
 collectors opts
   | optIgnoreDynu opts = []
-  | optMonDXen opts = [ xenCPUCollector ]
-  | otherwise = [ totalCPUCollector ]
+  | otherwise =
+      (if optMonDXen opts then [ xenCPUCollector ] else [ totalCPUCollector ] )
+      ++ [ kvmRSSCollector | optMonDKvmRSS opts ]
 
 -- * Querying infrastructure
 
diff --git a/src/Ganeti/HTools/Backend/Rapi.hs b/src/Ganeti/HTools/Backend/Rapi.hs
index 7d76751..218411c 100644
--- a/src/Ganeti/HTools/Backend/Rapi.hs
+++ b/src/Ganeti/HTools/Backend/Rapi.hs
@@ -53,7 +53,7 @@
 import Ganeti.BasicTypes
 import Ganeti.HTools.Loader
 import Ganeti.HTools.Types
-import Ganeti.JSON (loadJSArray, JSRecord, tryFromObj, fromJVal, maybeFromObj, fromJResult, tryArrayMaybeFromObj, readEitherString, fromObjWithDefault, asJSObject)
+import Ganeti.JSON (loadJSArray, JSRecord, tryFromObj, fromJVal, maybeFromObj, fromJResult, tryArrayMaybeFromObj, readEitherString, fromObjWithDefault, asJSObject, emptyContainer)
 import qualified Ganeti.HTools.Group as Group
 import qualified Ganeti.HTools.Node as Node
 import qualified Ganeti.HTools.Instance as Instance
@@ -186,8 +186,10 @@
   ctotal <- lvextract 0.0 "ctotal"
   cnos <- lvextract 0 "cnos"
   tags <- extract "tags"
-  let node = flip Node.setNodeTags tags $
-             Node.create name mtotal mnode mfree dtotal dfree ctotal cnos
+  hv_state <- extractDef emptyContainer "hv_state"
+  let node_mem = obtainNodeMemory hv_state mnode
+      node = flip Node.setNodeTags tags $
+             Node.create name mtotal node_mem mfree dtotal dfree ctotal cnos
              (not live || drained) sptotal spfree guuid' excl_stor
   return (name, node)
 
@@ -196,12 +198,13 @@
 parseGroup a = do
   name <- tryFromObj "Parsing new group" a "name"
   let extract s = tryFromObj ("Group '" ++ name ++ "'") a s
+  let extractDef s d = fromObjWithDefault a s d
   uuid <- extract "uuid"
   apol <- extract "alloc_policy"
   ipol <- extract "ipolicy"
   tags <- extract "tags"
-  -- TODO: parse networks to which this group is connected
-  return (uuid, Group.create name uuid apol [] ipol tags)
+  nets <- extractDef "networks" []
+  return (uuid, Group.create name uuid apol nets ipol tags)
 
 -- | Parse cluster data from the info resource.
 parseCluster :: JSObject JSValue -> Result ([String], IPolicy, String)
diff --git a/src/Ganeti/HTools/Backend/Text.hs b/src/Ganeti/HTools/Backend/Text.hs
index 5aaa784..4929f74 100644
--- a/src/Ganeti/HTools/Backend/Text.hs
+++ b/src/Ganeti/HTools/Backend/Text.hs
@@ -168,13 +168,14 @@
 -- | Generate policy data from a given policy object.
 serializeIPolicy :: String -> IPolicy -> String
 serializeIPolicy owner ipol =
-  let IPolicy minmax stdspec dts vcpu_ratio spindle_ratio = ipol
+  let IPolicy minmax stdspec dts vcpu_ratio spindle_ratio memory_ratio = ipol
       strings = [ owner
                 , serializeISpec stdspec
                 , serializeMultipleMinMaxISpecs minmax
                 , serializeDiskTemplates dts
                 , show vcpu_ratio
                 , show spindle_ratio
+                , show memory_ratio
                 ]
   in intercalate "|" strings
 
@@ -370,16 +371,21 @@
 -- | Loads an ipolicy from a field list.
 loadIPolicy :: [String] -> Result (String, IPolicy)
 loadIPolicy (owner:stdspec:minmaxspecs:dtemplates:
-             vcpu_ratio:spindle_ratio:_) = do
+             vcpu_ratio:spindle_ratio:memory_ratio:_) = do
   xstdspec <- loadISpec (owner ++ "/stdspec") (commaSplit stdspec)
   xminmaxspecs <- loadMultipleMinMaxISpecs owner $
                   sepSplit iSpecsSeparator minmaxspecs
   xdts <- mapM diskTemplateFromRaw $ commaSplit dtemplates
   xvcpu_ratio <- tryRead (owner ++ "/vcpu_ratio") vcpu_ratio
   xspindle_ratio <- tryRead (owner ++ "/spindle_ratio") spindle_ratio
+  xmemory_ratio <- tryRead (owner ++ "/memory_ratio") memory_ratio
   return (owner,
           IPolicy xminmaxspecs xstdspec
-                xdts xvcpu_ratio xspindle_ratio)
+                xdts xvcpu_ratio xspindle_ratio xmemory_ratio)
+loadIPolicy (owner:stdspec:minmaxspecs:dtemplates:
+             vcpu_ratio:spindle_ratio:_) =
+  loadIPolicy (owner:stdspec:minmaxspecs:dtemplates:
+               vcpu_ratio:spindle_ratio:["1.0"])
 loadIPolicy s = fail $ "Invalid ipolicy data: '" ++ show s ++ "'"
 
 loadOnePolicy :: (IPolicy, Group.List) -> String
diff --git a/src/Ganeti/HTools/CLI.hs b/src/Ganeti/HTools/CLI.hs
index 7ca25d9..110375e 100644
--- a/src/Ganeti/HTools/CLI.hs
+++ b/src/Ganeti/HTools/CLI.hs
@@ -8,7 +8,7 @@
 
 {-
 
-Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
+Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 Google Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -55,12 +55,15 @@
   -- * The options
   , oDataFile
   , oDiskMoves
+  , oAvoidDiskMoves
   , oDiskTemplate
   , oDryRun
   , oSpindleUse
   , oDynuFile
+  , oMemWeight
   , oMonD
   , oMonDDataFile
+  , oMonDKvmRSS
   , oMonDXen
   , oEvacMode
   , oMonDExitMissing
@@ -72,6 +75,7 @@
   , oForce
   , oFullEvacuation
   , oGroup
+  , oIdleDefault
   , oIAllocSrc
   , oIgnoreDyn
   , oIgnoreNonRedundant
@@ -143,11 +147,16 @@
 data Options = Options
   { optDataFile    :: Maybe FilePath -- ^ Path to the cluster data file
   , optDiskMoves   :: Bool           -- ^ Allow disk moves
+  , optAvoidDiskMoves :: Double      -- ^ Allow only disk moves improving
+                                     -- cluster score in more than
+                                     -- optAvoidDiskMoves times
   , optInstMoves   :: Bool           -- ^ Allow instance moves
   , optDiskTemplate :: Maybe DiskTemplate  -- ^ Override for the disk template
   , optSpindleUse  :: Maybe Int      -- ^ Override for the spindle usage
   , optDynuFile    :: Maybe FilePath -- ^ Optional file with dynamic use data
   , optIgnoreDynu  :: Bool           -- ^ Do not use dynamic use data
+  , optIdleDefault :: Bool           -- ^ Assume idle load for all not provided
+                                     -- dynamic utilisation data
   , optIgnoreSoftErrors :: Bool      -- ^ Ignore soft errors in balancing moves
   , optIndependentGroups :: Bool     -- ^ consider groups independently
   , optAcceptExisting :: Bool        -- ^ accept existing N+1 violations
@@ -156,8 +165,12 @@
                                      -- by MonDs
   , optMonDXen     :: Bool           -- ^ Should Xen-specific collectors be
                                      -- considered (only if MonD is queried)
+  , optMonDKvmRSS  :: Bool           -- ^ Should kvm RSS information be
+                                     -- considered (only if MonD is queried)
   , optMonDExitMissing :: Bool       -- ^ If the program should exit on missing
                                      -- MonD data
+  , optMemWeight   :: Double         -- ^ Rescale the weight of memory
+                                     -- utilisation
   , optEvacMode    :: Bool           -- ^ Enable evacuation mode
   , optRestrictedMigrate :: Bool     -- ^ Disallow replace-primary moves
   , optExInst      :: [String]       -- ^ Instances to be excluded
@@ -218,18 +231,22 @@
 defaultOptions  = Options
   { optDataFile    = Nothing
   , optDiskMoves   = True
+  , optAvoidDiskMoves = 1.0
   , optInstMoves   = True
   , optIndependentGroups = False
   , optAcceptExisting = False
   , optDiskTemplate = Nothing
   , optSpindleUse  = Nothing
   , optIgnoreDynu  = False
+  , optIdleDefault = False
   , optIgnoreSoftErrors = False
   , optDynuFile    = Nothing
   , optMonD        = False
   , optMonDFile = Nothing
   , optMonDXen     = False
+  , optMonDKvmRSS  = False
   , optMonDExitMissing = False
+  , optMemWeight   = 1.0
   , optEvacMode    = False
   , optRestrictedMigrate = False
   , optExInst      = []
@@ -340,6 +357,16 @@
    \ thus allowing only the 'cheap' failover/migrate operations",
    OptComplNone)
 
+oAvoidDiskMoves :: OptType
+oAvoidDiskMoves =
+  (Option "" ["avoid-disk-moves"]
+   (reqWithConversion (tryRead "disk moves avoiding factor")
+    (\f opts -> Ok opts { optAvoidDiskMoves = f }) "FACTOR")
+   "gain in cluster metrics on each balancing step including disk moves\
+   \ should be FACTOR times higher than the gain after migrations in order to\
+   \ admit disk move during the step",
+   OptComplFloat)
+
 oMonD :: OptType
 oMonD =
   (Option "" ["mond"]
@@ -363,6 +390,21 @@
     "also consider xen-specific collectors in MonD queries",
     OptComplNone)
 
+oMonDKvmRSS :: OptType
+oMonDKvmRSS =
+  (Option "" ["mond-kvm-rss"]
+    (NoArg (\ opts -> Ok opts { optMonDKvmRSS = True }))
+    "also consider residual-set-size data for kvm instances via MonD",
+    OptComplNone)
+
+oMemWeight :: OptType
+oMemWeight =
+  (Option "" ["mem-weight"]
+   (reqWithConversion (tryRead "memory weight factor")
+    (\ f opts -> Ok opts { optMemWeight = f }) "FACTOR")
+   "Rescale the weight of the memory utilization by the given factor",
+   OptComplFloat)
+
 oMonDExitMissing :: OptType
 oMonDExitMissing =
   (Option "" ["exit-on-missing-mond-data"]
@@ -420,6 +462,13 @@
    "Ignore any dynamic utilisation information",
    OptComplNone)
 
+oIdleDefault :: OptType
+oIdleDefault =
+  (Option "" ["idle-default"]
+   (NoArg (\ opts -> Ok opts {optIdleDefault = True}))
+   "Assume idleness for any non-availabe dynamic utilisation data",
+   OptComplNone)
+
 oIgnoreSoftErrors :: OptType
 oIgnoreSoftErrors =
   (Option "" ["ignore-soft-errors"]
diff --git a/src/Ganeti/HTools/Cluster.hs b/src/Ganeti/HTools/Cluster.hs
index 8e4327c..65746fd 100644
--- a/src/Ganeti/HTools/Cluster.hs
+++ b/src/Ganeti/HTools/Cluster.hs
@@ -82,11 +82,22 @@
   , findSplitInstances
   ) where
 
-import Control.Applicative ((<$>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Arrow ((&&&))
 import Control.Monad (unless)
 import qualified Data.IntSet as IntSet
-import Data.List
+import qualified Data.Set as Set
+import Data.List ( nub
+                 , sortBy
+                 , foldl'
+                 , intersect
+                 , partition
+                 , (\\)
+                 , sort
+                 , intercalate)
 import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
 import Data.Ord (comparing)
 import Text.Printf (printf)
@@ -334,61 +345,65 @@
              upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
          in compareTables cur_tbl upd_tbl
 
--- | Given the status of the current secondary as a valid new node and
--- the current candidate target node, generate the possible moves for
--- a instance.
-possibleMoves :: MirrorType -- ^ The mirroring type of the instance
-              -> Bool       -- ^ Whether the secondary node is a valid new node
-              -> Bool       -- ^ Whether we can change the primary node
-              -> Bool       -- ^ Whether we alowed to move disks
-              -> (Bool, Bool) -- ^ Whether migration is restricted and whether
-                              -- the instance primary is offline
-              -> Ndx        -- ^ Target node candidate
-              -> [IMove]    -- ^ List of valid result moves
+-- | Generate all possible migration moves of an instance given some
+-- additional parameters
+migrationMoves :: MirrorType -- ^ The mirroring type of the instance
+               -> Bool       -- ^ Whether the secondary node is active
+               -> [Ndx]      -- ^ Target node candidate list
+               -> [IMove]    -- ^ List of valid result moves
+migrationMoves MirrorNone _ _ = []
+migrationMoves MirrorInternal False _ = []
+migrationMoves MirrorInternal True  _ = [Failover]
+migrationMoves MirrorExternal _ nodes_idx = map FailoverToAny nodes_idx
 
-possibleMoves MirrorNone _ _ _ _ _ = []
+-- | Generate all possible disk moves (complex instance moves consist of disk
+-- moves and maybe migrations) of an instance given some additional parameters
+diskMoves :: MirrorType   -- ^ The mirroring type of the instance
+          -> Bool         -- ^ Whether the secondary node is a valid new node
+          -> Bool         -- ^ Whether we can change the primary node
+          -> (Bool, Bool) -- ^ Whether migration is restricted and whether
+                          -- the instance primary is offline
+          -> [Ndx]        -- ^ Target node candidates list
+          -> [IMove]      -- ^ List of valid result moves
+diskMoves MirrorNone _ _ _ _ = []
+diskMoves MirrorExternal _ _ _ _ = []
+diskMoves MirrorInternal valid_sec inst_moves restr nodes_idx =
+  concatMap (intMirrSingleDiskMove valid_sec inst_moves restr) nodes_idx
+  where
+    intMirrSingleDiskMove _ False _ tdx =
+      [ReplaceSecondary tdx]
 
-possibleMoves MirrorExternal _ False _ _ _ = []
+    intMirrSingleDiskMove _ _ (True, False) tdx =
+      [ReplaceSecondary tdx]
 
-possibleMoves MirrorExternal _ True _ _ tdx =
-  [ FailoverToAny tdx ]
+    intMirrSingleDiskMove True True (False, _) tdx =
+      [ ReplaceSecondary tdx
+      , ReplaceAndFailover tdx
+      , ReplacePrimary tdx
+      , FailoverAndReplace tdx
+      ]
 
-possibleMoves MirrorInternal _ _ False _ _ = []
+    intMirrSingleDiskMove True True (True, True) tdx =
+      [ ReplaceSecondary tdx
+      , ReplaceAndFailover tdx
+      , FailoverAndReplace tdx
+      ]
 
-possibleMoves MirrorInternal _ False True _ tdx =
-  [ ReplaceSecondary tdx ]
+    intMirrSingleDiskMove False True _ tdx =
+      [ ReplaceSecondary tdx
+      , ReplaceAndFailover tdx
+      ]
 
-possibleMoves MirrorInternal _ _ True (True, False) tdx =
-  [ ReplaceSecondary tdx
-  ]
-
-possibleMoves MirrorInternal True True True (False, _) tdx =
-  [ ReplaceSecondary tdx
-  , ReplaceAndFailover tdx
-  , ReplacePrimary tdx
-  , FailoverAndReplace tdx
-  ]
-
-possibleMoves MirrorInternal True True True (True, True) tdx =
-  [ ReplaceSecondary tdx
-  , ReplaceAndFailover tdx
-  , FailoverAndReplace tdx
-  ]
-
-possibleMoves MirrorInternal False True True _ tdx =
-  [ ReplaceSecondary tdx
-  , ReplaceAndFailover tdx
-  ]
 
 -- | Compute the best move for a given instance.
 checkInstanceMove ::  AlgorithmOptions -- ^ Algorithmic options for balancing
                   -> [Ndx]             -- ^ Allowed target node indices
                   -> Table             -- ^ Original table
                   -> Instance.Instance -- ^ Instance to move
-                  -> Table             -- ^ Best new table for this instance
+                  -> (Table, Table)    -- ^ Pair of best new tables:
+                                       -- migrations only and with disk moves
 checkInstanceMove opts nodes_idx ini_tbl@(Table nl _ _ _) target =
   let force = algIgnoreSoftErrors opts
-      disk_moves = algDiskMoves opts
       inst_moves = algInstanceMoves opts
       rest_mig = algRestrictedMigration opts
       opdx = Instance.pNode target
@@ -397,19 +412,23 @@
       nodes = filter (`notElem` bad_nodes) nodes_idx
       mir_type = Instance.mirrorType target
       use_secondary = elem osdx nodes_idx && inst_moves
-      aft_failover = if mir_type == MirrorInternal && use_secondary
-                       -- if drbd and allowed to failover
-                       then checkSingleStep force ini_tbl target ini_tbl
-                              Failover
-                       else ini_tbl
       primary_drained = Node.offline
                         . flip Container.find nl
                         $ Instance.pNode target
-      all_moves = concatMap (possibleMoves mir_type use_secondary inst_moves
-                             disk_moves (rest_mig, primary_drained)) nodes
-    in
-      -- iterate over the possible nodes for this instance
-      foldl' (checkSingleStep force ini_tbl target) aft_failover all_moves
+
+      migrations = migrationMoves mir_type use_secondary nodes
+      disk_moves = diskMoves mir_type use_secondary inst_moves
+                   (rest_mig, primary_drained) nodes
+
+      -- iterate over the possible nodes and migrations for this instance
+      best_migr_tbl =
+        if inst_moves
+          then foldl' (checkSingleStep force ini_tbl target) ini_tbl migrations
+          else ini_tbl
+      -- iterate over the possible moves for this instance
+      best_tbl =
+        foldl' (checkSingleStep force ini_tbl target) best_migr_tbl disk_moves
+  in (best_migr_tbl, best_tbl)
 
 -- | Compute the best next move.
 checkMove :: AlgorithmOptions       -- ^ Algorithmic options for balancing
@@ -417,27 +436,32 @@
              -> Table               -- ^ The current solution
              -> [Instance.Instance] -- ^ List of instances still to move
              -> Table               -- ^ The new solution
-checkMove opts nodes_idx ini_tbl victims =
-  let Table _ _ _ ini_plc = ini_tbl
+checkMove opts nodes_idx ini_tbl@(Table _ _ ini_cv _) victims =
+  let disk_moves = algDiskMoves opts
+      disk_moves_f = algDiskMovesFactor opts
       -- we're using rwhnf from the Control.Parallel.Strategies
       -- package; we don't need to use rnf as that would force too
       -- much evaluation in single-threaded cases, and in
       -- multi-threaded case the weak head normal form is enough to
       -- spark the evaluation
-      tables = parMap rwhnf (checkInstanceMove opts nodes_idx ini_tbl)
-               victims
+      table_pairs = parMap rwhnf (checkInstanceMove opts nodes_idx ini_tbl)
+                    victims
+
       -- iterate over all instances, computing the best move
-      best_tbl = foldl' compareTables ini_tbl tables
-      Table _ _ _ best_plc = best_tbl
-  in if length best_plc == length ini_plc
-       then ini_tbl -- no advancement
-       else best_tbl
+      best_migr_tbl@(Table _ _ best_migr_cv _) =
+        foldl' compareTables ini_tbl $ map fst table_pairs
+      best_tbl@(Table _ _ best_cv _) =
+        foldl' compareTables ini_tbl $ map snd table_pairs
+  in if not disk_moves
+     || ini_cv - best_cv <= (ini_cv - best_migr_cv) * disk_moves_f
+       then best_migr_tbl
+       else best_tbl -- best including disk moves
 
 -- | Check if we are allowed to go deeper in the balancing.
 doNextBalance :: Table     -- ^ The starting table
               -> Int       -- ^ Remaining length
               -> Score     -- ^ Score at which to stop
-              -> Bool      -- ^ The resulting table and commands
+              -> Bool      -- ^ True if we can continue
 doNextBalance ini_tbl max_rounds min_score =
   let Table _ _ ini_cv ini_plc = ini_tbl
       ini_plc_len = length ini_plc
@@ -463,7 +487,13 @@
         reloc_inst = filter (\i -> Instance.movable i &&
                                    Instance.autoBalance i) all_inst'
         node_idx = map Node.idx online_nodes
-        fin_tbl = checkMove opts node_idx ini_tbl reloc_inst
+        allowed_node = maybe (const True) (flip Set.member)
+                         $ algAllowedNodes opts
+        good_nidx = filter allowed_node node_idx
+        allowed_inst = liftA2 (&&) (allowed_node . Instance.pNode)
+                         (liftA2 (||) allowed_node (< 0) . Instance.sNode)
+        good_reloc_inst = filter allowed_inst reloc_inst
+        fin_tbl = checkMove opts good_nidx ini_tbl good_reloc_inst
         (Table _ _ fin_cv _) = fin_tbl
     in
       if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
@@ -841,6 +871,39 @@
   of x:_ -> Just . snd $ x
      _ -> Nothing
 
+-- | For a failure determine the underlying resource that most likely
+-- causes this kind of failure. In particular, N+1 violations are most
+-- likely caused by lack of memory.
+underlyingCause :: FailMode -> FailMode
+underlyingCause FailN1 = FailMem
+underlyingCause x = x
+
+-- | Shrink a resource of an instance until the failure statistics for
+-- this resource changes. Note that it might no be possible to allocate
+-- an instance at this size; nevertheless there might be a need to change
+-- the resource to shrink on, e.g., if the current instance is too big on
+-- two resources.
+doShrink :: (Instance.Instance -> AllocSolution) -> Instance.Instance
+         -> FailMode -> Maybe Instance.Instance
+doShrink allocFn inst fm =
+  let physRes = underlyingCause fm
+      getCount = runListHead 0 snd . filter ((==) physRes . fst)
+                 . collapseFailures . map underlyingCause . asFailures
+      initialStat = getCount $ allocFn inst
+      hasChanged = ((/=) initialStat . getCount . fst)
+      -- as the list of possible shrinks can be quite long, and, moreover,
+      -- has some cost of computing it, our heuristics is to look into it
+      -- only for a limited range; only once the list is shorter, we do
+      -- binary search.
+      lookAhead = 50
+      heuristics xs = if null (drop lookAhead xs)
+                        then length xs `div` 2
+                        else lookAhead
+  in fmap snd
+     . monotoneFind heuristics hasChanged
+     . map (allocFn &&& id)
+     $ iterateOk (`Instance.shrinkByType` physRes) inst
+
 -- | Tiered allocation method.
 --
 -- This places instances on the cluster, and decreases the spec until
@@ -857,21 +920,20 @@
                                Nothing -> (False, Nothing)
                                Just n -> (n <= ixes_cnt,
                                             Just (n - ixes_cnt))
-          sortedErrs = map fst $ sortBy (comparing snd) errs
-          suffShrink = sufficesShrinking
-                         (fromMaybe emptyAllocSolution
-                          . flip (tryAlloc opts nl' il') allocnodes)
-                       newinst
-          bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
+          sortedErrs = nub . map (underlyingCause . fst)
+                        $ sortBy (flip $ comparing snd) errs
+          allocFn = fromMaybe emptyAllocSolution
+                      . flip (tryAlloc opts nl' il') allocnodes
+          suffShrink = sufficesShrinking allocFn newinst
+          bigSteps = filter isJust . map suffShrink $ drop 1 sortedErrs
           progress (Ok (_, _, _, newil', _)) (Ok (_, _, _, newil, _)) =
             length newil' > length newil
           progress _ _ = False
       in if stop then newsol else
-           let newsol' = case Instance.shrinkByType newinst . last
-                                $ sortedErrs of
-                 Bad _ -> newsol
-                 Ok newinst' -> tieredAlloc opts nl' il' newlimit
-                                newinst' allocnodes ixes' cstats'
+           let newsol' = case map (doShrink allocFn newinst) sortedErrs of
+                 Just newinst' : _ -> tieredAlloc opts nl' il' newlimit
+                                        newinst' allocnodes ixes' cstats'
+                 _ -> newsol
            in if progress newsol' newsol then newsol' else
                 case bigSteps of
                   Just newinst':_ -> tieredAlloc opts nl' il' newlimit
diff --git a/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs b/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs
index 3e90e02..f8e9aa9 100644
--- a/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs
+++ b/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs
@@ -39,14 +39,14 @@
 
 import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..))
 import Ganeti.HTools.Cluster.AllocationSolution (AllocElement)
-import Ganeti.HTools.Cluster.Metrics ( compCV, compCVfromStats
+import Ganeti.HTools.Cluster.Metrics ( ClusterStatistics, compCV
+                                     , compCVfromStats
                                      , updateClusterStatisticsTwice)
 import Ganeti.HTools.Cluster.Moves (setInstanceLocationScore)
 import qualified Ganeti.HTools.Container as Container
 import qualified Ganeti.HTools.Instance as Instance
 import qualified Ganeti.HTools.Node as Node
 import Ganeti.HTools.Types
-import Ganeti.Utils.Statistics
 
 -- | Tries to allocate an instance on one given node.
 allocateOnSingle :: AlgorithmOptions
@@ -65,7 +65,7 @@
 
 -- | Tries to allocate an instance on a given pair of nodes.
 allocateOnPair :: AlgorithmOptions
-               -> [Statistics]
+               -> ClusterStatistics
                -> Node.List -> Instance.Instance -> Ndx -> Ndx
                -> OpResult AllocElement
 allocateOnPair opts stats nl inst new_pdx new_sdx =
diff --git a/src/Ganeti/HTools/Cluster/Metrics.hs b/src/Ganeti/HTools/Cluster/Metrics.hs
index a1681ee..ff1662a 100644
--- a/src/Ganeti/HTools/Cluster/Metrics.hs
+++ b/src/Ganeti/HTools/Cluster/Metrics.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE TemplateHaskell #-}
+
 {-| Implementation of the cluster metric
 
 -}
@@ -33,7 +35,8 @@
 -}
 
 module Ganeti.HTools.Cluster.Metrics
-  ( compCV
+  ( ClusterStatistics
+  , compCV
   , compCVfromStats
   , compCVNodes
   , compClusterStatistics
@@ -42,172 +45,24 @@
   , printStats
   ) where
 
-import Control.Monad (guard)
-import Data.List (partition, transpose)
-import Data.Maybe (fromMaybe)
-import Text.Printf (printf)
-
 import qualified Ganeti.HTools.Container as Container
 import qualified Ganeti.HTools.Node as Node
-import qualified Ganeti.HTools.PeerMap as P
-import Ganeti.HTools.Types
-import Ganeti.Utils (printTable)
-import Ganeti.Utils.Statistics
+import qualified Ganeti.HTools.Cluster.MetricsComponents as M
+import Ganeti.HTools.Cluster.MetricsTH
 
--- | Coefficient for the total reserved memory in the cluster metric. We
--- use a (local) constant here, as it is also used in the computation of
--- the best possible cluster score.
-reservedMemRtotalCoeff :: Double
-reservedMemRtotalCoeff = 0.25
-
--- | The names and weights of the individual elements in the CV list, together
--- with their statistical accumulation function and a bit to decide whether it
--- is a statistics for online nodes.
-detailedCVInfoExt :: [((Double, String)
-                     , ([AggregateComponent] -> Statistics, Bool))]
-detailedCVInfoExt = [ ((0.5,  "free_mem_cv"), (getStdDevStatistics, True))
-                    , ((0.5,  "free_disk_cv"), (getStdDevStatistics, True))
-                    , ((1,  "n1_cnt"), (getSumStatistics, True))
-                    , ((1,  "reserved_mem_cv"), (getStdDevStatistics, True))
-                    , ((4,  "offline_all_cnt"), (getSumStatistics, False))
-                    , ((16, "offline_pri_cnt"), (getSumStatistics, False))
-                    , ( (0.5,  "vcpu_ratio_cv")
-                      , (getStdDevStatistics, True))
-                    , ((1,  "cpu_load_cv"), (getStdDevStatistics, True))
-                    , ((1,  "mem_load_cv"), (getStdDevStatistics, True))
-                    , ((1,  "disk_load_cv"), (getStdDevStatistics, True))
-                    , ((1,  "net_load_cv"), (getStdDevStatistics, True))
-                    , ((2,  "pri_tags_score"), (getSumStatistics, True))
-                    , ((0.5,  "spindles_cv"), (getStdDevStatistics, True))
-                    , ((0.5,  "free_mem_cv_forth"), (getStdDevStatistics, True))
-                    , ( (0.5,  "free_disk_cv_forth")
-                      , (getStdDevStatistics, True))
-                    , ( (0.5,  "vcpu_ratio_cv_forth")
-                      , (getStdDevStatistics, True))
-                    , ((0.5,  "spindles_cv_forth"), (getStdDevStatistics, True))
-                    , ((1,  "location_score"), (getSumStatistics, True))
-                    , ( (1,  "location_exclusion_score")
-                      , (getMapStatistics, True))
-                    , ( (reservedMemRtotalCoeff,  "reserved_mem_rtotal")
-                      , (getSumStatistics, True))
-                    ]
-
--- | Compute the lower bound of the cluster score, i.e., the sum of the minimal
--- values for all cluster score values that are not 0 on a perfectly balanced
--- cluster.
-optimalCVScore :: Node.List -> Double
-optimalCVScore nodelist = fromMaybe 0 $ do
-  let nodes = Container.elems nodelist
-  guard $ length nodes > 1
-  let nodeMems = map Node.tMem nodes
-      totalMem = sum nodeMems
-      totalMemOneLessNode = totalMem - maximum nodeMems
-  guard $ totalMemOneLessNode > 0
-  let totalDrbdMem = fromIntegral . sum $ map (P.sumElems . Node.peers) nodes
-      optimalUsage = totalDrbdMem / totalMem
-      optimalUsageOneLessNode = totalDrbdMem / totalMemOneLessNode
-      relativeReserved = optimalUsageOneLessNode - optimalUsage
-  return $ reservedMemRtotalCoeff * relativeReserved
-
--- | The names and weights of the individual elements in the CV list.
-detailedCVInfo :: [(Double, String)]
-detailedCVInfo = map fst detailedCVInfoExt
-
--- | Holds the weights used by 'compCVNodes' for each metric.
-detailedCVWeights :: [Double]
-detailedCVWeights = map fst detailedCVInfo
-
--- | The aggregation functions for the weights
-detailedCVAggregation :: [([AggregateComponent] -> Statistics, Bool)]
-detailedCVAggregation = map snd detailedCVInfoExt
-
--- | The bit vector describing which parts of the statistics are
--- for online nodes.
-detailedCVOnlineStatus :: [Bool]
-detailedCVOnlineStatus = map snd detailedCVAggregation
-
--- | Compute statistical measures of a single node.
-compDetailedCVNode  :: Node.Node -> [AggregateComponent]
-compDetailedCVNode node =
-  let mem = Node.pMem node
-      memF = Node.pMemForth node
-      dsk = Node.pDsk node
-      dskF = Node.pDskForth node
-      n1 = fromIntegral
-           $ if Node.failN1 node
-               then length (Node.sList node) + length (Node.pList node)
-               else 0
-      res = Node.pRem node
-      ipri = fromIntegral . length $ Node.pList node
-      isec = fromIntegral . length $ Node.sList node
-      ioff = ipri + isec
-      cpu = Node.pCpuEff node
-      cpuF = Node.pCpuEffForth node
-      DynUtil c1 m1 d1 nn1 = Node.utilLoad node
-      DynUtil c2 m2 d2 nn2 = Node.utilPool node
-      (c_load, m_load, d_load, n_load) = (c1/c2, m1/m2, d1/d2, nn1/nn2)
-      pri_tags = fromIntegral $ Node.conflictingPrimaries node
-      spindles = Node.instSpindles node / Node.hiSpindles node
-      spindlesF = Node.instSpindlesForth node / Node.hiSpindles node
-      location_score = fromIntegral $ Node.locationScore node
-      location_exclusion_score = Node.instanceMap node
-  in [ SimpleNumber mem, SimpleNumber dsk, SimpleNumber n1, SimpleNumber res
-     , SimpleNumber ioff, SimpleNumber ipri, SimpleNumber cpu
-     , SimpleNumber c_load, SimpleNumber m_load, SimpleNumber d_load
-     , SimpleNumber n_load
-     , SimpleNumber pri_tags, SimpleNumber spindles
-     , SimpleNumber memF, SimpleNumber dskF, SimpleNumber cpuF
-     , SimpleNumber spindlesF
-     , SimpleNumber location_score
-     , SpreadValues location_exclusion_score
-     , SimpleNumber res
-     ]
-
--- | Compute the statistics of a cluster.
-compClusterStatistics :: [Node.Node] -> [Statistics]
-compClusterStatistics all_nodes =
-  let (offline, nodes) = partition Node.offline all_nodes
-      offline_values = transpose (map compDetailedCVNode offline)
-                       ++ repeat []
-      -- transpose of an empty list is empty and not k times the empty list, as
-      -- would be the transpose of a 0 x k matrix
-      online_values = transpose $ map compDetailedCVNode nodes
-      aggregate (f, True) (onNodes, _) = f onNodes
-      aggregate (f, False) (_, offNodes) = f offNodes
-  in zipWith aggregate detailedCVAggregation
-       $ zip online_values offline_values
-
--- | Update a cluster statistics by replacing the contribution of one
--- node by that of another.
-updateClusterStatistics :: [Statistics]
-                           -> (Node.Node, Node.Node) -> [Statistics]
-updateClusterStatistics stats (old, new) =
-  let update = zip (compDetailedCVNode old) (compDetailedCVNode new)
-      online = not $ Node.offline old
-      updateStat forOnline stat upd = if forOnline == online
-                                        then updateStatistics stat upd
-                                        else stat
-  in zipWith3 updateStat detailedCVOnlineStatus stats update
+$(declareStatistics M.metricComponents)
 
 -- | Update a cluster statistics twice.
-updateClusterStatisticsTwice :: [Statistics]
+updateClusterStatisticsTwice :: ClusterStatistics
                                 -> (Node.Node, Node.Node)
                                 -> (Node.Node, Node.Node)
-                                -> [Statistics]
+                                -> ClusterStatistics
 updateClusterStatisticsTwice s a =
   updateClusterStatistics (updateClusterStatistics s a)
 
--- | Compute cluster statistics
-compDetailedCV :: [Node.Node] -> [Double]
-compDetailedCV = map getStatisticValue . compClusterStatistics
-
--- | Compute the cluster score from its statistics
-compCVfromStats :: [Statistics] -> Double
-compCVfromStats = sum . zipWith (*) detailedCVWeights . map getStatisticValue
-
--- | Compute the /total/ variance.
+-- | Compute the total cluster store given the nodes.
 compCVNodes :: [Node.Node] -> Double
-compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
+compCVNodes = compCVfromStats . compClusterStatistics
 
 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
 compCV :: Node.List -> Double
@@ -215,14 +70,5 @@
 
 -- | Shows statistics for a given node list.
 printStats :: String -> Node.List -> String
-printStats lp nl =
-  let dcvs = compDetailedCV $ Container.elems nl
-      (weights, names) = unzip detailedCVInfo
-      hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
-      header = [ "Field", "Value", "Weight" ]
-      formatted = map (\(w, h, val) ->
-                         [ h
-                         , printf "%.8f" val
-                         , printf "x%.2f" w
-                         ]) hd
-  in printTable lp header formatted $ False:repeat True
+printStats lp =
+  showClusterStatistics lp . compClusterStatistics . Container.elems
diff --git a/src/Ganeti/HTools/Cluster/MetricsComponents.hs b/src/Ganeti/HTools/Cluster/MetricsComponents.hs
new file mode 100644
index 0000000..85f20ee
--- /dev/null
+++ b/src/Ganeti/HTools/Cluster/MetricsComponents.hs
@@ -0,0 +1,171 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Module describing cluster metrics components.
+
+    Metrics components are used for generation of functions deaing with cluster
+    statistics.
+
+-}
+
+{-
+
+Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.HTools.Cluster.MetricsComponents
+  ( metricComponents
+  ) where
+
+
+import Control.Monad (guard)
+import Data.Maybe (fromMaybe)
+import Language.Haskell.TH
+
+import Ganeti.HTools.Cluster.MetricsTH (MetricComponent(..))
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import qualified Ganeti.HTools.PeerMap as P
+import Ganeti.HTools.Types
+import Ganeti.Utils.Statistics
+
+-- | Type alias decreasing table size below
+type D = Double
+
+-- | List containing all currently enabled cluster metrics components
+metricComponents :: [MetricComponent]
+metricComponents =
+  [ stdDevComp "free_mem_cv"               [| 0.5  :: D |] True [| Node.pMem |]
+  , stdDevComp "free_disk_cv"              [| 0.5  :: D |] True [| Node.pDsk |]
+  , stdDevComp "vcpu_ratio_cv"             [| 0.5  :: D |] True
+    [| Node.pCpuEff |]
+  , sumComp    "spindles_cv"               [| 0.5  :: D |] True
+    [| \n -> Node.instSpindles n / Node.hiSpindles n |]
+  , sumComp    "fail_n1"                   [| 0.5  :: D |] True
+    [| \n -> if Node.failN1 n
+               then toDouble  $ length (Node.sList n) + length (Node.pList n)
+               else 0 |]
+  , stdDevComp "reserved_mem_cv"           [| 1    :: D |] True [| Node.pRem |]
+  , sumComp    "offline_all_cnt"           [| 4    :: D |] False
+    [| \n -> toDouble $ length (Node.pList n) + length (Node.sList n) |]
+  , sumComp    "offline_pri_cnt"           [| 16   :: D |] False
+    [| toDouble . length . Node.pList |]
+  , stdDevComp "cpu_load_cv"               [| 1    :: D |] True
+    [| \n -> let DynUtil c1 _ _ _ = Node.utilLoad n
+                 DynUtil c2 _ _ _ = Node.utilPool n
+             in c1/c2 |]
+  , stdDevComp "mem_load_cv"               [| 1    :: D |] True
+    [| \n -> let DynUtil _ m1 _ _ = Node.utilLoad n
+                 DynUtil _ m2 _ _ = Node.utilPool n
+             in m1/m2 |]
+  , stdDevComp "disk_load_cv"              [| 1    :: D |] True
+    [| \n -> let DynUtil _ _ d1 _ = Node.utilLoad n
+                 DynUtil _ _ d2 _ = Node.utilPool n
+             in d1/d2 |]
+  , stdDevComp "net_load_cv"               [| 1    :: D |] True
+    [| \n -> let DynUtil _ _ _ n1 = Node.utilLoad n
+                 DynUtil _ _ _ n2 = Node.utilPool n
+             in n1/n2 |]
+  , sumComp     "pri_tags_score"           [| 2    :: D |] True
+    [| toDouble . Node.conflictingPrimaries |]
+  , sumComp     "location_score"           [| 1    :: D |] True
+    [| toDouble . Node.locationScore |]
+  , mapComp     "location_exclusion_score" [| 0.5  :: D |] True
+    [| MapData . Node.instanceMap |]
+  , stdDevComp "free_mem_cv_forth"         [| 0.5  :: D |] True
+    [| Node.pMemForth    |]
+  , stdDevComp "free_disk_cv_forth"        [| 0.5  :: D |] True
+    [| Node.pDskForth    |]
+  , stdDevComp "vcpu_ratio_cv_forth"       [| 0.5  :: D |] True
+    [| Node.pCpuEffForth |]
+  , sumComp    "spindles_cv_forth"         [| 0.5  :: D |] True
+    [| \n -> Node.instSpindlesForth n / Node.hiSpindles n |]
+  , reservedMemRTotal
+  ]
+
+-- | Function to be used as a short MetricComponent constructor for SumStat.
+sumComp :: String -> ExpQ -> Bool -> ExpQ -> MetricComponent
+sumComp nm w on f = MetricComponent { name = nm
+                                    , weight = w
+                                    , fromNode = f
+                                    , fromNodeType = [t| Double |]
+                                    , statisticsType = [t| SumStat |]
+                                    , forOnlineNodes = on
+                                    , optimalValue = Nothing
+                                    }
+
+-- | Function to be used as a short MetricComponent constructor for StdDevStat.
+stdDevComp :: String -> ExpQ -> Bool -> ExpQ -> MetricComponent
+stdDevComp nm w on f = MetricComponent { name = nm
+                                       , weight = w
+                                       , fromNode = f
+                                       , fromNodeType = [t| Double |]
+                                       , statisticsType = [t| StdDevStat |]
+                                       , forOnlineNodes = on
+                                       , optimalValue = Nothing
+                                       }
+
+-- | Function to be used as a short MetricComponent constructor for MapStat.
+mapComp :: String -> ExpQ -> Bool -> ExpQ -> MetricComponent
+mapComp nm w on f = MetricComponent { name = nm
+                                    , weight = w
+                                    , fromNode = f
+                                    , fromNodeType = [t| MapData |]
+                                    , statisticsType = [t| MapStat |]
+                                    , forOnlineNodes = on
+                                    , optimalValue = Nothing
+                                    }
+
+-- | Weight of reservedMemRTotal component
+wReservedMemRTotal :: Double
+wReservedMemRTotal = 0.25
+
+reservedMemRTotal :: MetricComponent
+reservedMemRTotal = MetricComponent
+  { name = "reserved_mem_rtotal"
+  , weight = [| wReservedMemRTotal :: D |]
+  , fromNode =  [| Node.pRem |]
+  , fromNodeType = [t| Double |]
+  , statisticsType = [t| SumStat |]
+  , forOnlineNodes = True
+  , optimalValue = Just [| reservedMemRTotalOptValue |]
+  }
+
+-- | Computes theoretical opimal value for reservedMemRTotal component
+reservedMemRTotalOptValue :: Node.List -> Double
+reservedMemRTotalOptValue nodelist = fromMaybe 0 $ do
+  let nodes = Container.elems nodelist
+  guard $ length nodes > 1
+  let nodeMems = map Node.tMem nodes
+      totalMem = sum nodeMems
+      totalMemOneLessNode = totalMem - maximum nodeMems
+  guard $ totalMemOneLessNode > 0
+  let totalDrbdMem = fromIntegral . sum $ map (P.sumElems . Node.peers) nodes
+      optimalUsage = totalDrbdMem / totalMem
+      optimalUsageOneLessNode = totalDrbdMem / totalMemOneLessNode
+      relativeReserved = optimalUsageOneLessNode - optimalUsage
+  return $ wReservedMemRTotal * relativeReserved
diff --git a/src/Ganeti/HTools/Cluster/MetricsTH.hs b/src/Ganeti/HTools/Cluster/MetricsTH.hs
new file mode 100644
index 0000000..1e2265f
--- /dev/null
+++ b/src/Ganeti/HTools/Cluster/MetricsTH.hs
@@ -0,0 +1,263 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Declaration of the datatypes and functions dealing with cluster metrics
+    generated by template haskell.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+
+module Ganeti.HTools.Cluster.MetricsTH
+  ( MetricComponent(..)
+  , declareStatistics
+  ) where
+
+import Data.List (partition)
+import Data.Maybe (mapMaybe)
+import Language.Haskell.TH
+import Text.Printf (printf)
+
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Utils (printTable)
+import Ganeti.Utils.Statistics
+
+-- | Data type describing the metric component. The information provided by
+-- this data type is used to generate statistics data types and functions
+-- dealing with them
+data MetricComponent = MetricComponent
+  { name           :: String -- ^ The component name
+  , weight         :: Q Exp  -- ^ The component weight in the statistics sum
+  , fromNode       :: Q Exp  -- ^ Quasi quoted function obtaining spread value
+                             -- from a node given (Node.Node -> fromNodeType)
+  , fromNodeType   :: Q Type -- ^ Quasi quoted spread value type
+  , statisticsType :: Q Type -- ^ Quasi quoted statistics data type. Stat
+                             -- instance for fromNodeType and statisticsType
+                             -- should be defined
+  , forOnlineNodes :: Bool   -- ^ Whether this component should be calculated
+                             -- for online or offline nodes
+  , optimalValue   :: Maybe ExpQ  -- ^ Maybe quasi quoted function obtaining
+                                  -- optimal value of such component
+                                  -- (Node.List -> Double)
+  }
+
+-- | Declares all functions and data types implemented in template haskell
+declareStatistics :: [MetricComponent] -> Q [Dec]
+declareStatistics components = do
+  nodeValues              <- nodeValuesDecl components
+  getNodeValues           <- getNodeValuesDecl components
+  clusterStatistics       <- clusterStatisticsDecl components
+  compClusterStatistics   <- compClusterStatisticsDecl components
+  updateClusterStatistics <- updateClusterStatisticsDecl components
+  compCVfromStats         <- compCVfromStatsDecl components
+  showClusterStatistics   <- showClusterStatisticsDecl components
+  optimalCVScore          <- optimalCVScoreDecl components
+  return $ nodeValues ++ getNodeValues ++ clusterStatistics ++
+           compClusterStatistics ++ updateClusterStatistics ++
+           compCVfromStats ++ showClusterStatistics ++
+           optimalCVScore
+
+-- | Helper function constructing VarStringTypeQ
+getVarStrictTypeQ :: (String, Q Type) -> VarStrictTypeQ
+getVarStrictTypeQ (n, t) = do
+  t' <- t
+  return (mkName n, NotStrict, t')
+
+-- | Function constructs NodeValues data type for metric components given.
+-- The data type is used to store all spread values of one Node.
+nodeValuesDecl :: [MetricComponent] -> Q [Dec]
+nodeValuesDecl components = do
+  let names = map (("nv_" ++ ) . name ) components
+      types = map fromNodeType components
+  strict_types <- mapM getVarStrictTypeQ $ zip names types
+  return [DataD [] (mkName "NodeValues") []
+         [RecC (mkName "NodeValues") strict_types] []]
+
+-- | Function constructs ClusterStatistics data type for metric components
+-- given. The data type is used to store all Statistics constructed from the
+-- [NodeValues].
+clusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+clusterStatisticsDecl components = do
+  let names = map (("cs_" ++ ) . name ) components
+      types = map statisticsType components
+  strict_types <- mapM getVarStrictTypeQ $ zip names types
+  return [DataD [] (mkName "ClusterStatistics") []
+         [RecC (mkName "ClusterStatistics") strict_types] []]
+
+-- | Generates (getNodeValues :: Node.Node -> NodeValues) declaration for
+-- metric components given. The function constructs NodeValues by calling
+-- fromNode function for each metrics component.
+getNodeValuesDecl :: [MetricComponent] -> Q [Dec]
+getNodeValuesDecl components = do
+  extract_functions <- mapM fromNode components
+  x <- newName "node"
+  node_t <- [t| Node.Node |]
+  let names = map (mkName . ("nv_" ++) . name) components
+      values = map (\f -> AppE f (VarE x)) extract_functions
+      body_exp = RecConE (mkName "NodeValues") $ zip names values
+      fname = mkName "getNodeValues"
+      nv_t = ConT $ mkName "NodeValues"
+      sig_d = SigD fname (ArrowT `AppT` node_t `AppT` nv_t)
+      fun_d = FunD fname [Clause [VarP x] (NormalB body_exp) []]
+  return [sig_d, fun_d]
+
+-- | Helper function passing two arguments to a function
+appTwice :: Q Exp -> Q Exp -> Q Exp -> Q Exp
+appTwice fun arg1 = appE $ appE fun arg1
+
+-- | Helper function constructing Q (Name, Exp)
+getQNameExp :: String -> Q Exp -> Q (Name, Exp)
+getQNameExp n e = do
+  e' <- e
+  return (mkName n, e')
+
+-- | Generates (compClusterStatisticsHelper :: [Node.Node] ->
+-- ClusterStatistics) declaration for metric components given. The function
+-- constructs ClusterStatistics by calling calculate function for each spread
+-- values list. Spread values lists are obtained by getNodeValues call.
+compClusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+compClusterStatisticsDecl components = do
+  nl_i <- newName "nl"
+  let splitted = appTwice [| partition |] [| Node.offline |] (varE nl_i)
+      (nl_off, nl_on) = (appE [| fst |] splitted, appE [| snd |] splitted)
+      (online, offline) = partition forOnlineNodes components
+      nv_f nm = varE . mkName $ "nv_" ++ nm
+      nvl_f = appTwice [| map |] (varE (mkName "getNodeValues"))
+      nv_field nm = appTwice [| map |] $ nv_f nm
+      cs_field nm nvl = appE [| calculate |] $ nv_field nm nvl
+      (online_names, offline_names)  = (map name online, map name offline)
+      offline_f = map (\nm -> getQNameExp ("cs_" ++ nm) .
+                              cs_field nm $ nvl_f nl_off) offline_names
+      online_f  = map (\nm -> getQNameExp ("cs_" ++ nm) .
+                              cs_field nm $ nvl_f nl_on ) online_names
+      body = recConE (mkName "ClusterStatistics") $ offline_f ++ online_f
+      cls_stat_t = conT $ mkName "ClusterStatistics"
+      fname = mkName "compClusterStatistics"
+  sig_d <- sigD fname ((arrowT `appT` [t| [Node.Node] |]) `appT` cls_stat_t)
+  fun_d <- funD fname [clause [varP nl_i] (normalB body) []]
+  return [sig_d, fun_d]
+
+-- | Generates (updateClusterStatistics :: ClusterStatistics ->
+-- (Node.Node, Node.Node) -> ClusterStatistics) declaration for metric
+-- components given. The function calls update for each ClusterStatistics
+-- field if the node is online or preserves the old ClusterStatistics
+-- otherwise. This action replaces contribution of the first node by the
+-- contribution of the second node.
+updateClusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+updateClusterStatisticsDecl components = do
+  old_s <- newName "old_s"
+  n  <- newName "n"
+  n' <- newName "n'"
+  let (online, offline) = partition forOnlineNodes components
+      pattern = [varP old_s, tupP [varP n, varP n']]
+      is_node_online = appE [| not . Node.offline |] $ varE n
+      get_nv nd = appE (varE $ mkName "getNodeValues") $ varE nd
+      nv_get_field nm nd = appE (varE . mkName $ "nv_" ++ nm) $ get_nv nd
+      cs_cur_field nm = appE (varE . mkName $ "cs_" ++ nm) $ varE old_s
+      update_field nm = appTwice (appE [| update |] $ cs_cur_field nm)
+                                  (nv_get_field nm n) (nv_get_field nm n')
+      (online_names, offline_names) = (map name online, map name offline)
+      offline_f = map (\nm -> getQNameExp ("cs_" ++ nm) $
+                                          cs_cur_field nm) offline_names
+      online_f  = map (\nm -> getQNameExp ("cs_" ++ nm) $
+                                          update_field nm) online_names
+      body = condE is_node_online
+             (recConE (mkName "ClusterStatistics") $ offline_f ++ online_f)
+             (varE old_s)
+      fname = mkName "updateClusterStatistics"
+      cs_t = conT $ mkName "ClusterStatistics"
+  sig_d <- sigD fname ((arrowT `appT` cs_t) `appT`
+                       ((arrowT `appT` [t| (Node.Node, Node.Node) |]) `appT`
+                         cs_t))
+  fun_d <- funD fname [clause pattern (normalB body) []]
+  return [sig_d, fun_d]
+
+-- | Generates (compCVFromStats :: ClusterStatistics -> Double) declaration
+-- for metric components given. The function computes the cluster score from
+-- the ClusterStatistics.
+compCVfromStatsDecl :: [MetricComponent] -> Q [Dec]
+compCVfromStatsDecl components = do
+  cs <- newName "cs"
+  let get_comp c = appE (varE . mkName $ "cs_" ++ name c) $ varE cs
+      get_val c = appE [| getValue |] $ get_comp c
+      term c = appTwice [| (*) :: Double -> Double -> Double |]
+                         (get_val c) (weight c)
+      stat = appE [| sum :: [Double] -> Double |] . listE $ map term components
+      fname = mkName "compCVfromStats"
+      cs_t = conT $ mkName "ClusterStatistics"
+  sig_d <- sigD fname ((arrowT `appT` cs_t) `appT` [t| Double |])
+  fun_d <- funD fname [clause [varP cs] (normalB stat) []]
+  return [sig_d, fun_d]
+
+-- | Generates (showClusterStatistics :: ClusterStatistics -> String)
+-- declaration for metric components given. The function converts
+-- ClusterStatistics to a string containing a table obtained by printTable.
+showClusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+showClusterStatisticsDecl components = do
+  lp <- newName "lp"
+  cs <- newName "cs"
+  let get_comp c = appE (varE . mkName $ "cs_" ++ name c) $ varE cs
+      get_val c = appE [| getValue |] $ get_comp c
+      format w h val = listE [ h
+                             , appE [| printf "%.8f" |] val
+                             , appE [| printf "x%.2f"|] w
+                             ]
+      print_line c = format (weight c) (litE . StringL $ name c) (get_val c)
+      header = [| [ "Field", "Value", "Weight" ] |]
+      printed = listE $ map print_line components
+      result = appTwice (appTwice [| printTable |] (varE lp) header)
+                         printed [| False:repeat True |]
+      fname = mkName "showClusterStatistics"
+      cs_t = conT $ mkName "ClusterStatistics"
+  sig_d <- sigD fname ((arrowT `appT` [t| String |]) `appT`
+                       ((arrowT `appT` cs_t) `appT` [t| String |]))
+  fun_d <- funD fname [clause [varP lp, varP cs] (normalB result) []]
+  return [sig_d, fun_d]
+
+
+-- | Generates (optimalCVScore :: Node.List -> Double) declaration for metric
+-- components given. The function computes the lower bound of the cluster
+-- score, i.e., the sum of the minimal values for all cluster score values that
+-- are not 0 on a perfectly balanced cluster. Components which optimal values
+-- are not 0 have Nothing as optimaLValue component
+optimalCVScoreDecl :: [MetricComponent] -> Q [Dec]
+optimalCVScoreDecl components = do
+  nl <- newName "nl"
+  let stat =
+        foldl (addVal nl) [| 0 :: Double |] $ mapMaybe optimalValue components
+      fname = mkName "optimalCVScore"
+  sig_d <- sigD fname ((arrowT `appT` [t| Node.List |]) `appT` [t| Double |])
+  fun_d <- funD fname [clause [varP nl] (normalB stat) []]
+  return [sig_d, fun_d]
+  where
+    addVal :: Name -> ExpQ -> ExpQ -> ExpQ
+    addVal nl cur f = appTwice [| (+) :: Double -> Double -> Double |]
+                               cur . appE f $ varE nl
diff --git a/src/Ganeti/HTools/Dedicated.hs b/src/Ganeti/HTools/Dedicated.hs
index 206513a..00413a6 100644
--- a/src/Ganeti/HTools/Dedicated.hs
+++ b/src/Ganeti/HTools/Dedicated.hs
@@ -44,7 +44,10 @@
   , runDedicatedAllocation
   ) where
 
-import Control.Applicative (liftA2, (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Arrow ((&&&))
 import Control.Monad (unless, liftM, foldM, mplus)
 import qualified Data.Foldable as F
diff --git a/src/Ganeti/HTools/ExtLoader.hs b/src/Ganeti/HTools/ExtLoader.hs
index 56e2e80..b322cb3 100644
--- a/src/Ganeti/HTools/ExtLoader.hs
+++ b/src/Ganeti/HTools/ExtLoader.hs
@@ -122,8 +122,9 @@
   now <- getClockTime
 
   let ignoreDynU = optIgnoreDynu opts
+      startIdle = ignoreDynU || optIdleDefault opts
       eff_u = if ignoreDynU then [] else util_data
-      ldresult = input_data >>= (if ignoreDynU then clearDynU else return)
+      ldresult = input_data >>= (if startIdle then clearDynU else return)
                             >>= mergeData eff_u exTags selInsts exInsts now
   cdata <- exitIfBad "failed to load data, aborting" ldresult
   (cdata', ok) <- runWriterT $ if optMonD opts
diff --git a/src/Ganeti/HTools/Instance.hs b/src/Ganeti/HTools/Instance.hs
index 63b3024..33e40be 100644
--- a/src/Ganeti/HTools/Instance.hs
+++ b/src/Ganeti/HTools/Instance.hs
@@ -338,7 +338,9 @@
 
 -- | Checks if an instance is bigger than a given spec.
 instAboveISpec :: Instance -> T.ISpec -> Bool -> T.OpResult ()
-instAboveISpec = instCompareISpec LT
+instAboveISpec inst spec exclstore =
+  genericResult (const $ Bad T.FailTooSmall) Ok
+  $ instCompareISpec LT inst spec exclstore
 
 -- | Checks if an instance matches a min/max specs pair
 instMatchesMinMaxSpecs :: Instance -> T.MinMaxISpecs -> Bool -> T.OpResult ()
diff --git a/src/Ganeti/HTools/Loader.hs b/src/Ganeti/HTools/Loader.hs
index 50ffbc1..2294468 100644
--- a/src/Ganeti/HTools/Loader.hs
+++ b/src/Ganeti/HTools/Loader.hs
@@ -53,6 +53,7 @@
   , ClusterData(..)
   , isAllocationRequest
   , emptyCluster
+  , obtainNodeMemory
   , extractDesiredLocations
   , updateDesiredLocationTags
   ) where
@@ -76,8 +77,11 @@
 import qualified Ganeti.HTools.Tags as Tags
 import qualified Ganeti.HTools.Tags.Constants as TagsC
 import Ganeti.HTools.Types
+import qualified Ganeti.Types as T
+import qualified Ganeti.Objects as O
 import Ganeti.Utils
 import Ganeti.Types (EvacMode)
+import Ganeti.JSON
 
 -- * Types
 
@@ -417,3 +421,14 @@
 eitherLive :: (Monad m) => Bool -> a -> m a -> m a
 eitherLive True _ live_data = live_data
 eitherLive False def_data _ = return def_data
+
+-- | Obtains memory used by node. It's memory_dom0 for Xen and memNode
+-- otherwise because live data collector exists only for Xen
+obtainNodeMemory :: O.FilledHvState -> Int -> Int
+obtainNodeMemory hv_state memory_dom0 =
+  let getNM ((_, hvs):_) 0 = O.hvstateMemNode hvs
+      getNM ((T.XenPvm, _):_) mem_dom0 = mem_dom0
+      getNM ((T.XenHvm, _):_) mem_dom0 = mem_dom0
+      getNM ((_, hvs):_) _ = O.hvstateMemNode hvs
+      getNM _ mem_dom0 = mem_dom0
+  in getNM (M.toList $ fromContainer hv_state) memory_dom0
diff --git a/src/Ganeti/HTools/Node.hs b/src/Ganeti/HTools/Node.hs
index 79993ad..6749568 100644
--- a/src/Ganeti/HTools/Node.hs
+++ b/src/Ganeti/HTools/Node.hs
@@ -43,6 +43,7 @@
   , create
   -- ** Finalization after data loading
   , buildPeers
+  , computePmem
   , setIdx
   , setAlias
   , setOffline
@@ -99,13 +100,15 @@
   , haveExclStorage
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Monad (liftM, liftM2)
-import Control.Applicative ((<$>), (<*>))
 import qualified Data.Foldable as Foldable
 import Data.Function (on)
 import qualified Data.Graph as Graph
 import qualified Data.IntMap as IntMap
-import Data.List hiding (group)
+import Data.List (intercalate, foldl', delete, union, sortBy, groupBy)
 import qualified Data.Map as Map
 import Data.Ord (comparing)
 import qualified Data.Set as Set
@@ -302,6 +305,10 @@
 haveExclStorage nl =
   any exclStorage $ Container.elems nl
 
+-- | Conversion formula from fMem, tMem and nMem to pMem.
+computePmem :: Int -> Double -> Int -> Double
+computePmem fmem tmem nmem = fromIntegral fmem / (tmem - fromIntegral nmem)
+
 -- * Initialization functions
 
 -- | Create a new node.
@@ -342,8 +349,8 @@
        , peers = P.empty
        , rMem = 0
        , rMemForth = 0
-       , pMem = fromIntegral mem_f_init / mem_t_init
-       , pMemForth = fromIntegral mem_f_init / mem_t_init
+       , pMem = computePmem mem_f_init mem_t_init mem_n_init
+       , pMemForth = computePmem mem_f_init mem_t_init mem_n_init
        , pDsk = if excl_stor
                 then computePDsk spindles_f_init $ fromIntegral spindles_t_init
                 else computePDsk dsk_f_init dsk_t_init
@@ -450,12 +457,22 @@
        , hiCpu = mCpuTohiCpu (T.iPolicyVcpuRatio pol) (tCpu node)
        , hiSpindles = computeHiSpindles (T.iPolicySpindleRatio pol)
                       (tSpindles node)
+       , pMem = computePmem (fMem node) (tMem node) (nMem node)
+       , pMemForth = computePmem (fMemForth node) (tMem node) (nMem node)
        }
 
 -- | Computes the maximum reserved memory for peers from a peer map.
 computeMaxRes :: P.PeerMap -> P.Elem
 computeMaxRes = P.maxElem
 
+-- | Calculates the lower acceptable amount of free memory. It's a negative
+-- value, thanks to memory over-commitment
+fMemTreshold :: Node -> Int
+fMemTreshold t =
+  fMemTresholdHelper (T.iPolicyMemoryRatio $ iPolicy t) (tMem t) (nMem t)
+  where fMemTresholdHelper ratio tmem nmem =
+          truncate $ (1 - ratio) * (tmem - fromIntegral nmem)
+
 -- | Builds the peer map for a given node.
 buildPeers :: Node -> Instance.List -> Node
 buildPeers t il =
@@ -472,7 +489,7 @@
               (sList t)
       pmap = P.accumArray (+) mdata
       new_rmem = computeMaxRes pmap
-      new_failN1 = fMem t < new_rmem
+      new_failN1 = fMem t - new_rmem <= fMemTreshold t
       new_prem = fromIntegral new_rmem / tMem t
   in t { peers = pmap
        , failN1 = new_failN1
@@ -595,7 +612,7 @@
                                 (fMemForth node)
                                 (Instance.mem inst)
 
-          new_pMemForth = fromIntegral new_fMemForth / tMem node
+          new_pMemForth = computePmem new_fMemForth (tMem node) (nMem node)
 
       in node
            { pTags = addTags (pTags node) (Instance.exclTags inst)
@@ -728,7 +745,7 @@
             new_dsk_forth = incIf uses_disk (fDskForth n) (Instance.dsk inst)
             new_free_sp_forth = calcNewFreeSpindlesForth False n inst
             new_inst_sp_forth = calcSpindleUseForth False n inst
-            new_mp_forth = fromIntegral new_mem_forth / tMem n
+            new_mp_forth = computePmem new_mem_forth (tMem n) (nMem n)
             new_dp_forth = computeNewPDsk n new_free_sp_forth new_dsk_forth
             new_ucpu_forth = decIf i_online (uCpuForth n) (Instance.vcpus inst)
             new_rcpu_forth = fromIntegral new_ucpu_forth / tCpu n
@@ -759,9 +776,9 @@
                 new_dsk = incIf uses_disk (fDsk t) (Instance.dsk inst)
                 new_free_sp = calcNewFreeSpindles False t inst
                 new_inst_sp = calcSpindleUse False t inst
-                new_mp = fromIntegral new_mem / tMem t
+                new_mp = computePmem new_mem (tMem t) (nMem t)
                 new_dp = computeNewPDsk t new_free_sp new_dsk
-                new_failn1 = new_mem <= rMem t
+                new_failn1 = new_mem - rMem t <= fMemTreshold t
                 new_ucpu = decIf i_online (uCpu t) (Instance.vcpus inst)
                 new_rcpu = fromIntegral new_ucpu / tCpu t
                 new_load = utilLoad t `T.subUtil` Instance.util inst
@@ -830,7 +847,7 @@
                              then old_rmem
                              else computeMaxRes new_peers
                 new_prem = fromIntegral new_rmem / tMem t
-                new_failn1 = fMem t <= new_rmem
+                new_failn1 = fMem t - new_rmem <= fMemTreshold t
                 new_dp = computeNewPDsk t new_free_sp new_dsk
                 old_load = utilLoad t
                 new_load = old_load
@@ -870,7 +887,7 @@
       inst_tags = Instance.exclTags inst
 
       new_mem_forth = fMemForth t - Instance.mem inst
-      new_mp_forth = fromIntegral new_mem_forth / tMem t
+      new_mp_forth = computePmem new_mem_forth (tMem t) (nMem t)
       new_dsk_forth = decIf uses_disk (fDskForth t) (Instance.dsk inst)
       new_free_sp_forth = calcNewFreeSpindlesForth True t inst
       new_inst_sp_forth = calcSpindleUseForth True t inst
@@ -898,7 +915,7 @@
           }
 
       checkForthcomingViolation
-        | new_mem_forth <= 0                            = Bad T.FailMem
+        | new_mem_forth <= fMemTreshold t                = Bad T.FailMem
         | uses_disk && new_dsk_forth <= 0               = Bad T.FailDisk
         | uses_disk && new_dsk_forth < loDsk t          = Bad T.FailDisk
         | uses_disk && exclStorage t
@@ -921,19 +938,19 @@
                new_dsk = decIf uses_disk (fDsk t) (Instance.dsk inst)
                new_free_sp = calcNewFreeSpindles True t inst
                new_inst_sp = calcSpindleUse True t inst
-               new_failn1 = new_mem <= rMem t
+               new_failn1 = new_mem - rMem t <= fMemTreshold t
                new_ucpu = incIf i_online (uCpu t) (Instance.vcpus inst)
                new_pcpu = fromIntegral new_ucpu / tCpu t
                new_dp = computeNewPDsk t new_free_sp new_dsk
                new_load = utilLoad t `T.addUtil` Instance.util inst
 
                new_plist = iname:pList t
-               new_mp = fromIntegral new_mem / tMem t
+               new_mp = computePmem new_mem (tMem t) (nMem t)
 
                new_instance_map = addTags (instanceMap t)
                                 $ getLocationExclusionPairs t inst
       in case () of
-        _ | new_mem <= 0 -> Bad T.FailMem
+        _ | new_mem <= fMemTreshold t -> Bad T.FailMem
           | uses_disk && new_dsk <= 0 -> Bad T.FailDisk
           | strict && uses_disk && new_dsk < loDsk t -> Bad T.FailDisk
           | uses_disk && exclStorage t && new_free_sp < 0 -> Bad T.FailSpindles
@@ -1019,7 +1036,7 @@
         | new_dsk_forth < loDsk t                = Bad T.FailDisk
         | exclStorage t && new_free_sp_forth < 0 = Bad T.FailSpindles
         | new_inst_sp_forth > hiSpindles t       = Bad T.FailDisk
-        | secondary_needed_mem >= old_mem_forth  = Bad T.FailMem
+        | old_mem_forth - secondary_needed_mem <= fMemTreshold t = Bad T.FailMem
         -- TODO Check failN1 including forthcoming instances
         | otherwise                              = Ok ()
 
@@ -1035,7 +1052,7 @@
                new_inst_sp = calcSpindleUse True t inst
                new_rmem = max (rMem t) new_peem
                new_prem = fromIntegral new_rmem / tMem t
-               new_failn1 = old_mem <= new_rmem
+               new_failn1 = old_mem - new_rmem <= fMemTreshold t
                new_dp = computeNewPDsk t new_free_sp new_dsk
                old_load = utilLoad t
                new_load = old_load
@@ -1049,7 +1066,8 @@
           | strict && new_dsk < loDsk t -> Bad T.FailDisk
           | exclStorage t && new_free_sp < 0 -> Bad T.FailSpindles
           | strict && new_inst_sp > hiSpindles t -> Bad T.FailDisk
-          | strict && secondary_needed_mem >= old_mem -> Bad T.FailMem
+          | strict && old_mem - secondary_needed_mem <= fMemTreshold t
+                                                   -> Bad T.FailMem
           | strict && new_failn1 && not (failN1 t) -> Bad T.FailMem
 
           -- When strict also check forthcoming limits, but after normal checks
@@ -1277,6 +1295,8 @@
            , OpCodes.opSecondaryIp = Nothing
            , OpCodes.opgenericNdParams = Nothing
            , OpCodes.opPowered = Nothing
+           , OpCodes.opVerbose = False
+           , OpCodes.opDebug = False
            }
 
 -- | Generate OpCode for applying a OobCommand to the given nodes
diff --git a/src/Ganeti/HTools/Program/Harep.hs b/src/Ganeti/HTools/Program/Harep.hs
index 8ad7deb..87d9b53 100644
--- a/src/Ganeti/HTools/Program/Harep.hs
+++ b/src/Ganeti/HTools/Program/Harep.hs
@@ -42,10 +42,7 @@
 import Control.Exception (bracket)
 import Control.Lens (over)
 import Control.Monad
-import Data.Function
-import Data.List
 import Data.Maybe
-import Data.Ord
 import System.Time
 import qualified Data.Map as Map
 import qualified Text.JSON as J
@@ -58,21 +55,18 @@
 import Ganeti.Jobs
 import Ganeti.OpCodes
 import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
-import Ganeti.OpParams
 import Ganeti.Types
 import Ganeti.Utils
-import qualified Ganeti.Constants as C
 import qualified Ganeti.Luxi as L
 import qualified Ganeti.Path as Path
 
 import Ganeti.HTools.CLI
+import qualified Ganeti.HTools.Container as Container
 import Ganeti.HTools.Loader
 import Ganeti.HTools.ExtLoader
-import qualified Ganeti.HTools.Tags.Constants as Tags
+import Ganeti.HTools.Repair
 import Ganeti.HTools.Types
-import qualified Ganeti.HTools.Container as Container
 import qualified Ganeti.HTools.Instance as Instance
-import qualified Ganeti.HTools.Node as Node
 
 import Ganeti.Version (version)
 
@@ -101,135 +95,6 @@
   . setOpComment ("automated repairs by harep " ++ version)
   . wrapOpCode
 
-data InstanceData = InstanceData { arInstance :: Instance.Instance
-                                 , arState :: AutoRepairStatus
-                                 , tagsToRemove :: [String]
-                                 }
-                    deriving (Eq, Show)
-
--- | Parse a tag into an 'AutoRepairData' record.
---
--- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
--- malformed.
-parseInitTag :: String -> Maybe AutoRepairData
-parseInitTag tag =
-  let parsePending = do
-        subtag <- chompPrefix Tags.autoRepairTagPending tag
-        case sepSplit ':' subtag of
-          [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
-          _                       -> fail ("Invalid tag: " ++ show tag)
-
-      parseResult = do
-        subtag <- chompPrefix Tags.autoRepairTagResult tag
-        case sepSplit ':' subtag of
-          [rtype, uuid, ts, result, jobs] -> do
-            arData <- makeArData rtype uuid ts jobs
-            result' <- autoRepairResultFromRaw result
-            return arData { arResult = Just result' }
-          _                               -> fail ("Invalid tag: " ++ show tag)
-
-      makeArData rtype uuid ts jobs = do
-        rtype' <- autoRepairTypeFromRaw rtype
-        ts' <- tryRead "auto-repair time" ts
-        jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
-        return AutoRepairData { arType = rtype'
-                              , arUuid = uuid
-                              , arTime = TOD ts' 0
-                              , arJobs = jobs'
-                              , arResult = Nothing
-                              , arTag = tag
-                              }
-  in
-   parsePending `mplus` parseResult
-
--- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
-getArData :: AutoRepairStatus -> Maybe AutoRepairData
-getArData status =
-  case status of
-    ArHealthy (Just d) -> Just d
-    ArFailedRepair  d  -> Just d
-    ArPendingRepair d  -> Just d
-    ArNeedsRepair   d  -> Just d
-    _                  -> Nothing
-
--- | Return a short name for each auto-repair status.
---
--- This is a more concise representation of the status, because the default
--- "Show" formatting includes all the accompanying auto-repair data.
-arStateName :: AutoRepairStatus -> String
-arStateName status =
-  case status of
-    ArHealthy _       -> "Healthy"
-    ArFailedRepair _  -> "Failure"
-    ArPendingRepair _ -> "Pending repair"
-    ArNeedsRepair _   -> "Needs repair"
-
--- | Return a new list of tags to remove that includes @arTag@ if present.
-delCurTag :: InstanceData -> [String]
-delCurTag instData =
-  let arData = getArData $ arState instData
-      rmTags = tagsToRemove instData
-  in
-   case arData of
-     Just d  -> arTag d : rmTags
-     Nothing -> rmTags
-
--- | Set the initial auto-repair state of an instance from its auto-repair tags.
---
--- The rules when there are multiple tags is:
---
---   * the earliest failure result always wins
---
---   * two or more pending repairs results in a fatal error
---
---   * a pending result from id X and a success result from id Y result in error
---     if Y is newer than X
---
---   * if there are no pending repairs, the newest success result wins,
---     otherwise the pending result is used.
-setInitialState :: Instance.Instance -> Result InstanceData
-setInitialState inst =
-  let arData = mapMaybe parseInitTag $ Instance.allTags inst
-      -- Group all the AutoRepairData records by id (i.e. by repair task), and
-      -- present them from oldest to newest.
-      arData' = sortBy (comparing arUuid) arData
-      arGroups = groupBy ((==) `on` arUuid) arData'
-      arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
-  in
-   foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
-
--- | Update the initial status of an instance with new repair task tags.
---
--- This function gets called once per repair group in an instance's tag, and it
--- determines whether to set the status of the instance according to this new
--- group, or to keep the existing state. See the documentation for
--- 'setInitialState' for the rules to be followed when determining this.
-arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
-arStatusCmp instData arData =
-  let curSt = arState instData
-      arData' = sortBy (comparing keyfn) arData
-      keyfn d = (arResult d, arTime d)
-      newData = last arData'
-      newSt = case arResult newData of
-                Just ArSuccess -> ArHealthy $ Just newData
-                Just ArEnoperm -> ArHealthy $ Just newData
-                Just ArFailure -> ArFailedRepair newData
-                Nothing        -> ArPendingRepair newData
-  in
-   case curSt of
-     ArFailedRepair _ -> Ok instData  -- Always keep the earliest failure.
-     ArHealthy _      -> Ok instData { arState = newSt
-                                     , tagsToRemove = delCurTag instData
-                                     }
-     ArPendingRepair d -> Bad (
-       "An unfinished repair was found in instance " ++
-       Instance.name (arInstance instData) ++ ": found tag " ++
-       show (arTag newData) ++ ", but older pending tag " ++
-       show (arTag d) ++ "exists.")
-
-     ArNeedsRepair _ -> Bad
-       "programming error: ArNeedsRepair found as an initial state"
-
 -- | Query jobs of a pending repair, returning the new instance data.
 processPending :: Options -> L.Client -> InstanceData -> IO InstanceData
 processPending opts client instData =
@@ -264,20 +129,6 @@
 
     _ -> return instData
 
--- | Update the tag of an 'AutoRepairData' record to match all the other fields.
-updateTag :: AutoRepairData -> AutoRepairData
-updateTag arData =
-  let ini = [autoRepairTypeToRaw $ arType arData,
-             arUuid arData,
-             clockTimeToString $ arTime arData]
-      end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
-      (pfx, middle) =
-         case arResult arData of
-          Nothing -> (Tags.autoRepairTagPending, [])
-          Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs])
-  in
-   arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
-
 -- | Apply and remove tags from an instance as indicated by 'InstanceData'.
 --
 -- If the /arState/ of the /InstanceData/ record has an associated
@@ -309,100 +160,6 @@
 
   return instData { tagsToRemove = [] }
 
--- | Detect brokenness with an instance and suggest repair type and jobs to run.
-detectBroken :: Node.List -> Instance.Instance
-             -> Maybe (AutoRepairType, [OpCode])
-detectBroken nl inst =
-  let disk = Instance.diskTemplate inst
-      iname = Instance.name inst
-      offPri = Node.offline $ Container.find (Instance.pNode inst) nl
-      offSec = Node.offline $ Container.find (Instance.sNode inst) nl
-  in
-   case disk of
-     DTDrbd8
-       | offPri && offSec ->
-         Just (
-           ArReinstall,
-           [ OpInstanceRecreateDisks { opInstanceName = iname
-                                     , opInstanceUuid = Nothing
-                                     , opRecreateDisksInfo = RecreateDisksAll
-                                     , opNodes = []
-                                       -- FIXME: there should be a better way to
-                                       -- specify opcode parameters than abusing
-                                       -- mkNonEmpty in this way (using the fact
-                                       -- that Maybe is used both for optional
-                                       -- fields, and to express failure).
-                                     , opNodeUuids = Nothing
-                                     , opIallocator = mkNonEmpty "hail"
-                                     }
-           , OpInstanceReinstall { opInstanceName = iname
-                                 , opInstanceUuid = Nothing
-                                 , opOsType = Nothing
-                                 , opTempOsParams = Nothing
-                                 , opOsparamsPrivate = Nothing
-                                 , opOsparamsSecret = Nothing
-                                 , opForceVariant = False
-                                 }
-           ])
-       | offPri ->
-         Just (
-           ArFailover,
-           [ OpInstanceFailover { opInstanceName = iname
-                                , opInstanceUuid = Nothing
-                                  -- FIXME: ditto, see above.
-                                , opShutdownTimeout = fromJust $ mkNonNegative
-                                                      C.defaultShutdownTimeout
-                                , opIgnoreConsistency = False
-                                , opTargetNode = Nothing
-                                , opTargetNodeUuid = Nothing
-                                , opIgnoreIpolicy = False
-                                , opIallocator = Nothing
-                                , opMigrationCleanup = False
-                                }
-           ])
-       | offSec ->
-         Just (
-           ArFixStorage,
-           [ OpInstanceReplaceDisks { opInstanceName = iname
-                                    , opInstanceUuid = Nothing
-                                    , opReplaceDisksMode = ReplaceNewSecondary
-                                    , opReplaceDisksList = []
-                                    , opRemoteNode = Nothing
-                                      -- FIXME: ditto, see above.
-                                    , opRemoteNodeUuid = Nothing
-                                    , opIallocator = mkNonEmpty "hail"
-                                    , opEarlyRelease = False
-                                    , opIgnoreIpolicy = False
-                                    }
-            ])
-       | otherwise -> Nothing
-
-     DTPlain
-       | offPri ->
-         Just (
-           ArReinstall,
-           [ OpInstanceRecreateDisks { opInstanceName = iname
-                                     , opInstanceUuid = Nothing
-                                     , opRecreateDisksInfo = RecreateDisksAll
-                                     , opNodes = []
-                                       -- FIXME: ditto, see above.
-                                     , opNodeUuids = Nothing
-                                     , opIallocator = mkNonEmpty "hail"
-                                     }
-           , OpInstanceReinstall { opInstanceName = iname
-                                 , opInstanceUuid = Nothing
-                                 , opOsType = Nothing
-                                 , opTempOsParams = Nothing
-                                 , opOsparamsPrivate = Nothing
-                                 , opOsparamsSecret = Nothing
-                                 , opForceVariant = False
-                                 }
-           ])
-       | otherwise -> Nothing
-
-     _ -> Nothing  -- Other cases are unimplemented for now: DTDiskless,
-                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
-
 -- | Submit jobs, unless a dry-run is requested; in this case, just report
 -- the job that would be submitted.
 submitJobs' :: Options -> [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
diff --git a/src/Ganeti/HTools/Program/Hbal.hs b/src/Ganeti/HTools/Program/Hbal.hs
index 084433a..68572dc 100644
--- a/src/Ganeti/HTools/Program/Hbal.hs
+++ b/src/Ganeti/HTools/Program/Hbal.hs
@@ -4,7 +4,7 @@
 
 {-
 
-Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
+Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 Google Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,7 @@
 import Text.Printf (printf)
 
 import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), fromCLIOptions)
+import Ganeti.HTools.Backend.MonD (scaleMemoryWeight)
 import qualified Ganeti.HTools.Container as Container
 import qualified Ganeti.HTools.Cluster as Cluster
 import qualified Ganeti.HTools.Cluster.Metrics as Metrics
@@ -101,15 +102,19 @@
     , oMinGain
     , oMinGainLim
     , oDiskMoves
+    , oAvoidDiskMoves
     , oSelInst
     , oInstMoves
     , oIgnoreSoftErrors
     , oDynuFile
+    , oIdleDefault
     , oIgnoreDyn 
     , oMonD
     , oMonDDataFile
     , oMonDExitMissing
     , oMonDXen
+    , oMonDKvmRSS
+    , oMemWeight
     , oExTags
     , oExInst
     , oSaveCluster
@@ -312,13 +317,14 @@
       showinsts = optShowInsts opts
       force = optIgnoreSoftErrors opts
 
-  ini_cdata@(ClusterData gl fixed_nl ilf ctags ipol) <- loadExternalData opts
+  ini_cdata@(ClusterData gl fixed_nl ilf' ctags ipol) <- loadExternalData opts
 
   when (verbose > 1) $ do
        putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags
        putStrLn $ "Loaded cluster ipolicy: " ++ show ipol
 
-  nlf <- setNodeStatus opts fixed_nl
+  nlf' <- setNodeStatus opts fixed_nl
+  let (nlf, ilf) = scaleMemoryWeight (optMemWeight opts) (nlf', ilf')
   checkCluster verbose nlf ilf
 
   maybeSaveData (optSaveCluster opts) "original" "before balancing" ini_cdata
diff --git a/src/Ganeti/HTools/Program/Hcheck.hs b/src/Ganeti/HTools/Program/Hcheck.hs
index a2251ff..1250ca2 100644
--- a/src/Ganeti/HTools/Program/Hcheck.hs
+++ b/src/Ganeti/HTools/Program/Hcheck.hs
@@ -53,8 +53,8 @@
 import qualified Ganeti.HTools.Group as Group
 import qualified Ganeti.HTools.Node as Node
 import qualified Ganeti.HTools.Instance as Instance
-
 import qualified Ganeti.HTools.Program.Hbal as Hbal
+import Ganeti.HTools.RedundancyLevel (redundancy)
 
 import Ganeti.Common
 import Ganeti.HTools.CLI
@@ -70,6 +70,7 @@
   return
     [ oDataFile
     , oDiskMoves
+    , oAvoidDiskMoves
     , oDynuFile
     , oIgnoreDyn
     , oEvacMode
@@ -111,7 +112,7 @@
 type GroupInfo = (Gdx, (Node.List, Instance.List))
 
 -- | A type alias for group stats.
-type GroupStats = ((Group.Group, Double), [Int])
+type GroupStats = ((Group.Group, Double, Int), [Int])
 
 -- | Prefix for machine readable names.
 htcPrefix :: String
@@ -130,10 +131,12 @@
 -- | Data showed per group.
 groupData :: Options -> [(String, String)]
 groupData opts = commonData opts ++ [("SCORE", "Group score")]
+                 ++ [("REDUNDANCY", "Group redundancy level")]
 
 -- | Data showed per cluster.
 clusterData :: Options -> [(String, String)]
 clusterData opts = commonData opts  ++
+              [ ("REDUNDANCY", "Cluster redundancy level") ] ++
               [ ("NEED_REBALANCE", "Cluster is not healthy") ]
 
 -- | Phase-specific prefix for machine readable version.
@@ -221,9 +224,9 @@
 extractGroupData False grp = Group.name grp
 
 -- | Prepare values for group.
-prepareGroupValues :: [Int] -> Double -> [String]
-prepareGroupValues stats score =
-  map show stats ++ [printf "%.8f" score]
+prepareGroupValues :: [Int] -> Double -> Int -> [String]
+prepareGroupValues stats score redundancyLevel =
+  map show stats ++ [printf "%.8f" score] ++ [show redundancyLevel]
 
 -- | Prepare values for cluster.
 prepareClusterValues :: Bool -> [Int] -> [Bool] -> [String]
@@ -232,15 +235,16 @@
 
 -- | Print all the statistics on a group level.
 printGroupStats :: Options -> Bool -> Phase -> GroupStats -> IO ()
-printGroupStats opts machineread phase ((grp, score), stats) = do
-  let values = prepareGroupValues stats score
+printGroupStats opts machineread phase
+                ((grp, score, redundancyLevel), stats) = do
+  let values = prepareGroupValues stats score redundancyLevel
       extradata = extractGroupData machineread grp
   printStats opts machineread (GroupLvl extradata) phase values
 
 -- | Print all the statistics on a cluster (global) level.
-printClusterStats :: Options -> Bool -> Phase -> [Int] -> Bool -> IO ()
-printClusterStats opts machineread phase stats needhbal = do
-  let values = prepareClusterValues machineread stats [needhbal]
+printClusterStats :: Options -> Bool -> Phase -> [Int] -> Bool -> Int -> IO ()
+printClusterStats opts machineread phase stats needhbal gRed = do
+  let values = prepareClusterValues machineread (stats ++ [gRed]) [needhbal]
   printStats opts machineread ClusterLvl phase values
 
 -- | Check if any of cluster metrics is non-zero.
@@ -263,13 +267,14 @@
       offline_pri = sum . map length $ map Node.pList offnl
       offline_sec = length $ map Node.sList offnl
       score = Metrics.compCV nl
+      redundancyLvl = redundancy (fromCLIOptions opts) nl il
       groupstats = [ n1violated
                    , conflicttags
                    , offline_pri
                    , offline_sec
                    ]
                    ++ [ gn1fail | optCapacity opts ]
-  in ((grp, score), groupstats)
+  in ((grp, score, redundancyLvl), groupstats)
 
 -- | Use Hbal's iterateDepth to simulate group rebalance.
 executeSimulation :: Options -> Cluster.Table -> Double
@@ -327,6 +332,7 @@
 
   let groupsstats = map (perGroupChecks opts gl) splitcluster
       clusterstats = map sum . transpose . map snd $ groupsstats
+      globalRedundancy = minimum $ map (\((_, _, r), _) -> r) groupsstats
       needrebalance = clusterNeedsRebalance clusterstats
 
   unless (verbose < 1 || machineread) .
@@ -339,6 +345,7 @@
   mapM_ (printGroupStats opts machineread Initial) groupsstats
 
   printClusterStats opts machineread Initial clusterstats needrebalance
+                    globalRedundancy
 
   let exitOK = nosimulation || not needrebalance
       simulate = not nosimulation && needrebalance
@@ -348,12 +355,14 @@
   when (simulate || machineread) $ do
     let newgroupstats = map (perGroupChecks opts gl) rebalancedcluster
         newclusterstats = map sum . transpose . map snd $ newgroupstats
+        newGlobalRedundancy = minimum $ map (\((_, _, r), _) -> r)
+                                            newgroupstats
         newneedrebalance = clusterNeedsRebalance clusterstats
 
     mapM_ (printGroupStats opts machineread Rebalanced) newgroupstats
 
     printClusterStats opts machineread Rebalanced newclusterstats
-                           newneedrebalance
+                           newneedrebalance newGlobalRedundancy
 
   printFinalHTC machineread
 
diff --git a/src/Ganeti/HTools/RedundancyLevel.hs b/src/Ganeti/HTools/RedundancyLevel.hs
new file mode 100644
index 0000000..ca77a10
--- /dev/null
+++ b/src/Ganeti/HTools/RedundancyLevel.hs
@@ -0,0 +1,76 @@
+{-| Implementation of the computation of the cluster redundancy level
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.HTools.RedundancyLevel
+  ( redundancy
+  ) where
+
+import Control.Applicative (liftA2)
+import Control.Arrow ((&&&))
+import Data.Function (on)
+import qualified Data.IntMap as IntMap
+import Data.List (sortBy)
+
+import Ganeti.BasicTypes (runListHead)
+import Ganeti.HTools.AlgorithmParams (AlgorithmOptions)
+import Ganeti.HTools.GlobalN1 (redundant)
+import qualified Ganeti.HTools.Cluster as Cluster
+import qualified Ganeti.HTools.Cluster.Metrics as Metrics
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Utils (iterateJust)
+
+-- | Estimate the level of redundancy of node group given
+-- by its nodes and instances.
+redundancy :: AlgorithmOptions -> Node.List -> Instance.List -> Int
+redundancy _    nl _  | any (liftA2 (&&) Node.offline $ not . null . Node.pList)
+                        $ IntMap.elems nl = -1
+redundancy opts nl il | not $ redundant opts nl il = 0
+redundancy opts nl il =
+  let sortedNodes =
+        sortBy (compare `on` ((Node.tMem . snd) &&& fst))
+        . filter (not . Node.offline . snd)
+        $ IntMap.toAscList nl
+   in case sortedNodes of
+     [] -> 0
+     (indexBigNode, bigNode):_ ->
+       let bigNode' = bigNode { Node.offline = True }
+           nl' = Container.add indexBigNode bigNode' nl
+           initialMetrics = Metrics.compCV nl'
+           initialTable = Cluster.Table nl' il initialMetrics []
+           Cluster.Table nl'' il' _ _ =
+             runListHead initialTable id . reverse
+             $ iterateJust (Cluster.tryBalance opts) initialTable
+       in 1 + redundancy opts nl'' il'
diff --git a/src/Ganeti/HTools/Repair.hs b/src/Ganeti/HTools/Repair.hs
new file mode 100644
index 0000000..4220635
--- /dev/null
+++ b/src/Ganeti/HTools/Repair.hs
@@ -0,0 +1,305 @@
+{-| Implementation of the auto-repair logic for Ganeti.
+
+-}
+
+{-
+
+Copyright (C) 2013, 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.HTools.Repair
+  ( InstanceData(..)
+  , parseInitTag
+  , getArData
+  , arStateName
+  , delCurTag
+  , setInitialState
+  , arStatusCmp
+  , updateTag
+  , detectBroken
+  ) where
+
+import Control.Monad (mplus, foldM)
+import Data.Function (on)
+import Data.List (sortBy, groupBy, intercalate)
+import Data.Maybe (mapMaybe, fromJust)
+import Data.Ord (comparing)
+import System.Time (ClockTime(TOD))
+
+import Ganeti.BasicTypes (GenericResult(..), Result)
+import qualified Ganeti.Constants as C
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import qualified Ganeti.HTools.Tags.Constants as Tags
+import Ganeti.HTools.Types
+import Ganeti.OpCodes (OpCode(..))
+import Ganeti.OpParams ( RecreateDisksInfo(RecreateDisksAll)
+                       , ReplaceDisksMode(ReplaceNewSecondary)
+                       )
+import Ganeti.Types (makeJobIdS, fromJobId, mkNonEmpty, mkNonNegative)
+import Ganeti.Utils (chompPrefix, sepSplit, tryRead, clockTimeToString)
+
+-- | Description of an instance annotated with repair-related information.
+data InstanceData = InstanceData { arInstance :: Instance.Instance
+                                 , arState :: AutoRepairStatus
+                                 , tagsToRemove :: [String]
+                                 }
+                    deriving (Eq, Show)
+
+
+-- | Parse a tag into an 'AutoRepairData' record.
+--
+-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
+-- malformed.
+parseInitTag :: String -> Maybe AutoRepairData
+parseInitTag tag =
+  let parsePending = do
+        subtag <- chompPrefix Tags.autoRepairTagPending tag
+        case sepSplit ':' subtag of
+          [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
+          _                       -> fail ("Invalid tag: " ++ show tag)
+
+      parseResult = do
+        subtag <- chompPrefix Tags.autoRepairTagResult tag
+        case sepSplit ':' subtag of
+          [rtype, uuid, ts, result, jobs] -> do
+            arData <- makeArData rtype uuid ts jobs
+            result' <- autoRepairResultFromRaw result
+            return arData { arResult = Just result' }
+          _                               -> fail ("Invalid tag: " ++ show tag)
+
+      makeArData rtype uuid ts jobs = do
+        rtype' <- autoRepairTypeFromRaw rtype
+        ts' <- tryRead "auto-repair time" ts
+        jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
+        return AutoRepairData { arType = rtype'
+                              , arUuid = uuid
+                              , arTime = TOD ts' 0
+                              , arJobs = jobs'
+                              , arResult = Nothing
+                              , arTag = tag
+                              }
+  in
+   parsePending `mplus` parseResult
+
+-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
+getArData :: AutoRepairStatus -> Maybe AutoRepairData
+getArData status =
+  case status of
+    ArHealthy (Just d) -> Just d
+    ArFailedRepair  d  -> Just d
+    ArPendingRepair d  -> Just d
+    ArNeedsRepair   d  -> Just d
+    _                  -> Nothing
+
+-- | Return a short name for each auto-repair status.
+--
+-- This is a more concise representation of the status, because the default
+-- "Show" formatting includes all the accompanying auto-repair data.
+arStateName :: AutoRepairStatus -> String
+arStateName status =
+  case status of
+    ArHealthy _       -> "Healthy"
+    ArFailedRepair _  -> "Failure"
+    ArPendingRepair _ -> "Pending repair"
+    ArNeedsRepair _   -> "Needs repair"
+
+-- | Return a new list of tags to remove that includes @arTag@ if present.
+delCurTag :: InstanceData -> [String]
+delCurTag instData =
+  let arData = getArData $ arState instData
+      rmTags = tagsToRemove instData
+  in
+   case arData of
+     Just d  -> arTag d : rmTags
+     Nothing -> rmTags
+
+-- | Set the initial auto-repair state of an instance from its auto-repair tags.
+--
+-- The rules when there are multiple tags is:
+--
+--   * the earliest failure result always wins
+--
+--   * two or more pending repairs results in a fatal error
+--
+--   * a pending result from id X and a success result from id Y result in error
+--     if Y is newer than X
+--
+--   * if there are no pending repairs, the newest success result wins,
+--     otherwise the pending result is used.
+setInitialState :: Instance.Instance -> Result InstanceData
+setInitialState inst =
+  let arData = mapMaybe parseInitTag $ Instance.allTags inst
+      -- Group all the AutoRepairData records by id (i.e. by repair task), and
+      -- present them from oldest to newest.
+      arData' = sortBy (comparing arUuid) arData
+      arGroups = groupBy ((==) `on` arUuid) arData'
+      arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
+  in
+   foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
+
+-- | Update the initial status of an instance with new repair task tags.
+--
+-- This function gets called once per repair group in an instance's tag, and it
+-- determines whether to set the status of the instance according to this new
+-- group, or to keep the existing state. See the documentation for
+-- 'setInitialState' for the rules to be followed when determining this.
+arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
+arStatusCmp instData arData =
+  let curSt = arState instData
+      arData' = sortBy (comparing keyfn) arData
+      keyfn d = (arResult d, arTime d)
+      newData = last arData'
+      newSt = case arResult newData of
+                Just ArSuccess -> ArHealthy $ Just newData
+                Just ArEnoperm -> ArHealthy $ Just newData
+                Just ArFailure -> ArFailedRepair newData
+                Nothing        -> ArPendingRepair newData
+  in
+   case curSt of
+     ArFailedRepair _ -> Ok instData  -- Always keep the earliest failure.
+     ArHealthy _      -> Ok instData { arState = newSt
+                                     , tagsToRemove = delCurTag instData
+                                     }
+     ArPendingRepair d -> Bad (
+       "An unfinished repair was found in instance " ++
+       Instance.name (arInstance instData) ++ ": found tag " ++
+       show (arTag newData) ++ ", but older pending tag " ++
+       show (arTag d) ++ "exists.")
+
+     ArNeedsRepair _ -> Bad
+       "programming error: ArNeedsRepair found as an initial state"
+
+-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
+updateTag :: AutoRepairData -> AutoRepairData
+updateTag arData =
+  let ini = [autoRepairTypeToRaw $ arType arData,
+             arUuid arData,
+             clockTimeToString $ arTime arData]
+      end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
+      (pfx, middle) =
+         case arResult arData of
+          Nothing -> (Tags.autoRepairTagPending, [])
+          Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs])
+  in
+   arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
+
+-- | Detect brokenness with an instance and suggest repair type and jobs to run.
+detectBroken :: Node.List -> Instance.Instance
+             -> Maybe (AutoRepairType, [OpCode])
+detectBroken nl inst =
+  let disk = Instance.diskTemplate inst
+      iname = Instance.name inst
+      offPri = Node.offline $ Container.find (Instance.pNode inst) nl
+      offSec = Node.offline $ Container.find (Instance.sNode inst) nl
+  in
+   case disk of
+     DTDrbd8
+       | offPri && offSec ->
+         Just (
+           ArReinstall,
+           [ OpInstanceRecreateDisks { opInstanceName = iname
+                                     , opInstanceUuid = Nothing
+                                     , opRecreateDisksInfo = RecreateDisksAll
+                                     , opNodes = []
+                                       -- FIXME: there should be a better way to
+                                       -- specify opcode parameters than abusing
+                                       -- mkNonEmpty in this way (using the fact
+                                       -- that Maybe is used both for optional
+                                       -- fields, and to express failure).
+                                     , opNodeUuids = Nothing
+                                     , opIallocator = mkNonEmpty "hail"
+                                     }
+           , OpInstanceReinstall { opInstanceName = iname
+                                 , opInstanceUuid = Nothing
+                                 , opOsType = Nothing
+                                 , opTempOsParams = Nothing
+                                 , opOsparamsPrivate = Nothing
+                                 , opOsparamsSecret = Nothing
+                                 , opForceVariant = False
+                                 }
+           ])
+       | offPri ->
+         Just (
+           ArFailover,
+           [ OpInstanceFailover { opInstanceName = iname
+                                , opInstanceUuid = Nothing
+                                  -- FIXME: ditto, see above.
+                                , opShutdownTimeout = fromJust $ mkNonNegative
+                                                      C.defaultShutdownTimeout
+                                , opIgnoreConsistency = False
+                                , opTargetNode = Nothing
+                                , opTargetNodeUuid = Nothing
+                                , opIgnoreIpolicy = False
+                                , opIallocator = Nothing
+                                , opMigrationCleanup = False
+                                }
+           ])
+       | offSec ->
+         Just (
+           ArFixStorage,
+           [ OpInstanceReplaceDisks { opInstanceName = iname
+                                    , opInstanceUuid = Nothing
+                                    , opReplaceDisksMode = ReplaceNewSecondary
+                                    , opReplaceDisksList = []
+                                    , opRemoteNode = Nothing
+                                      -- FIXME: ditto, see above.
+                                    , opRemoteNodeUuid = Nothing
+                                    , opIallocator = mkNonEmpty "hail"
+                                    , opEarlyRelease = False
+                                    , opIgnoreIpolicy = False
+                                    }
+            ])
+       | otherwise -> Nothing
+
+     DTPlain
+       | offPri ->
+         Just (
+           ArReinstall,
+           [ OpInstanceRecreateDisks { opInstanceName = iname
+                                     , opInstanceUuid = Nothing
+                                     , opRecreateDisksInfo = RecreateDisksAll
+                                     , opNodes = []
+                                       -- FIXME: ditto, see above.
+                                     , opNodeUuids = Nothing
+                                     , opIallocator = mkNonEmpty "hail"
+                                     }
+           , OpInstanceReinstall { opInstanceName = iname
+                                 , opInstanceUuid = Nothing
+                                 , opOsType = Nothing
+                                 , opTempOsParams = Nothing
+                                 , opOsparamsPrivate = Nothing
+                                 , opOsparamsSecret = Nothing
+                                 , opForceVariant = False
+                                 }
+           ])
+       | otherwise -> Nothing
+
+     _ -> Nothing  -- Other cases are unimplemented for now: DTDiskless,
+                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
diff --git a/src/Ganeti/HTools/Types.hs b/src/Ganeti/HTools/Types.hs
index a1fb765..d683e1b 100644
--- a/src/Ganeti/HTools/Types.hs
+++ b/src/Ganeti/HTools/Types.hs
@@ -254,6 +254,9 @@
       THH.simpleField ConstantUtils.ipolicyVcpuRatio [t| Double |]
   , THH.renameField "SpindleRatio" $
       THH.simpleField ConstantUtils.ipolicySpindleRatio [t| Double |]
+  , THH.renameField "MemoryRatio" .
+      THH.defaultField [| ConstantUtils.ipolicyDefaultsMemoryRatio |] $
+      THH.simpleField ConstantUtils.ipolicyMemoryRatio [t| Double |]
   ])
 
 -- | Converts an ISpec type to a RSpec one.
@@ -275,6 +278,7 @@
           , iPolicyDiskTemplates = [minBound..maxBound]
           , iPolicyVcpuRatio = ConstantUtils.ipolicyDefaultsVcpuRatio
           , iPolicySpindleRatio = ConstantUtils.ipolicyDefaultsSpindleRatio
+          , iPolicyMemoryRatio = ConstantUtils.ipolicyDefaultsMemoryRatio
           }
 
 -- | The dynamic resource specs of a machine (i.e. load or load
@@ -358,6 +362,8 @@
               | FailDisk -- ^ Failed due to not enough disk
               | FailCPU  -- ^ Failed due to not enough CPU capacity
               | FailN1   -- ^ Failed due to not passing N1 checks
+              | FailTooSmall -- ^ Failed due to the instance being smaller
+                             -- than allowed
               | FailTags -- ^ Failed due to tag exclusion
               | FailMig  -- ^ Failed due to migration restrictions
               | FailDiskCount -- ^ Failed due to wrong number of disks
@@ -377,10 +383,11 @@
 -- will instead raise an exception.
 type OpResult = GenericResult FailMode
 
--- | 'Error' instance for 'FailMode' designed to catch unintended
+-- | 'FromString' instance for 'FailMode' designed to catch unintended
 -- use as a general monad.
-instance Error FailMode where
-  strMsg v = error $ "Programming error: OpResult used as generic monad" ++ v
+instance FromString FailMode where
+  mkFromString v = error $ "Programming error: OpResult used as generic monad"
+                           ++ v
 
 -- | Conversion from 'OpResult' to 'Result'.
 opToResult :: OpResult a -> Result a
diff --git a/src/Ganeti/Hs2Py/OpDoc.hs b/src/Ganeti/Hs2Py/OpDoc.hs
index aee68db..f5f832d 100644
--- a/src/Ganeti/Hs2Py/OpDoc.hs
+++ b/src/Ganeti/Hs2Py/OpDoc.hs
@@ -168,6 +168,10 @@
 opRestrictedCommand =
   "Runs a restricted command on node(s)."
 
+opRepairCommand :: String
+opRepairCommand =
+  "Runs a repair command on a given node."
+
 opNodeRemove :: String
 opNodeRemove =
   "Remove a node.\n\
diff --git a/src/Ganeti/Hypervisor/Xen/XmParser.hs b/src/Ganeti/Hypervisor/Xen/XmParser.hs
index 00f1133..97a2edd 100644
--- a/src/Ganeti/Hypervisor/Xen/XmParser.hs
+++ b/src/Ganeti/Hypervisor/Xen/XmParser.hs
@@ -71,7 +71,7 @@
           doubleP = LCDouble <$> A.rational <* A.skipSpace <* A.endOfInput
           innerDoubleP = LCDouble <$> A.rational
           stringP = LCString . unpack <$> A.takeWhile1 (not . (\c -> isSpace c
-            || c `elem` "()"))
+            || c `elem` ("()" :: String)))
           wspace = AC.many1 A.space
           rparen = A.skipSpace *> A.char ')'
           finalP =   listConfigP <* rparen
@@ -163,5 +163,5 @@
 uptimeLineParser = do
   name <- A.takeTill isSpace <* A.skipSpace
   idNum <- A.decimal <* A.skipSpace
-  uptime <- A.takeTill (`elem` "\n\r") <* A.skipSpace
+  uptime <- A.takeTill (`elem` ("\n\r" :: String)) <* A.skipSpace
   return . UptimeInfo (unpack name) idNum $ unpack uptime
diff --git a/src/Ganeti/JQScheduler.hs b/src/Ganeti/JQScheduler.hs
index df6fefc..4c594fa 100644
--- a/src/Ganeti/JQScheduler.hs
+++ b/src/Ganeti/JQScheduler.hs
@@ -48,16 +48,29 @@
   , configChangeNeedsRescheduling
   ) where
 
-import Control.Applicative (liftA2, (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Arrow
 import Control.Concurrent
 import Control.Exception
-import Control.Monad
+import Control.Monad ( when
+                     , mfilter
+                     , liftM
+                     , void
+                     , unless
+                     , forever
+                     , forM_)
 import Control.Monad.IO.Class
 import Data.Function (on)
-import Data.Functor ((<$))
 import Data.IORef (IORef, atomicModifyIORef, newIORef, readIORef)
-import Data.List
+import Data.List ( find
+                 , deleteFirstsBy
+                 , sortBy
+                 , intercalate
+                 , partition
+                 , insertBy)
 import Data.Maybe
 import qualified Data.Map as Map
 import Data.Ord (comparing)
@@ -132,10 +145,6 @@
 unreadJob :: QueuedJob -> JobWithStat
 unreadJob job = JobWithStat {jJob=job, jStat=nullFStat, jINotify=Nothing}
 
--- | Reload interval for polling the running jobs for updates in microseconds.
-watchInterval :: Int
-watchInterval = C.luxidJobqueuePollInterval * 1000000 
-
 -- | Read a cluster parameter from the configuration, using a default if the
 -- configuration is not available.
 getConfigValue :: (Cluster -> a) -> a -> JQStatus -> IO a
@@ -499,7 +508,7 @@
 -- | Time-based watcher for updating the job queue.
 onTimeWatcher :: JQStatus -> IO ()
 onTimeWatcher qstate = forever $ do
-  threadDelay watchInterval
+  threadDelaySeconds C.luxidJobqueuePollInterval
   logDebug "Job queue watcher timer fired"
   updateStatusAndScheduleSomeJobs qstate
   logDebug "Job queue watcher cycle finished"
diff --git a/src/Ganeti/JQueue.hs b/src/Ganeti/JQueue.hs
index 5c3b8f5..736fce6 100644
--- a/src/Ganeti/JQueue.hs
+++ b/src/Ganeti/JQueue.hs
@@ -82,21 +82,30 @@
     , QueuedJob(..)
     ) where
 
-import Control.Applicative (liftA2, (<|>), (<$>))
+import Prelude ()
+import Ganeti.Prelude hiding (id, log)
+
+import Control.Applicative (liftA2, (<|>))
 import Control.Arrow (first, second)
 import Control.Concurrent (forkIO, threadDelay)
 import Control.Exception
 import Control.Lens (over)
-import Control.Monad
+import Control.Monad ( filterM
+                     , liftM
+                     , foldM
+                     , void
+                     , mfilter
+                     , when
+                     , mzero
+                     , unless
+                     , msum)
 import Control.Monad.IO.Class
 import Control.Monad.Trans (lift)
 import Control.Monad.Trans.Maybe
-import Data.Functor ((<$))
-import Data.List
+import Data.List (stripPrefix, sortBy, isPrefixOf)
 import Data.Maybe
 import Data.Ord (comparing)
 -- workaround what seems to be a bug in ghc 7.4's TH shadowing code
-import Prelude hiding (id, log)
 import System.Directory
 import System.FilePath
 import System.IO.Error (isDoesNotExistError)
@@ -483,7 +492,7 @@
   mapM_ (replicateJob rootdir mastercandidates)
 
 -- | Writes a job to a file and replicates it to master candidates.
-writeAndReplicateJob :: (Error e)
+writeAndReplicateJob :: (FromString e)
                      => ConfigData -> FilePath -> QueuedJob
                      -> ResultT e IO [(Node, ERpcError ())]
 writeAndReplicateJob cfg rootdir job = do
diff --git a/src/Ganeti/JSON.hs b/src/Ganeti/JSON.hs
index 770da55..86323ba 100644
--- a/src/Ganeti/JSON.hs
+++ b/src/Ganeti/JSON.hs
@@ -62,6 +62,7 @@
   , lookupContainer
   , alterContainerL
   , readContainer
+  , getKeysFromContainer
   , mkUsedKeys
   , allUsedKeys
   , DictObject(..)
@@ -85,7 +86,7 @@
 
 import Control.Applicative
 import Control.DeepSeq
-import Control.Monad.Error.Class
+import Control.Monad.Error.Class (MonadError(..))
 import Control.Monad.Writer
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.UTF8 as UTF8
@@ -148,8 +149,8 @@
 fromJResult _ (J.Ok x) = return x
 
 -- | Converts a JSON Result into a MonadError value.
-fromJResultE :: (Error e, MonadError e m) => String -> J.Result a -> m a
-fromJResultE s (J.Error x) = throwError . strMsg $ s ++ ": " ++ x
+fromJResultE :: (FromString e, MonadError e m) => String -> J.Result a -> m a
+fromJResultE s (J.Error x) = throwError . mkFromString $ s ++ ": " ++ x
 fromJResultE _ (J.Ok x) = return x
 
 -- | Tries to read a string from a JSON value.
@@ -247,10 +248,10 @@
     J.Ok x -> return x
 
 -- | Small wrapper over 'readJSON' for 'MonadError'.
-fromJValE :: (Error e, MonadError e m, J.JSON a) => J.JSValue -> m a
+fromJValE :: (FromString e, MonadError e m, J.JSON a) => J.JSValue -> m a
 fromJValE v =
   case J.readJSON v of
-    J.Error s -> throwError . strMsg $
+    J.Error s -> throwError . mkFromString $
                   "Cannot convert value '" ++ show (pp_value v) ++
                   "', error: " ++ s
     J.Ok x -> return x
@@ -338,6 +339,10 @@
 -- | Type alias for string keys.
 type Container = GenericContainer BS.ByteString
 
+-- | Returns all string keys from a container.
+getKeysFromContainer :: (Container a) -> [String]
+getKeysFromContainer = map UTF8.toString . Map.keys . fromContainer
+
 instance HasStringRepr BS.ByteString where
   fromStringRepr = return . UTF8.fromString
   toStringRepr = UTF8.toString
diff --git a/src/Ganeti/Jobs.hs b/src/Ganeti/Jobs.hs
index 01c2ac8..e31d74e 100644
--- a/src/Ganeti/Jobs.hs
+++ b/src/Ganeti/Jobs.hs
@@ -38,19 +38,24 @@
   , execWithCancel
   , execJobsWait
   , execJobsWaitOk
+  , execJobsWaitOkJid
   , waitForJobs
+  , forceFailover
   ) where
 
-import Control.Concurrent (threadDelay)
 import Control.Exception (bracket)
+import Control.Monad (void, forM)
+import Data.Functor.Identity (runIdentity)
 import Data.List
 import Data.Tuple
 import Data.IORef
 import System.Exit
 import System.Posix.Process
 import System.Posix.Signals
+import qualified Text.JSON as J
 
 import Ganeti.BasicTypes
+import qualified Ganeti.Constants as C
 import Ganeti.Errors
 import qualified Ganeti.Luxi as L
 import Ganeti.OpCodes
@@ -147,26 +152,36 @@
       callback jids'
       waitForJobs jids' client
 
+-- | Wait for one job until it is finished, using the WaitForJobChange
+-- luxi command. Return the JobId and the and the final job status.
+waitForJob :: L.Client -> L.JobId -> ResultT String IO (L.JobId, JobStatus)
+waitForJob c jid = waitForJob' J.JSNull 0 where
+  waitForJob' prevJob prevLog = do
+    rval <- mkResultT' $ L.callMethod (L.WaitForJobChange jid ["status"]
+                                       prevJob (J.showJSON prevLog)
+                                       C.luxiWfjcTimeout) c
+    let parsed = J.readJSON rval
+                 :: (J.Result ( [JobStatus]
+                              , [ (Int, J.JSValue, J.JSValue, J.JSValue)]))
+    (status, logs) <- case parsed of
+      J.Ok ([s], ls) -> return (s, ls)
+      J.Ok (s, _) -> fail $ "Expected precisely one job status, got " ++ show s
+      J.Error x -> fail $ show x
+    let pLog =  maximum $ prevLog : map (\(cnt, _, _, _) -> cnt) logs
+    if status > JOB_STATUS_RUNNING
+      then return (jid, status)
+      else waitForJob' (J.showJSON [status]) pLog
+
+
 -- | Polls a set of jobs at an increasing interval until all are finished one
 -- way or another.
 waitForJobs :: [L.JobId] -> L.Client -> IO (Result [(L.JobId, JobStatus)])
-waitForJobs jids client = waitForJobs' 500000 15000000
-  where
-    waitForJobs' delay maxdelay = do
-      -- TODO: this should use WaitForJobChange once it's available in Haskell
-      -- land, instead of a fixed schedule of sleeping intervals.
-      threadDelay delay
-      sts <- L.queryJobsStatus client jids
-      case sts of
-        Bad e -> return . Bad $ "Checking job status: " ++ formatError e
-        Ok sts' -> if any (<= JOB_STATUS_RUNNING) sts' then
-                     waitForJobs' (min (delay * 2) maxdelay) maxdelay
-                   else
-                     return . Ok $ zip jids sts'
+waitForJobs jids = runResultT . forM jids . waitForJob
 
--- | Execute jobs and return @Ok@ only if all of them succeeded.
-execJobsWaitOk :: [[MetaOpCode]] -> L.Client -> IO (Result ())
-execJobsWaitOk opcodes client = do
+-- | Execute jobs and return @Ok@ only if all of them succeeded; in
+-- this case, also return the list of Job IDs.
+execJobsWaitOkJid :: [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
+execJobsWaitOkJid opcodes client = do
   let nullog = const (return () :: IO ())
       failed = filter ((/=) JOB_STATUS_SUCCESS . snd)
       fmtfail (i, s) = show (fromJobId i) ++ "=>" ++ jobStatusToRaw s
@@ -174,7 +189,28 @@
   case sts of
     Bad e -> return $ Bad e
     Ok sts' -> return (if null $ failed sts' then
-                         Ok ()
+                         Ok $ map fst sts'
                        else
                          Bad ("The following jobs failed: " ++
                               (intercalate ", " . map fmtfail $ failed sts')))
+
+-- | Execute jobs and return @Ok@ only if all of them succeeded.
+execJobsWaitOk :: [[MetaOpCode]] -> L.Client -> IO (Result ())
+execJobsWaitOk opcodes =
+  fmap void . execJobsWaitOkJid opcodes
+
+-- | Channge Migrations to Failovers
+forceFailover :: OpCode -> OpCode
+forceFailover op@(OpInstanceMigrate {}) =
+  let timeout = runIdentity $ mkNonNegative C.defaultShutdownTimeout
+  in OpInstanceFailover { opInstanceName = opInstanceName op
+                        , opInstanceUuid = opInstanceUuid op
+                        , opShutdownTimeout = timeout
+                        , opIgnoreConsistency = True
+                        , opTargetNode = opTargetNode op
+                        , opTargetNodeUuid = opTargetNodeUuid op
+                        , opIgnoreIpolicy = opIgnoreIpolicy op
+                        , opMigrationCleanup = opMigrationCleanup op
+                        , opIallocator = opIallocator op
+                        }
+forceFailover op = op
diff --git a/src/Ganeti/Kvmd.hs b/src/Ganeti/Kvmd.hs
index 4979396..597298b 100644
--- a/src/Ganeti/Kvmd.hs
+++ b/src/Ganeti/Kvmd.hs
@@ -59,13 +59,13 @@
 
 module Ganeti.Kvmd where
 
-import Prelude hiding (rem)
+import Prelude ()
+import Ganeti.Prelude hiding (rem)
 
-import Control.Applicative ((<$>))
 import Control.Exception (try)
 import Control.Concurrent
 import Control.Monad (unless, when)
-import Data.List
+import Data.List (isPrefixOf, isInfixOf)
 import Data.Set (Set)
 import qualified Data.Set as Set (delete, empty, insert, member)
 import System.Directory
diff --git a/src/Ganeti/Lens.hs b/src/Ganeti/Lens.hs
index c7951e6..ca4719d 100644
--- a/src/Ganeti/Lens.hs
+++ b/src/Ganeti/Lens.hs
@@ -44,7 +44,10 @@
   , atSet
   ) where
 
-import Control.Applicative ((<$>), WrappedMonad(..))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (WrappedMonad(..))
 import Control.Lens
 import Control.Monad
 import Data.Functor.Compose (Compose(..))
diff --git a/src/Ganeti/Locking/Allocation.hs b/src/Ganeti/Locking/Allocation.hs
index d1caa2a..4a681b4 100644
--- a/src/Ganeti/Locking/Allocation.hs
+++ b/src/Ganeti/Locking/Allocation.hs
@@ -50,9 +50,12 @@
   , freeLocks
   ) where
 
-import Control.Applicative (liftA2, (<$>), (<*>), pure)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Arrow (second, (***))
-import Control.Monad
+import Control.Monad (unless, guard, foldM, when)
 import Data.Foldable (for_, find)
 import Data.List (foldl')
 import qualified Data.Map as M
diff --git a/src/Ganeti/Locking/Locks.hs b/src/Ganeti/Locking/Locks.hs
index e5bf524..1401b4f 100644
--- a/src/Ganeti/Locking/Locks.hs
+++ b/src/Ganeti/Locking/Locks.hs
@@ -44,7 +44,9 @@
   , lockLevel
   ) where
 
-import Control.Applicative ((<$>), (<*>), pure)
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Monad ((>=>), liftM)
 import Data.List (stripPrefix)
 import System.Posix.Types (ProcessID)
diff --git a/src/Ganeti/Logging.hs b/src/Ganeti/Logging.hs
index cf5a3fd..a1f42d6 100644
--- a/src/Ganeti/Logging.hs
+++ b/src/Ganeti/Logging.hs
@@ -60,15 +60,16 @@
   , isDebugMode
   ) where
 
-import Control.Applicative ((<$>))
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Monad
-import Control.Monad.Error (Error(..), MonadError(..), catchError)
+import Control.Monad.Error.Class (MonadError(..))
 import Control.Monad.Reader
 import qualified Control.Monad.RWS.Strict as RWSS
 import qualified Control.Monad.State.Strict as SS
 import Control.Monad.Trans.Identity
 import Control.Monad.Trans.Maybe
-import Data.Monoid
 import System.Log.Logger
 import System.Log.Handler.Simple
 import System.Log.Handler.Syslog
@@ -76,7 +77,7 @@
 import System.Log.Formatter
 import System.IO
 
-import Ganeti.BasicTypes (ResultT(..))
+import Ganeti.BasicTypes (ResultT(..), FromString(..))
 import Ganeti.THH
 import qualified Ganeti.ConstantUtils as ConstantUtils
 
@@ -168,7 +169,7 @@
 instance (MonadLog m, Monoid w) => MonadLog (RWSS.RWST r w s m) where
   logAt p = lift . logAt p
 
-instance (MonadLog m, Error e) => MonadLog (ResultT e m) where
+instance (MonadLog m, FromString e) => MonadLog (ResultT e m) where
   logAt p = lift . logAt p
 
 -- | Log at debug level.
diff --git a/src/Ganeti/Luxi.hs b/src/Ganeti/Luxi.hs
index f763eee..831e859 100644
--- a/src/Ganeti/Luxi.hs
+++ b/src/Ganeti/Luxi.hs
@@ -60,6 +60,8 @@
   , recvMsgExt
   , sendMsg
   , allLuxiCalls
+  , extractArray
+  , fromJValWithStatus
   ) where
 
 import Control.Applicative (optional, liftA, (<|>))
@@ -71,7 +73,7 @@
 import Ganeti.BasicTypes
 import Ganeti.Constants
 import Ganeti.Errors
-import Ganeti.JSON (fromJResult, fromJVal, Tuple5(..), MaybeForJSON(..), TimeAsDoubleJSON(..))
+import Ganeti.JSON (fromJResult, fromJVal, fromObj, Tuple5(..), MaybeForJSON(..), TimeAsDoubleJSON(..))
 import Ganeti.UDSServer
 import Ganeti.Objects
 import Ganeti.OpParams (pTagsObject)
@@ -381,3 +383,41 @@
                                          LuxiError "Missing job status field"
                                     else Ok (map head vals)
                        J.Error x -> Bad $ LuxiError x
+
+-- * Utility functions
+
+-- | Get values behind \"data\" part of the result.
+getData :: (Monad m) => JSValue -> m JSValue
+getData (JSObject o) = fromObj (fromJSObject o) "data"
+getData x = fail $ "Invalid input, expected dict entry but got " ++ show x
+
+-- | Converts a (status, value) into m value, if possible.
+parseQueryField :: (Monad m) => JSValue -> m (JSValue, JSValue)
+parseQueryField (JSArray [status, result]) = return (status, result)
+parseQueryField o =
+  fail $ "Invalid query field, expected (status, value) but got " ++ show o
+
+-- | Parse a result row.
+parseQueryRow :: (Monad m) => JSValue -> m [(JSValue, JSValue)]
+parseQueryRow (JSArray arr) = mapM parseQueryField arr
+parseQueryRow o =
+  fail $ "Invalid query row result, expected array but got " ++ show o
+
+-- | Parse an overall query result and get the [(status, value)] list
+-- for each element queried.
+parseQueryResult :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
+parseQueryResult (JSArray arr) = mapM parseQueryRow arr
+parseQueryResult o =
+  fail $ "Invalid query result, expected array but got " ++ show o
+
+-- | Prepare resulting output as parsers expect it.
+extractArray :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
+extractArray v =
+  getData v >>= parseQueryResult
+
+-- | Testing result status for more verbose error message.
+fromJValWithStatus :: (J.JSON a, Monad m) => (JSValue, JSValue) -> m a
+fromJValWithStatus (st, v) = do
+  st' <- fromJVal st
+  Qlang.checkRS st' v >>= fromJVal
+
diff --git a/src/Ganeti/MaintD/Autorepairs.hs b/src/Ganeti/MaintD/Autorepairs.hs
new file mode 100644
index 0000000..ce86d06
--- /dev/null
+++ b/src/Ganeti/MaintD/Autorepairs.hs
@@ -0,0 +1,236 @@
+{-| Auto-repair task of the maintenance daemon.
+
+This module implements the non-pure parts of harep-style
+repairs carried out by the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Autorepairs
+  ( harepTasks
+  ) where
+
+import Control.Arrow (second, (***))
+import Control.Monad (forM)
+import Control.Exception (bracket)
+import Data.Maybe (isJust, fromJust)
+import qualified Data.Set as Set
+import System.IO.Error (tryIOError)
+import System.Time (getClockTime)
+
+import Ganeti.BasicTypes
+import Ganeti.Errors (formatError)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.HTools.Repair
+import Ganeti.HTools.Types
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid, submitJobs)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.Utils (annotateOpCode)
+import Ganeti.OpCodes (OpCode(..))
+import qualified Ganeti.Path as Path
+import Ganeti.Types (JobId, JobStatus(..), TagKind(..), mkNonNegative)
+import Ganeti.Utils (newUUID, logAndBad)
+
+-- | Apply and remove tags form an instance indicated by `InstanceData`.
+commitChange :: L.Client
+             -> InstanceData
+             -> ResultT String IO (InstanceData, [JobId])
+commitChange client instData = do
+  now <- liftIO currentTimestamp
+  let arData = getArData $ arState instData
+      iname = Instance.name $ arInstance instData
+      rmTags = tagsToRemove instData
+  addJobs <- if isJust arData
+               then do
+                 let tag = arTag $ fromJust arData
+                 logDebug $ "Adding tag " ++ tag ++ " to " ++ iname
+                 mkResultT $ execJobsWaitOkJid
+                               [[ annotateOpCode "harep state tagging" now
+                                   . OpTagsSet TagKindInstance [tag]
+                                   $ Just iname ]]
+                               client
+               else return []
+  rmJobs <- if null rmTags
+              then return []
+              else do
+                logDebug $ "Removing tags " ++ show rmTags ++ " from " ++ iname
+                mkResultT $ execJobsWaitOkJid
+                              [[ annotateOpCode "harep state tag removal" now
+                                . OpTagsDel TagKindInstance rmTags
+                                $ Just iname ]]
+                              client
+  return (instData { tagsToRemove = [] }, addJobs ++ rmJobs)
+
+-- | Query jobs of a pending repair, returning the new instance data.
+processPending :: L.Client
+               -> InstanceData
+               -> IO (Result (InstanceData, [JobId]))
+processPending client instData = runResultT $ case arState instData of
+  (ArPendingRepair arData) -> do
+    sts <- liftIO . L.queryJobsStatus client $ arJobs arData
+    time <- liftIO getClockTime
+    case sts of
+      Bad e -> mkResultT . logAndBad
+                 $ "Could not check job status: " ++ formatError e
+      Ok sts' ->
+        if any (<= JOB_STATUS_RUNNING) sts' then
+          return (instData, [])
+        else do
+          let iname = Instance.name $ arInstance instData
+              srcSt = arStateName $ arState instData
+              arState' =
+                if all (== JOB_STATUS_SUCCESS) sts' then
+                  ArHealthy . Just
+                    . updateTag $ arData { arResult = Just ArSuccess
+                                         , arTime = time }
+                else
+                  ArFailedRepair . updateTag
+                    $ arData { arResult = Just ArFailure, arTime = time }
+              destSt = arStateName arState'
+              instData' = instData { arState = arState'
+                                   , tagsToRemove = delCurTag instData
+                                   }
+          logInfo $ "Moving " ++ iname ++ " form " ++ show srcSt ++ " to "
+                    ++ show destSt
+          commitChange client instData'
+  _ -> return (instData, [])
+
+-- | Perfom the suggested repair on an instance if its policy allows it
+-- and return the list of submitted jobs.
+doRepair :: L.Client
+         -> InstanceData
+         -> (AutoRepairType, [OpCode])
+         -> IO (Result ([Idx], [JobId]))
+doRepair client instData (rtype, opcodes) = runResultT $ do
+  let inst = arInstance instData
+      ipol = Instance.arPolicy inst
+      iname = Instance.name inst
+  case ipol of
+    ArEnabled maxtype -> do
+      uuid <- liftIO newUUID
+      time <- liftIO getClockTime
+      if rtype > maxtype then do
+        let arState' = ArNeedsRepair (
+              updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
+            instData' = instData { arState = arState'
+                                 , tagsToRemove = delCurTag instData
+                                 }
+        logInfo $ "Not performing repair of type " ++ show rtype ++ " on "
+                  ++ iname ++ " because only repairs up to " ++ show maxtype
+                  ++ " are allowed"
+        (_, jobs) <- commitChange client instData'
+        return ([], jobs)
+      else do
+        now <- liftIO currentTimestamp
+        logInfo $ "Executing " ++ show rtype ++ " repair on " ++ iname
+        -- As in harep, we delay the actual repair, to allow the tagging
+        -- to happen first; again this is only about speeding up the harep
+        -- round, not about correctness.
+        let opcodes' = OpTestDelay { opDelayDuration = 10
+                                   , opDelayOnMaster = True
+                                   , opDelayOnNodes = []
+                                   , opDelayOnNodeUuids = Nothing
+                                   , opDelayRepeat = fromJust $ mkNonNegative 0
+                                   , opDelayInterruptible = False
+                                   , opDelayNoLocks = False
+                                   } : opcodes
+        jids <- liftIO $ submitJobs
+                           [ map (annotateOpCode "harep-style repair" now)
+                             opcodes'] client
+        case jids of
+          Bad e -> mkResultT . logAndBad $ "Failure submitting repair jobs: "
+                                           ++ e
+          Ok jids' -> do
+            let arState' = ArPendingRepair (
+                  updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
+                instData' = instData { arState = arState'
+                                     , tagsToRemove = delCurTag instData
+                                     }
+            (_, tagjobs) <- commitChange client instData'
+            let nodes = filter (>= 0) [Instance.pNode inst, Instance.sNode inst]
+            return (nodes, jids' ++ tagjobs)
+    otherSt -> do
+      logDebug $ "Not repairing " ++ iname ++ " because it is in state "
+                 ++ show otherSt
+      return ([], [])
+
+-- | Harep-like repair tasks.
+harepTasks :: (Node.List, Instance.List) -- ^ Current cluster configuration
+           -> Set.Set Int -- ^ Node indices on which actions may be taken
+           -> ResultT String IO (Set.Set Int, [JobId])
+              -- ^ untouched nodes and jobs submitted
+harepTasks (nl, il) nidxs = do
+  logDebug $ "harep tasks on nodes " ++ show (Set.toList nidxs)
+  iniData <- mkResultT . return . mapM setInitialState $ Container.elems il
+
+  -- First step: check all pending repairs, see if they are completed.
+  luxiSocket <- liftIO Path.defaultQuerySocket
+  either_iData <- liftIO . tryIOError
+                  . bracket (L.getLuxiClient luxiSocket) L.closeClient
+                  $  forM iniData . processPending
+  (iData', jobs) <- mkResultT $ case either_iData of
+                      Left e -> logAndBad $ "Error while harep status update: "
+                                              ++ show e
+                      Right r ->
+                        if any isBad r
+                          then logAndBad $ "Bad harep processing pending: "
+                                            ++ show (justBad r)
+                          else return . Ok . second concat . unzip $ justOk r
+
+  -- Second step: detect any problems.
+  let repairs = map (detectBroken nl . arInstance) iData'
+
+  -- Third step: create repair jobs for broken instances that are in ArHealthy.
+  let repairIfHealthy c i = case arState i of
+                              ArHealthy _ -> doRepair c i
+                              _           -> const . return $ Ok ([], [])
+      maybeRepair c (i, r) = maybe (return $ Ok ([], []))
+                               (repairIfHealthy c i) r
+  either_repairJobs <- liftIO . tryIOError
+                       . bracket (L.getLuxiClient luxiSocket) L.closeClient
+                       $ forM (zip iData' repairs) . maybeRepair
+
+  (ntouched, jobs') <- mkResultT $ case either_repairJobs of
+                         Left e -> logAndBad $ "Error while attempting repair: "
+                                                 ++ show e
+                         Right r ->
+                           if any isBad r
+                             then logAndBad $ "Error submitting repair jobs: "
+                                                ++ show (justBad r)
+                             else return . Ok . (concat *** concat) . unzip
+                                    $ justOk r
+
+  return (nidxs Set.\\ Set.fromList ntouched, jobs ++ jobs' )
diff --git a/src/Ganeti/MaintD/Balance.hs b/src/Ganeti/MaintD/Balance.hs
new file mode 100644
index 0000000..d48fb5d
--- /dev/null
+++ b/src/Ganeti/MaintD/Balance.hs
@@ -0,0 +1,347 @@
+{-| Balancing task of the maintenance daemon.
+
+This module carries out the automated balancing done by the
+maintenance daemon. The actual balancing algorithm is imported
+from htools.
+
+-}
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Balance
+  ( balanceTask
+  ) where
+
+import Control.Arrow ((***), (&&&))
+import Control.Exception.Lifted (bracket)
+import Control.Monad (liftM, unless, when)
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef)
+import qualified Data.IntMap as IntMap
+import qualified Data.Set as Set
+import qualified Data.Map as Map
+import Data.Maybe (mapMaybe, isJust)
+import qualified Data.Traversable as Traversable
+import System.IO.Error (tryIOError)
+import Text.Printf (printf)
+
+import Ganeti.BasicTypes ( ResultT, mkResultT, mkResultT'
+                         , GenericResult(..), Result)
+import Ganeti.Cpu.Types (emptyCPUavgload, CPUavgload(..))
+import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), defaultOptions)
+import qualified Ganeti.HTools.Backend.MonD as MonD
+import qualified Ganeti.HTools.Cluster as Cluster
+import qualified Ganeti.HTools.Cluster.Metrics as Metrics
+import qualified Ganeti.HTools.Cluster.Utils as ClusterUtils
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.JQueue.Objects (Timestamp)
+import Ganeti.Jobs (submitJobs)
+import Ganeti.HTools.Types ( zeroUtil, DynUtil(cpuWeight), addUtil, subUtil
+                           , MoveJob, iPolicyMemoryRatio)
+import Ganeti.Logging.Lifted (logDebug)
+import Ganeti.MaintD.MemoryState ( MemoryState, getEvacuated
+                                 , addEvacuated, rmEvacuated)
+import Ganeti.MaintD.Utils (annotateOpCode)
+import qualified Ganeti.Luxi as L
+import Ganeti.OpCodes (MetaOpCode)
+import qualified Ganeti.Path as Path
+import qualified Ganeti.Query.Language as Qlang
+import Ganeti.Types (JobId)
+import Ganeti.Utils (logAndBad)
+
+-- * Collection of dynamic load data
+
+data AllReports = AllReports { rTotal :: MonD.Report
+                             , rIndividual :: MonD.Report
+                             , rMem :: MonD.Report
+                             }
+
+-- | Empty report. It describes an idle node and can be used as
+-- default value for nodes marked as offline.
+emptyReports :: AllReports
+emptyReports = AllReports (MonD.CPUavgloadReport emptyCPUavgload)
+                          (MonD.InstanceCpuReport Map.empty)
+                          (MonD.InstanceRSSReport Map.empty)
+
+-- | Query a node unless it is offline and return all
+-- CPU reports. For offline nodes return the empty report.
+queryNode :: Node.Node -> ResultT String IO AllReports
+queryNode node = do
+  let getReport dc = mkResultT
+                     . liftM (maybe (Bad $ "Failed collecting "
+                                           ++ MonD.dName dc
+                                           ++ " from " ++ Node.name node) Ok
+                              . MonD.mkReport dc)
+                     $ MonD.fromCurl dc node
+  if Node.offline node
+    then return emptyReports
+    else do
+      total <- getReport MonD.totalCPUCollector
+      xeninstances <- getReport MonD.xenCPUCollector
+      rssinstances <- getReport MonD.kvmRSSCollector
+      return $ AllReports total xeninstances rssinstances
+
+-- | Get a map with the CPU live data for all nodes; for offline nodes
+-- the empty report is guessed.
+queryLoad :: Node.List -> ResultT String IO (Container.Container AllReports)
+queryLoad = Traversable.mapM queryNode
+
+-- | Ask luxid about the hypervisors used. As, at the moment, we only
+-- have specialised CPU collectors for xen, we're only interested which
+-- instances run under the Xen hypervisor.
+getXenInstances :: ResultT String IO (Set.Set String)
+getXenInstances = do
+  let query = L.Query (Qlang.ItemTypeOpCode Qlang.QRInstance)
+              ["name", "hypervisor"] Qlang.EmptyFilter
+  luxiSocket <- liftIO Path.defaultQuerySocket
+  raw <- bracket (mkResultT . liftM (either (Bad . show) Ok)
+                   . tryIOError $ L.getLuxiClient luxiSocket)
+                 (liftIO . L.closeClient)
+                 $ mkResultT' . L.callMethod query
+  answer <- L.extractArray raw >>= mapM (mapM L.fromJValWithStatus)
+  let getXen [name, hv] | hv `elem` ["xen-pvm", "xen-hvm"] = [name]
+      getXen _ = []
+  return $ Set.fromList (answer >>= getXen)
+
+-- | Look for an instance in a given report.
+findInstanceLoad :: String -> AllReports -> Maybe Double
+findInstanceLoad  name r | MonD.InstanceCpuReport m <- rIndividual r =
+  Map.lookup name m
+findInstanceLoad _ _ = Nothing
+
+-- | Update the CPU load of one instance based on the reports.
+-- Fail if instance CPU load is not (yet) available. However, do
+-- accpet missing load data for instances on offline nodes, as well
+-- as old load data for recently migrated instances.
+updateCPUInstance :: Node.List
+                  -> Container.Container AllReports
+                  -> Set.Set String
+                  -> [String]
+                  -> Instance.Instance
+                  -> Result Instance.Instance
+updateCPUInstance nl reports xeninsts evacuated inst =
+  let name = Instance.name inst
+      nidx = Instance.pNode inst
+  in if name `Set.member` xeninsts
+    then let onNodeLoad = findInstanceLoad name (Container.find nidx reports)
+             allLoads = mapMaybe (findInstanceLoad name)
+                          $ Container.elems reports
+         in case () of
+           _ | Just load <- onNodeLoad ->
+                 return $ inst { Instance.util = zeroUtil { cpuWeight = load } }
+           _ | (load:_) <- allLoads ->
+                 return $ inst { Instance.util = zeroUtil { cpuWeight = load } }
+           _ | Node.offline $ Container.find nidx nl ->
+                 return $ inst { Instance.util = zeroUtil }
+           _ | Instance.name inst `elem` evacuated ->
+                 return $ inst { Instance.util = zeroUtil }
+           _ -> fail $ "Xen CPU data unavailable for " ++ name
+    else let rep = rTotal $ Container.find nidx reports
+         in case rep of MonD.CPUavgloadReport (CPUavgload _ _ ndload) ->
+                          let w = ndload * fromIntegral (Instance.vcpus inst)
+                                  / (fromIntegral . Node.uCpu
+                                       $ Container.find nidx nl)
+                          in return $ inst { Instance.util =
+                                                zeroUtil { cpuWeight = w }}
+                        _ -> fail $ "CPU data unavailable for node of " ++ name
+
+-- | Update CPU usage data based on the collected reports. That is, get the
+-- CPU usage of all instances from the reports and also update the nodes
+-- accordingly.
+updateCPULoad :: (Node.List, Instance.List)
+              -> Container.Container AllReports
+              -> Set.Set String
+              -> [ String ]
+              -> Result (Node.List, Instance.List)
+updateCPULoad (nl, il) reports xeninsts evacuated = do
+  il' <- Traversable.mapM (updateCPUInstance nl reports xeninsts evacuated) il
+  let addNodeUtil n delta = n { Node.utilLoad = addUtil (Node.utilLoad n) delta
+                              , Node.utilLoadForth =
+                                  addUtil (Node.utilLoadForth n) delta
+                              }
+  let updateNodeUtil nnl inst_old inst_new =
+        let delta = subUtil (Instance.util inst_new) $ Instance.util inst_old
+            nidx = Instance.pNode inst_old
+            n = Container.find nidx nnl
+            n' = addNodeUtil n delta
+        in Container.add nidx n' nnl
+  let nl' = foldl (\nnl i -> updateNodeUtil nnl (Container.find i il)
+                               $ Container.find i il') nl $ Container.keys il
+  return (nl', il')
+
+-- | For an instance, given by name, verify if an individual load report is
+-- available again.
+cleanUpEvacuation :: IORef MemoryState
+                  -> Instance.List
+                  -> Container.Container AllReports
+                  -> String
+                  -> IO ()
+cleanUpEvacuation memstate il reports name = do
+  let insts = filter ((==) name . Instance.name) $ Container.elems il
+  case insts of
+    [] -> do
+            logDebug $ "Instnace " ++ name ++ "no longer on the cluster"
+            rmEvacuated memstate name
+    inst:_ -> do
+                 let nidx = Instance.pNode inst
+                 when (isJust . findInstanceLoad name
+                         $ Container.find nidx reports) $ do
+                   logDebug $ "Load data for " ++ name ++ " available again"
+                   rmEvacuated memstate name
+
+-- * Balancing
+
+-- | Transform an instance move into a submittable job.
+moveToJob :: Timestamp -> (Node.List, Instance.List) -> MoveJob -> [MetaOpCode]
+moveToJob now (nl, il) (_, idx, move, _) =
+  let opCodes = Cluster.iMoveToJob nl il idx move
+  in map (annotateOpCode "auto-balancing the cluster" now) opCodes
+
+-- | Iteratively improve a cluster by iterating over tryBalance.
+iterateBalance :: AlgorithmOptions
+               -> Cluster.Table -- ^ the starting table
+               -> [MoveJob]     -- ^ current command list
+               -> [MoveJob]     -- ^ resulting commands
+iterateBalance opts ini_tbl cmds =
+  let Cluster.Table ini_nl ini_il _ _ = ini_tbl
+      m_next_tbl = Cluster.tryBalance opts ini_tbl
+  in case m_next_tbl of
+    Just next_tbl@(Cluster.Table _ _ _ plc@(curplc:_)) ->
+      let (idx, _, _, move, _) = curplc
+          plc_len = length plc
+          (_, cs) = Cluster.printSolutionLine ini_nl ini_il 1 1 curplc plc_len
+          afn = Cluster.involvedNodes ini_il curplc
+          cmds' = (afn, idx, move, cs):cmds
+      in iterateBalance opts next_tbl cmds'
+    _ -> cmds
+
+-- | List instances evacuated in a move job, if any.
+evacuatedInsts :: (Node.List, Instance.List)
+               -> MoveJob
+               -> [String]
+evacuatedInsts (nl, il) (_, idx, _, _) =
+  let inst = Container.find idx il
+      node = Container.find (Instance.pNode inst) nl
+  in [Instance.name inst | Node.offline node]
+
+-- | Balance a single group, restricted to the allowed nodes and
+-- minimal gain.
+balanceGroup :: IORef MemoryState
+             -> Set.Set String
+             -> L.Client
+             -> Set.Set Int
+             -> Double
+             -> (Int,  (Node.List, Instance.List))
+             -> ResultT String IO [JobId]
+balanceGroup memstate xens client allowedNodes threshold (gidx, (nl, il)) = do
+  logDebug $ printf "Balancing group %d, %d nodes, %d instances." gidx
+               (Container.size nl) (Container.size il)
+  let ini_cv = Metrics.compCV nl
+      ini_tbl = Cluster.Table nl il ini_cv []
+      opts = defaultOptions { algAllowedNodes = Just allowedNodes
+                            , algMinGain = threshold
+                            , algMinGainLimit = 10 * threshold
+                            }
+      cmds = iterateBalance opts ini_tbl []
+      tasks = take 1 $ Cluster.splitJobs cmds
+  logDebug $ "First task group: " ++ show tasks
+  now <- liftIO currentTimestamp
+  let jobs = tasks >>= map (moveToJob now (nl, il))
+      evacs = filter (`Set.member` xens)
+              (concat tasks >>= evacuatedInsts (nl, il))
+  if null jobs
+    then return []
+    else do
+      unless (null evacs) $ do
+        logDebug $ "Evacuation of instances " ++ show evacs
+        liftIO $ addEvacuated memstate evacs
+      jids <- liftIO $ submitJobs jobs client
+      case jids of
+        Bad e -> mkResultT . logAndBad
+                   $ "Failure submitting balancing jobs: " ++ e
+        Ok jids' -> return jids'
+
+-- * Memory balancing
+
+-- | Decide the weight that dynamic memory utilization should have
+-- based on the memory-over-commitment ratio. This function is likely
+-- to change once more experience with memory over-commited clusters
+-- is gained.
+weightFromMemRatio :: Double -> Double
+weightFromMemRatio f = 0.0 `max` (f - 1) * 5.0
+
+-- | Apply the memory data to the cluster data.
+useMemData :: Double
+           -> Container.Container AllReports
+           -> (Node.List, Instance.List)
+           -> ResultT String IO (Node.List, Instance.List)
+useMemData ratio allreports (nl, il) = do
+  logDebug "Taking dynamic memory data into account"
+  let memoryReports =
+        map (flip Container.find nl *** rMem) $ IntMap.toList allreports
+  mkResultT . return . liftM (MonD.scaleMemoryWeight (weightFromMemRatio ratio))
+    $ MonD.useInstanceRSSData memoryReports (nl, il)
+
+-- * Interface function
+
+-- | Carry out all the needed balancing, based on live CPU data, only touching
+-- the available nodes. Only carry out balancing steps where the gain is above
+-- the threshold.
+balanceTask :: IORef MemoryState
+            -> (Node.List, Instance.List) -- ^ current cluster configuration
+            -> Set.Set Int -- ^ node indices on which actions may be taken
+            -> Double -- ^ threshold for improvement
+            -> ResultT String IO [JobId] -- ^ jobs submitted
+balanceTask memstate (nl, il) okNodes threshold = do
+  logDebug "Collecting dynamic load values"
+  evacuated <- getEvacuated memstate
+  logDebug $ "Not expecting load data from: " ++ show evacuated
+  reports <- queryLoad nl
+  xenInstances <- getXenInstances
+  (nl', il') <- mkResultT . return
+                  $ updateCPULoad (nl, il) reports xenInstances evacuated
+  liftIO $ mapM_ (cleanUpEvacuation memstate il reports) evacuated
+  let memoryOvercommitment =
+        maximum . (0.0:) . map (iPolicyMemoryRatio .Node.iPolicy)
+        $ IntMap.elems nl
+  logDebug $ "Memory over-commitment ratio is " ++ show memoryOvercommitment
+  (nl'', il'') <- if memoryOvercommitment > 1.0
+                    then useMemData memoryOvercommitment reports (nl', il')
+                    else return (nl', il')
+  logDebug . (++) "Dynamic node load: " . show
+    . map (Node.name &&& Node.utilLoad) $ Container.elems nl''
+  let ngroups = ClusterUtils.splitCluster nl'' il''
+  luxiSocket <- liftIO Path.defaultQuerySocket
+  bracket (liftIO $ L.getLuxiClient luxiSocket) (liftIO . L.closeClient) $ \c ->
+    liftM concat $ mapM (balanceGroup memstate xenInstances c okNodes threshold)
+                        ngroups
diff --git a/src/Ganeti/MaintD/CleanupIncidents.hs b/src/Ganeti/MaintD/CleanupIncidents.hs
new file mode 100644
index 0000000..f8aaf92
--- /dev/null
+++ b/src/Ganeti/MaintD/CleanupIncidents.hs
@@ -0,0 +1,87 @@
+{-| Incident clean up in the maintenance daemon.
+
+This module implements the clean up of events that are finished,
+and acknowledged as such by the user.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.CleanupIncidents
+  ( cleanupIncidents
+  ) where
+
+import Control.Arrow ((&&&))
+import Control.Monad (unless)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.IORef (IORef)
+
+import Ganeti.BasicTypes (ResultT, mkResultT)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Logging.Lifted
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, rmIncident)
+import Ganeti.Objects.Maintenance (Incident(..), RepairStatus(..))
+import Ganeti.Utils (logAndBad)
+
+-- | Remove a single incident, provided the corresponding tag
+-- is no longer present.
+cleanupIncident :: IORef MemoryState
+                -> Node.List
+                -> Incident
+                -> ResultT String IO ()
+cleanupIncident memstate nl incident = do
+  let location = incidentNode incident
+      uuid = incidentUuid incident
+      tag = incidentTag incident
+      nodes = filter ((==) location . Node.name) $ Container.elems nl
+  case nodes of
+    [] -> do
+            logInfo $ "No node any more with name " ++ location
+                       ++ "; will forget event " ++ UTF8.toString uuid
+            liftIO . rmIncident memstate $ UTF8.toString uuid
+    [nd] -> unless (tag `elem` Node.nTags nd) $ do
+              logInfo $ "Tag " ++ tag ++ " removed on " ++ location
+                        ++ "; will forget event " ++ UTF8.toString uuid
+              liftIO . rmIncident memstate $ UTF8.toString uuid
+    _ -> mkResultT . logAndBad
+           $ "Found More than one node with name " ++ location
+
+-- | Remove all incidents from the record that are in a final state
+-- and additionally the node tag for that incident has been removed.
+cleanupIncidents :: IORef MemoryState -> Node.List -> ResultT String IO ()
+cleanupIncidents memstate nl = do
+  incidents <- getIncidents memstate
+  let finalized = filter ((> RSPending) . incidentRepairStatus) incidents
+  logDebug . (++) "Finalized incidents " . show
+    $ map (incidentNode &&& incidentUuid) finalized
+  mapM_ (cleanupIncident memstate nl) finalized
diff --git a/src/Ganeti/MaintD/CollectIncidents.hs b/src/Ganeti/MaintD/CollectIncidents.hs
new file mode 100644
index 0000000..ba31569
--- /dev/null
+++ b/src/Ganeti/MaintD/CollectIncidents.hs
@@ -0,0 +1,130 @@
+{-| Discovery of incidents by the maintenance daemon.
+
+This module implements the querying of all monitoring
+daemons for the value of the node-status data collector.
+Any new incident gets registered.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.CollectIncidents
+  ( collectIncidents
+  ) where
+
+import Control.Applicative (liftA2)
+import Control.Monad (unless)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.IORef (IORef)
+import Network.Curl
+import System.Time (getClockTime)
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes (ResultT)
+import qualified Ganeti.Constants as C
+import qualified Ganeti.DataCollectors.Diagnose as D
+import Ganeti.DataCollectors.Types (getCategoryName)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Logging.Lifted
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, updateIncident)
+import Ganeti.Objects.Maintenance
+import Ganeti.Utils (newUUID)
+
+-- | Query a node, unless it is offline, and return
+-- the paylod of the report, if available. For offline
+-- nodes return nothing.
+queryStatus :: Node.Node -> IO (Maybe J.JSValue)
+queryStatus node = do
+  let name = Node.name node
+  let url = name ++ ":" ++ show C.defaultMondPort
+            ++ "/1/report/" ++ maybe "default" getCategoryName D.dcCategory
+            ++ "/" ++ D.dcName
+  if Node.offline node
+    then do
+      logDebug $ "Not asking " ++ name ++ "; it is offline"
+      return Nothing
+    else do
+      (code, body) <- liftIO $ curlGetString url []
+      case code of
+        CurlOK ->
+          case J.decode body of
+            J.Ok r -> return $ Just r
+            _ -> return Nothing
+        _ -> do
+          logWarning $ "Failed to contact " ++ name
+          return Nothing
+
+-- | Update the status of one node.
+updateNode :: IORef MemoryState -> Node.Node -> ResultT String IO ()
+updateNode memstate node = do
+  let name = Node.name node
+  logDebug $ "Inspecting " ++ name
+  report <- liftIO $ queryStatus node
+  case report of
+    Just (J.JSObject obj)
+      | Just orig@(J.JSObject origobj) <- lookup "data" $ J.fromJSObject obj,
+        Just s <- lookup "status" $ J.fromJSObject origobj,
+        J.Ok state <- J.readJSON s,
+        state /= RANoop -> do
+          let origs = J.encode orig
+          logDebug $ "Relevant event on " ++ name ++ ": " ++ origs
+          incidents <- getIncidents memstate
+          unless (any (liftA2 (&&)
+                        ((==) name . incidentNode)
+                        ((==) orig . incidentOriginal)) incidents) $ do
+            logInfo $ "Registering new incident on " ++ name ++ ": " ++ origs
+            uuid <- liftIO newUUID
+            now <- liftIO getClockTime
+            let tag = C.maintdSuccessTagPrefix ++ uuid
+                incident = Incident { incidentOriginal = orig
+                                    , incidentAction = state
+                                    , incidentRepairStatus = RSNoted
+                                    , incidentJobs = []
+                                    , incidentNode = name
+                                    , incidentTag = tag
+                                    , incidentUuid = UTF8.fromString uuid
+                                    , incidentCtime = now
+                                    , incidentMtime = now
+                                    , incidentSerial = 1
+                                    }
+            liftIO $ updateIncident memstate incident
+    _ -> return ()
+
+
+-- | Query all MonDs for updates on the node-status.
+collectIncidents :: IORef MemoryState -> Node.List -> ResultT String IO ()
+collectIncidents memstate nl = do
+  _ <- getIncidents memstate -- always update the memory state,
+                             -- even if we do not observe anything
+  logDebug "Querying all nodes for incidents"
+  mapM_ (updateNode memstate) $ Container.elems nl
diff --git a/src/Ganeti/MaintD/FailIncident.hs b/src/Ganeti/MaintD/FailIncident.hs
new file mode 100644
index 0000000..917cb78
--- /dev/null
+++ b/src/Ganeti/MaintD/FailIncident.hs
@@ -0,0 +1,93 @@
+{-| Incident failing in the maintenace daemon
+
+This module implements the treatment of an incident, once
+a job failed.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.FailIncident
+ ( failIncident
+ ) where
+
+import Control.Exception.Lifted (bracket)
+import Control.Lens.Setter (over)
+import Control.Monad (liftM, when)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.IORef (IORef)
+import System.IO.Error (tryIOError)
+
+import Ganeti.BasicTypes (ResultT, mkResultT, GenericResult(..))
+import qualified Ganeti.Constants as C
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, updateIncident)
+import Ganeti.MaintD.Utils (annotateOpCode)
+import Ganeti.Objects.Lens (incidentJobsL)
+import Ganeti.Objects.Maintenance (Incident(..), RepairStatus(..))
+import Ganeti.OpCodes (OpCode(..))
+import qualified Ganeti.Path as Path
+import Ganeti.Types (JobId, fromJobId, TagKind(..))
+
+-- | Mark an incident as failed.
+markAsFailed :: IORef MemoryState -> Incident -> ResultT String IO ()
+markAsFailed memstate incident = do
+  let uuid = incidentUuid incident
+      newtag = C.maintdFailureTagPrefix ++ UTF8.toString uuid
+  logInfo $ "Marking incident " ++ UTF8.toString uuid ++ " as failed"
+  now <- liftIO currentTimestamp
+  luxiSocket <- liftIO Path.defaultQuerySocket
+  jids <- bracket (mkResultT . liftM (either (Bad . show) Ok)
+                   . tryIOError $ L.getLuxiClient luxiSocket)
+                  (liftIO . L.closeClient)
+                  (mkResultT . execJobsWaitOkJid
+                     [[ annotateOpCode "marking incident handling as failed" now
+                        . OpTagsSet TagKindNode [ newtag ]
+                        . Just $ incidentNode incident ]])
+  let incident' = over incidentJobsL (++ jids)
+                    $ incident { incidentRepairStatus = RSFailed
+                               , incidentTag = newtag
+                               }
+  liftIO $ updateIncident memstate incident'
+
+-- | Mark the incident, if any, belonging to the given job as
+-- failed after having tagged it appropriately.
+failIncident :: IORef MemoryState -> JobId -> ResultT String IO ()
+failIncident memstate jid = do
+  incidents <- getIncidents memstate
+  let affected = filter (elem jid . incidentJobs) incidents
+  when (null affected) . logInfo
+    $ "Job " ++ show (fromJobId jid) ++ " does not belong to an incident"
+  mapM_ (markAsFailed memstate) affected
diff --git a/src/Ganeti/MaintD/HandleIncidents.hs b/src/Ganeti/MaintD/HandleIncidents.hs
new file mode 100644
index 0000000..90831d0
--- /dev/null
+++ b/src/Ganeti/MaintD/HandleIncidents.hs
@@ -0,0 +1,300 @@
+{-| Incident handling in the maintenance daemon.
+
+This module implements the submission of actions for ongoing
+repair events reported by the node-status data collector.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.HandleIncidents
+  ( handleIncidents
+  ) where
+
+import Control.Arrow ((&&&))
+import Control.Exception.Lifted (bracket)
+import Control.Lens.Setter (over)
+import Control.Monad (foldM)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.Function (on)
+import Data.IORef (IORef)
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes ( GenericResult(..), ResultT, mkResultT, Down(..))
+import qualified Ganeti.Constants as C
+import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), defaultOptions)
+import Ganeti.HTools.Cluster.Evacuate (tryNodeEvac, EvacSolution(..))
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Group as Group
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.HTools.Types (Idx)
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid, submitJobs, forceFailover)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.MemoryState ( MemoryState, getIncidents, rmIncident
+                                 , updateIncident, appendJobs)
+import Ganeti.MaintD.Utils (annotateOpCode, getRepairCommand)
+import Ganeti.Objects.Lens (incidentJobsL)
+import Ganeti.Objects.Maintenance ( RepairStatus(..), RepairAction(..)
+                                  , Incident(..))
+import Ganeti.OpCodes (OpCode(..), MetaOpCode)
+import qualified Ganeti.Path as Path
+import Ganeti.Types ( cTimeOf, uuidOf, mkNonEmpty, fromJobId
+                    , EvacMode(..), TagKind(..))
+import Ganeti.Utils (maxBy, logAndBad)
+
+-- | Given two incidents, choose the more severe one; for equally severe
+-- ones the older (by creation timestamp).
+moreSevereIncident :: Incident -> Incident -> Incident
+moreSevereIncident = maxBy (compare `on` incidentAction &&& (Down . cTimeOf))
+
+-- | From a given list of incidents, return, for each node,
+-- the one with the most severe action.
+rankIncidents :: [Incident] -> Map.Map String Incident
+rankIncidents = foldl (\m i -> Map.insertWith moreSevereIncident
+                                 (incidentNode i) i m) Map.empty
+
+-- | Generate a job to drain a given node.
+drainJob :: String -> ResultT String IO [ MetaOpCode ]
+drainJob name = do
+  name' <- mkNonEmpty name
+  now <- liftIO currentTimestamp
+  return $ map (annotateOpCode ("Draining " ++ name) now)
+    [ OpNodeSetParams { opNodeName = name'
+                      , opNodeUuid = Nothing
+                      , opForce = True
+                      , opHvState = Nothing
+                      , opDiskState = Nothing
+                      , opMasterCandidate = Nothing
+                      , opOffline = Nothing
+                      , opDrained = Just True
+                      , opAutoPromote = False
+                      , opMasterCapable = Nothing
+                      , opVmCapable = Nothing
+                      , opSecondaryIp = Nothing
+                      , opgenericNdParams = Nothing
+                      , opPowered = Nothing
+                      , opVerbose = False
+                      , opDebug = False
+                      } ]
+
+-- | Submit and register the next job for a node evacuation.
+handleEvacuation :: L.Client -- ^ Luxi client to use
+                 -> IORef MemoryState -- ^ memory state of the daemon
+                 -> (Group.List, Node.List, Instance.List) -- ^ cluster
+                 -> Idx -- ^ index of the node to evacuate
+                 -> Bool -- ^ whether to try migrations
+                 -> Set.Set Int -- ^ allowed nodes for evacuation
+                 -> Incident -- ^ the incident
+                 -> ResultT String IO (Set.Set Int) -- ^ nodes still available
+handleEvacuation client memst (gl, nl, il) ndx migrate freenodes incident = do
+  let node = Container.find ndx nl
+      name = Node.name node
+      fNdNames = map (Node.name . flip Container.find nl) $ Set.elems freenodes
+      evacOpts = defaultOptions { algEvacMode = True
+                                , algIgnoreSoftErrors = True
+                                , algRestrictToNodes = Just fNdNames
+                                }
+      evacFun = tryNodeEvac evacOpts gl nl il
+      migrateFun = if migrate then id else forceFailover
+      annotateFun = annotateOpCode $ "Evacuating " ++ name
+      pendingIncident = incident { incidentRepairStatus = RSPending }
+      updateJobs jids_r = case jids_r of
+        Ok jids -> do
+          let incident' = over incidentJobsL (++ jids) pendingIncident
+          liftIO $ updateIncident memst incident'
+          liftIO $ appendJobs memst jids
+          logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
+        Bad e -> mkResultT . logAndBad
+                   $ "Failure evacuating " ++ name ++ ": " ++ e
+      logInstName i = logInfo $ "Evacuating instance "
+                                  ++ Instance.name (Container.find i il)
+                                  ++ " from " ++ name
+      execSol sol = do
+        now <- liftIO currentTimestamp
+        let jobs = map (map (annotateFun now . migrateFun)) $ esOpCodes sol
+        jids <- liftIO $ submitJobs jobs client
+        updateJobs jids
+        let touched = esMoved sol >>= \(_, _, nidxs) -> nidxs
+        return $ freenodes Set.\\ Set.fromList touched
+  logDebug $ "Handling evacuation of " ++ name
+  case () of _ | not $ Node.offline node -> do
+                   logDebug $ "Draining node " ++ name
+                   job <- drainJob name
+                   jids <- liftIO $ submitJobs [job] client
+                   updateJobs jids
+                   return freenodes
+               | i:_ <- Node.pList node -> do
+                   logInstName i
+                   (_, _, sol) <- mkResultT . return $ evacFun ChangePrimary [i]
+                   execSol sol
+               | i:_ <- Node.sList node -> do
+                   logInstName i
+                   (_, _, sol) <- mkResultT . return
+                                    $ evacFun ChangeSecondary [i]
+                   execSol sol
+               | otherwise -> do
+                   logDebug $ "Finished evacuation of " ++ name
+                   now <- liftIO currentTimestamp
+                   jids <- mkResultT $ execJobsWaitOkJid
+                            [[ annotateFun now
+                               . OpTagsSet TagKindNode [ incidentTag incident ]
+                               $ Just name]] client
+                   let incident' = over incidentJobsL (++ jids)
+                                     $ incident { incidentRepairStatus =
+                                                    RSCompleted }
+                   liftIO $ updateIncident memst incident'
+                   liftIO $ appendJobs memst jids
+                   return freenodes
+
+-- | Submit the next action for a live-repair incident.
+handleLiveRepairs :: L.Client -- ^ Luxi client to use
+                 -> IORef MemoryState -- ^ memory state of the daemon
+                 -> Idx -- ^ the node to handle the event on
+                 -> Set.Set Int -- ^ unaffected nodes
+                 -> Incident -- ^ the incident
+                 -> ResultT String IO (Set.Set Int) -- ^ nodes still available
+handleLiveRepairs client memst ndx freenodes incident = do
+  let maybeCmd = getRepairCommand incident
+      uuid = incidentUuid incident
+      name = incidentNode incident
+  now <- liftIO currentTimestamp
+  logDebug $ "Handling requested command " ++ show maybeCmd ++ " on " ++ name
+  case () of
+    _ | null $ incidentJobs incident,
+        Just cmd <- maybeCmd,
+        cmd /= "" -> do
+            logDebug "Submitting repair command job"
+            name' <- mkNonEmpty name
+            cmd' <- mkNonEmpty cmd
+            orig' <- mkNonEmpty . J.encode $ incidentOriginal incident
+            jids_r <- liftIO $ submitJobs
+                        [[ annotateOpCode "repair command requested by node" now
+                           OpRepairCommand { opNodeName = name'
+                                           , opRepairCommand = cmd'
+                                           , opInput = Just orig'
+                                           } ]] client
+            case jids_r of
+              Ok jids -> do
+                let incident' = over incidentJobsL (++ jids) incident
+                liftIO $ updateIncident memst incident'
+                liftIO $ appendJobs memst jids
+                logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
+              Bad e -> mkResultT . logAndBad
+                   $ "Failure requesting command " ++ cmd ++ " on " ++ name
+                     ++ ": " ++ e
+      | null $ incidentJobs incident -> do
+            logInfo $ "Marking incident " ++ UTF8.toString uuid ++ " as failed;"
+                      ++ " command for live repair not specified"
+            let newtag = C.maintdFailureTagPrefix ++ UTF8.toString uuid
+            jids <- mkResultT $ execJobsWaitOkJid
+                      [[ annotateOpCode "marking incident as ill specified" now
+                         . OpTagsSet TagKindNode [ newtag ]
+                         $ Just name ]] client
+            let incident' = over incidentJobsL (++ jids)
+                              $ incident { incidentRepairStatus = RSFailed
+                                         , incidentTag = newtag
+                                         }
+            liftIO $ updateIncident memst incident'
+            liftIO $ appendJobs memst jids
+      | otherwise -> do
+            logDebug "Command execution has succeeded"
+            jids <- mkResultT $ execJobsWaitOkJid
+                      [[ annotateOpCode "repair command requested by node" now
+                         . OpTagsSet TagKindNode [ incidentTag incident ]
+                         $ Just name ]] client
+            let incident' = over incidentJobsL (++ jids)
+                            $ incident { incidentRepairStatus = RSCompleted }
+            liftIO $ updateIncident memst incident'
+            liftIO $ appendJobs memst jids
+  return $ Set.delete ndx freenodes
+
+
+-- | Submit the next actions for a single incident, given the unaffected nodes;
+-- register all submitted jobs and return the new set of unaffected nodes.
+handleIncident :: L.Client
+               -> IORef MemoryState
+               -> (Group.List, Node.List, Instance.List)
+               -> Set.Set Int
+               -> (String, Incident)
+               -> ResultT String IO (Set.Set Int)
+handleIncident client memstate (gl, nl, il) freeNodes (name, incident) = do
+  ndx <- case Container.keys $ Container.filter ((==) name . Node.name) nl of
+           [ndx] -> return ndx
+           [] -> do
+             logWarning $ "Node " ++ name ++ " no longer in the cluster;"
+                          ++ " clearing incident " ++ show incident
+             liftIO . rmIncident memstate $ uuidOf incident
+             fail $ "node " ++ name ++ " left the cluster"
+           ndxs -> do
+             logWarning $ "Abmigious node name " ++ name
+                          ++ "; could refer to indices " ++ show ndxs
+             fail $ "ambigious name " ++ name
+  case incidentAction incident of
+    RANoop -> do
+      logDebug $ "Nothing to do for " ++ show incident
+      liftIO . rmIncident memstate $ uuidOf incident
+      return freeNodes
+    RALiveRepair ->
+      handleLiveRepairs client memstate ndx freeNodes incident
+    RAEvacuate ->
+      handleEvacuation client memstate (gl, nl, il) ndx True freeNodes incident
+    RAEvacuateFailover ->
+      handleEvacuation client memstate (gl, nl, il) ndx False freeNodes incident
+
+-- | Submit the jobs necessary for the next maintenance step
+-- for each pending maintenance, i.e., the most radical maintenance
+-- for each node. Return the set of node indices unaffected by these
+-- operations. Also, for each job submitted, register it directly.
+handleIncidents :: IORef MemoryState
+                -> (Group.List, Node.List, Instance.List)
+                -> ResultT String IO (Set.Set Int)
+handleIncidents memstate (gl, nl, il) = do
+  incidents <- getIncidents memstate
+  let activeIncidents = filter ((<= RSPending) . incidentRepairStatus) incidents
+      incidentsToHandle = rankIncidents activeIncidents
+      incidentNodes = Set.fromList . Container.keys
+        $ Container.filter ((`Map.member` incidentsToHandle) . Node.name) nl
+      freeNodes = Set.fromList (Container.keys nl) Set.\\ incidentNodes
+  if null activeIncidents
+    then return freeNodes
+    else do
+      luxiSocket <- liftIO Path.defaultQuerySocket
+      bracket (liftIO $ L.getLuxiClient luxiSocket)
+              (liftIO . L.closeClient)
+              $ \ client ->
+                foldM (handleIncident client memstate (gl, nl, il)) freeNodes
+                  $ Map.assocs incidentsToHandle
diff --git a/src/Ganeti/MaintD/MemoryState.hs b/src/Ganeti/MaintD/MemoryState.hs
new file mode 100644
index 0000000..ce0c94a
--- /dev/null
+++ b/src/Ganeti/MaintD/MemoryState.hs
@@ -0,0 +1,153 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Memory copy of the state of the maintenance daemon.
+
+While the autoritative state of the maintenance daemon is
+stored in the configuration, the daemon keeps a copy of some
+values at run time, so that they can easily be exposed over
+HTTP.
+
+This module also provides functions for the mirrored information
+to update both, the authoritative state and the in-memory copy.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.MemoryState
+  ( MemoryState(..)
+  , emptyMemoryState
+  , getJobs
+  , clearJobs
+  , appendJobs
+  , getEvacuated
+  , addEvacuated
+  , rmEvacuated
+  , getIncidents
+  , updateIncident
+  , rmIncident
+  ) where
+
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef)
+
+import Ganeti.BasicTypes (ResultT, withErrorT)
+import Ganeti.Lens (makeCustomLenses)
+import Ganeti.Objects.Maintenance (Incident)
+import Ganeti.Types (JobId, uuidOf)
+import Ganeti.Utils (ordNub)
+import Ganeti.Utils.IORef (atomicModifyWithLens_)
+import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceJobs, runModifyRpc
+                            , clearMaintdJobs, appendMaintdJobs
+                            , maintenanceEvacuated, addMaintdEvacuated
+                            , rmMaintdEvacuated
+                            , maintenanceIncidents, updateMaintdIncident
+                            , rmMaintdIncident )
+
+-- | In-memory copy of parts of the state of the maintenance
+-- daemon.
+data MemoryState = MemoryState
+  { msJobs :: [ JobId ]
+  , msEvacuated :: [ String ]
+  , msIncidents :: [ Incident ]
+  }
+
+$(makeCustomLenses ''MemoryState)
+
+-- | Inital state of the in-memory copy. All parts will be updated
+-- before use, after one round at the latest this copy is up to date.
+emptyMemoryState :: MemoryState
+emptyMemoryState = MemoryState { msJobs = []
+                               , msEvacuated = []
+                               , msIncidents = []
+                               }
+
+-- | Get the list of jobs from the authoritative copy, and update the
+-- in-memory copy as well.
+getJobs :: IORef MemoryState -> ResultT String IO [JobId]
+getJobs memstate = do
+  jobs <- withErrorT show $ runNewWConfdClient maintenanceJobs
+  liftIO . atomicModifyWithLens_ memstate msJobsL $ const jobs
+  return jobs
+
+-- | Reset the list of active jobs.
+clearJobs :: IORef MemoryState -> IO ()
+clearJobs memstate = do
+  runModifyRpc clearMaintdJobs
+  atomicModifyWithLens_ memstate msJobsL $ const []
+
+-- | Append jobs to the list of active jobs, if not present already
+appendJobs :: IORef MemoryState -> [JobId] -> IO ()
+appendJobs memstate jobs = do
+  runModifyRpc $ appendMaintdJobs jobs
+  atomicModifyWithLens_ memstate msJobsL $ ordNub . (++ jobs)
+
+-- | Get the list of recently evacuated instances from the authoritative
+-- copy and update the in-memory state.
+getEvacuated :: IORef MemoryState -> ResultT String IO [String]
+getEvacuated memstate = do
+  evac <- withErrorT show $ runNewWConfdClient maintenanceEvacuated
+  liftIO . atomicModifyWithLens_ memstate msEvacuatedL $ const evac
+  return evac
+
+-- | Add names to the list of recently evacuated instances.
+addEvacuated :: IORef MemoryState -> [String] -> IO ()
+addEvacuated memstate names = do
+  runModifyRpc $ addMaintdEvacuated names
+  atomicModifyWithLens_ memstate msEvacuatedL $ ordNub . (++ names)
+
+-- | Remove a name from the list of recently evacuated instances.
+rmEvacuated :: IORef MemoryState -> String -> IO ()
+rmEvacuated memstate name = do
+  runModifyRpc $ rmMaintdEvacuated name
+  atomicModifyWithLens_ memstate msEvacuatedL $ filter (/= name)
+
+-- | Get the list of incidents fo the authoritative copy and update the
+-- in-memory state.
+getIncidents :: IORef MemoryState -> ResultT String IO  [Incident]
+getIncidents memstate = do
+  incidents <- withErrorT show $ runNewWConfdClient maintenanceIncidents
+  liftIO . atomicModifyWithLens_ memstate msIncidentsL $ const incidents
+  return incidents
+
+-- | Update an incident.
+updateIncident :: IORef MemoryState -> Incident -> IO ()
+updateIncident memstate incident = do
+  runModifyRpc $ updateMaintdIncident incident
+  atomicModifyWithLens_ memstate msIncidentsL
+    $ (incident :) . filter ((/= uuidOf incident) . uuidOf)
+
+-- | Remove an incident.
+rmIncident :: IORef MemoryState -> String -> IO ()
+rmIncident memstate uuid = do
+  runModifyRpc $ rmMaintdIncident uuid
+  atomicModifyWithLens_ memstate msIncidentsL
+    $ filter ((/= uuid) . uuidOf)
diff --git a/src/Ganeti/MaintD/Server.hs b/src/Ganeti/MaintD/Server.hs
new file mode 100644
index 0000000..b88b23e
--- /dev/null
+++ b/src/Ganeti/MaintD/Server.hs
@@ -0,0 +1,215 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+{-| Implementation of the Ganeti maintenenace server.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Server
+  ( options
+  , main
+  , checkMain
+  , prepMain
+  ) where
+
+import Control.Applicative ((<|>))
+import Control.Concurrent (forkIO)
+import Control.Exception.Lifted (bracket)
+import Control.Monad (forever, void, unless, when, liftM)
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef, newIORef, readIORef)
+import qualified Data.Set as Set
+import Snap.Core (Snap, method, Method(GET), ifTop, dir, route)
+import Snap.Http.Server (httpServe)
+import Snap.Http.Server.Config (Config)
+import System.IO.Error (tryIOError)
+import System.Time (getClockTime)
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes ( GenericResult(..), ResultT, runResultT, mkResultT
+                         , mkResultTEither, withErrorT, isBad, isOk)
+import qualified Ganeti.Constants as C
+import Ganeti.Daemon ( OptType, CheckFn, PrepFn, MainFn, oDebug
+                     , oNoVoting, oYesDoIt, oPort, oBindAddress, oNoDaemonize)
+import Ganeti.Daemon.Utils (handleMasterVerificationOptions)
+import qualified Ganeti.HTools.Backend.Luxi as Luxi
+import Ganeti.HTools.Loader (ClusterData(..), mergeData, checkData)
+import Ganeti.Jobs (waitForJobs)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.Autorepairs (harepTasks)
+import Ganeti.MaintD.Balance (balanceTask)
+import Ganeti.MaintD.CleanupIncidents (cleanupIncidents)
+import Ganeti.MaintD.CollectIncidents (collectIncidents)
+import Ganeti.MaintD.FailIncident (failIncident)
+import Ganeti.MaintD.HandleIncidents (handleIncidents)
+import Ganeti.MaintD.MemoryState
+import qualified Ganeti.Path as Path
+import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
+import Ganeti.Types (JobId(..), JobStatus(..))
+import Ganeti.Utils (threadDelaySeconds, partitionM)
+import Ganeti.Utils.Http (httpConfFromOpts, plainJSON, error404)
+import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceRoundDelay
+                            , maintenanceBalancing)
+
+-- | Options list and functions.
+options :: [OptType]
+options =
+  [ oNoDaemonize
+  , oDebug
+  , oPort C.defaultMaintdPort
+  , oBindAddress
+  , oNoVoting
+  , oYesDoIt
+  ]
+
+-- | Type alias for checkMain results.
+type CheckResult = ()
+
+-- | Type alias for prepMain results
+type PrepResult = Config Snap ()
+
+-- | Load cluster data
+--
+-- At the moment, only the static data is fetched via luxi;
+-- once we support load-based balancing in maintd as well,
+-- we also need to query the MonDs for the load data.
+loadClusterData :: ResultT String IO ClusterData
+loadClusterData = do
+  now <- liftIO getClockTime
+  socket <- liftIO Path.defaultQuerySocket
+  either_inp <-  liftIO . tryIOError $ Luxi.loadData socket
+  input_data <- mkResultT $ case either_inp of
+                  Left e -> do
+                    let msg = show e
+                    logNotice $ "Couldn't read data from luxid: " ++ msg
+                    return $ Bad msg
+                  Right r -> return r
+  cdata <- mkResultT . return $ mergeData [] [] [] [] now input_data
+  let (msgs, nl) = checkData (cdNodes cdata) (cdInstances cdata)
+  unless (null msgs) . logDebug $ "Cluster data inconsistencies: " ++ show msgs
+  return $ cdata { cdNodes = nl }
+
+-- | Perform one round of maintenance
+maintenance :: IORef MemoryState -> ResultT String IO ()
+maintenance memstate = do
+  delay <- withErrorT show $ runNewWConfdClient maintenanceRoundDelay
+  liftIO $ threadDelaySeconds delay
+  oldjobs <- getJobs memstate
+  logDebug $ "Jobs submitted in the last round: "
+             ++ show (map fromJobId oldjobs)
+  luxiSocket <- liftIO Path.defaultQuerySocket
+
+  -- Filter out any jobs in the maintenance list which can't be parsed by luxi
+  -- anymore. This can happen if the job file is corrupted, missing or archived.
+  -- We have to query one job at a time, as luxi returns a single error if any
+  -- job in the query list can't be read/parsed.
+  (okjobs, badjobs) <- bracket
+       (mkResultTEither . tryIOError $ L.getLuxiClient luxiSocket)
+       (liftIO . L.closeClient)
+       $  mkResultT . liftM Ok
+       . (\c -> partitionM (\j -> liftM isOk $ L.queryJobsStatus c [j]) oldjobs)
+
+  unless (null badjobs) $ do
+    logInfo . (++) "Unparsable jobs (marking as failed): "
+        . show $ map fromJobId badjobs
+    mapM_ (failIncident memstate) badjobs
+
+  jobresults <- bracket
+      (mkResultTEither . tryIOError $ L.getLuxiClient luxiSocket)
+      (liftIO . L.closeClient)
+      $ mkResultT . (\c -> waitForJobs okjobs c)
+
+  let failedjobs = map fst $ filter ((/=) JOB_STATUS_SUCCESS . snd) jobresults
+  unless (null failedjobs) $ do
+    logInfo . (++) "Failed jobs: " . show $ map fromJobId failedjobs
+    mapM_ (failIncident memstate) failedjobs
+  unless (null oldjobs)
+    . liftIO $ clearJobs memstate
+  logDebug "New round of maintenance started"
+  cData <- loadClusterData
+  let il = cdInstances cData
+      nl = cdNodes cData
+      gl = cdGroups cData
+  cleanupIncidents memstate nl
+  collectIncidents memstate nl
+  nidxs <- handleIncidents memstate (gl, nl, il)
+  (nidxs', jobs) <- harepTasks (nl, il) nidxs
+  unless (null jobs)
+   . liftIO $ appendJobs memstate jobs
+  logDebug $ "Nodes unaffected by harep " ++ show (Set.toList nidxs')
+             ++ ", jobs submitted " ++ show (map fromJobId jobs)
+  (bal, thresh) <- withErrorT show $ runNewWConfdClient maintenanceBalancing
+  when (bal && not (Set.null nidxs')) $ do
+    logDebug $ "Will balance unaffected nodes, threshold " ++ show thresh
+    jobs' <- balanceTask memstate (nl, il) nidxs thresh
+    logDebug $ "Balancing jobs submitted: " ++ show (map fromJobId jobs')
+    unless (null jobs')
+      . liftIO $ appendJobs memstate jobs'
+
+-- | Expose a part of the memory state
+exposeState :: J.JSON a => (MemoryState -> a) -> IORef MemoryState -> Snap ()
+exposeState selector ref = do
+  state <- liftIO $ readIORef ref
+  plainJSON $ selector state
+
+-- | The information to serve via HTTP
+httpInterface :: IORef MemoryState -> Snap ()
+httpInterface memstate =
+  ifTop (method GET $ plainJSON [1 :: Int])
+  <|> dir "1" (ifTop (plainJSON J.JSNull)
+               <|> route [ ("jobs", exposeState msJobs memstate)
+                         , ("evacuated", exposeState msEvacuated memstate)
+                         , ("status", exposeState msIncidents memstate)
+                         ])
+  <|> error404
+
+-- | Check function for luxid.
+checkMain :: CheckFn CheckResult
+checkMain = handleMasterVerificationOptions
+
+-- | Prepare function for luxid.
+prepMain :: PrepFn CheckResult PrepResult
+prepMain opts _ = httpConfFromOpts GanetiMaintd opts
+
+-- | Main function.
+main :: MainFn CheckResult PrepResult
+main _ _ httpConf = do
+  memstate <- newIORef emptyMemoryState
+  void . forkIO . forever $ do
+    res <- runResultT $ maintenance memstate
+    (if isBad res then logInfo else logDebug)
+       $ "Maintenance round result is " ++ show res
+    when (isBad res) $ do
+      logDebug "Backing off after a round with internal errors"
+      threadDelaySeconds C.maintdDefaultRoundDelay
+  httpServe httpConf $ httpInterface memstate
diff --git a/src/Ganeti/MaintD/Utils.hs b/src/Ganeti/MaintD/Utils.hs
new file mode 100644
index 0000000..b74d2de
--- /dev/null
+++ b/src/Ganeti/MaintD/Utils.hs
@@ -0,0 +1,64 @@
+{-| Utility functions for the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Utils
+  ( annotateOpCode
+  , getRepairCommand
+  ) where
+
+import Control.Lens.Setter (over)
+import qualified Text.JSON as J
+
+import qualified Ganeti.Constants as C
+import Ganeti.JQueue (reasonTrailTimestamp)
+import Ganeti.JQueue.Objects (Timestamp)
+import Ganeti.Objects.Maintenance (Incident(..))
+import Ganeti.OpCodes (OpCode, MetaOpCode, wrapOpCode)
+import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
+
+-- | Wrap an `OpCode` into a `MetaOpCode` and adding an indication
+-- that the `OpCode` was submitted by the maintenance daemon.
+annotateOpCode :: String -> Timestamp -> OpCode -> MetaOpCode
+annotateOpCode reason ts =
+  over (metaParamsL . opReasonL)
+    (++ [(C.opcodeReasonSrcMaintd, reason, reasonTrailTimestamp ts)])
+  . wrapOpCode
+
+-- | Get the name of the repair command from a live-repair incident.
+getRepairCommand :: Incident -> Maybe String
+getRepairCommand incident
+  | J.JSObject obj <- incidentOriginal incident,
+    Just (J.JSString cmd) <- lookup "command" $ J.fromJSObject obj
+      = return $ J.fromJSString cmd
+getRepairCommand _ = Nothing
diff --git a/src/Ganeti/Metad/ConfigCore.hs b/src/Ganeti/Metad/ConfigCore.hs
index 7211c7e..5821baa 100644
--- a/src/Ganeti/Metad/ConfigCore.hs
+++ b/src/Ganeti/Metad/ConfigCore.hs
@@ -35,7 +35,9 @@
 -}
 module Ganeti.Metad.ConfigCore where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent.MVar.Lifted
 import Control.Monad.Base
 import Control.Monad.IO.Class
diff --git a/src/Ganeti/Monitoring/Server.hs b/src/Ganeti/Monitoring/Server.hs
index da78b00..668779b 100644
--- a/src/Ganeti/Monitoring/Server.hs
+++ b/src/Ganeti/Monitoring/Server.hs
@@ -41,19 +41,20 @@
   , DataCollector(..)
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Applicative
 import Control.DeepSeq (force)
 import Control.Exception.Base (evaluate)
-import Control.Monad
+import Control.Monad (void, forever, liftM, foldM, foldM_, mzero)
 import Control.Monad.IO.Class
-import Data.ByteString.Char8 (pack, unpack)
+import Data.ByteString.Char8 (unpack)
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.Maybe (fromMaybe)
 import Data.List (find)
-import Data.Monoid (mempty)
 import qualified Data.Map as Map
 import qualified Data.PSQueue as Queue
-import Network.BSD (getServicePortNumber)
 import Snap.Core
 import Snap.Http.Server
 import qualified Text.JSON as J
@@ -71,7 +72,8 @@
 import qualified Ganeti.Constants as C
 import qualified Ganeti.ConstantUtils as CU
 import Ganeti.Runtime
-import Ganeti.Utils (getCurrentTimeUSec, withDefaultOnIOError)
+import Ganeti.Utils (getCurrentTimeUSec)
+import Ganeti.Utils.Http (httpConfFromOpts, error404, plainJSON)
 
 -- * Types and constants definitions
 
@@ -87,17 +89,6 @@
 latestAPIVersion :: Int
 latestAPIVersion = C.mondLatestApiVersion
 
--- * Configuration handling
-
--- | The default configuration for the HTTP server.
-defaultHttpConf :: FilePath -> FilePath -> Config Snap ()
-defaultHttpConf accessLog errorLog =
-  setAccessLog (ConfigFileLog accessLog) .
-  setCompression False .
-  setErrorLog (ConfigFileLog errorLog) $
-  setVerbose False
-  emptyConfig
-
 -- * Helper functions
 
 -- | Check function for the monitoring agent.
@@ -106,28 +97,18 @@
 
 -- | Prepare function for monitoring agent.
 prepMain :: PrepFn CheckResult PrepResult
-prepMain opts _ = do
-  accessLog <- daemonsExtraLogFile GanetiMond AccessLog
-  errorLog <- daemonsExtraLogFile GanetiMond ErrorLog
-  defaultPort <- withDefaultOnIOError C.defaultMondPort
-                 . liftM fromIntegral
-                 $ getServicePortNumber C.mond
-  return .
-    setPort
-      (maybe defaultPort fromIntegral (optPort opts)) .
-    maybe id (setBind . pack) (optBindAddress opts)
-    $ defaultHttpConf accessLog errorLog
+prepMain opts _ = httpConfFromOpts GanetiMond opts
 
 -- * Query answers
 
 -- | Reply to the supported API version numbers query.
 versionQ :: Snap ()
-versionQ = writeBS . pack $ J.encode [latestAPIVersion]
+versionQ = plainJSON [latestAPIVersion]
 
 -- | Version 1 of the monitoring HTTP API.
 version1Api :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
 version1Api mvar mvarConfig =
-  let returnNull = writeBS . pack $ J.encode J.JSNull :: Snap ()
+  let returnNull = plainJSON J.JSNull
   in ifTop returnNull <|>
      route
        [ ("list", listHandler mvarConfig)
@@ -171,7 +152,7 @@
 listHandler :: MVar ConfigAccess -> Snap ()
 listHandler mvarConfig = dir "collectors" $ do
   collectors' <- liftIO $ activeCollectors mvarConfig
-  writeBS . pack . J.encode $ map dcListItem collectors'
+  plainJSON $ map dcListItem collectors'
 
 -- | Handler for returning data collector reports.
 reportHandler :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
@@ -187,7 +168,7 @@
 allReports mvar mvarConfig = do
   collectors' <- liftIO $ activeCollectors mvarConfig
   reports <- mapM (liftIO . getReport mvar) collectors'
-  writeBS . pack . J.encode $ reports
+  plainJSON reports
 
 -- | Takes the CollectorMap and a DataCollector and returns the report for this
 -- collector.
@@ -213,6 +194,7 @@
 catFromName "storage"    = BT.Ok $ Just DCStorage
 catFromName "daemon"     = BT.Ok $ Just DCDaemon
 catFromName "hypervisor" = BT.Ok $ Just DCHypervisor
+catFromName "node"       = BT.Ok $ Just DCNode
 catFromName "default"    = BT.Ok Nothing
 catFromName _            = BT.Bad "No such category"
 
@@ -221,11 +203,6 @@
   modifyResponse $ setResponseStatus 404 "Not found"
   writeBS "Unable to produce a report for the requested resource"
 
-error404 :: Snap ()
-error404 = do
-  modifyResponse $ setResponseStatus 404 "Not found"
-  writeBS "Resource not found"
-
 -- | Return the report of one collector.
 oneReport :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
 oneReport mvar mvarConfig = do
@@ -243,7 +220,7 @@
       Just col -> return col
       Nothing -> fail "Unable to find the requested collector"
   dcr <- liftIO $ getReport mvar collector
-  writeBS . pack . J.encode $ dcr
+  plainJSON dcr
 
 -- | The function implementing the HTTP API of the monitoring agent.
 monitoringApi :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
diff --git a/src/Ganeti/Network.hs b/src/Ganeti/Network.hs
index 1cb6aa1..b557369 100644
--- a/src/Ganeti/Network.hs
+++ b/src/Ganeti/Network.hs
@@ -55,7 +55,7 @@
   ) where
 
 import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
 import Control.Monad.State
 import Data.Bits ((.&.))
 import Data.Function (on)
@@ -98,7 +98,7 @@
 netIpv4NumHosts = ipv4NumHosts . ip4netMask . networkNetwork
 
 -- | Creates a new bit array pool of the appropriate size
-newPoolArray :: (MonadError e m, Error e) => Network -> m BA.BitArray
+newPoolArray :: (MonadError e m, FromString e) => Network -> m BA.BitArray
 newPoolArray net = do
   let numhosts = netIpv4NumHosts net
   when (numhosts > ipv4NetworkMaxNumHosts) . failError $
@@ -112,15 +112,15 @@
   return $ BA.zeroes (fromInteger numhosts)
 
 -- | Creates a new bit array pool of the appropriate size
-newPool :: (MonadError e m, Error e) => Network -> m AddressPool
+newPool :: (MonadError e m, FromString e) => Network -> m AddressPool
 newPool = liftM AddressPool . newPoolArray
 
 -- | A helper function that creates a bit array pool, of it's missing.
-orNewPool :: (MonadError e m, Error e)
+orNewPool :: (MonadError e m, FromString e)
           => Network -> Maybe AddressPool -> m AddressPool
 orNewPool net = maybe (newPool net) return
 
-withPool :: (MonadError e m, Error e)
+withPool :: (MonadError e m, FromString e)
          => PoolPart -> (Network -> BA.BitArray -> m (a, BA.BitArray))
          -> StateT Network m a
 withPool part f = StateT $ \n -> mapMOf2 (poolLens part) (f' n) n
@@ -129,7 +129,7 @@
              . mapMOf2 addressPoolIso (f net)
              <=< orNewPool net
 
-withPool_ :: (MonadError e m, Error e)
+withPool_ :: (MonadError e m, FromString e)
           => PoolPart -> (Network -> BA.BitArray -> m BA.BitArray)
           -> Network -> m Network
 withPool_ part f = execStateT $ withPool part ((liftM ((,) ()) .) . f)
@@ -137,12 +137,12 @@
 readPool :: PoolPart -> Network -> Maybe BA.BitArray
 readPool = view . poolArrayLens
 
-readPoolE :: (MonadError e m, Error e)
+readPoolE :: (MonadError e m, FromString e)
           => PoolPart -> Network -> m BA.BitArray
 readPoolE part net =
   liftM apReservations $ orNewPool net ((view . poolLens) part net)
 
-readAllE :: (MonadError e m, Error e)
+readAllE :: (MonadError e m, FromString e)
          => Network -> m BA.BitArray
 readAllE net = do
   let toRes = liftM apReservations . orNewPool net
@@ -180,7 +180,7 @@
 
 -- | Returns an address index wrt a network.
 -- Fails if the address isn't in the network range.
-addrIndex :: (MonadError e m, Error e) => Ip4Address -> Network -> m Int
+addrIndex :: (MonadError e m, FromString e) => Ip4Address -> Network -> m Int
 addrIndex addr net = do
   let n = networkNetwork net
       i = on (-) ip4AddressToNumber addr (ip4BaseAddr n)
@@ -190,7 +190,7 @@
 
 -- | Returns an address of a given index wrt a network.
 -- Fails if the index isn't in the network range.
-addrAt :: (MonadError e m, Error e) => Int -> Network -> m Ip4Address
+addrAt :: (MonadError e m, FromString e) => Int -> Network -> m Ip4Address
 addrAt i net | (i' < 0) || (i' >= ipv4NumHosts (ip4netMask n)) =
     failError $ "Requested index " ++ show i
                 ++ " outside the range of network '" ++ show net ++ "'"
@@ -202,13 +202,13 @@
 
 -- | Checks if a given address is reserved.
 -- Fails if the address isn't in the network range.
-isReserved :: (MonadError e m, Error e) =>
+isReserved :: (MonadError e m, FromString e) =>
               PoolPart -> Ip4Address -> Network -> m Bool
 isReserved part addr net =
   (BA.!) `liftM` readPoolE part net `ap` addrIndex addr net
 
 -- | Marks an address as used.
-reserve :: (MonadError e m, Error e) =>
+reserve :: (MonadError e m, FromString e) =>
            PoolPart -> Ip4Address -> Network -> m Network
 reserve part addr =
     withPool_ part $ \net ba -> do
@@ -220,7 +220,7 @@
       BA.setAt idx True ba
 
 -- | Marks an address as unused.
-release :: (MonadError e m, Error e) =>
+release :: (MonadError e m, FromString e) =>
            PoolPart -> Ip4Address -> Network -> m Network
 release part addr =
     withPool_ part $ \net ba -> do
@@ -233,7 +233,7 @@
 
 -- | Get the first free address in the network
 -- that satisfies a given predicate.
-findFree :: (MonadError e m, Error e)
+findFree :: (MonadError e m, FromString e)
          => (Ip4Address -> Bool) -> Network -> m (Maybe Ip4Address)
 findFree p net = readAllE net >>= BA.foldr f (return Nothing)
   where
diff --git a/src/Ganeti/Objects.hs b/src/Ganeti/Objects.hs
index 59abc5c..065aaa8 100644
--- a/src/Ganeti/Objects.hs
+++ b/src/Ganeti/Objects.hs
@@ -103,16 +103,22 @@
   , module Ganeti.PartialParams
   , module Ganeti.Objects.Disk
   , module Ganeti.Objects.Instance
-  ) where
+  , module Ganeti.Objects.Maintenance
+  , FilledHvStateParams(..)
+  , PartialHvStateParams(..)
+  , allHvStateParamFields
+  , FilledHvState
+  , PartialHvState ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow (first)
 import Control.Monad.State
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.List (foldl', intercalate)
 import Data.Maybe
 import qualified Data.Map as Map
-import Data.Monoid
 import Data.Ord (comparing)
 import Data.Ratio (numerator, denominator)
 import Data.Tuple (swap)
@@ -127,8 +133,10 @@
 import Ganeti.JSON (DictObject(..), Container, emptyContainer, GenericContainer)
 import Ganeti.Objects.BitArray (BitArray)
 import Ganeti.Objects.Disk
+import Ganeti.Objects.Maintenance
 import Ganeti.Objects.Nic
 import Ganeti.Objects.Instance
+import Ganeti.Objects.HvState
 import Ganeti.Query.Language
 import Ganeti.PartialParams
 import Ganeti.Types
@@ -318,6 +326,8 @@
     simpleField "std" [t| PartialISpecParams |]
   , optionalField . renameField "SpindleRatioP" $
     simpleField "spindle-ratio" [t| Double |]
+  , optionalField . renameField "MemoryRatioP" $
+    simpleField "memory-ratio" [t| Double |]
   , optionalField . renameField "VcpuRatioP" $
     simpleField "vcpu-ratio" [t| Double |]
   , optionalField . renameField "DiskTemplatesP" $
@@ -331,6 +341,8 @@
     simpleField ConstantUtils.ispecsMinmax [t| [MinMaxISpecs] |]
   , renameField "StdSpec" $ simpleField "std" [t| FilledISpecParams |]
   , simpleField "spindle-ratio"  [t| Double |]
+  , defaultField [| ConstantUtils.ipolicyDefaultsMemoryRatio |] $
+    simpleField "memory-ratio"  [t| Double |]
   , simpleField "vcpu-ratio"     [t| Double |]
   , simpleField "disk-templates" [t| [DiskTemplate] |]
   ])
@@ -341,17 +353,20 @@
             (FilledIPolicy { ipolicyMinMaxISpecs  = fminmax
                            , ipolicyStdSpec       = fstd
                            , ipolicySpindleRatio  = fspindleRatio
+                           , ipolicyMemoryRatio   = fmemoryRatio
                            , ipolicyVcpuRatio     = fvcpuRatio
                            , ipolicyDiskTemplates = fdiskTemplates})
             (PartialIPolicy { ipolicyMinMaxISpecsP  = pminmax
                             , ipolicyStdSpecP       = pstd
                             , ipolicySpindleRatioP  = pspindleRatio
+                            , ipolicyMemoryRatioP   = pmemoryRatio
                             , ipolicyVcpuRatioP     = pvcpuRatio
                             , ipolicyDiskTemplatesP = pdiskTemplates}) =
     FilledIPolicy
                 { ipolicyMinMaxISpecs  = fromMaybe fminmax pminmax
                 , ipolicyStdSpec       = maybe fstd (fillParams fstd) pstd
                 , ipolicySpindleRatio  = fromMaybe fspindleRatio pspindleRatio
+                , ipolicyMemoryRatio   = fromMaybe fmemoryRatio pmemoryRatio
                 , ipolicyVcpuRatio     = fromMaybe fvcpuRatio pvcpuRatio
                 , ipolicyDiskTemplates = fromMaybe fdiskTemplates
                                          pdiskTemplates
@@ -359,22 +374,31 @@
   toPartial (FilledIPolicy { ipolicyMinMaxISpecs  = fminmax
                            , ipolicyStdSpec       = fstd
                            , ipolicySpindleRatio  = fspindleRatio
+                           , ipolicyMemoryRatio   = fmemoryRatio
                            , ipolicyVcpuRatio     = fvcpuRatio
                            , ipolicyDiskTemplates = fdiskTemplates}) =
     PartialIPolicy
                 { ipolicyMinMaxISpecsP  = Just fminmax
                 , ipolicyStdSpecP       = Just $ toPartial fstd
                 , ipolicySpindleRatioP  = Just fspindleRatio
+                , ipolicyMemoryRatioP   = Just fmemoryRatio
                 , ipolicyVcpuRatioP     = Just fvcpuRatio
                 , ipolicyDiskTemplatesP = Just fdiskTemplates
                 }
   toFilled (PartialIPolicy { ipolicyMinMaxISpecsP  = pminmax
                            , ipolicyStdSpecP       = pstd
                            , ipolicySpindleRatioP  = pspindleRatio
+                           , ipolicyMemoryRatioP   = pmemoryRatio
                            , ipolicyVcpuRatioP     = pvcpuRatio
                            , ipolicyDiskTemplatesP = pdiskTemplates}) =
-    FilledIPolicy <$> pminmax <*> (toFilled =<< pstd) <*>  pspindleRatio
-                  <*> pvcpuRatio <*> pdiskTemplates
+    FilledIPolicy <$> pminmax <*> (toFilled =<< pstd) <*> pspindleRatio
+                  <*> pmemoryRatio <*> pvcpuRatio <*> pdiskTemplates
+
+-- | Disk state parameters.
+--
+-- As according to the documentation this option is unused by Ganeti,
+-- the content is just a 'JSValue'.
+type DiskState = Container JSValue
 
 -- * Node definitions
 
@@ -389,32 +413,20 @@
   , simpleField "cpu_speed"     [t| Double |]
   ])
 
--- | Disk state parameters.
---
--- As according to the documentation this option is unused by Ganeti,
--- the content is just a 'JSValue'.
-type DiskState = Container JSValue
-
--- | Hypervisor state parameters.
---
--- As according to the documentation this option is unused by Ganeti,
--- the content is just a 'JSValue'.
-type HypervisorState = Container JSValue
-
 $(buildObject "Node" "node" $
-  [ simpleField "name"             [t| String |]
-  , simpleField "primary_ip"       [t| String |]
-  , simpleField "secondary_ip"     [t| String |]
-  , simpleField "master_candidate" [t| Bool   |]
-  , simpleField "offline"          [t| Bool   |]
-  , simpleField "drained"          [t| Bool   |]
-  , simpleField "group"            [t| String |]
-  , simpleField "master_capable"   [t| Bool   |]
-  , simpleField "vm_capable"       [t| Bool   |]
-  , simpleField "ndparams"         [t| PartialNDParams |]
-  , simpleField "powered"          [t| Bool   |]
+  [ simpleField "name"              [t| String          |]
+  , simpleField "primary_ip"        [t| String          |]
+  , simpleField "secondary_ip"      [t| String          |]
+  , simpleField "master_candidate"  [t| Bool            |]
+  , simpleField "offline"           [t| Bool            |]
+  , simpleField "drained"           [t| Bool            |]
+  , simpleField "group"             [t| String          |]
+  , simpleField "master_capable"    [t| Bool            |]
+  , simpleField "vm_capable"        [t| Bool            |]
+  , simpleField "ndparams"          [t| PartialNDParams |]
+  , simpleField "powered"           [t| Bool            |]
   , notSerializeDefaultField [| emptyContainer |] $
-    simpleField "hv_state_static"   [t| HypervisorState |]
+    simpleField "hv_state_static"   [t| PartialHvState  |]
   , notSerializeDefaultField [| emptyContainer |] $
     simpleField "disk_state_static" [t| DiskState       |]
   ]
@@ -445,15 +457,15 @@
 type Networks = Container PartialNicParams
 
 $(buildObject "NodeGroup" "group" $
-  [ simpleField "name"         [t| String |]
+  [ simpleField "name"              [t| String |]
   , defaultField [| [] |] $ simpleField "members" [t| [String] |]
-  , simpleField "ndparams"     [t| PartialNDParams |]
-  , simpleField "alloc_policy" [t| AllocPolicy     |]
-  , simpleField "ipolicy"      [t| PartialIPolicy  |]
-  , simpleField "diskparams"   [t| GroupDiskParams |]
-  , simpleField "networks"     [t| Networks        |]
+  , simpleField "ndparams"          [t| PartialNDParams |]
+  , simpleField "alloc_policy"      [t| AllocPolicy     |]
+  , simpleField "ipolicy"           [t| PartialIPolicy  |]
+  , simpleField "diskparams"        [t| GroupDiskParams |]
+  , simpleField "networks"          [t| Networks        |]
   , notSerializeDefaultField [| emptyContainer |] $
-    simpleField "hv_state_static"   [t| HypervisorState |]
+    simpleField "hv_state_static"   [t| PartialHvState  |]
   , notSerializeDefaultField [| emptyContainer |] $
     simpleField "disk_state_static" [t| DiskState       |]
   ]
@@ -664,10 +676,10 @@
   , simpleField "primary_ip_family"              [t| IpFamily                |]
   , simpleField "prealloc_wipe_disks"            [t| Bool                    |]
   , simpleField "ipolicy"                        [t| FilledIPolicy           |]
-  , defaultField [| emptyContainer |] $
-    simpleField "hv_state_static"                [t| HypervisorState        |]
-  , defaultField [| emptyContainer |] $
-    simpleField "disk_state_static"              [t| DiskState              |]
+  , notSerializeDefaultField [| emptyContainer |] $
+    simpleField "hv_state_static"                [t| FilledHvState           |]
+  , notSerializeDefaultField [| emptyContainer |] $
+    simpleField "disk_state_static"              [t| DiskState               |]
   , simpleField "enabled_disk_templates"         [t| [DiskTemplate]          |]
   , simpleField "candidate_certs"                [t| CandidateCertificates   |]
   , simpleField "max_running_jobs"               [t| Int                     |]
@@ -678,6 +690,8 @@
   , simpleField "compression_tools"              [t| [String]                |]
   , simpleField "enabled_user_shutdown"          [t| Bool                    |]
   , simpleField "data_collectors"         [t| Container DataCollectorConfig  |]
+  , defaultField [| [] |] $ simpleField
+      "diagnose_data_collector_filename"         [t| String                  |]
   , simpleField "ssh_key_type"                   [t| SshKeyType              |]
   , simpleField "ssh_key_bits"                   [t| Int                     |]
  ]
@@ -711,6 +725,7 @@
   , simpleField "networks"   [t| Container Network   |]
   , simpleField "disks"      [t| Container Disk      |]
   , simpleField "filters"    [t| Container FilterRule |]
+  , simpleField "maintenance" [t| MaintenanceData    |]
   ]
   ++ timeStampFields
   ++ serialFields)
diff --git a/src/Ganeti/Objects/BitArray.hs b/src/Ganeti/Objects/BitArray.hs
index 7932fb2..62b45c4 100644
--- a/src/Ganeti/Objects/BitArray.hs
+++ b/src/Ganeti/Objects/BitArray.hs
@@ -58,7 +58,7 @@
 import Prelude hiding (foldr)
 
 import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
 import qualified Data.IntSet as IS
 import qualified Text.JSON as J
 
@@ -116,7 +116,7 @@
 -- | Sets or removes an element from a bit array.
 
 -- | Sets a given bit in an array. Fails if the index is out of bounds.
-setAt :: (MonadError e m, Error e) => Int -> Bool -> BitArray -> m BitArray
+setAt :: (MonadError e m, FromString e) => Int -> Bool -> BitArray -> m BitArray
 setAt i False (BitArray s bits) =
   return $ BitArray s (IS.delete i bits)
 setAt i True (BitArray s bits) | (i >= 0) && (i < s) =
diff --git a/src/Ganeti/Objects/Disk.hs b/src/Ganeti/Objects/Disk.hs
index a03ba23..f6b3cbb 100644
--- a/src/Ganeti/Objects/Disk.hs
+++ b/src/Ganeti/Objects/Disk.hs
@@ -36,7 +36,9 @@
 
 module Ganeti.Objects.Disk where
 
-import Control.Applicative ((<*>), (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.Char (isAsciiLower, isAsciiUpper, isDigit)
 import Data.List (isPrefixOf, isInfixOf)
diff --git a/src/Ganeti/Objects/HvState.hs b/src/Ganeti/Objects/HvState.hs
new file mode 100644
index 0000000..de2599f
--- /dev/null
+++ b/src/Ganeti/Objects/HvState.hs
@@ -0,0 +1,60 @@
+{-# LANGUAGE TemplateHaskell, FunctionalDependencies #-}
+
+{-| Implementation of the Ganeti HvState config object.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Objects.HvState
+  ( FilledHvStateParams(..)
+  , PartialHvStateParams(..)
+  , allHvStateParamFields
+  , FilledHvState
+  , PartialHvState ) where
+
+import Ganeti.THH
+import Ganeti.JSON
+import Ganeti.Types
+
+$(buildParam "HvState" "hvstate"
+  [ simpleField "cpu_node"  [t| Int |]
+  , simpleField "cpu_total" [t| Int |]
+  , simpleField "mem_hv"    [t| Int |]
+  , simpleField "mem_node"  [t| Int |]
+  , simpleField "mem_total" [t| Int |]
+  ])
+
+-- | Static filled hypervisor state (hvtype to hvstate mapping)
+type FilledHvState = GenericContainer Hypervisor FilledHvStateParams
+
+-- | Static partial hypervisor state (hvtype to hvstate mapping)
+type PartialHvState = GenericContainer Hypervisor PartialHvStateParams
diff --git a/src/Ganeti/Objects/Instance.hs b/src/Ganeti/Objects/Instance.hs
index fb35f65..a946b4e 100644
--- a/src/Ganeti/Objects/Instance.hs
+++ b/src/Ganeti/Objects/Instance.hs
@@ -40,7 +40,9 @@
 module Ganeti.Objects.Instance where
 
 import qualified Data.ByteString.UTF8 as UTF8
-import Data.Monoid
+
+import Prelude ()
+import Ganeti.Prelude
 
 import Ganeti.JSON (emptyContainer)
 import Ganeti.Objects.Nic
diff --git a/src/Ganeti/Objects/Lens.hs b/src/Ganeti/Objects/Lens.hs
index e838bfd..3f27981 100644
--- a/src/Ganeti/Objects/Lens.hs
+++ b/src/Ganeti/Objects/Lens.hs
@@ -157,6 +157,14 @@
 instance TagsObjectL Cluster where
   tagsL = clusterTagsL
 
+$(makeCustomLenses ''MaintenanceData)
+
+instance TimeStampObjectL MaintenanceData where
+  mTimeL = maintMtimeL
+
+instance SerialNoObjectL MaintenanceData where
+  serialL = maintSerialL
+
 $(makeCustomLenses ''ConfigData)
 
 instance SerialNoObjectL ConfigData where
@@ -164,3 +172,5 @@
 
 instance TimeStampObjectL ConfigData where
   mTimeL = configMtimeL
+
+$(makeCustomLenses ''Incident)
diff --git a/src/Ganeti/Objects/Maintenance.hs b/src/Ganeti/Objects/Maintenance.hs
new file mode 100644
index 0000000..ea6e709
--- /dev/null
+++ b/src/Ganeti/Objects/Maintenance.hs
@@ -0,0 +1,115 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Implementation of the Ganeti configuration for the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Objects.Maintenance
+  ( MaintenanceData(..)
+  , RepairAction(..)
+  , RepairStatus(..)
+  , Incident(..)
+  ) where
+
+import qualified Data.ByteString.UTF8 as UTF8
+import qualified Text.JSON as J
+
+import qualified Ganeti.Constants as C
+import Ganeti.THH
+import Ganeti.THH.Field
+import Ganeti.Types
+
+-- | Action to be taken for a certain repair event. Note
+-- that the order is important, as we rely on values higher
+-- in the derived order to be more intrusive actions.
+$(declareLADT ''String "RepairAction"
+    [ ("RANoop", "Ok")
+    , ("RALiveRepair", "live-repair")
+    , ("RAEvacuate", "evacuate")
+    , ("RAEvacuateFailover", "evacuate-failover")
+    ])
+$(makeJSONInstance ''RepairAction)
+
+-- | Progress made on the particular repair event. Again we rely
+-- on the order in that everything larger than `RSPending` is finalized
+-- in the sense that no further jobs will be submitted.
+$(declareLADT ''String "RepairStatus"
+   [ ("RSNoted", "noted")
+   , ("RSPending", "pending")
+   , ("RSCanceled", "canceled")
+   , ("RSFailed", "failed")
+   , ("RSCompleted", "completed")
+   ])
+$(makeJSONInstance ''RepairStatus)
+
+$(buildObject "Incident" "incident" $
+   [ simpleField "original" [t| J.JSValue |]
+   , simpleField "action" [t| RepairAction |]
+   , defaultField [| [] |] $ simpleField "jobs" [t| [ JobId ] |]
+   , simpleField "node" [t| String |]
+   , simpleField "repair-status" [t| RepairStatus |]
+   , simpleField "tag" [t| String |]
+   ]
+   ++ uuidFields
+   ++ timeStampFields
+   ++ serialFields)
+
+instance SerialNoObject Incident where
+  serialOf = incidentSerial
+
+instance TimeStampObject Incident where
+  cTimeOf = incidentCtime
+  mTimeOf = incidentMtime
+
+instance UuidObject Incident where
+  uuidOf = UTF8.toString . incidentUuid
+
+$(buildObject "MaintenanceData" "maint" $
+  [ defaultField [| C.maintdDefaultRoundDelay |]
+    $ simpleField "roundDelay" [t| Int |]
+  , defaultField [| [] |] $ simpleField "jobs" [t| [ JobId ] |]
+  , defaultField [| False |] $ simpleField "balance" [t| Bool |]
+  , defaultField [| 0.1 :: Double |]
+    $ simpleField "balanceThreshold" [t| Double |]
+  , defaultField [| [] |] $ simpleField "evacuated" [t| [ String ] |]
+  , defaultField [| [] |] $ simpleField "incidents" [t| [ Incident ] |]
+  ]
+  ++ timeStampFields
+  ++ serialFields)
+
+instance SerialNoObject MaintenanceData where
+  serialOf = maintSerial
+
+instance TimeStampObject MaintenanceData where
+  cTimeOf = maintCtime
+  mTimeOf = maintMtime
diff --git a/src/Ganeti/OpCodes.hs b/src/Ganeti/OpCodes.hs
index 3ff87b1..8b0dc91 100644
--- a/src/Ganeti/OpCodes.hs
+++ b/src/Ganeti/OpCodes.hs
@@ -58,7 +58,9 @@
   , setOpPriority
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Data.List (intercalate)
 import Data.Map (Map)
 import qualified Text.JSON
@@ -262,6 +264,10 @@
      , pEnabledUserShutdown
      , pEnabledDataCollectors
      , pDataCollectorInterval
+     , pDiagnoseDataCollectorFilename
+     , pMaintdRoundDelay
+     , pMaintdEnableBalancing
+     , pMaintdBalancingThreshold
      ],
      [])
   , ("OpClusterRedistConf",
@@ -330,11 +336,21 @@
      , pRestrictedCommand
      ],
      [])
+  , ("OpRepairCommand",
+     [t| String |],
+     OpDoc.opRepairCommand,
+     [ pNodeName
+     , pRepairCommand
+     , pInput
+     ],
+     [])
   , ("OpNodeRemove",
      [t| () |],
       OpDoc.opNodeRemove,
      [ pNodeName
      , pNodeUuid
+     , pVerbose
+     , pDebug
      ],
      "node_name")
   , ("OpNodeAdd",
@@ -351,6 +367,8 @@
      , pVmCapable
      , pNdParams
      , pNodeSetup
+     , pVerbose
+     , pDebug
      ],
      "node_name")
   , ("OpNodeQueryvols",
@@ -408,6 +426,8 @@
      , pSecondaryIp
      , pNdParams
      , pPowered
+     , pVerbose
+     , pDebug
      ],
      "node_name")
   , ("OpNodePowercycle",
diff --git a/src/Ganeti/OpParams.hs b/src/Ganeti/OpParams.hs
index 3c8be20..83fd2c4 100644
--- a/src/Ganeti/OpParams.hs
+++ b/src/Ganeti/OpParams.hs
@@ -244,6 +244,8 @@
   , pZeroingTimeoutPerMiB
   , pTagSearchPattern
   , pRestrictedCommand
+  , pRepairCommand
+  , pInput
   , pReplaceDisksMode
   , pReplaceDisksList
   , pAllowFailover
@@ -297,7 +299,11 @@
   , pEnabledUserShutdown
   , pAdminStateSource
   , pEnabledDataCollectors
+  , pMaintdRoundDelay
+  , pMaintdEnableBalancing
+  , pMaintdBalancingThreshold
   , pDataCollectorInterval
+  , pDiagnoseDataCollectorFilename
   , pNodeSslCerts
   , pSshKeyBits
   , pSshKeyType
@@ -905,12 +911,12 @@
 pRequiredNodes :: Field
 pRequiredNodes =
   withDoc "Required list of node names" .
-  renameField "ReqNodes " $ simpleField "nodes" [t| [NonEmptyString] |]
+  renameField "ReqNodes" $ simpleField "nodes" [t| [NonEmptyString] |]
 
 pRequiredNodeUuids :: Field
 pRequiredNodeUuids =
   withDoc "Required list of node UUIDs" .
-  renameField "ReqNodeUuids " . optionalField $
+  renameField "ReqNodeUuids" . optionalField $
   simpleField "node_uuids" [t| [NonEmptyString] |]
 
 pRestrictedCommand :: Field
@@ -919,6 +925,17 @@
   renameField "RestrictedCommand" $
   simpleField "command" [t| NonEmptyString |]
 
+pRepairCommand :: Field
+pRepairCommand =
+  withDoc "Repair command name" .
+  renameField "RepairCommand" $
+  simpleField "command" [t| NonEmptyString |]
+
+pInput :: Field
+pInput =
+  withDoc "Input to be redirected to stdin of repair script" .
+  optionalField $ simpleField "input" [t| NonEmptyString |]
+
 pNodeName :: Field
 pNodeName =
   withDoc "A required node name (for single-node LUs)" $
@@ -1521,7 +1538,7 @@
 pDiskIndex :: Field
 pDiskIndex =
   withDoc "Disk index for e.g. grow disk" .
-  renameField "DiskIndex " $ simpleField "disk" [t| DiskIndex |]
+  renameField "DiskIndex" $ simpleField "disk" [t| DiskIndex |]
 
 pDiskChgAmount :: Field
 pDiskChgAmount =
@@ -1742,7 +1759,7 @@
 pIAllocatorInstances :: Field
 pIAllocatorInstances =
   withDoc "IAllocator instances field" .
-  renameField "IAllocatorInstances " .
+  renameField "IAllocatorInstances" .
   optionalField $
   simpleField "instances" [t| [NonEmptyString] |]
 
@@ -1891,6 +1908,29 @@
   optionalField $
   simpleField C.dataCollectorsIntervalName [t| GenericContainer String Int |]
 
+pDiagnoseDataCollectorFilename :: Field
+pDiagnoseDataCollectorFilename =
+  withDoc "Sets the filename of the script diagnose data collector should run" $
+  optionalStringField "diagnose_data_collector_filename"
+
+pMaintdRoundDelay :: Field
+pMaintdRoundDelay =
+  withDoc "Minimal delay between rounds of the maintenance daemon"
+  . optionalField
+  $ simpleField "maint_round_delay" [t| Int |]
+
+pMaintdEnableBalancing :: Field
+pMaintdEnableBalancing =
+  withDoc "Whether the maintenance daemon should also keep the cluster balanced"
+  . optionalField
+  $ simpleField "maint_balance" [t| Bool |]
+
+pMaintdBalancingThreshold :: Field
+pMaintdBalancingThreshold =
+  withDoc "Minimal gain per balancing step by the maintenance daemon"
+  . optionalField
+  $ simpleField "maint_balance_threshold" [t| Double |]
+
 pNodeSslCerts :: Field
 pNodeSslCerts =
   withDoc "Whether to renew node SSL certificates" .
diff --git a/src/Ganeti/Parsers.hs b/src/Ganeti/Parsers.hs
index 10b0e41..7cb037e 100644
--- a/src/Ganeti/Parsers.hs
+++ b/src/Ganeti/Parsers.hs
@@ -37,7 +37,9 @@
 -}
 module Ganeti.Parsers where
 
-import Control.Applicative ((*>))
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.Attoparsec.Text as A
 import Data.Attoparsec.Text (Parser)
 import Data.Text (unpack)
@@ -54,6 +56,10 @@
 numberP :: Parser Int
 numberP = skipSpaces *> A.decimal
 
+-- | A parser recognizing a number preceeded by spaces.
+integerP :: Parser Integer
+integerP = skipSpaces *> A.decimal
+
 -- | A parser recognizing a word preceded by spaces, and closed by a space.
 stringP :: Parser String
 stringP = skipSpaces *> fmap unpack (A.takeWhile $ not . A.isHorizontalSpace)
diff --git a/src/Ganeti/Path.hs b/src/Ganeti/Path.hs
index 2b52d85..8c02dea 100644
--- a/src/Ganeti/Path.hs
+++ b/src/Ganeti/Path.hs
@@ -58,6 +58,7 @@
   , instanceReasonDir
   , getInstReasonFilename
   , jqueueExecutorPy
+  , kvmPidDir
   ) where
 
 import System.FilePath
@@ -190,3 +191,7 @@
 jqueueExecutorPy :: IO FilePath
 jqueueExecutorPy = return $ versionedsharedir
                             </> "ganeti" </> "jqueue" </> "exec.py"
+
+-- | The path to the directory where kvm stores the pid files.
+kvmPidDir :: IO FilePath
+kvmPidDir = runDir `pjoin` "kvm-hypervisor" `pjoin` "pid"
diff --git a/src/Ganeti/Prelude.hs b/src/Ganeti/Prelude.hs
new file mode 100644
index 0000000..8114b9f
--- /dev/null
+++ b/src/Ganeti/Prelude.hs
@@ -0,0 +1,194 @@
+{-# LANGUAGE NoImplicitPrelude, CPP #-}
+
+{-| Export Prelude as in base 4.8.0
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Prelude (
+
+    -- * Standard types, classes and related functions
+
+    -- ** Basic data types
+    Bool(False, True),
+    (&&), (||), not, otherwise,
+
+    Maybe(Nothing, Just),
+    maybe,
+
+    Either(Left, Right),
+    either,
+
+    Ordering(LT, EQ, GT),
+    Char, String,
+
+    -- *** Tuples
+    fst, snd, curry, uncurry,
+
+    -- ** Basic type classes
+    Eq((==), (/=)),
+    Ord(compare, (<), (<=), (>=), (>), max, min),
+    Enum(succ, pred, toEnum, fromEnum, enumFrom, enumFromThen,
+         enumFromTo, enumFromThenTo),
+    Bounded(minBound, maxBound),
+
+    -- ** Numbers
+
+    -- *** Numeric types
+    Int, Integer, Float, Double,
+    Rational, Word,
+
+    -- *** Numeric type classes
+    Num((+), (-), (*), negate, abs, signum, fromInteger),
+    Real(toRational),
+    Integral(quot, rem, div, mod, quotRem, divMod, toInteger),
+    Fractional((/), recip, fromRational),
+    Floating(pi, exp, log, sqrt, (**), logBase, sin, cos, tan,
+             asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh),
+    RealFrac(properFraction, truncate, round, ceiling, floor),
+    RealFloat(floatRadix, floatDigits, floatRange, decodeFloat,
+              encodeFloat, exponent, significand, scaleFloat, isNaN,
+              isInfinite, isDenormalized, isIEEE, isNegativeZero, atan2),
+
+    -- *** Numeric functions
+    subtract, even, odd, gcd, lcm, (^), (^^),
+    fromIntegral, realToFrac,
+
+    -- ** Monoids
+    Monoid(mempty, mappend, mconcat),
+
+    -- ** Monads and functors
+    Functor(fmap, (<$)), (<$>),
+    Applicative(pure, (<*>), (*>), (<*)),
+    Monad((>>=), (>>), return, fail),
+    mapM_, sequence_, (=<<),
+
+#if MIN_VERSION_base(4,8,0)
+    -- ** Folds and traversals
+    Foldable(elem,      -- :: (Foldable t, Eq a) => a -> t a -> Bool
+             -- fold,   -- :: Monoid m => t m -> m
+             foldMap,   -- :: Monoid m => (a -> m) -> t a -> m
+             foldr,     -- :: (a -> b -> b) -> b -> t a -> b
+             -- foldr', -- :: (a -> b -> b) -> b -> t a -> b
+             foldl,     -- :: (b -> a -> b) -> b -> t a -> b
+             -- foldl', -- :: (b -> a -> b) -> b -> t a -> b
+             foldr1,    -- :: (a -> a -> a) -> t a -> a
+             foldl1,    -- :: (a -> a -> a) -> t a -> a
+             maximum,   -- :: (Foldable t, Ord a) => t a -> a
+             minimum,   -- :: (Foldable t, Ord a) => t a -> a
+             product,   -- :: (Foldable t, Num a) => t a -> a
+             sum),      -- :: Num a => t a -> a
+             -- toList) -- :: Foldable t => t a -> [a]
+#else
+    Foldable(foldMap,
+             foldr,
+             foldl,
+             foldr1,
+             foldl1),
+    elem,
+    maximum,
+    minimum,
+    product,
+    sum,
+#endif
+
+    Traversable(traverse, sequenceA, mapM, sequence),
+
+    -- ** Miscellaneous functions
+    id, const, (.), flip, ($), until,
+    asTypeOf, error, undefined,
+    seq, ($!),
+
+    -- * List operations
+    map, (++), filter,
+    head, last, tail, init, null, length, (!!),
+    reverse,
+    -- *** Special folds
+    and, or, any, all,
+    concat, concatMap,
+    -- ** Building lists
+    -- *** Scans
+    scanl, scanl1, scanr, scanr1,
+    -- *** Infinite lists
+    iterate, repeat, replicate, cycle,
+    -- ** Sublists
+    take, drop, splitAt, takeWhile, dropWhile, span, break,
+    -- ** Searching lists
+    notElem, lookup,
+    -- ** Zipping and unzipping lists
+    zip, zip3, zipWith, zipWith3, unzip, unzip3,
+    -- ** Functions on strings
+    lines, words, unlines, unwords,
+
+    -- * Converting to and from @String@
+    -- ** Converting to @String@
+    ShowS,
+    Show(showsPrec, showList, show),
+    shows,
+    showChar, showString, showParen,
+    -- ** Converting from @String@
+    ReadS,
+    Read(readsPrec, readList),
+    reads, readParen, read, lex,
+
+    -- * Basic Input and output
+    IO,
+    -- ** Simple I\/O operations
+    -- All I/O functions defined here are character oriented.  The
+    -- treatment of the newline character will vary on different systems.
+    -- For example, two characters of input, return and linefeed, may
+    -- read as a single newline character.  These functions cannot be
+    -- used portably for binary I/O.
+    -- *** Output functions
+    putChar,
+    putStr, putStrLn, print,
+    -- *** Input functions
+    getChar,
+    getLine, getContents, interact,
+    -- *** Files
+    FilePath,
+    readFile, writeFile, appendFile, readIO, readLn,
+    -- ** Exception handling in the I\/O monad
+    IOError, ioError, userError,
+
+  ) where
+
+#if MIN_VERSION_base(4,8,0)
+import Prelude
+#else
+import Prelude hiding   ( elem, maximum, minimum, product, sum )
+import Data.Foldable    ( Foldable(..), elem, maximum, minimum, product, sum )
+import Data.Traversable ( Traversable(..) )
+import Control.Applicative
+import Data.Monoid
+import Data.Word
+#endif
diff --git a/src/Ganeti/Query/Exec.hs b/src/Ganeti/Query/Exec.hs
index 124f7f3..79889ff 100644
--- a/src/Ganeti/Query/Exec.hs
+++ b/src/Ganeti/Query/Exec.hs
@@ -60,12 +60,14 @@
   , forkJobProcess
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent (rtsSupportsBoundThreads)
 import Control.Concurrent.Lifted (threadDelay)
 import Control.Exception (finally)
 import Control.Monad
-import Control.Monad.Error
-import Data.Functor
+import Control.Monad.Error.Class (MonadError(..))
 import qualified Data.Map as M
 import Data.Maybe (listToMaybe, mapMaybe)
 import System.Directory (getDirectoryContents)
@@ -103,7 +105,7 @@
                               }
 
 -- Returns the list of all open file descriptors of the current process.
-listOpenFds :: (Error e) => ResultT e IO [Fd]
+listOpenFds :: (FromString e) => ResultT e IO [Fd]
 listOpenFds = liftM filterReadable
                 $ liftIO (getDirectoryContents "/proc/self/fd") `orElse`
                   liftIO (getDirectoryContents "/dev/fd") `orElse`
@@ -224,7 +226,7 @@
 
 -- | Forks the job process and starts processing of the given job.
 -- Returns the livelock of the job and its process ID.
-forkJobProcess :: (Error e, Show e)
+forkJobProcess :: (FromString e, Show e)
                => QueuedJob -- ^ a job to process
                -> FilePath  -- ^ the daemons own livelock file
                -> (FilePath -> ResultT e IO ())
diff --git a/src/Ganeti/Query/Filter.hs b/src/Ganeti/Query/Filter.hs
index 64eab37..0d36ff2 100644
--- a/src/Ganeti/Query/Filter.hs
+++ b/src/Ganeti/Query/Filter.hs
@@ -66,13 +66,14 @@
   , FilterOp(..)
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Monad (liftM, mzero)
 import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
 import Control.Monad.Trans.Class (lift)
 import qualified Data.Map as Map
 import Data.Maybe
-import Data.Traversable (traverse)
 import Text.JSON (JSValue(..), fromJSString)
 import Text.JSON.Pretty (pp_value)
 import qualified Text.Regex.PCRE as PCRE
@@ -136,7 +137,7 @@
 -- | A type synonim for a rank-2 comparator function. This is used so
 -- that we can pass the usual '<=', '>', '==' functions to 'binOpFilter'
 -- and for them to be used in multiple contexts.
-type Comparator = (Eq a, Ord a) => a -> a -> Bool
+type Comparator = forall a . (Eq a, Ord a) => a -> a -> Bool
 
 -- | Equality checker.
 --
@@ -183,10 +184,10 @@
 -- note: the next two implementations are the same, but we have to
 -- repeat them due to the encapsulation done by FilterValue
 containsFilter (QuotedString val) lst = do
-  lst' <- fromJVal lst
+  lst' <- fromJVal lst :: ErrorResult [String]
   return $! val `elem` lst'
 containsFilter (NumericValue val) lst = do
-  lst' <- fromJVal lst
+  lst' <- fromJVal lst :: ErrorResult [Integer]
   return $! val `elem` lst'
 
 
diff --git a/src/Ganeti/Query/Group.hs b/src/Ganeti/Query/Group.hs
index 45bd81a..26fc881 100644
--- a/src/Ganeti/Query/Group.hs
+++ b/src/Ganeti/Query/Group.hs
@@ -55,6 +55,8 @@
   , (FieldDefinition "custom_ipolicy" "CustomInstancePolicy" QFTOther
        "Custom instance policy limitations",
      FieldSimple (rsNormal . groupIpolicy), QffNormal)
+  , (FieldDefinition "networks" "Networks" QFTOther "Node group networks",
+     FieldSimple (rsNormal . groupNetworks), QffNormal)
   , (FieldDefinition "custom_ndparams" "CustomNDParams" QFTOther
        "Custom node parameters",
      FieldSimple (rsNormal . groupNdparams), QffNormal)
@@ -83,6 +85,11 @@
        "List of primary instances",
      FieldConfig (\cfg -> rsNormal . niceSort . mapMaybe instName . fst .
                           getGroupInstances cfg . uuidOf), QffNormal)
+  , (FieldDefinition "hv_state" "HypervisorState" QFTOther
+       "Custom static hypervisor state",
+     FieldSimple (rsNormal . groupHvStateStatic), QffNormal)
+  , (FieldDefinition "disk_state" "DiskState" QFTOther "Disk state",
+     FieldSimple (rsNormal . groupDiskStateStatic), QffNormal)
   ] ++
   map buildNdParamField allNDParamFields ++
   timeStampFields ++
diff --git a/src/Ganeti/Query/Language.hs b/src/Ganeti/Query/Language.hs
index 882a9da..3c6919f 100644
--- a/src/Ganeti/Query/Language.hs
+++ b/src/Ganeti/Query/Language.hs
@@ -65,10 +65,11 @@
     , checkRS
     ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.DeepSeq
 import Data.Foldable
-import Data.Traversable (Traversable)
 import Data.Ratio (numerator, denominator)
 import Text.JSON.Pretty (pp_value)
 import Text.JSON.Types
@@ -94,7 +95,8 @@
 
 -- | No-op 'NFData' instance for 'ResultStatus', since it's a single
 -- constructor data-type.
-instance NFData ResultStatus
+instance NFData ResultStatus where
+  rnf x = seq x ()
 
 -- | Check that ResultStatus is success or fail with descriptive
 -- message.
diff --git a/src/Ganeti/Query/Node.hs b/src/Ganeti/Query/Node.hs
index 9d36c74..f431ade 100644
--- a/src/Ganeti/Query/Node.hs
+++ b/src/Ganeti/Query/Node.hs
@@ -38,8 +38,10 @@
   , collectLiveData
   ) where
 
-import Control.Applicative
-import Data.List
+import Prelude ()
+import Ganeti.Prelude
+
+import Data.List (intercalate)
 import Data.Maybe
 import qualified Text.JSON as J
 
@@ -243,13 +245,14 @@
   , (FieldDefinition "powered" "Powered" QFTBool
        "Whether node is thought to be powered on",
      FieldConfig getNodePower, QffNormal)
-  -- FIXME: the two fields below are incomplete in Python, part of the
-  -- non-implemented node resource model; they are declared just for
-  -- parity, but are not functional
-  , (FieldDefinition "hv_state" "HypervisorState" QFTOther "Hypervisor state",
-     FieldSimple (const rsUnavail), QffNormal)
+  , (FieldDefinition "hv_state" "HypervisorState" QFTOther
+       "Static hypervisor state for default hypervisor only",
+     FieldConfig $ (rsNormal .) . getFilledHvStateParams, QffNormal)
+  , (FieldDefinition "custom_hv_state" "CustomHypervisorState" QFTOther
+       "Custom static hypervisor state",
+     FieldSimple $ rsNormal . nodeHvStateStatic, QffNormal)
   , (FieldDefinition "disk_state" "DiskState" QFTOther "Disk state",
-     FieldSimple (const rsUnavail), QffNormal)
+     FieldSimple $ rsNormal . nodeDiskStateStatic, QffNormal)
   ] ++
   map nodeLiveFieldBuilder nodeLiveFieldsDefs ++
   map buildNdParamField allNDParamFields ++
diff --git a/src/Ganeti/Query/Server.hs b/src/Ganeti/Query/Server.hs
index c942803..aefe129 100644
--- a/src/Ganeti/Query/Server.hs
+++ b/src/Ganeti/Query/Server.hs
@@ -40,13 +40,15 @@
   , prepMain
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent
 import Control.Exception
 import Control.Lens ((.~))
 import Control.Monad (forever, when, mzero, guard, zipWithM, liftM, void)
 import Control.Monad.Base (MonadBase, liftBase)
-import Control.Monad.Error (MonadError)
+import Control.Monad.Error.Class (MonadError)
 import Control.Monad.IO.Class
 import Control.Monad.Trans (lift)
 import Control.Monad.Trans.Maybe
@@ -271,6 +273,18 @@
             , ("data_collector_interval",
                showJSON . fmap dataCollectorInterval
                         $ clusterDataCollectors cluster)
+            , ("diagnose_data_collector_filename",
+               showJSON $ clusterDiagnoseDataCollectorFilename cluster)
+            , ("maint_round_delay",
+               showJSON . maintRoundDelay $ configMaintenance  cdata)
+            , ("maint_balance",
+               showJSON . maintBalance $ configMaintenance cdata)
+            , ("maint_balance_threshold",
+               showJSON . maintBalanceThreshold $ configMaintenance cdata)
+            , ("hv_state",
+               showJSON $ clusterHvStateStatic cluster)
+            , ("disk_state",
+               showJSON $ clusterDiskStateStatic cluster)
             , ("modify_ssh_setup",
                showJSON $ clusterModifySshSetup cluster)
             , ("ssh_key_type", showJSON $ clusterSshKeyType cluster)
diff --git a/src/Ganeti/Runtime.hs b/src/Ganeti/Runtime.hs
index 01f3885..8cf497f 100644
--- a/src/Ganeti/Runtime.hs
+++ b/src/Ganeti/Runtime.hs
@@ -52,7 +52,6 @@
   ) where
 
 import Control.Monad
-import Control.Monad.Error
 import qualified Data.Map as M
 import System.Exit
 import System.FilePath
@@ -75,6 +74,7 @@
                   | GanetiWConfd
                   | GanetiKvmd
                   | GanetiLuxid
+                  | GanetiMaintd
                   | GanetiMond
                     deriving (Show, Enum, Bounded, Eq, Ord)
 
@@ -103,6 +103,7 @@
 daemonName GanetiWConfd  = "ganeti-wconfd"
 daemonName GanetiKvmd    = "ganeti-kvmd"
 daemonName GanetiLuxid   = "ganeti-luxid"
+daemonName GanetiMaintd  = "ganeti-maintd"
 daemonName GanetiMond    = "ganeti-mond"
 
 -- | Returns whether the daemon only runs on the master node.
@@ -115,6 +116,7 @@
 daemonOnlyOnMaster GanetiWConfd  = True
 daemonOnlyOnMaster GanetiKvmd    = False
 daemonOnlyOnMaster GanetiLuxid   = True
+daemonOnlyOnMaster GanetiMaintd  = True
 daemonOnlyOnMaster GanetiMond    = False
 
 -- | Returns the log file base for a daemon.
@@ -127,6 +129,7 @@
 daemonLogBase GanetiWConfd  = "wconf-daemon"
 daemonLogBase GanetiKvmd    = "kvm-daemon"
 daemonLogBase GanetiLuxid   = "luxi-daemon"
+daemonLogBase GanetiMaintd  = "maintenance-daemon"
 daemonLogBase GanetiMond    = "monitoring-daemon"
 
 -- | Returns the configured user name for a daemon.
@@ -139,6 +142,7 @@
 daemonUser GanetiWConfd  = AutoConf.wconfdUser
 daemonUser GanetiKvmd    = AutoConf.kvmdUser
 daemonUser GanetiLuxid   = AutoConf.luxidUser
+daemonUser GanetiMaintd  = AutoConf.mondUser
 daemonUser GanetiMond    = AutoConf.mondUser
 
 -- | Returns the configured group for a daemon.
@@ -151,6 +155,7 @@
 daemonGroup (DaemonGroup GanetiWConfd)  = AutoConf.wconfdGroup
 daemonGroup (DaemonGroup GanetiLuxid)   = AutoConf.luxidGroup
 daemonGroup (DaemonGroup GanetiKvmd)    = AutoConf.kvmdGroup
+daemonGroup (DaemonGroup GanetiMaintd)  = AutoConf.mondGroup
 daemonGroup (DaemonGroup GanetiMond)    = AutoConf.mondGroup
 daemonGroup (ExtraGroup  DaemonsGroup)  = AutoConf.daemonsGroup
 daemonGroup (ExtraGroup  AdminGroup)    = AutoConf.adminGroup
@@ -189,7 +194,7 @@
             map ExtraGroup  [minBound..maxBound]
 
 -- | Computes the group/user maps.
-getEnts :: (Error e) => ResultT e IO RuntimeEnts
+getEnts :: (FromString e) => ResultT e IO RuntimeEnts
 getEnts = do
   let userOf = liftM userID . liftIO . getUserEntryForName . daemonUser
   let groupOf = liftM groupID . liftIO . getGroupEntryForName . daemonGroup
diff --git a/src/Ganeti/Ssconf.hs b/src/Ganeti/Ssconf.hs
index 99ad3e5..e3fc864 100644
--- a/src/Ganeti/Ssconf.hs
+++ b/src/Ganeti/Ssconf.hs
@@ -54,8 +54,10 @@
   , emptySSConf
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow ((&&&))
-import Control.Applicative ((<$>))
 import Control.Exception
 import Control.Monad (forM, liftM)
 import qualified Data.Map as M
diff --git a/src/Ganeti/Storage/Diskstats/Parser.hs b/src/Ganeti/Storage/Diskstats/Parser.hs
index 64d3885..6f64b04 100644
--- a/src/Ganeti/Storage/Diskstats/Parser.hs
+++ b/src/Ganeti/Storage/Diskstats/Parser.hs
@@ -36,7 +36,9 @@
 -}
 module Ganeti.Storage.Diskstats.Parser (diskstatsParser) where
 
-import Control.Applicative ((<*>), (<*), (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.Attoparsec.Text as A
 import qualified Data.Attoparsec.Combinator as AC
 import Data.Attoparsec.Text (Parser)
diff --git a/src/Ganeti/Storage/Drbd/Parser.hs b/src/Ganeti/Storage/Drbd/Parser.hs
index c9c8dce..8dee72c 100644
--- a/src/Ganeti/Storage/Drbd/Parser.hs
+++ b/src/Ganeti/Storage/Drbd/Parser.hs
@@ -36,7 +36,10 @@
 -}
 module Ganeti.Storage.Drbd.Parser (drbdStatusParser, commaIntParser) where
 
-import Control.Applicative ((<*>), (*>), (<*), (<$>), (<|>), pure)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative ((<|>))
 import qualified Data.Attoparsec.Text as A
 import qualified Data.Attoparsec.Combinator as AC
 import Data.Attoparsec.Text (Parser)
diff --git a/src/Ganeti/Storage/Lvm/LVParser.hs b/src/Ganeti/Storage/Lvm/LVParser.hs
index 470c41a..cf31431 100644
--- a/src/Ganeti/Storage/Lvm/LVParser.hs
+++ b/src/Ganeti/Storage/Lvm/LVParser.hs
@@ -37,7 +37,9 @@
 -}
 module Ganeti.Storage.Lvm.LVParser (lvParser, lvCommand, lvParams) where
 
-import Control.Applicative ((<*>), (*>), (<*), (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.Attoparsec.Text as A
 import qualified Data.Attoparsec.Combinator as AC
 import Data.Attoparsec.Text (Parser)
diff --git a/src/Ganeti/THH.hs b/src/Ganeti/THH.hs
index 7ae4c9f..4bc7e88 100644
--- a/src/Ganeti/THH.hs
+++ b/src/Ganeti/THH.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE ParallelListComp, TemplateHaskell #-}
+{-# LANGUAGE ParallelListComp, TemplateHaskell, RankNTypes #-}
 
 {-| TemplateHaskell helper for Ganeti Haskell code.
 
@@ -77,11 +77,14 @@
                   , ssconfConstructorName
                   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow ((&&&), second)
 import Control.Applicative
 import Control.Lens.Type (Lens, Lens')
 import Control.Lens (lens, set, element)
-import Control.Monad
+import Control.Monad (liftM, replicateM, when, unless)
 import Control.Monad.Base () -- Needed to prevent spurious GHC linking errors.
 import Control.Monad.Writer (tell)
 import qualified Control.Monad.Trans as MT
@@ -90,10 +93,9 @@
   -- See issue #683 and https://ghc.haskell.org/trac/ghc/ticket/4899
 import Data.Char
 import Data.Function (on)
-import Data.List
+import Data.List (intercalate, groupBy, stripPrefix, sort, nub)
 import Data.Maybe
 import qualified Data.Map as M
-import Data.Monoid
 import qualified Data.Set as S
 import qualified Data.Text as T
 import Language.Haskell.TH
@@ -486,7 +488,7 @@
 genFromRaw :: Name -> Name -> Name -> [(String, Either String Name)] -> Q [Dec]
 genFromRaw traw fname tname constructors = do
   -- signature of form (Monad m) => String -> m $name
-  sigt <- [t| (Monad m) => $(conT traw) -> m $(conT tname) |]
+  sigt <- [t| forall m. (Monad m) => $(conT traw) -> m $(conT tname) |]
   -- clauses for a guarded pattern
   let varp = mkName "s"
       varpe = varE varp
@@ -1201,8 +1203,13 @@
               -> Q [Dec]
 genDictObject save_fn load_fn sname fields = do
   let name = mkName sname
+      -- newName fails in ghc 7.10 when used on keywords
+      newName' "data" = newName "data_ghcBug10599"
+      newName' "instance" = newName "instance_ghcBug10599"
+      newName' "type" = newName "type_ghcBug10599"
+      newName' s = newName s
   -- toDict
-  fnames <- mapM (newName . fieldVariable) fields
+  fnames <- mapM (newName' . fieldVariable) fields
   let pat = conP name (map varP fnames)
       tdexp = [| concat $(listE $ zipWith save_fn fnames fields) |]
   tdclause <- clause [pat] (normalB tdexp) []
diff --git a/src/Ganeti/THH/HsRPC.hs b/src/Ganeti/THH/HsRPC.hs
index 7822912..8bcdb4d 100644
--- a/src/Ganeti/THH/HsRPC.hs
+++ b/src/Ganeti/THH/HsRPC.hs
@@ -43,11 +43,13 @@
   , mkRpcCalls
   ) where
 
-import Control.Applicative
-import Control.Monad
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad (liftM)
 import Control.Monad.Base
-import Control.Monad.Error
-import Control.Monad.Reader
+import Control.Monad.Error.Class (MonadError)
+import Control.Monad.Reader (ReaderT, runReaderT, ask)
 import Control.Monad.Trans.Control
 import Language.Haskell.TH
 import qualified Text.JSON as J
diff --git a/src/Ganeti/THH/PyRPC.hs b/src/Ganeti/THH/PyRPC.hs
index eee1554..81e9223 100644
--- a/src/Ganeti/THH/PyRPC.hs
+++ b/src/Ganeti/THH/PyRPC.hs
@@ -40,9 +40,11 @@
   , genPyUDSRpcStubStr
   ) where
 
-import Control.Monad
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad (liftM, zipWithM)
 import Data.Char (toLower, toUpper)
-import Data.Functor
 import Data.Maybe (fromMaybe)
 import Language.Haskell.TH
 import Language.Haskell.TH.Syntax (liftString)
diff --git a/src/Ganeti/THH/PyType.hs b/src/Ganeti/THH/PyType.hs
index 5a3941c..efcbc32 100644
--- a/src/Ganeti/THH/PyType.hs
+++ b/src/Ganeti/THH/PyType.hs
@@ -39,8 +39,10 @@
   , pyOptionalType
   ) where
 
-import Control.Applicative
-import Control.Monad
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad (ap, liftM)
 import Data.List (intercalate)
 import Language.Haskell.TH
 import Language.Haskell.TH.Syntax (Lift(..))
diff --git a/src/Ganeti/THH/RPC.hs b/src/Ganeti/THH/RPC.hs
index fa4b84c..25388df 100644
--- a/src/Ganeti/THH/RPC.hs
+++ b/src/Ganeti/THH/RPC.hs
@@ -42,10 +42,12 @@
   , mkRpcM
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow ((&&&))
 import Control.Monad
-import Control.Monad.Error.Class
+import Control.Monad.Error.Class (MonadError(..))
 import Data.Map (Map)
 import qualified Data.Map as Map
 import Language.Haskell.TH
@@ -78,12 +80,12 @@
              , US.hExec          = liftToHandler . exec
              }
   where
-    orError :: (MonadError e m, Error e) => Maybe a -> e -> m a
+    orError :: (MonadError e m, FromString e) => Maybe a -> e -> m a
     orError m e = maybe (throwError e) return m
 
     exec (Request m as) = do
       (RpcFn f) <- orError (Map.lookup m fs)
-                           (strMsg $ "No such method: " ++ m)
+                           (mkFromString $ "No such method: " ++ m)
       i <- fromJResultE "RPC input" . J.readJSON $ as
       o <- f i -- lift $ f i
       return $ J.showJSON o
diff --git a/src/Ganeti/Types.hs b/src/Ganeti/Types.hs
index 318127e..8da06d4 100644
--- a/src/Ganeti/Types.hs
+++ b/src/Ganeti/Types.hs
@@ -190,7 +190,9 @@
   , TagsObject(..)
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Monad (liftM)
 import qualified Text.JSON as JSON
 import Text.JSON (JSON, readJSON, showJSON)
diff --git a/src/Ganeti/UDSServer.hs b/src/Ganeti/UDSServer.hs
index c259475..7008d08 100644
--- a/src/Ganeti/UDSServer.hs
+++ b/src/Ganeti/UDSServer.hs
@@ -70,7 +70,9 @@
   , listener
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent.Lifted (fork, yield)
 import Control.Monad.Base
 import Control.Monad.Trans.Control
@@ -79,7 +81,7 @@
 import qualified Data.ByteString as B
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.IORef
-import Data.List
+import Data.List (isInfixOf)
 import Data.Word (Word8)
 import qualified Network.Socket as S
 import System.Directory (removeFile)
diff --git a/src/Ganeti/Utils.hs b/src/Ganeti/Utils.hs
index 7ec9f84..42a8db9 100644
--- a/src/Ganeti/Utils.hs
+++ b/src/Ganeti/Utils.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
+{-# LANGUAGE FlexibleContexts, ScopedTypeVariables, CPP #-}
 
 {-| Utility functions. -}
 
@@ -58,6 +58,7 @@
   , exitWhen
   , exitUnless
   , logWarningIfBad
+  , logAndBad
   , rStripSpace
   , newUUID
   , isUUID
@@ -96,20 +97,36 @@
   , ensurePermissions
   , ordNub
   , isSubsequenceOf
+  , maxBy
+  , threadDelaySeconds
+  , monotoneFind
+  , iterateJust
+  , partitionM
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent
 import Control.Exception (try, bracket)
 import Control.Monad
-import Control.Monad.Error
 import qualified Data.Attoparsec.ByteString as A
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.Char (toUpper, isAlphaNum, isDigit, isSpace)
 import qualified Data.Either as E
 import Data.Function (on)
 import Data.IORef
-import Data.List
+#if MIN_VERSION_base(4,8,0)
+import Data.List hiding (isSubsequenceOf)
+#else
+import Data.List ( intercalate
+                 , find
+                 , foldl'
+                 , transpose
+                 , sortBy
+                 , isPrefixOf
+                 , maximumBy)
+#endif
 import qualified Data.Map as M
 import Data.Maybe (fromMaybe)
 import qualified Data.Set as S
@@ -189,6 +206,10 @@
                  then '\'':v ++ "'"
                  else v
 
+-- | Delay a thread for several seconds.
+threadDelaySeconds :: Int -> IO ()
+threadDelaySeconds = threadDelay . (*) 1000000
+
 -- * Mathematical functions
 
 -- Simple and slow statistical functions, please replace with better
@@ -350,6 +371,12 @@
   return defVal
 logWarningIfBad _ _ (Ok v) = return v
 
+-- | Log a message and return a Bad result.
+logAndBad :: String -> IO (Result a)
+logAndBad msg = do
+  logNotice msg
+  return $ Bad msg
+
 -- | Try an IO interaction, log errors and unfold as a 'Result'.
 tryAndLogIOError :: IO a -> String -> (a -> Result b) -> IO (Result b)
 tryAndLogIOError io msg okfn =
@@ -818,3 +845,42 @@
 isSubsequenceOf _     []                   = False
 isSubsequenceOf a@(x:a') (y:b) | x == y    = isSubsequenceOf a' b
                                | otherwise = isSubsequenceOf a b
+
+-- | Compute the maximum of two elements by a given order.
+-- As opposed to using `maximumBy`, is function is guaranteed
+-- to be total, as the signature enforces a non-empty list of
+-- arguments.
+maxBy :: (a -> a -> Ordering) -> a -> a -> a
+maxBy ord a b = maximumBy ord [a, b]
+
+-- | Given a predicate that is monotone on a list, find the
+-- first list entry where it holds, if any. Use the monotonicity
+-- property to evaluate the property at as few places as possible,
+-- guided by the heuristics provided.
+monotoneFind :: ([a] -> Int) -> (a -> Bool) -> [a] -> Maybe a
+monotoneFind heuristics p xs =
+  let count = heuristics xs
+  in case () of
+    _ | x:xs' <- drop count xs
+        -> if p x
+             then (`mplus` Just x) . monotoneFind heuristics p
+                  $ take count xs
+             else monotoneFind heuristics p xs'
+    _ | x:xs' <- xs
+        -> if p x
+             then Just x
+             else monotoneFind heuristics p xs'
+    _ -> Nothing
+
+-- | Iterate a function as long as it returns Just values, collecting
+-- all the Justs that where obtained.
+iterateJust :: (a -> Maybe a) -> a -> [a]
+iterateJust f a = a : maybe [] (iterateJust f) (f a)
+
+-- | A version of partition with a monadic predicate
+-- Implementation taken from David Fox's Extras package.
+partitionM :: (Monad m) => (a -> m Bool) -> [a] -> m ([a], [a])
+partitionM p xs = foldM f ([], []) xs
+  where f (a, b) x = do
+        pv <- p x
+        return $ if pv then (x : a, b) else (a, x : b)
diff --git a/src/Ganeti/Utils/Atomic.hs b/src/Ganeti/Utils/Atomic.hs
index 7f4d2df..ae7bf81 100644
--- a/src/Ganeti/Utils/Atomic.hs
+++ b/src/Ganeti/Utils/Atomic.hs
@@ -43,7 +43,7 @@
 import qualified Control.Exception.Lifted as L
 import Control.Monad
 import Control.Monad.Base (MonadBase(..))
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
 import Control.Monad.Trans.Control
 import System.FilePath.Posix (takeDirectory, takeBaseName)
 import System.IO
@@ -91,12 +91,12 @@
 -- | Opens a file in a R/W mode, locks it (blocking if needed) and runs
 -- a given action while the file is locked. Releases the lock and
 -- closes the file afterwards.
-withLockedFile :: (MonadError e m, Error e, MonadBaseControl IO m)
+withLockedFile :: (MonadError e m, FromString e, MonadBaseControl IO m)
                => FilePath -> (Fd -> m a) -> m a
 withLockedFile path =
     L.bracket (openAndLock path) (liftBase . closeFd)
   where
-    openAndLock :: (MonadError e m, Error e, MonadBaseControl IO m)
+    openAndLock :: (MonadError e m, FromString e, MonadBaseControl IO m)
                 => FilePath -> m Fd
     openAndLock p = liftBase $ do
       fd <- openFd p ReadWrite Nothing defaultFileFlags
diff --git a/src/Ganeti/Utils/Http.hs b/src/Ganeti/Utils/Http.hs
new file mode 100644
index 0000000..901d401
--- /dev/null
+++ b/src/Ganeti/Utils/Http.hs
@@ -0,0 +1,102 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+{-| Utils for HTTP servers
+
+-}
+
+{-
+
+Copyright (C) 2013 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Utils.Http
+  ( httpConfFromOpts
+  , error404
+  , plainJSON
+  ) where
+
+import Control.Monad (liftM)
+import Data.ByteString.Char8 (pack)
+import Data.Map ((!))
+import Data.Maybe (fromMaybe)
+import Network.BSD (getServicePortNumber)
+import qualified Network.Socket as Socket
+import Snap.Core (Snap, writeBS, modifyResponse, setResponseStatus)
+import Snap.Http.Server.Config ( Config, ConfigLog(ConfigFileLog), emptyConfig
+                               , setAccessLog, setErrorLog, setCompression
+                               , setVerbose, setPort, setBind )
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes (GenericResult(..))
+import qualified Ganeti.Constants as C
+import Ganeti.Daemon (DaemonOptions(..))
+import Ganeti.Runtime ( GanetiDaemon, daemonName
+                      , daemonsExtraLogFile, ExtraLogReason(..))
+import qualified Ganeti.Ssconf as Ssconf
+import Ganeti.Utils (withDefaultOnIOError)
+
+-- * Configuration handling
+
+-- | The default configuration for the HTTP server.
+defaultHttpConf :: FilePath -> FilePath -> Config Snap ()
+defaultHttpConf accessLog errorLog =
+  setAccessLog (ConfigFileLog accessLog) .
+  setCompression False .
+  setErrorLog (ConfigFileLog errorLog) $
+  setVerbose False
+  emptyConfig
+
+-- | Get the HTTP Configuration from the daemon options.
+httpConfFromOpts :: GanetiDaemon -> DaemonOptions -> IO (Config Snap ())
+httpConfFromOpts daemon opts = do
+  accessLog <- daemonsExtraLogFile daemon AccessLog
+  errorLog <- daemonsExtraLogFile daemon ErrorLog
+  let name = daemonName daemon
+      standardPort = snd $ C.daemonsPorts ! name
+  defaultPort <- withDefaultOnIOError standardPort
+                 . liftM fromIntegral
+                 $ getServicePortNumber name
+  defaultFamily <- Ssconf.getPrimaryIPFamily Nothing
+  let defaultBind = if defaultFamily == Ok Socket.AF_INET6 then "::" else "*"
+  return .
+    setPort (maybe defaultPort fromIntegral (optPort opts)) .
+    setBind (pack . fromMaybe defaultBind $ optBindAddress opts)
+    $ defaultHttpConf accessLog errorLog
+
+
+-- * Standard answers
+
+-- | Resource not found error
+error404 :: Snap ()
+error404 = do
+  modifyResponse $ setResponseStatus 404 "Not found"
+  writeBS "Resource not found"
+
+-- | Return the JSON encoding of an object
+plainJSON :: J.JSON a => a -> Snap ()
+plainJSON = writeBS . pack . J.encode
diff --git a/src/Ganeti/Utils/IORef.hs b/src/Ganeti/Utils/IORef.hs
index 488d2e8..a220e3e 100644
--- a/src/Ganeti/Utils/IORef.hs
+++ b/src/Ganeti/Utils/IORef.hs
@@ -34,6 +34,7 @@
 
 module Ganeti.Utils.IORef
   ( atomicModifyWithLens
+  , atomicModifyWithLens_
   , atomicModifyIORefErr
   , atomicModifyIORefErrLog
   ) where
@@ -53,6 +54,11 @@
                      => IORef a -> Lens a a b c -> (b -> (r, c)) -> m r
 atomicModifyWithLens ref l f = atomicModifyIORef ref (swap . traverseOf l f)
 
+-- | Atomically modify an 'IORef', not reading any value.
+atomicModifyWithLens_ :: (MonadBase IO m)
+                      => IORef a -> Lens a a b c -> (b -> c) -> m ()
+atomicModifyWithLens_ ref l f = atomicModifyWithLens ref l $ (,) () . f
+
 -- | Atomically modifies an 'IORef' using a function that can possibly fail.
 -- If it fails, the value of the 'IORef' is preserved.
 atomicModifyIORefErr :: (MonadBase IO m)
diff --git a/src/Ganeti/Utils/Livelock.hs b/src/Ganeti/Utils/Livelock.hs
index 8bbb37f..905cd88 100644
--- a/src/Ganeti/Utils/Livelock.hs
+++ b/src/Ganeti/Utils/Livelock.hs
@@ -41,7 +41,7 @@
 
 import qualified Control.Exception as E
 import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
 import System.Directory (doesFileExist, getDirectoryContents)
 import System.FilePath.Posix ((</>))
 import System.IO
@@ -59,7 +59,7 @@
 -- | Appends the current time to the given prefix, creates
 -- the lockfile in the appropriate directory, and locks it.
 -- Returns its full path and the file's file descriptor.
-mkLivelockFile :: (Error e, MonadError e m, MonadIO m)
+mkLivelockFile :: (FromString e, MonadError e m, MonadIO m)
                => FilePath -> m (Fd, Livelock)
 mkLivelockFile prefix = do
   (TOD secs _) <- liftIO getClockTime
diff --git a/src/Ganeti/Utils/Monad.hs b/src/Ganeti/Utils/Monad.hs
index cd09a0d..cecaaf4 100644
--- a/src/Ganeti/Utils/Monad.hs
+++ b/src/Ganeti/Utils/Monad.hs
@@ -44,7 +44,7 @@
   ) where
 
 import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError(..))
 import Control.Monad.Trans.Maybe
 
 -- | Retries the given action up to @n@ times.
diff --git a/src/Ganeti/Utils/MultiMap.hs b/src/Ganeti/Utils/MultiMap.hs
index 0f97e26..6f46e1d 100644
--- a/src/Ganeti/Utils/MultiMap.hs
+++ b/src/Ganeti/Utils/MultiMap.hs
@@ -54,13 +54,13 @@
   , values
   ) where
 
-import Prelude hiding (lookup, null, elem)
+import Prelude ()
+import Ganeti.Prelude hiding (lookup, null, elem)
 
 import Control.Monad
 import qualified Data.Foldable as F
 import qualified Data.Map as M
 import Data.Maybe (fromMaybe, isJust)
-import Data.Monoid
 import qualified Data.Set as S
 import qualified Text.JSON as J
 
diff --git a/src/Ganeti/Utils/Random.hs b/src/Ganeti/Utils/Random.hs
index 500e00d..bdccd4e 100644
--- a/src/Ganeti/Utils/Random.hs
+++ b/src/Ganeti/Utils/Random.hs
@@ -38,7 +38,9 @@
   , delayRandom
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Concurrent (threadDelay)
 import Control.Monad
 import Control.Monad.State
diff --git a/src/Ganeti/Utils/Statistics.hs b/src/Ganeti/Utils/Statistics.hs
index 7057973..ff91d93 100644
--- a/src/Ganeti/Utils/Statistics.hs
+++ b/src/Ganeti/Utils/Statistics.hs
@@ -1,5 +1,4 @@
-{-# LANGUAGE BangPatterns #-}
-
+{-# LANGUAGE BangPatterns, MultiParamTypeClasses, FunctionalDependencies#-}
 
 {-| Utility functions for statistical accumulation. -}
 
@@ -34,100 +33,105 @@
 -}
 
 module Ganeti.Utils.Statistics
-  ( Statistics
+  ( Stat
+  , SumStat(..)
+  , StdDevStat(..)
   , TagTagMap
-  , AggregateComponent(..)
-  , getSumStatistics
-  , getStdDevStatistics
-  , getMapStatistics
-  , getStatisticValue
-  , updateStatistics
+  , MapData(..)
+  , MapStat(..)
+  , update
+  , calculate
+  , getValue
+  , toDouble
   ) where
 
 import qualified Data.Foldable as Foldable
 import Data.List (foldl')
 import qualified Data.Map as Map
 
+-- | Typeclass describing necessary statistical accumulations functions. Types
+-- defining an instance of Stat behave as if the given statistics were computed
+-- on the list of values, but they allow a potentially more efficient update of
+-- a given value. c is the statistical accumulation data type itself while s is
+-- a type of spread values used to calculate a statistics. s defined as a
+-- type dependent from c in order to pretend ambiguity.
+class (Show c) => Stat s c | c -> s where
+  -- | Calculate a statistics from the spread values list.
+  calculate :: [s] -> c
+  -- | In a given statistics replace on value by another. This will only give
+  -- meaningful results, if the original value was actually part of
+  -- the statistics.
+  update :: c -> s -> s -> c
+  -- | Obtain the value of a statistics.
+  getValue :: c -> Double
+
+-- | Type of statistical accumulations representing simple sum of values
+data SumStat = SumStat Double deriving Show
+-- | Type of statistical accumulations representing values standard deviation
+data StdDevStat = StdDevStat Double Double Double deriving Show
+                  -- count, sum, and not the sum of squares---instead the
+                  -- computed variance for better precission.
+-- | Type of statistical accumulations representing the amount of instances per
+-- each tags pair. See Also TagTagMap documentation.
+data MapStat = MapStat TagTagMap deriving Show
+
+instance Stat Double SumStat where
+  calculate xs =
+    let addComponent s x =
+          let !s' = s + x
+          in s'
+        st = foldl' addComponent 0 xs
+    in SumStat st
+  update (SumStat s) x x' =
+    SumStat $ s + x' - x
+  getValue (SumStat s) = s
+
+instance Stat Double StdDevStat where
+  calculate xs =
+    let addComponent (n, s) x =
+          let !n' = n + 1
+              !s' = s + x
+          in (n', s')
+        (nt, st) = foldl' addComponent (0, 0) xs
+        mean = st / nt
+        center x = x - mean
+        nvar = foldl' (\v x -> let d = center x in v + d * d) 0 xs
+    in StdDevStat nt st (nvar / nt)
+  update (StdDevStat n s var) x x' =
+    let !ds = x' - x
+        !dss = x' * x' - x * x
+        !dnnvar = (n * dss - 2 * s * ds) - ds * ds
+        !s' = s + ds
+        !var' = max 0 $ var + dnnvar / (n * n)
+    in StdDevStat n s' var'
+  getValue (StdDevStat _ _ var) = sqrt var
+
 -- | Type to store the number of instances for each exclusion and location
 -- pair. This is necessary to calculate second component of location score.
 type TagTagMap = Map.Map (String, String) Int
 
--- | Abstract type of statistical accumulations. They behave as if the given
--- statistics were computed on the list of values, but they allow a potentially
--- more efficient update of a given value.
-data Statistics = SumStatistics Double
-                | StdDevStatistics Double Double Double
-                  -- count, sum, and not the sum of squares---instead the
-                  -- computed variance for better precission.
-                | MapStatistics TagTagMap deriving Show
+-- | Data type used to store spread values of type TagTagMap. This data type
+-- is introduced only to defin an instance of Stat for TagTagMap.
+data MapData = MapData TagTagMap
 
--- | Abstract type of per-node statistics measures. The SimpleNumber is used
--- to construct SumStatistics and StdDevStatistics while SpreadValues is used
--- to construct MapStatistics.
-data AggregateComponent = SimpleNumber Double
-                        | SpreadValues TagTagMap
--- Each function below depends on the contents of AggregateComponent but it's
--- necessary to define each function as a function processing both
--- SimpleNumber and SpreadValues instances (see Metrics.hs). That's why
--- pattern matches for invalid type defined as functions which change nothing.
+-- | Helper function unpacking [MapData] spread values list.
+mapTmpToMap :: [MapData] -> [TagTagMap]
+mapTmpToMap (MapData m : xs) = m : mapTmpToMap xs
+mapTmpToMap _ = []
 
--- | Get a statistics that sums up the values.
-getSumStatistics :: [AggregateComponent] -> Statistics
-getSumStatistics xs =
-  let addComponent s (SimpleNumber x) =
-        let !s' = s + x
-        in s'
-      addComponent s _ = s
-      st = foldl' addComponent 0 xs
-  in SumStatistics st
+instance Stat MapData MapStat where
+  calculate xs =
+    let addComponent m x =
+          let !m' = Map.unionWith (+) m x
+          in m'
+        mt = foldl' addComponent Map.empty (mapTmpToMap xs)
+    in MapStat mt
+  update (MapStat m) (MapData x) (MapData x') =
+    let nm = Map.unionWith (+) (Map.unionWith (-) m x) x'
+    in MapStat nm
+  getValue (MapStat m) = fromIntegral $ Foldable.sum m - Map.size m
 
--- | Get a statistics for the standard deviation.
-getStdDevStatistics :: [AggregateComponent] -> Statistics
-getStdDevStatistics xs =
-  let addComponent (n, s) (SimpleNumber x) =
-        let !n' = n + 1
-            !s' = s + x
-        in (n', s')
-      addComponent (n, s) _ = (n, s)
-      (nt, st) = foldl' addComponent (0, 0) xs
-      mean = st / nt
-      center (SimpleNumber x) = x - mean
-      center _ = 0
-      nvar = foldl' (\v x -> let d = center x in v + d * d) 0 xs
-  in StdDevStatistics nt st (nvar / nt)
-
--- | Get a statistics for the standard deviation.
-getMapStatistics :: [AggregateComponent] -> Statistics
-getMapStatistics xs =
-  let addComponent m (SpreadValues x) =
-        let !m' = Map.unionWith (+) m x
-        in m'
-      addComponent m _ = m
-      mt = foldl' addComponent Map.empty xs
-  in MapStatistics mt
-
--- | Obtain the value of a statistics.
-getStatisticValue :: Statistics -> Double
-getStatisticValue (SumStatistics s) = s
-getStatisticValue (StdDevStatistics _ _ var) = sqrt var
-getStatisticValue (MapStatistics m) = fromIntegral $ Foldable.sum m - Map.size m
--- Function above calculates sum (N_i - 1) over each map entry.
-
--- | In a given statistics replace on value by another. This
--- will only give meaningful results, if the original value
--- was actually part of the statistics.
-updateStatistics :: Statistics -> (AggregateComponent, AggregateComponent) ->
-                    Statistics
-updateStatistics (SumStatistics s) (SimpleNumber x, SimpleNumber y) =
-  SumStatistics $ s + (y - x)
-updateStatistics (StdDevStatistics n s var) (SimpleNumber x, SimpleNumber y) =
-  let !ds = y - x
-      !dss = y * y - x * x
-      !dnnvar = (n * dss - 2 * s * ds) - ds * ds
-      !s' = s + ds
-      !var' = max 0 $ var + dnnvar / (n * n)
-  in StdDevStatistics n s' var'
-updateStatistics (MapStatistics m) (SpreadValues x, SpreadValues y) =
-  let nm = Map.unionWith (+) (Map.unionWith (-) m x) y
-  in MapStatistics nm
-updateStatistics s _ = s
+-- | Converts Integral types to Double. It's usefull than it's not enough type
+-- information in the expression to call fromIntegral directly.
+toDouble :: (Integral a) => a -> Double
+toDouble = fromIntegral
diff --git a/src/Ganeti/Utils/UniStd.hs b/src/Ganeti/Utils/UniStd.hs
index c3453d9..6f301f2 100644
--- a/src/Ganeti/Utils/UniStd.hs
+++ b/src/Ganeti/Utils/UniStd.hs
@@ -54,7 +54,7 @@
 -- Because of a bug in GHC 7.6.3 (at least), calling 'hIsClosed' on a handle
 -- to get the file descriptor leaks memory. Therefore we open a given file
 -- just to sync it and close it again.
-fsyncFile :: (Error e) => FilePath -> ResultT e IO ()
+fsyncFile :: (FromString e) => FilePath -> ResultT e IO ()
 fsyncFile path = liftIO
   $ bracket (openFd path ReadOnly Nothing defaultFileFlags) closeFd callfsync
   where
diff --git a/src/Ganeti/Utils/Validate.hs b/src/Ganeti/Utils/Validate.hs
index 421f0c1..cab6b90 100644
--- a/src/Ganeti/Utils/Validate.hs
+++ b/src/Ganeti/Utils/Validate.hs
@@ -51,16 +51,20 @@
   , validate'
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow
 import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError(..))
 import Control.Monad.Writer
 import qualified Data.Foldable as F
 import Data.Functor.Identity
 import Data.List (intercalate)
 import Data.Sequence
 
+import Ganeti.BasicTypes (FromString(..))
+
 -- | Monad for running validation checks.
 newtype ValidationMonadT m a =
   ValidationMonad { runValidationMonad :: WriterT (Seq String) m a }
@@ -100,19 +104,19 @@
 
 -- | A helper function for throwing an exception if a list of errors
 -- is non-empty.
-throwIfErrors :: (MonadError e m, Error e) => (a, [String]) -> m a
+throwIfErrors :: (MonadError e m, FromString e) => (a, [String]) -> m a
 throwIfErrors (x, []) = return x
-throwIfErrors (_, es) = throwError (strMsg $ "Validation errors: "
-                                             ++ intercalate "; " es)
+throwIfErrors (_, es) = throwError (mkFromString $ "Validation errors: "
+                                                   ++ intercalate "; " es)
 
 -- | Runs a validation action and if there are errors, combine them
 -- into an exception.
-evalValidate :: (MonadError e m, Error e) => ValidationMonad a -> m a
+evalValidate :: (MonadError e m, FromString e) => ValidationMonad a -> m a
 evalValidate = throwIfErrors . runValidate
 
 -- | Runs a validation action and if there are errors, combine them
 -- into an exception.
-evalValidateT :: (MonadError e m, Error e) => ValidationMonadT m a -> m a
+evalValidateT :: (MonadError e m, FromString e) => ValidationMonadT m a -> m a
 evalValidateT k = runValidateT k >>= throwIfErrors
 
 -- | A typeclass for objects that can be validated.
diff --git a/src/Ganeti/WConfd/Client.hs b/src/Ganeti/WConfd/Client.hs
index a477907..12bd69b 100644
--- a/src/Ganeti/WConfd/Client.hs
+++ b/src/Ganeti/WConfd/Client.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE TemplateHaskell, FlexibleContexts #-}
 
 {-| The Ganeti WConfd client functions.
 
@@ -38,14 +38,22 @@
 
 module Ganeti.WConfd.Client where
 
+import Control.Concurrent (threadDelay)
 import Control.Exception.Lifted (bracket)
+import Control.Monad (unless)
+import Control.Monad.Base
+import Control.Monad.Error.Class (MonadError)
+import Control.Monad.Trans.Control (MonadBaseControl)
 
-import Ganeti.THH.HsRPC
+import Ganeti.BasicTypes (runResultT, GenericResult(..))
 import Ganeti.Constants
+import Ganeti.Errors (GanetiException)
 import Ganeti.JSON (unMaybeForJSON)
 import Ganeti.Locking.Locks (ClientId)
 import Ganeti.Objects (ConfigData)
-import Ganeti.UDSServer (ConnectConfig(..), Client, connectClient)
+import qualified Ganeti.Path as Path
+import Ganeti.THH.HsRPC
+import Ganeti.UDSServer (ConnectConfig(..), Client, connectClient, closeClient)
 import Ganeti.WConfd.Core (exportedFunctions)
 
 -- * Generated client functions
@@ -65,6 +73,15 @@
 getWConfdClient :: FilePath -> IO Client
 getWConfdClient = connectClient wconfdConnectConfig wconfdDefCtmo
 
+-- | Run an Rpc with a fresh client.
+runNewWConfdClient :: ( MonadBase IO m, MonadBaseControl IO m
+                      ,  MonadError GanetiException m )
+                   => RpcClientMonad a -> m a
+runNewWConfdClient request =
+  bracket (liftBase (Path.defaultWConfdSocket >>= getWConfdClient))
+          (liftBase . closeClient)
+    $ runRpcClient request
+
 -- * Helper functions for getting a remote lock
 
 -- | Calls the `lockConfig` RPC until the lock is obtained.
@@ -86,3 +103,14 @@
 withLockedConfig c shared =
   -- Unlock config even if something throws.
   bracket (waitLockConfig c shared) (const $ unlockConfig c)
+
+
+-- * Other functions
+
+-- | Try an RPC until no errors occur and the result is true.
+runModifyRpc :: RpcClientMonad Bool -> IO ()
+runModifyRpc action = do
+  res <- runResultT $ runNewWConfdClient action
+  unless (res == Ok True) $ do
+    threadDelay 100000 -- sleep 0.1 seconds
+    runModifyRpc action
diff --git a/src/Ganeti/WConfd/ConfigModifications.hs b/src/Ganeti/WConfd/ConfigModifications.hs
index 46686d4..2fdea73 100644
--- a/src/Ganeti/WConfd/ConfigModifications.hs
+++ b/src/Ganeti/WConfd/ConfigModifications.hs
@@ -1,4 +1,5 @@
-{-# LANGUAGE TemplateHaskell, NoMonomorphismRestriction, FlexibleContexts #-}
+{-# LANGUAGE TemplateHaskell, NoMonomorphismRestriction, FlexibleContexts,
+    RankNTypes #-}
 
 {-|  The WConfd functions for direct configuration manipulation
 
@@ -39,18 +40,21 @@
 
 module Ganeti.WConfd.ConfigModifications where
 
-import Control.Applicative ((<$>))
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Lens (_2)
 import Control.Lens.Getter ((^.))
-import Control.Lens.Setter ((.~), (%~))
-import qualified Data.ByteString.UTF8 as UTF8
+import Control.Lens.Setter (Setter, (.~), (%~), (+~), over)
 import Control.Lens.Traversal (mapMOf)
-import Control.Monad (unless, when, forM_, foldM, liftM2)
-import Control.Monad.Error (throwError, MonadError)
+import Control.Lens.Type (Simple)
+import Control.Monad (unless, when, forM_, foldM, liftM, liftM2)
+import Control.Monad.Error.Class (throwError, MonadError)
 import Control.Monad.IO.Class (liftIO)
 import Control.Monad.Trans.State (StateT, get, put, modify,
                                   runStateT, execStateT)
-import Data.Foldable (fold, foldMap)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.Foldable (fold)
 import Data.List (elemIndex)
 import Data.Maybe (isJust, maybeToList, fromMaybe, fromJust)
 import Language.Haskell.TH (Name)
@@ -68,7 +72,8 @@
 import Ganeti.Logging.Lifted (logDebug, logInfo)
 import Ganeti.Objects
 import Ganeti.Objects.Lens
-import Ganeti.Types (AdminState, AdminStateSource)
+import Ganeti.Types (AdminState, AdminStateSource, JobId)
+import Ganeti.Utils (ordNub)
 import Ganeti.WConfd.ConfigState (ConfigState, csConfigData, csConfigDataL)
 import Ganeti.WConfd.Monad (WConfdMonad, modifyConfigWithLock
                            , modifyConfigAndReturnWithLock)
@@ -117,7 +122,7 @@
 
       instKeys = keysFromC . configInstances . csConfigData $ cs
       nodeKeys = keysFromC . configNodes . csConfigData $ cs
-      
+
       instValues = map uuidOf . valuesFromC
                  . configInstances . csConfigData $ cs
       nodeValues = map uuidOf . valuesFromC . configNodes . csConfigData $ cs
@@ -672,6 +677,74 @@
     . T.releaseDRBDMinors . UTF8.fromString $ uuidOf disk
   return . MaybeForJSON $ fmap (_2 %~ TimeAsDoubleJSON) r
 
+-- | Set a particular value and bump serial in the hosting
+-- structure. Arguments are a setter to focus on the part
+-- of the configuration that gets serial-bumped, and a modification
+-- of that part. The function will do the change and bump the serial
+-- in the WConfdMonad temporarily acquiring the configuration lock.
+-- Return True if that succeeded and False if the configuration lock
+-- was not available; no change is done in the latter case.
+changeAndBump :: (SerialNoObjectL a, TimeStampObjectL a)
+              => Simple Setter ConfigState a
+              -> (a -> a)
+              -> WConfdMonad Bool
+changeAndBump focus change = do
+  now <- liftIO getClockTime
+  let operation = over focus $ (serialL +~ 1) . (mTimeL .~ now) . change
+  liftM isJust $ modifyConfigWithLock
+    (\_ cs -> return . operation $ cs)
+    (return ())
+
+-- | Change and bump part of the maintenance part of the configuration.
+changeAndBumpMaint :: (MaintenanceData -> MaintenanceData) -> WConfdMonad Bool
+changeAndBumpMaint = changeAndBump $ csConfigDataL . configMaintenanceL
+
+-- | Set the maintenance intervall.
+setMaintdRoundDelay :: Int -> WConfdMonad Bool
+setMaintdRoundDelay delay = changeAndBumpMaint $ maintRoundDelayL .~ delay
+
+-- | Clear the list of current maintenance jobs.
+clearMaintdJobs :: WConfdMonad Bool
+clearMaintdJobs = changeAndBumpMaint $ maintJobsL .~ []
+
+-- | Append new jobs to the list of current maintenace jobs, if
+-- not alread present.
+appendMaintdJobs :: [JobId] -> WConfdMonad Bool
+appendMaintdJobs jobs = changeAndBumpMaint . over maintJobsL
+                          $ ordNub . (++ jobs)
+
+-- | Set the autobalance flag.
+setMaintdBalance :: Bool -> WConfdMonad Bool
+setMaintdBalance value = changeAndBumpMaint $ maintBalanceL .~ value
+
+-- | Set the auto-balance threshold.
+setMaintdBalanceThreshold :: Double -> WConfdMonad Bool
+setMaintdBalanceThreshold value = changeAndBumpMaint
+                                    $ maintBalanceThresholdL .~ value
+
+-- | Add a name to the list of recently evacuated instances.
+addMaintdEvacuated :: [String] -> WConfdMonad Bool
+addMaintdEvacuated names = changeAndBumpMaint . over maintEvacuatedL
+                            $ ordNub . (++ names)
+
+-- | Remove a name from the list of recently evacuated instances.
+rmMaintdEvacuated :: String -> WConfdMonad Bool
+rmMaintdEvacuated name = changeAndBumpMaint . over maintEvacuatedL
+                          $ filter (/= name)
+
+-- | Update an incident to the list of known incidents; if the incident,
+-- as identified by the UUID, is not present, it is added.
+updateMaintdIncident :: Incident -> WConfdMonad Bool
+updateMaintdIncident incident =
+  changeAndBumpMaint . over maintIncidentsL
+    $ (incident :) . filter ((/= uuidOf incident) . uuidOf)
+
+-- | Remove an incident from the list of known incidents.
+rmMaintdIncident :: String -> WConfdMonad Bool
+rmMaintdIncident uuid =
+  changeAndBumpMaint . over maintIncidentsL
+    $ filter ((/= uuid) . uuidOf)
+
 -- * The list of functions exported to RPC.
 
 exportedFunctions :: [Name]
@@ -691,4 +764,13 @@
                     , 'updateNetwork
                     , 'updateNode
                     , 'updateNodeGroup
+                    , 'setMaintdRoundDelay
+                    , 'clearMaintdJobs
+                    , 'appendMaintdJobs
+                    , 'setMaintdBalance
+                    , 'setMaintdBalanceThreshold
+                    , 'addMaintdEvacuated
+                    , 'rmMaintdEvacuated
+                    , 'updateMaintdIncident
+                    , 'rmMaintdIncident
                     ]
diff --git a/src/Ganeti/WConfd/ConfigState.hs b/src/Ganeti/WConfd/ConfigState.hs
index fa6e754..b41fda1 100644
--- a/src/Ganeti/WConfd/ConfigState.hs
+++ b/src/Ganeti/WConfd/ConfigState.hs
@@ -43,7 +43,9 @@
   , needsFullDist
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Data.Function (on)
 import System.Time (ClockTime(..))
 
diff --git a/src/Ganeti/WConfd/ConfigVerify.hs b/src/Ganeti/WConfd/ConfigVerify.hs
index 246b627..118d775 100644
--- a/src/Ganeti/WConfd/ConfigVerify.hs
+++ b/src/Ganeti/WConfd/ConfigVerify.hs
@@ -39,7 +39,8 @@
   , verifyConfigErr
   ) where
 
-import Control.Monad.Error
+import Control.Monad (forM_)
+import Control.Monad.Error.Class (MonadError(..))
 import qualified Data.ByteString.UTF8 as UTF8
 import qualified Data.Foldable as F
 import qualified Data.Map as M
diff --git a/src/Ganeti/WConfd/ConfigWriter.hs b/src/Ganeti/WConfd/ConfigWriter.hs
index 8ffbc13..ccd562b 100644
--- a/src/Ganeti/WConfd/ConfigWriter.hs
+++ b/src/Ganeti/WConfd/ConfigWriter.hs
@@ -43,10 +43,14 @@
   , distSSConfAsyncTask
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad ((>=>), liftM, unless)
 import Control.Monad.Base
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
 import qualified Control.Monad.State.Strict as S
+import Control.Monad.Trans.Class (lift)
 import Control.Monad.Trans.Control
 import Data.Monoid
 import qualified Data.Set as Set
diff --git a/src/Ganeti/WConfd/Core.hs b/src/Ganeti/WConfd/Core.hs
index 73dba45..88ecafa 100644
--- a/src/Ganeti/WConfd/Core.hs
+++ b/src/Ganeti/WConfd/Core.hs
@@ -61,8 +61,13 @@
                             , lockLevel, LockLevel
                             , ClientType(ClientOther), ClientId(..) )
 import qualified Ganeti.Locking.Waiting as LW
-import Ganeti.Objects (ConfigData, DRBDSecret, LogicalVolume, Ip4Address)
+import Ganeti.Objects ( ConfigData, DRBDSecret, LogicalVolume, Ip4Address
+                      , configMaintenance, maintRoundDelay, maintJobs
+                      , maintBalance, maintBalanceThreshold, maintEvacuated
+                      , Incident, maintIncidents
+                      )
 import Ganeti.Objects.Lens (configClusterL, clusterMasterNodeL)
+import Ganeti.Types (JobId)
 import Ganeti.WConfd.ConfigState (csConfigDataL)
 import qualified Ganeti.WConfd.ConfigVerify as V
 import Ganeti.WConfd.DeathDetection (cleanupLocks)
@@ -165,6 +170,30 @@
 flushConfigGroup :: String -> WConfdMonad ()
 flushConfigGroup = forceConfigStateDistribution . ToGroups . S.singleton
 
+-- *** Access to individual parts of the configuration
+
+-- | Get the configurable value of the maintenance interval
+maintenanceRoundDelay :: WConfdMonad Int
+maintenanceRoundDelay = liftM ( maintRoundDelay . configMaintenance )
+                              CW.readConfig
+
+-- | Get the list of jobs in the state of the maintenance daemon.
+maintenanceJobs :: WConfdMonad [JobId]
+maintenanceJobs = liftM ( maintJobs . configMaintenance ) CW.readConfig
+
+-- | Get the information related to balancing for the maintenance daemon.
+maintenanceBalancing :: WConfdMonad (Bool, Double)
+maintenanceBalancing = liftM ((maintBalance &&& maintBalanceThreshold)
+                              . configMaintenance) CW.readConfig
+
+-- | Get the list of recently evacuated instances.
+maintenanceEvacuated :: WConfdMonad [String]
+maintenanceEvacuated = liftM (maintEvacuated . configMaintenance) CW.readConfig
+
+-- | Get the list of current incidents.
+maintenanceIncidents :: WConfdMonad [Incident]
+maintenanceIncidents = liftM (maintIncidents . configMaintenance) CW.readConfig
+
 -- ** Temporary reservations related functions
 
 dropAllReservations :: ClientId -> WConfdMonad ()
@@ -396,6 +425,11 @@
                     , 'writeConfigAndUnlock
                     , 'flushConfig
                     , 'flushConfigGroup
+                    , 'maintenanceRoundDelay
+                    , 'maintenanceJobs
+                    , 'maintenanceBalancing
+                    , 'maintenanceEvacuated
+                    , 'maintenanceIncidents
                     -- temporary reservations (common)
                     , 'dropAllReservations
                     -- DRBD
diff --git a/src/Ganeti/WConfd/Monad.hs b/src/Ganeti/WConfd/Monad.hs
index fe78e31..b37ab9e 100644
--- a/src/Ganeti/WConfd/Monad.hs
+++ b/src/Ganeti/WConfd/Monad.hs
@@ -69,19 +69,20 @@
   , DistributionTarget(..)
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow ((&&&), second)
 import Control.Concurrent (forkIO, myThreadId)
 import Control.Exception.Lifted (bracket)
 import Control.Monad
 import Control.Monad.Base
-import Control.Monad.Error
 import Control.Monad.Reader
 import Control.Monad.State
 import Control.Monad.Trans.Control
 import Data.Functor.Identity
 import Data.IORef.Lifted
-import Data.Monoid (Any(..), Monoid(..))
+import Data.Monoid (Any(..))
 import qualified Data.Set as S
 import Data.Tuple (swap)
 import System.Posix.Process (getProcessID)
diff --git a/src/Ganeti/WConfd/Persistent.hs b/src/Ganeti/WConfd/Persistent.hs
index 48b8330..dc0bc63 100644
--- a/src/Ganeti/WConfd/Persistent.hs
+++ b/src/Ganeti/WConfd/Persistent.hs
@@ -46,7 +46,7 @@
   , persistentTempRes
   ) where
 
-import Control.Monad.Error
+import Control.Monad.Error.Class (catchError)
 import System.Directory (doesFileExist)
 import qualified Text.JSON as J
 
diff --git a/src/Ganeti/WConfd/Server.hs b/src/Ganeti/WConfd/Server.hs
index b226d09..1c2ef83 100644
--- a/src/Ganeti/WConfd/Server.hs
+++ b/src/Ganeti/WConfd/Server.hs
@@ -43,7 +43,6 @@
 import Control.Concurrent (forkIO)
 import Control.Exception
 import Control.Monad
-import Control.Monad.Error
 
 import Ganeti.BasicTypes
 import qualified Ganeti.Constants as C
@@ -88,8 +87,8 @@
   conf_file <- Path.clusterConfFile
 
   dh <- toErrorBase
-        . withErrorT (strMsg . ("Initialization of the daemon failed" ++)
-                             . formatError) $ do
+        . withErrorT (mkFromString . ("Initialization of the daemon failed" ++)
+                                   . formatError) $ do
     ents <- getEnts
     (cdata, cstat) <- loadConfigFromFile conf_file
     verifyConfigErr cdata
diff --git a/src/Ganeti/WConfd/TempRes.hs b/src/Ganeti/WConfd/TempRes.hs
index 565fae2..9c0220d 100644
--- a/src/Ganeti/WConfd/TempRes.hs
+++ b/src/Ganeti/WConfd/TempRes.hs
@@ -73,9 +73,11 @@
   , reserved
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Lens.At
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError(..))
 import Control.Monad.State
 import Control.Monad.Trans.Maybe
 import qualified Data.ByteString as BS
diff --git a/src/ganeti-maintd.hs b/src/ganeti-maintd.hs
new file mode 100644
index 0000000..caa76fc
--- /dev/null
+++ b/src/ganeti-maintd.hs
@@ -0,0 +1,47 @@
+{-| Ganeti maintenance agent daemon
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Main (main) where
+
+import Ganeti.Daemon
+import Ganeti.Runtime
+import qualified Ganeti.MaintD.Server as S
+
+-- | Main function.
+main :: IO ()
+main =
+  genericMain GanetiMaintd S.options
+    S.checkMain
+    S.prepMain
+    S.main
diff --git a/test/data/cluster_config_2.16.json b/test/data/cluster_config_2.16.json
new file mode 100644
index 0000000..76e9b4f
--- /dev/null
+++ b/test/data/cluster_config_2.16.json
@@ -0,0 +1,658 @@
+{
+  "cluster": {
+    "beparams": {
+      "default": {
+        "always_failover": false,
+        "auto_balance": true,
+        "maxmem": 128,
+        "minmem": 128,
+        "spindle_use": 1,
+        "vcpus": 1
+      }
+    },
+    "blacklisted_os": [],
+    "candidate_certs": {},
+    "candidate_pool_size": 10,
+    "cluster_name": "cluster.name.example.com",
+    "compression_tools": [
+      "gzip",
+      "gzip-fast",
+      "gzip-slow"
+    ],
+    "ctime": 1343869045.6048839,
+    "data_collectors": {
+      "cpu-avg-load": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "diskstats": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "drbd": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "inst-status-xen": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "lv": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "xen-cpu-avg-load": {
+        "active": true,
+        "interval": 5000000.0
+      }
+    },
+    "default_iallocator": "hail",
+    "default_iallocator_params": {},
+    "disk_state_static": {},
+    "diskparams": {
+      "blockdev": {},
+      "diskless": {},
+      "drbd": {
+        "c-delay-target": 1,
+        "c-fill-target": 200,
+        "c-max-rate": 2048,
+        "c-min-rate": 1024,
+        "c-plan-ahead": 1,
+        "data-stripes": 2,
+        "disk-barriers": "bf",
+        "disk-custom": "",
+        "dynamic-resync": false,
+        "meta-barriers": true,
+        "meta-stripes": 2,
+        "metavg": "xenvg",
+        "net-custom": "",
+        "protocol": "C",
+        "resync-rate": 1024
+      },
+      "ext": {
+        "access": "kernelspace"
+      },
+      "file": {},
+      "gluster": {
+        "access": "kernelspace",
+        "host": "127.0.0.1",
+        "port": 24007,
+        "volume": "gv0"
+      },
+      "plain": {
+        "stripes": 2
+      },
+      "rbd": {
+        "access": "kernelspace",
+        "pool": "rbd"
+      },
+      "sharedfile": {}
+    },
+    "drbd_usermode_helper": "/bin/true",
+    "enabled_disk_templates": [
+      "drbd",
+      "plain",
+      "file",
+      "sharedfile"
+    ],
+    "enabled_hypervisors": [
+      "xen-pvm"
+    ],
+    "enabled_user_shutdown": false,
+    "file_storage_dir": "",
+    "gluster_storage_dir": "",
+    "hidden_os": [],
+    "highest_used_port": 32105,
+    "hv_state_static": {
+      "xen-pvm": {
+        "cpu_node": 1,
+        "cpu_total": 1,
+        "mem_hv": 0,
+        "mem_node": 0,
+        "mem_total": 0
+      }
+    },
+    "hvparams": {
+      "chroot": {
+        "init_script": "/ganeti-chroot"
+      },
+      "fake": {
+        "migration_mode": "live"
+      },
+      "kvm": {
+        "acpi": true,
+        "boot_order": "disk",
+        "cdrom2_image_path": "",
+        "cdrom_disk_type": "",
+        "cdrom_image_path": "",
+        "cpu_cores": 0,
+        "cpu_mask": "all",
+        "cpu_sockets": 0,
+        "cpu_threads": 0,
+        "cpu_type": "",
+        "disk_aio": "threads",
+        "disk_cache": "default",
+        "disk_type": "paravirtual",
+        "floppy_image_path": "",
+        "initrd_path": "",
+        "kernel_args": "ro",
+        "kernel_path": "/boot/vmlinuz-kvmU",
+        "keymap": "",
+        "kvm_extra": "",
+        "kvm_flag": "",
+        "kvm_path": "/usr/bin/kvm",
+        "machine_version": "",
+        "mem_path": "",
+        "migration_bandwidth": 4,
+        "migration_caps": "",
+        "migration_downtime": 30,
+        "migration_mode": "live",
+        "migration_port": 4041,
+        "nic_type": "paravirtual",
+        "reboot_behavior": "reboot",
+        "root_path": "/dev/vda1",
+        "security_domain": "",
+        "security_model": "none",
+        "serial_console": true,
+        "serial_speed": 38400,
+        "soundhw": "",
+        "spice_bind": "",
+        "spice_image_compression": "",
+        "spice_ip_version": 0,
+        "spice_jpeg_wan_compression": "",
+        "spice_password_file": "",
+        "spice_playback_compression": true,
+        "spice_streaming_video": "",
+        "spice_tls_ciphers": "HIGH:-DES:-3DES:-EXPORT:-ADH",
+        "spice_use_tls": false,
+        "spice_use_vdagent": true,
+        "spice_zlib_glz_wan_compression": "",
+        "usb_devices": "",
+        "usb_mouse": "",
+        "use_chroot": false,
+        "use_localtime": false,
+        "user_shutdown": false,
+        "vga": "",
+        "vhost_net": false,
+        "virtio_net_queues": 1,
+        "vnc_bind_address": "",
+        "vnc_password_file": "",
+        "vnc_tls": false,
+        "vnc_x509_path": "",
+        "vnc_x509_verify": false,
+        "vnet_hdr": true
+      },
+      "lxc": {
+        "cpu_mask": "",
+        "devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+        "drop_capabilities": "mac_override,sys_boot,sys_module,sys_time,sys_admin",
+        "extra_cgroups": "",
+        "extra_config": "",
+        "lxc_cgroup_use": "",
+        "lxc_devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+        "lxc_drop_capabilities": "mac_override,sys_boot,sys_module,sys_time",
+        "lxc_extra_config": "",
+        "lxc_startup_wait": 30,
+        "lxc_tty": 6,
+        "num_ttys": 6,
+        "startup_timeout": 30
+      },
+      "xen-hvm": {
+        "acpi": true,
+        "blockdev_prefix": "hd",
+        "boot_order": "cd",
+        "cdrom_image_path": "",
+        "cpu_cap": 0,
+        "cpu_mask": "all",
+        "cpu_weight": 256,
+        "cpuid": "",
+        "device_model": "/usr/lib/xen/bin/qemu-dm",
+        "disk_type": "paravirtual",
+        "kernel_path": "/usr/lib/xen/boot/hvmloader",
+        "migration_mode": "non-live",
+        "migration_port": 8082,
+        "nic_type": "rtl8139",
+        "pae": true,
+        "pci_pass": "",
+        "reboot_behavior": "reboot",
+        "soundhw": "",
+        "use_localtime": false,
+        "vif_script": "",
+        "vif_type": "ioemu",
+        "viridian": false,
+        "vnc_bind_address": "0.0.0.0",
+        "vnc_password_file": "/your/vnc-cluster-password",
+        "xen_cmd": "xm"
+      },
+      "xen-pvm": {
+        "blockdev_prefix": "sd",
+        "bootloader_args": "",
+        "bootloader_path": "",
+        "cpu_cap": 0,
+        "cpu_mask": "all",
+        "cpu_weight": 256,
+        "cpuid": "",
+        "initrd_path": "",
+        "kernel_args": "ro",
+        "kernel_path": "/boot/vmlinuz-xenU",
+        "migration_mode": "live",
+        "migration_port": 8082,
+        "reboot_behavior": "reboot",
+        "root_path": "/dev/xvda1",
+        "soundhw": "",
+        "use_bootloader": false,
+        "vif_script": "",
+        "xen_cmd": "xm"
+      }
+    },
+    "install_image": "",
+    "instance_communication_network": "",
+    "ipolicy": {
+      "disk-templates": [
+        "drbd",
+        "plain",
+        "sharedfile",
+        "file"
+      ],
+      "minmax": [
+        {
+          "max": {
+            "cpu-count": 8,
+            "disk-count": 16,
+            "disk-size": 1048576,
+            "memory-size": 32768,
+            "nic-count": 8,
+            "spindle-use": 12
+          },
+          "min": {
+            "cpu-count": 1,
+            "disk-count": 1,
+            "disk-size": 1024,
+            "memory-size": 128,
+            "nic-count": 1,
+            "spindle-use": 1
+          }
+        }
+      ],
+      "spindle-ratio": 32.0,
+      "std": {
+        "cpu-count": 1,
+        "disk-count": 1,
+        "disk-size": 1024,
+        "memory-size": 128,
+        "nic-count": 1,
+        "spindle-use": 1
+      },
+      "vcpu-ratio": 1.0
+    },
+    "mac_prefix": "aa:bb:cc",
+    "maintain_node_health": false,
+    "master_ip": "192.0.2.87",
+    "master_netdev": "eth0",
+    "master_netmask": 32,
+    "master_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+    "max_running_jobs": 20,
+    "max_tracked_jobs": 25,
+    "modify_etc_hosts": true,
+    "modify_ssh_setup": true,
+    "mtime": 1361964122.7947099,
+    "ndparams": {
+      "cpu_speed": 1.0,
+      "exclusive_storage": false,
+      "oob_program": "",
+      "ovs": false,
+      "ovs_link": "",
+      "ovs_name": "switch1",
+      "spindle_count": 1,
+      "ssh_port": 22
+    },
+    "nicparams": {
+      "default": {
+        "link": "br974",
+        "mode": "bridged",
+        "vlan": ""
+      }
+    },
+    "os_hvp": {
+      "TEMP-Ganeti-QA-OS": {
+        "xen-hvm": {
+          "acpi": false,
+          "pae": true
+        },
+        "xen-pvm": {
+          "root_path": "/dev/sda5"
+        }
+      }
+    },
+    "osparams": {},
+    "osparams_private_cluster": {},
+    "prealloc_wipe_disks": false,
+    "primary_ip_family": 2,
+    "reserved_lvs": [],
+    "rsahostkeypub": "YOURKEY",
+    "serial_no": 3189,
+    "shared_file_storage_dir": "/srv/ganeti/shared-file-storage",
+    "ssh_key_bits": 1024,
+    "ssh_key_type": "dsa",
+    "tags": [
+      "mytag"
+    ],
+    "tcpudp_port_pool": [
+      32104,
+      32105,
+      32101,
+      32102,
+      32103
+    ],
+    "uid_pool": [],
+    "use_external_mip_script": false,
+    "uuid": "dddf8c12-f2d8-4718-a35b-7804daf12a3f",
+    "volume_group_name": "xenvg",
+    "zeroing_image": ""
+  },
+  "ctime": 1343869045.6055231,
+  "disks": {
+    "150bd154-8e23-44d1-b762-5065ae5a507b": {
+      "ctime": 1354038435.343601,
+      "dev_type": "plain",
+      "iv_name": "disk/0",
+      "logical_id": [
+        "xenvg",
+        "b27a576a-13f7-4f07-885c-63fcad4fdfcc.disk0"
+      ],
+      "mode": "rw",
+      "mtime": 1354038435.343601,
+      "nodes": [
+        "2ae3d962-2dad-44f2-bdb1-85f77107f907"
+      ],
+      "params": {},
+      "serial_no": 1,
+      "size": 1280,
+      "uuid": "150bd154-8e23-44d1-b762-5065ae5a507b"
+    },
+    "77ced3a5-6756-49ae-8d1f-274e27664c05": {
+      "children": [
+        {
+          "ctime": 1421677173.7280669,
+          "dev_type": "plain",
+          "logical_id": [
+            "xenvg",
+            "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_data"
+          ],
+          "mtime": 1421677173.7280591,
+          "nodes": [
+            "9a12d554-75c0-4cb1-8064-103365145db0",
+            "41f9c238-173c-4120-9e41-04ad379b647a"
+          ],
+          "params": {},
+          "serial_no": 1,
+          "size": 1024
+        },
+        {
+          "ctime": 1421677173.728096,
+          "dev_type": "plain",
+          "logical_id": [
+            "xenvg",
+            "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_meta"
+          ],
+          "mtime": 1421677173.7280879,
+          "nodes": [
+            "9a12d554-75c0-4cb1-8064-103365145db0",
+            "41f9c238-173c-4120-9e41-04ad379b647a"
+          ],
+          "params": {},
+          "serial_no": 1,
+          "size": 128
+        }
+      ],
+      "ctime": 1363620258.6089759,
+      "dev_type": "drbd",
+      "iv_name": "disk/0",
+      "logical_id": [
+        "9a12d554-75c0-4cb1-8064-103365145db0",
+        "41f9c238-173c-4120-9e41-04ad379b647a",
+        32100,
+        0,
+        0,
+        "d3c3fd475fcbaf5fd177fb245ac43b71247ada38"
+      ],
+      "mode": "rw",
+      "mtime": 1363620258.6089759,
+      "nodes": [
+        "9a12d554-75c0-4cb1-8064-103365145db0",
+        "41f9c238-173c-4120-9e41-04ad379b647a"
+      ],
+      "params": {},
+      "serial_no": 1,
+      "size": 1024,
+      "uuid": "77ced3a5-6756-49ae-8d1f-274e27664c05"
+    },
+    "79acf611-be58-4334-9fe4-4f2b73ae8abb": {
+      "ctime": 1355186880.4511809,
+      "dev_type": "plain",
+      "iv_name": "disk/0",
+      "logical_id": [
+        "xenvg",
+        "3e559cd7-1024-4294-a923-a9fd13182b2f.disk0"
+      ],
+      "mode": "rw",
+      "mtime": 1355186880.4511809,
+      "nodes": [
+        "41f9c238-173c-4120-9e41-04ad379b647a"
+      ],
+      "params": {},
+      "serial_no": 1,
+      "size": 102400,
+      "uuid": "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+    }
+  },
+  "filters": {},
+  "instances": {
+    "4e091bdc-e205-4ed7-8a47-0c9130a6619f": {
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "beparams": {},
+      "ctime": 1354038435.343601,
+      "disks": [
+        "150bd154-8e23-44d1-b762-5065ae5a507b"
+      ],
+      "disks_active": true,
+      "hvparams": {},
+      "hypervisor": "xen-pvm",
+      "mtime": 1354224585.700732,
+      "name": "instance3.example.com",
+      "nics": [
+        {
+          "mac": "aa:bb:cc:5e:5c:75",
+          "nicparams": {},
+          "uuid": "1ab090c1-e017-406c-afb4-fc285cb43e31"
+        }
+      ],
+      "os": "debian-image",
+      "osparams": {},
+      "osparams_private": {},
+      "primary_node": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+      "serial_no": 4,
+      "tags": [],
+      "uuid": "4e091bdc-e205-4ed7-8a47-0c9130a6619f"
+    },
+    "6c078d22-3eb6-4780-857d-81772e09eef1": {
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "beparams": {},
+      "ctime": 1363620258.6089759,
+      "disks": [
+        "77ced3a5-6756-49ae-8d1f-274e27664c05"
+      ],
+      "disks_active": true,
+      "hvparams": {},
+      "hypervisor": "xen-pvm",
+      "mtime": 1363620320.8749011,
+      "name": "instance1.example.com",
+      "nics": [
+        {
+          "mac": "aa:bb:cc:b2:6e:0b",
+          "nicparams": {},
+          "uuid": "2c953d72-fac4-4aa9-a225-4131bb271791"
+        }
+      ],
+      "os": "busybox",
+      "osparams": {},
+      "osparams_private": {},
+      "primary_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+      "serial_no": 2,
+      "uuid": "6c078d22-3eb6-4780-857d-81772e09eef1"
+    },
+    "8fde9f6d-e1f1-4850-9e9c-154966f622f5": {
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "beparams": {},
+      "ctime": 1355186880.4511809,
+      "disks": [
+        "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+      ],
+      "disks_active": true,
+      "hvparams": {},
+      "hypervisor": "xen-pvm",
+      "mtime": 1355186898.307642,
+      "name": "instance2.example.com",
+      "nics": [
+        {
+          "mac": "aa:bb:cc:56:83:fb",
+          "nicparams": {},
+          "uuid": "1cf95562-e676-4fd0-8214-e8b84a2f7bd1"
+        }
+      ],
+      "os": "debian-image",
+      "osparams": {},
+      "osparams_private": {},
+      "primary_node": "41f9c238-173c-4120-9e41-04ad379b647a",
+      "serial_no": 2,
+      "tags": [],
+      "uuid": "8fde9f6d-e1f1-4850-9e9c-154966f622f5"
+    }
+  },
+  "mtime": 1421677173.729104,
+  "networks": {
+    "99f0128a-1c84-44da-90b9-9581ea00c075": {
+      "ext_reservations": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+      "name": "a network",
+      "network": "203.0.113.0/24",
+      "reservations": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+      "serial_no": 1,
+      "uuid": "99f0128a-1c84-44da-90b9-9581ea00c075"
+    }
+  },
+  "nodegroups": {
+    "5244a46d-7506-4e14-922d-02b58153dde1": {
+      "alloc_policy": "preferred",
+      "diskparams": {},
+      "ipolicy": {},
+      "mtime": 1361963775.5750091,
+      "name": "default",
+      "ndparams": {},
+      "networks": {},
+      "serial_no": 125,
+      "tags": [],
+      "uuid": "5244a46d-7506-4e14-922d-02b58153dde1"
+    },
+    "6c0a8916-b719-45ad-95dd-82192b1e473f": {
+      "alloc_policy": "preferred",
+      "diskparams": {},
+      "ipolicy": {
+        "disk-templates": [
+          "plain"
+        ],
+        "minmax": [
+          {
+            "max": {
+              "cpu-count": 8,
+              "disk-count": 16,
+              "disk-size": 1048576,
+              "memory-size": 32768,
+              "nic-count": 18,
+              "spindle-use": 14
+            },
+            "min": {
+              "cpu-count": 2,
+              "disk-count": 2,
+              "disk-size": 1024,
+              "memory-size": 128,
+              "nic-count": 1,
+              "spindle-use": 1
+            }
+          }
+        ],
+        "spindle-ratio": 5.2000000000000002,
+        "vcpu-ratio": 3.1400000000000001
+      },
+      "mtime": 1361963775.5750091,
+      "name": "another",
+      "ndparams": {
+        "exclusive_storage": true
+      },
+      "networks": {},
+      "serial_no": 125,
+      "tags": [],
+      "uuid": "6c0a8916-b719-45ad-95dd-82192b1e473f"
+    }
+  },
+  "nodes": {
+    "2ae3d962-2dad-44f2-bdb1-85f77107f907": {
+      "ctime": 1343869045.6048839,
+      "drained": false,
+      "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+      "master_candidate": true,
+      "master_capable": true,
+      "mtime": 1358348755.779906,
+      "name": "node2.example.com",
+      "ndparams": {},
+      "offline": false,
+      "powered": true,
+      "primary_ip": "192.0.2.83",
+      "secondary_ip": "198.51.100.83",
+      "serial_no": 6,
+      "tags": [],
+      "uuid": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+      "vm_capable": true
+    },
+    "41f9c238-173c-4120-9e41-04ad379b647a": {
+      "ctime": 1343869205.9348071,
+      "drained": false,
+      "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+      "master_candidate": true,
+      "master_capable": true,
+      "mtime": 1353019704.8853681,
+      "name": "node3.example.com",
+      "ndparams": {},
+      "offline": false,
+      "powered": true,
+      "primary_ip": "192.0.2.84",
+      "secondary_ip": "198.51.100.84",
+      "serial_no": 2,
+      "tags": [],
+      "uuid": "41f9c238-173c-4120-9e41-04ad379b647a",
+      "vm_capable": true
+    },
+    "9a12d554-75c0-4cb1-8064-103365145db0": {
+      "ctime": 1349722460.022264,
+      "drained": false,
+      "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+      "master_candidate": true,
+      "master_capable": true,
+      "mtime": 1359986533.3533289,
+      "name": "node1.example.com",
+      "ndparams": {},
+      "offline": false,
+      "powered": true,
+      "primary_ip": "192.0.2.82",
+      "secondary_ip": "198.51.100.82",
+      "serial_no": 197,
+      "tags": [],
+      "uuid": "9a12d554-75c0-4cb1-8064-103365145db0",
+      "vm_capable": true
+    }
+  },
+  "serial_no": 7627,
+  "version": 2160000
+}
diff --git a/test/data/cluster_config_2.17.json b/test/data/cluster_config_2.17.json
new file mode 100644
index 0000000..65204b9
--- /dev/null
+++ b/test/data/cluster_config_2.17.json
@@ -0,0 +1,669 @@
+{
+  "cluster": {
+    "beparams": {
+      "default": {
+        "always_failover": false,
+        "auto_balance": true,
+        "maxmem": 128,
+        "minmem": 128,
+        "spindle_use": 1,
+        "vcpus": 1
+      }
+    },
+    "blacklisted_os": [],
+    "candidate_certs": {},
+    "candidate_pool_size": 10,
+    "cluster_name": "cluster.name.example.com",
+    "compression_tools": [
+      "gzip",
+      "gzip-fast",
+      "gzip-slow"
+    ],
+    "ctime": 1343869045.6048839,
+    "data_collectors": {
+      "cpu-avg-load": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "diagnose": {
+        "active": true,
+        "intervall": 5000000.0
+      },
+      "diskstats": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "drbd": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "inst-status-xen": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "kvm-inst-rss": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "lv": {
+        "active": true,
+        "interval": 5000000.0
+      },
+      "xen-cpu-avg-load": {
+        "active": true,
+        "interval": 5000000.0
+      }
+    },
+    "default_iallocator": "hail",
+    "default_iallocator_params": {},
+    "diagnose_data_collector_filename": "",
+    "disk_state_static": {},
+    "diskparams": {
+      "blockdev": {},
+      "diskless": {},
+      "drbd": {
+        "c-delay-target": 1,
+        "c-fill-target": 200,
+        "c-max-rate": 2048,
+        "c-min-rate": 1024,
+        "c-plan-ahead": 1,
+        "data-stripes": 2,
+        "disk-barriers": "bf",
+        "disk-custom": "",
+        "dynamic-resync": false,
+        "meta-barriers": true,
+        "meta-stripes": 2,
+        "metavg": "xenvg",
+        "net-custom": "",
+        "protocol": "C",
+        "resync-rate": 1024
+      },
+      "ext": {
+        "access": "kernelspace"
+      },
+      "file": {},
+      "gluster": {
+        "access": "kernelspace",
+        "host": "127.0.0.1",
+        "port": 24007,
+        "volume": "gv0"
+      },
+      "plain": {
+        "stripes": 2
+      },
+      "rbd": {
+        "access": "kernelspace",
+        "pool": "rbd"
+      },
+      "sharedfile": {}
+    },
+    "drbd_usermode_helper": "/bin/true",
+    "enabled_disk_templates": [
+      "drbd",
+      "plain",
+      "file",
+      "sharedfile"
+    ],
+    "enabled_hypervisors": [
+      "xen-pvm"
+    ],
+    "enabled_user_shutdown": false,
+    "file_storage_dir": "",
+    "gluster_storage_dir": "",
+    "hidden_os": [],
+    "highest_used_port": 32105,
+    "hv_state_static": {
+      "xen-pvm": {
+        "cpu_node": 1,
+        "cpu_total": 1,
+        "mem_hv": 0,
+        "mem_node": 0,
+        "mem_total": 0
+      }
+    },
+    "hvparams": {
+      "chroot": {
+        "init_script": "/ganeti-chroot"
+      },
+      "fake": {
+        "migration_mode": "live"
+      },
+      "kvm": {
+        "acpi": true,
+        "boot_order": "disk",
+        "cdrom2_image_path": "",
+        "cdrom_disk_type": "",
+        "cdrom_image_path": "",
+        "cpu_cores": 0,
+        "cpu_mask": "all",
+        "cpu_sockets": 0,
+        "cpu_threads": 0,
+        "cpu_type": "",
+        "disk_aio": "threads",
+        "disk_cache": "default",
+        "disk_type": "paravirtual",
+        "floppy_image_path": "",
+        "initrd_path": "",
+        "kernel_args": "ro",
+        "kernel_path": "/boot/vmlinuz-kvmU",
+        "keymap": "",
+        "kvm_extra": "",
+        "kvm_flag": "",
+        "kvm_path": "/usr/bin/kvm",
+        "machine_version": "",
+        "mem_path": "",
+        "migration_bandwidth": 4,
+        "migration_caps": "",
+        "migration_downtime": 30,
+        "migration_mode": "live",
+        "migration_port": 4041,
+        "nic_type": "paravirtual",
+        "reboot_behavior": "reboot",
+        "root_path": "/dev/vda1",
+        "security_domain": "",
+        "security_model": "none",
+        "serial_console": true,
+        "serial_speed": 38400,
+        "soundhw": "",
+        "spice_bind": "",
+        "spice_image_compression": "",
+        "spice_ip_version": 0,
+        "spice_jpeg_wan_compression": "",
+        "spice_password_file": "",
+        "spice_playback_compression": true,
+        "spice_streaming_video": "",
+        "spice_tls_ciphers": "HIGH:-DES:-3DES:-EXPORT:-ADH",
+        "spice_use_tls": false,
+        "spice_use_vdagent": true,
+        "spice_zlib_glz_wan_compression": "",
+        "usb_devices": "",
+        "usb_mouse": "",
+        "use_chroot": false,
+        "use_localtime": false,
+        "user_shutdown": false,
+        "vga": "",
+        "vhost_net": false,
+        "virtio_net_queues": 1,
+        "vnc_bind_address": "",
+        "vnc_password_file": "",
+        "vnc_tls": false,
+        "vnc_x509_path": "",
+        "vnc_x509_verify": false,
+        "vnet_hdr": true
+      },
+      "lxc": {
+        "cpu_mask": "",
+        "devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+        "drop_capabilities": "mac_override,sys_boot,sys_module,sys_time,sys_admin",
+        "extra_cgroups": "",
+        "extra_config": "",
+        "lxc_cgroup_use": "",
+        "lxc_devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+        "lxc_drop_capabilities": "mac_override,sys_boot,sys_module,sys_time",
+        "lxc_extra_config": "",
+        "lxc_startup_wait": 30,
+        "lxc_tty": 6,
+        "num_ttys": 6,
+        "startup_timeout": 30
+      },
+      "xen-hvm": {
+        "acpi": true,
+        "blockdev_prefix": "hd",
+        "boot_order": "cd",
+        "cdrom_image_path": "",
+        "cpu_cap": 0,
+        "cpu_mask": "all",
+        "cpu_weight": 256,
+        "cpuid": "",
+        "device_model": "/usr/lib/xen/bin/qemu-dm",
+        "disk_type": "paravirtual",
+        "kernel_path": "/usr/lib/xen/boot/hvmloader",
+        "migration_mode": "non-live",
+        "migration_port": 8082,
+        "nic_type": "rtl8139",
+        "pae": true,
+        "pci_pass": "",
+        "reboot_behavior": "reboot",
+        "soundhw": "",
+        "use_localtime": false,
+        "vif_script": "",
+        "vif_type": "ioemu",
+        "viridian": false,
+        "vnc_bind_address": "0.0.0.0",
+        "vnc_password_file": "/your/vnc-cluster-password",
+        "xen_cmd": "xm"
+      },
+      "xen-pvm": {
+        "blockdev_prefix": "sd",
+        "bootloader_args": "",
+        "bootloader_path": "",
+        "cpu_cap": 0,
+        "cpu_mask": "all",
+        "cpu_weight": 256,
+        "cpuid": "",
+        "initrd_path": "",
+        "kernel_args": "ro",
+        "kernel_path": "/boot/vmlinuz-xenU",
+        "migration_mode": "live",
+        "migration_port": 8082,
+        "reboot_behavior": "reboot",
+        "root_path": "/dev/xvda1",
+        "soundhw": "",
+        "use_bootloader": false,
+        "vif_script": "",
+        "xen_cmd": "xm"
+      }
+    },
+    "install_image": "",
+    "instance_communication_network": "",
+    "ipolicy": {
+      "disk-templates": [
+        "drbd",
+        "plain",
+        "sharedfile",
+        "file"
+      ],
+      "minmax": [
+        {
+          "max": {
+            "cpu-count": 8,
+            "disk-count": 16,
+            "disk-size": 1048576,
+            "memory-size": 32768,
+            "nic-count": 8,
+            "spindle-use": 12
+          },
+          "min": {
+            "cpu-count": 1,
+            "disk-count": 1,
+            "disk-size": 1024,
+            "memory-size": 128,
+            "nic-count": 1,
+            "spindle-use": 1
+          }
+        }
+      ],
+      "spindle-ratio": 32.0,
+      "std": {
+        "cpu-count": 1,
+        "disk-count": 1,
+        "disk-size": 1024,
+        "memory-size": 128,
+        "nic-count": 1,
+        "spindle-use": 1
+      },
+      "vcpu-ratio": 1.0,
+      "memory-ratio": 1.7
+    },
+    "mac_prefix": "aa:bb:cc",
+    "maintain_node_health": false,
+    "master_ip": "192.0.2.87",
+    "master_netdev": "eth0",
+    "master_netmask": 32,
+    "master_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+    "max_running_jobs": 20,
+    "max_tracked_jobs": 25,
+    "modify_etc_hosts": true,
+    "modify_ssh_setup": true,
+    "mtime": 1361964122.7947099,
+    "ndparams": {
+      "cpu_speed": 1.0,
+      "exclusive_storage": false,
+      "oob_program": "",
+      "ovs": false,
+      "ovs_link": "",
+      "ovs_name": "switch1",
+      "spindle_count": 1,
+      "ssh_port": 22
+    },
+    "nicparams": {
+      "default": {
+        "link": "br974",
+        "mode": "bridged",
+        "vlan": ""
+      }
+    },
+    "os_hvp": {
+      "TEMP-Ganeti-QA-OS": {
+        "xen-hvm": {
+          "acpi": false,
+          "pae": true
+        },
+        "xen-pvm": {
+          "root_path": "/dev/sda5"
+        }
+      }
+    },
+    "osparams": {},
+    "osparams_private_cluster": {},
+    "prealloc_wipe_disks": false,
+    "primary_ip_family": 2,
+    "reserved_lvs": [],
+    "rsahostkeypub": "YOURKEY",
+    "serial_no": 3189,
+    "shared_file_storage_dir": "/srv/ganeti/shared-file-storage",
+    "ssh_key_bits": 1024,
+    "ssh_key_type": "dsa",
+    "tags": [
+      "mytag"
+    ],
+    "tcpudp_port_pool": [
+      32104,
+      32105,
+      32101,
+      32102,
+      32103
+    ],
+    "uid_pool": [],
+    "use_external_mip_script": false,
+    "uuid": "dddf8c12-f2d8-4718-a35b-7804daf12a3f",
+    "volume_group_name": "xenvg",
+    "zeroing_image": ""
+  },
+  "ctime": 1343869045.6055231,
+  "disks": {
+    "150bd154-8e23-44d1-b762-5065ae5a507b": {
+      "ctime": 1354038435.343601,
+      "dev_type": "plain",
+      "iv_name": "disk/0",
+      "logical_id": [
+        "xenvg",
+        "b27a576a-13f7-4f07-885c-63fcad4fdfcc.disk0"
+      ],
+      "mode": "rw",
+      "mtime": 1354038435.343601,
+      "nodes": [
+        "2ae3d962-2dad-44f2-bdb1-85f77107f907"
+      ],
+      "params": {},
+      "serial_no": 1,
+      "size": 1280,
+      "uuid": "150bd154-8e23-44d1-b762-5065ae5a507b"
+    },
+    "77ced3a5-6756-49ae-8d1f-274e27664c05": {
+      "children": [
+        {
+          "ctime": 1421677173.7280669,
+          "dev_type": "plain",
+          "logical_id": [
+            "xenvg",
+            "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_data"
+          ],
+          "mtime": 1421677173.7280591,
+          "nodes": [
+            "9a12d554-75c0-4cb1-8064-103365145db0",
+            "41f9c238-173c-4120-9e41-04ad379b647a"
+          ],
+          "params": {},
+          "serial_no": 1,
+          "size": 1024
+        },
+        {
+          "ctime": 1421677173.728096,
+          "dev_type": "plain",
+          "logical_id": [
+            "xenvg",
+            "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_meta"
+          ],
+          "mtime": 1421677173.7280879,
+          "nodes": [
+            "9a12d554-75c0-4cb1-8064-103365145db0",
+            "41f9c238-173c-4120-9e41-04ad379b647a"
+          ],
+          "params": {},
+          "serial_no": 1,
+          "size": 128
+        }
+      ],
+      "ctime": 1363620258.6089759,
+      "dev_type": "drbd",
+      "iv_name": "disk/0",
+      "logical_id": [
+        "9a12d554-75c0-4cb1-8064-103365145db0",
+        "41f9c238-173c-4120-9e41-04ad379b647a",
+        32100,
+        0,
+        0,
+        "d3c3fd475fcbaf5fd177fb245ac43b71247ada38"
+      ],
+      "mode": "rw",
+      "mtime": 1363620258.6089759,
+      "nodes": [
+        "9a12d554-75c0-4cb1-8064-103365145db0",
+        "41f9c238-173c-4120-9e41-04ad379b647a"
+      ],
+      "params": {},
+      "serial_no": 1,
+      "size": 1024,
+      "uuid": "77ced3a5-6756-49ae-8d1f-274e27664c05"
+    },
+    "79acf611-be58-4334-9fe4-4f2b73ae8abb": {
+      "ctime": 1355186880.4511809,
+      "dev_type": "plain",
+      "iv_name": "disk/0",
+      "logical_id": [
+        "xenvg",
+        "3e559cd7-1024-4294-a923-a9fd13182b2f.disk0"
+      ],
+      "mode": "rw",
+      "mtime": 1355186880.4511809,
+      "nodes": [
+        "41f9c238-173c-4120-9e41-04ad379b647a"
+      ],
+      "params": {},
+      "serial_no": 1,
+      "size": 102400,
+      "uuid": "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+    }
+  },
+  "filters": {},
+  "instances": {
+    "4e091bdc-e205-4ed7-8a47-0c9130a6619f": {
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "beparams": {},
+      "ctime": 1354038435.343601,
+      "disks": [
+        "150bd154-8e23-44d1-b762-5065ae5a507b"
+      ],
+      "disks_active": true,
+      "hvparams": {},
+      "hypervisor": "xen-pvm",
+      "mtime": 1354224585.700732,
+      "name": "instance3.example.com",
+      "nics": [
+        {
+          "mac": "aa:bb:cc:5e:5c:75",
+          "nicparams": {},
+          "uuid": "1ab090c1-e017-406c-afb4-fc285cb43e31"
+        }
+      ],
+      "os": "debian-image",
+      "osparams": {},
+      "osparams_private": {},
+      "primary_node": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+      "serial_no": 4,
+      "tags": [],
+      "uuid": "4e091bdc-e205-4ed7-8a47-0c9130a6619f"
+    },
+    "6c078d22-3eb6-4780-857d-81772e09eef1": {
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "beparams": {},
+      "ctime": 1363620258.6089759,
+      "disks": [
+        "77ced3a5-6756-49ae-8d1f-274e27664c05"
+      ],
+      "disks_active": true,
+      "hvparams": {},
+      "hypervisor": "xen-pvm",
+      "mtime": 1363620320.8749011,
+      "name": "instance1.example.com",
+      "nics": [
+        {
+          "mac": "aa:bb:cc:b2:6e:0b",
+          "nicparams": {},
+          "uuid": "2c953d72-fac4-4aa9-a225-4131bb271791"
+        }
+      ],
+      "os": "busybox",
+      "osparams": {},
+      "osparams_private": {},
+      "primary_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+      "serial_no": 2,
+      "uuid": "6c078d22-3eb6-4780-857d-81772e09eef1"
+    },
+    "8fde9f6d-e1f1-4850-9e9c-154966f622f5": {
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "beparams": {},
+      "ctime": 1355186880.4511809,
+      "disks": [
+        "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+      ],
+      "disks_active": true,
+      "hvparams": {},
+      "hypervisor": "xen-pvm",
+      "mtime": 1355186898.307642,
+      "name": "instance2.example.com",
+      "nics": [
+        {
+          "mac": "aa:bb:cc:56:83:fb",
+          "nicparams": {},
+          "uuid": "1cf95562-e676-4fd0-8214-e8b84a2f7bd1"
+        }
+      ],
+      "os": "debian-image",
+      "osparams": {},
+      "osparams_private": {},
+      "primary_node": "41f9c238-173c-4120-9e41-04ad379b647a",
+      "serial_no": 2,
+      "tags": [],
+      "uuid": "8fde9f6d-e1f1-4850-9e9c-154966f622f5"
+    }
+  },
+  "maintenance": {},
+  "mtime": 1421677173.729104,
+  "networks": {
+    "99f0128a-1c84-44da-90b9-9581ea00c075": {
+      "ext_reservations": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+      "name": "a network",
+      "network": "203.0.113.0/24",
+      "reservations": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+      "serial_no": 1,
+      "uuid": "99f0128a-1c84-44da-90b9-9581ea00c075"
+    }
+  },
+  "nodegroups": {
+    "5244a46d-7506-4e14-922d-02b58153dde1": {
+      "alloc_policy": "preferred",
+      "diskparams": {},
+      "ipolicy": {},
+      "mtime": 1361963775.5750091,
+      "name": "default",
+      "ndparams": {},
+      "networks": {},
+      "serial_no": 125,
+      "tags": [],
+      "uuid": "5244a46d-7506-4e14-922d-02b58153dde1"
+    },
+    "6c0a8916-b719-45ad-95dd-82192b1e473f": {
+      "alloc_policy": "preferred",
+      "diskparams": {},
+      "ipolicy": {
+        "disk-templates": [
+          "plain"
+        ],
+        "minmax": [
+          {
+            "max": {
+              "cpu-count": 8,
+              "disk-count": 16,
+              "disk-size": 1048576,
+              "memory-size": 32768,
+              "nic-count": 18,
+              "spindle-use": 14
+            },
+            "min": {
+              "cpu-count": 2,
+              "disk-count": 2,
+              "disk-size": 1024,
+              "memory-size": 128,
+              "nic-count": 1,
+              "spindle-use": 1
+            }
+          }
+        ],
+        "spindle-ratio": 5.2000000000000002,
+        "vcpu-ratio": 3.1400000000000001
+      },
+      "mtime": 1361963775.5750091,
+      "name": "another",
+      "ndparams": {
+        "exclusive_storage": true
+      },
+      "networks": {},
+      "serial_no": 125,
+      "tags": [],
+      "uuid": "6c0a8916-b719-45ad-95dd-82192b1e473f"
+    }
+  },
+  "nodes": {
+    "2ae3d962-2dad-44f2-bdb1-85f77107f907": {
+      "ctime": 1343869045.6048839,
+      "drained": false,
+      "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+      "master_candidate": true,
+      "master_capable": true,
+      "mtime": 1358348755.779906,
+      "name": "node2.example.com",
+      "ndparams": {},
+      "offline": false,
+      "powered": true,
+      "primary_ip": "192.0.2.83",
+      "secondary_ip": "198.51.100.83",
+      "serial_no": 6,
+      "tags": [],
+      "uuid": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+      "vm_capable": true
+    },
+    "41f9c238-173c-4120-9e41-04ad379b647a": {
+      "ctime": 1343869205.9348071,
+      "drained": false,
+      "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+      "master_candidate": true,
+      "master_capable": true,
+      "mtime": 1353019704.8853681,
+      "name": "node3.example.com",
+      "ndparams": {},
+      "offline": false,
+      "powered": true,
+      "primary_ip": "192.0.2.84",
+      "secondary_ip": "198.51.100.84",
+      "serial_no": 2,
+      "tags": [],
+      "uuid": "41f9c238-173c-4120-9e41-04ad379b647a",
+      "vm_capable": true
+    },
+    "9a12d554-75c0-4cb1-8064-103365145db0": {
+      "ctime": 1349722460.022264,
+      "drained": false,
+      "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+      "master_candidate": true,
+      "master_capable": true,
+      "mtime": 1359986533.3533289,
+      "name": "node1.example.com",
+      "ndparams": {},
+      "offline": false,
+      "powered": true,
+      "primary_ip": "192.0.2.82",
+      "secondary_ip": "198.51.100.82",
+      "serial_no": 197,
+      "tags": [],
+      "uuid": "9a12d554-75c0-4cb1-8064-103365145db0",
+      "vm_capable": true
+    }
+  },
+  "serial_no": 7627,
+  "version": 2170000
+}
diff --git a/test/data/htools/dyn1.json b/test/data/htools/dyn1.json
new file mode 100644
index 0000000..ab0e89d
--- /dev/null
+++ b/test/data/htools/dyn1.json
@@ -0,0 +1,38 @@
+[
+  {
+    "node": "node-01-000",
+    "reports": [
+      {
+        "category": null,
+        "data": {
+          "inst-00": 0.1,
+          "inst-01": 0.1,
+          "inst-02": 0.1,
+          "inst-03": 0.1
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "xen-cpu-avg-load",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      }
+    ]
+  },
+  {
+    "node": "node-01-001",
+    "reports": [
+      {
+        "category": null,
+        "data": {
+          "inst-10": 2.0,
+          "inst-11": 2.0
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "xen-cpu-avg-load",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      }
+    ]
+  }
+]
diff --git a/test/data/htools/dyn2.json b/test/data/htools/dyn2.json
new file mode 100644
index 0000000..3a8e2b4
--- /dev/null
+++ b/test/data/htools/dyn2.json
@@ -0,0 +1,64 @@
+[
+  {
+    "node": "node-01-000",
+    "reports": [
+      {
+        "category": null,
+        "data": {
+          "cpu_number": 32,
+          "cpu_total": 0.1,
+          "cpus": []
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "cpu-avg-load",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      },
+      {
+        "category": null,
+        "data": {
+          "inst-00": 256,
+          "inst-01": 256,
+          "inst-02": 256,
+          "inst-03": 256
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "kvm-inst-rss",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      }
+    ]
+  },
+  {
+    "node": "node-01-001",
+    "reports": [
+      {
+        "category": null,
+        "data": {
+          "cpu_number": 32,
+          "cpu_total": 0.3,
+          "cpus": []
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "cpu-avg-load",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      },
+      {
+        "category": null,
+        "data": {
+          "inst-10": 65536,
+          "inst-11": 65536
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "kvm-inst-rss",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      }
+    ]
+  }
+]
diff --git a/test/data/htools/dyn3.json b/test/data/htools/dyn3.json
new file mode 100644
index 0000000..e40c276
--- /dev/null
+++ b/test/data/htools/dyn3.json
@@ -0,0 +1,64 @@
+[
+  {
+    "node": "node-01-000",
+    "reports": [
+      {
+        "category": null,
+        "data": {
+          "cpu_number": 32,
+          "cpu_total": 2.0,
+          "cpus": []
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "cpu-avg-load",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      },
+      {
+        "category": null,
+        "data": {
+          "inst-00": 256,
+          "inst-01": 256,
+          "inst-02": 256,
+          "inst-03": 256
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "kvm-inst-rss",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      }
+    ]
+  },
+  {
+    "node": "node-01-001",
+    "reports": [
+      {
+        "category": null,
+        "data": {
+          "cpu_number": 32,
+          "cpu_total": 0.1,
+          "cpus": []
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "cpu-avg-load",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      },
+      {
+        "category": null,
+        "data": {
+          "inst-10": 65536,
+          "inst-11": 65536
+        },
+        "format_version": 1,
+        "kind": 0,
+        "name": "kvm-inst-rss",
+        "timestamp": 1444910125282702000,
+        "version": "B"
+      }
+    ]
+  }
+]
diff --git a/test/data/htools/hail-alloc-memory-over-commitment.json b/test/data/htools/hail-alloc-memory-over-commitment.json
new file mode 100644
index 0000000..58c3b5d
--- /dev/null
+++ b/test/data/htools/hail-alloc-memory-over-commitment.json
@@ -0,0 +1,204 @@
+{
+  "cluster_tags": [
+    "htools:desiredlocation:power",
+    "htools:nlocation:power"
+  ],
+  "nodegroups": {
+    "uuid-group-1": {
+      "ipolicy": {
+        "std": {
+          "nic-count": 1,
+          "disk-size": 1024,
+          "disk-count": 1,
+          "memory-size": 128,
+          "cpu-count": 1,
+          "spindle-use": 1
+        },
+        "minmax": [
+          {
+            "min": {
+              "nic-count": 1,
+              "disk-size": 128,
+              "disk-count": 1,
+              "memory-size": 128,
+              "cpu-count": 1,
+              "spindle-use": 1
+            },
+            "max": {
+              "nic-count": 8,
+              "disk-size": 1048576,
+              "disk-count": 16,
+              "memory-size": 32768,
+              "cpu-count": 8,
+              "spindle-use": 8
+            }
+          }
+        ],
+        "vcpu-ratio": 4.0,
+        "disk-templates": [
+          "sharedfile",
+          "diskless",
+          "plain",
+          "blockdev",
+          "drbd",
+          "file",
+          "rbd"
+        ],
+        "spindle-ratio": 32.0,
+        "memory-ratio": 2
+      },
+      "networks": [],
+      "alloc_policy": "preferred",
+      "tags": [],
+      "name": "default"
+    }
+  },
+  "cluster_name": "cluster",
+  "instances": {
+    "instance1": {
+      "disks": [
+        {
+          "spindles": 1,
+          "mode": "rw",
+          "size": 51200
+        }
+      ],
+      "disk_space_total": 51200,
+      "hypervisor": "xen-pvm",
+      "tags": [
+        "test:test"
+      ],
+      "nics": [
+        {
+          "ip": null,
+          "mac": "aa:00:00:10:d2:01",
+          "link": "xen-br0",
+          "mode": "bridged",
+          "bridge": "xen-br0"
+        }
+      ],
+      "vcpus": 1,
+      "spindle_use": 1,
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "disk_template": "drbd",
+      "memory": 1024,
+      "nodes": [
+        "node1",
+        "node2"
+      ],
+      "os": "instance-debootstrap"
+    },
+    "instance2": {
+      "disks": [
+        {
+          "spindles": 1,
+          "mode": "rw",
+          "size": 51200
+        }
+      ],
+      "disk_space_total": 51200,
+      "hypervisor": "xen-pvm",
+      "tags": [
+        "test:test"
+      ],
+      "nics": [
+        {
+          "ip": null,
+          "mac": "aa:00:00:10:d2:01",
+          "link": "xen-br0",
+          "mode": "bridged",
+          "bridge": "xen-br0"
+        }
+      ],
+      "vcpus": 1,
+      "spindle_use": 1,
+      "admin_state": "up",
+      "admin_state_source": "admin",
+      "disk_template": "drbd",
+      "memory": 1024,
+      "nodes": [
+        "node2",
+        "node1"
+      ],
+      "os": "instance-debootstrap"
+    }
+  },
+  "nodes": {
+    "node1": {
+      "total_disk": 307200,
+      "total_cpus": 4,
+      "group": "uuid-group-1",
+      "i_pri_up_memory": 0,
+      "tags": [
+        "power:a"
+      ],
+      "master_candidate": true,
+      "free_memory": 256,
+      "ndparams": {
+        "spindle_count": 1,
+        "oob_program": null,
+        "exclusive_storage": false
+      },
+      "reserved_cpus": 1,
+      "master_capable": true,
+      "free_disk": 307200,
+      "drained": false,
+      "total_memory": 1280,
+      "i_pri_memory": 0,
+      "reserved_memory": 0,
+      "free_spindles": 12,
+      "total_spindles": 12,
+      "vm_capable": true,
+      "offline": false
+    },
+    "node2": {
+      "total_disk": 307200,
+      "total_cpus": 4,
+      "group": "uuid-group-1",
+      "i_pri_up_memory": 0,
+      "tags": [
+        "power:b"
+      ],
+      "master_candidate": true,
+      "free_memory": 256,
+      "ndparams": {
+        "spindle_count": 1,
+        "oob_program": null,
+        "exclusive_storage": false
+      },
+      "reserved_cpus": 1,
+      "master_capable": true,
+      "free_disk": 307200,
+      "drained": false,
+      "total_memory": 1280,
+      "i_pri_memory": 0,
+      "reserved_memory": 0,
+      "free_spindles": 12,
+      "total_spindles": 12,
+      "vm_capable": true,
+      "offline": false
+    }
+  },
+  "request": {
+    "disk_space_total": 0,
+    "disk_template": "drbd",
+    "disks": [
+      {
+        "size": 1024
+      }
+    ],
+    "hypervisor": "xen-pvm",
+    "memory": 256,
+    "name": "instance-new",
+    "nics": [],
+    "os": "instance-debootstrap",
+    "required_nodes": 2,
+    "spindle_use": 1,
+    "tags": [
+      "power:a"
+    ],
+    "type": "allocate",
+    "vcpus": 1
+  }
+}
diff --git a/test/data/htools/hbal-avoid-disk-moves.data b/test/data/htools/hbal-avoid-disk-moves.data
new file mode 100644
index 0000000..41dac29
--- /dev/null
+++ b/test/data/htools/hbal-avoid-disk-moves.data
@@ -0,0 +1,12 @@
+group-01|fake-uuid-01|preferred||
+
+node-01|16384|0|14336|409600|306600|16|N|fake-uuid-01|1|power:a
+node-02|16384|0|16384|409600|357800|16|N|fake-uuid-01|1|power:b
+node-03|16384|0|16384|409600|357800|16|N|fake-uuid-01|1|power:a
+node-04|16384|0|16384|409600|409600|16|N|fake-uuid-01|1|power:b
+
+inst1|1024|51200|1|running|Y|node-01|node-02|drbd|power:a|1
+inst2|1024|51200|1|running|Y|node-01|node-03|drbd|power:a|1
+
+htools:nlocation:power
+htools:desiredlocation:power
diff --git a/test/data/htools/hbal-dyn2.data b/test/data/htools/hbal-dyn2.data
new file mode 100644
index 0000000..619a5d9
--- /dev/null
+++ b/test/data/htools/hbal-dyn2.data
@@ -0,0 +1,15 @@
+group-01|fake-uuid-01|preferred||
+
+node-01-000|552|0|40|3100|3100|32|M|fake-uuid-01|1
+node-01-001|552|0|40|3100|3100|32|N|fake-uuid-01|1
+
+inst-00|128|0|1|running|Y|node-01-000||ext||1
+inst-01|128|0|1|running|Y|node-01-000||ext||1
+inst-02|128|0|1|running|Y|node-01-000||ext||1
+inst-03|128|0|1|running|Y|node-01-000||ext||1
+inst-10|256|0|2|running|Y|node-01-001||ext||1
+inst-11|256|0|2|running|Y|node-01-001||ext||1
+
+
+|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|6.0
+group-01|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|6.0
diff --git a/test/data/htools/hbal-memory-over-commitment-2.data b/test/data/htools/hbal-memory-over-commitment-2.data
new file mode 100644
index 0000000..d9cd6ea
--- /dev/null
+++ b/test/data/htools/hbal-memory-over-commitment-2.data
@@ -0,0 +1,13 @@
+group-01|fake-uuid-01|preferred||
+
+node-01|1024|0|0|409600|256000|16|N|fake-uuid-01|1|power:a
+node-02|1280|0|128|409600|256000|16|N|fake-uuid-01|1|power:b
+
+inst1|1024|51200|1|running|Y|node-01|node-02|drbd|power:a|1
+inst2|1024|51200|1|running|Y|node-02|node-01|drbd|power:b|1
+inst3|128|51200|1|running|Y|node-02|node-01|drbd|power:a|1
+
+htools:desiredlocation:power
+htools:nlocation:power
+
+group-01|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|2.0
diff --git a/test/data/htools/hbal-memory-over-commitment.data b/test/data/htools/hbal-memory-over-commitment.data
new file mode 100644
index 0000000..c307d8f
--- /dev/null
+++ b/test/data/htools/hbal-memory-over-commitment.data
@@ -0,0 +1,13 @@
+group-01|fake-uuid-01|preferred||
+
+node-01|1024|0|0|409600|256000|16|N|fake-uuid-01|1|power:a
+node-02|1280|0|128|409600|256000|16|N|fake-uuid-01|1|power:b
+
+inst1|1024|51200|1|running|Y|node-01|node-02|drbd|power:a|1
+inst2|1024|51200|1|running|Y|node-02|node-01|drbd|power:b|1
+inst3|128|51200|1|running|Y|node-02|node-01|drbd|power:a|1
+
+htools:desiredlocation:power
+htools:nlocation:power
+
+group-01|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|1.0
diff --git a/test/hs/Test/Ganeti/BasicTypes.hs b/test/hs/Test/Ganeti/BasicTypes.hs
index f29d16f..e9ed399 100644
--- a/test/hs/Test/Ganeti/BasicTypes.hs
+++ b/test/hs/Test/Ganeti/BasicTypes.hs
@@ -37,10 +37,12 @@
 
 module Test.Ganeti.BasicTypes (testBasicTypes) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck hiding (Result)
 import Test.QuickCheck.Function
 
-import Control.Applicative
 import Control.Monad
 
 import Test.Ganeti.TestHelper
diff --git a/test/hs/Test/Ganeti/Confd/Types.hs b/test/hs/Test/Ganeti/Confd/Types.hs
index 3bc7167..6e7cb29 100644
--- a/test/hs/Test/Ganeti/Confd/Types.hs
+++ b/test/hs/Test/Ganeti/Confd/Types.hs
@@ -42,7 +42,9 @@
   , ConfdReqQ(..)
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck
 import Test.HUnit
 import qualified Text.JSON as J
diff --git a/test/hs/Test/Ganeti/HTools/Instance.hs b/test/hs/Test/Ganeti/HTools/Instance.hs
index dcd4b79..84a5f5c 100644
--- a/test/hs/Test/Ganeti/HTools/Instance.hs
+++ b/test/hs/Test/Ganeti/HTools/Instance.hs
@@ -44,8 +44,10 @@
   , Instance.Instance(..)
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Arrow ((&&&))
-import Control.Applicative ((<$>))
 import Control.Monad (liftM)
 import Test.QuickCheck hiding (Result)
 
diff --git a/test/hs/Test/Ganeti/HTools/Node.hs b/test/hs/Test/Ganeti/HTools/Node.hs
index e7f46e2..24d942d 100644
--- a/test/hs/Test/Ganeti/HTools/Node.hs
+++ b/test/hs/Test/Ganeti/HTools/Node.hs
@@ -128,8 +128,11 @@
       let node' = node { Node.offline = False
                        , Node.fMem = fmem
                        , Node.fMemForth = fmem
-                       , Node.pMem = fromIntegral fmem / Node.tMem node
-                       , Node.pMemForth = fromIntegral fmem / Node.tMem node
+                       , Node.pMem = Node.computePmem fmem (Node.tMem node)
+                                                      (Node.nMem node)
+                       , Node.pMemForth = Node.computePmem fmem
+                                                           (Node.tMem node)
+                                                           (Node.nMem node)
                        , Node.rMem = 0
                        , Node.rMemForth = 0
                        , Node.pRem = 0
diff --git a/test/hs/Test/Ganeti/HTools/Types.hs b/test/hs/Test/Ganeti/HTools/Types.hs
index 7708b0a..4136308 100644
--- a/test/hs/Test/Ganeti/HTools/Types.hs
+++ b/test/hs/Test/Ganeti/HTools/Types.hs
@@ -45,10 +45,12 @@
   , nullIPolicy
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck hiding (Result)
 import Test.HUnit
 
-import Control.Applicative
 import Control.Monad (replicateM)
 
 import Test.Ganeti.TestHelper
@@ -146,11 +148,13 @@
     dts  <- genUniquesList num_tmpl arbitrary
     vcpu_ratio <- choose (1.0, maxVcpuRatio)
     spindle_ratio <- choose (1.0, maxSpindleRatio)
+    memory_ratio <- choose (1.0, maxMemoryRatio)
     return Types.IPolicy { Types.iPolicyMinMaxISpecs = iminmax
                          , Types.iPolicyStdSpec = istd
                          , Types.iPolicyDiskTemplates = dts
                          , Types.iPolicyVcpuRatio = vcpu_ratio
                          , Types.iPolicySpindleRatio = spindle_ratio
+                         , Types.iPolicyMemoryRatio = memory_ratio
                          }
 
 -- * Test cases
diff --git a/test/hs/Test/Ganeti/JQScheduler.hs b/test/hs/Test/Ganeti/JQScheduler.hs
index 77eb2ac..04a6287 100644
--- a/test/hs/Test/Ganeti/JQScheduler.hs
+++ b/test/hs/Test/Ganeti/JQScheduler.hs
@@ -37,7 +37,9 @@
 
 module Test.Ganeti.JQScheduler (testJQScheduler) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Lens ((&), (.~), _2)
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.List (inits)
@@ -45,7 +47,6 @@
 import qualified Data.Map as Map
 import Data.Set (Set, difference)
 import qualified Data.Set as Set
-import Data.Traversable (traverse)
 import Text.JSON (JSValue(..))
 import Test.HUnit
 import Test.QuickCheck
diff --git a/test/hs/Test/Ganeti/JQueue/Objects.hs b/test/hs/Test/Ganeti/JQueue/Objects.hs
index 13e0f0f..6d56a5d 100644
--- a/test/hs/Test/Ganeti/JQueue/Objects.hs
+++ b/test/hs/Test/Ganeti/JQueue/Objects.hs
@@ -39,7 +39,9 @@
   , genJobId
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck as QuickCheck
 import Text.JSON
 
diff --git a/test/hs/Test/Ganeti/Locking/Allocation.hs b/test/hs/Test/Ganeti/Locking/Allocation.hs
index a4ce21b..498d149 100644
--- a/test/hs/Test/Ganeti/Locking/Allocation.hs
+++ b/test/hs/Test/Ganeti/Locking/Allocation.hs
@@ -42,7 +42,9 @@
   , requestSucceeded
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.Foldable as F
 import qualified Data.Map as M
 import Data.Maybe (fromMaybe)
diff --git a/test/hs/Test/Ganeti/Locking/Locks.hs b/test/hs/Test/Ganeti/Locking/Locks.hs
index 732779f..1c992ff 100644
--- a/test/hs/Test/Ganeti/Locking/Locks.hs
+++ b/test/hs/Test/Ganeti/Locking/Locks.hs
@@ -37,7 +37,10 @@
 
 module Test.Ganeti.Locking.Locks (testLocking_Locks) where
 
-import Control.Applicative ((<$>), (<*>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Monad (liftM)
 import System.Posix.Types (CPid)
 
diff --git a/test/hs/Test/Ganeti/Locking/Waiting.hs b/test/hs/Test/Ganeti/Locking/Waiting.hs
index ee1a6b0..1b06225 100644
--- a/test/hs/Test/Ganeti/Locking/Waiting.hs
+++ b/test/hs/Test/Ganeti/Locking/Waiting.hs
@@ -37,7 +37,10 @@
 
 module Test.Ganeti.Locking.Waiting (testLocking_Waiting) where
 
-import Control.Applicative ((<$>), (<*>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
 import Control.Monad (liftM)
 import qualified Data.Map as M
 import qualified Data.Set as S
diff --git a/test/hs/Test/Ganeti/Luxi.hs b/test/hs/Test/Ganeti/Luxi.hs
index c269b8c..47736f4 100644
--- a/test/hs/Test/Ganeti/Luxi.hs
+++ b/test/hs/Test/Ganeti/Luxi.hs
@@ -37,12 +37,14 @@
 
 module Test.Ganeti.Luxi (testLuxi) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.HUnit
 import Test.QuickCheck
 import Test.QuickCheck.Monadic (monadicIO, run, stop)
 
 import Data.List
-import Control.Applicative
 import Control.Concurrent (forkIO)
 import Control.Exception (bracket)
 import qualified Text.JSON as J
diff --git a/test/hs/Test/Ganeti/Objects.hs b/test/hs/Test/Ganeti/Objects.hs
index 76543da..ea17bc0 100644
--- a/test/hs/Test/Ganeti/Objects.hs
+++ b/test/hs/Test/Ganeti/Objects.hs
@@ -49,11 +49,13 @@
   , genBitStringMaxLen
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck
 import qualified Test.HUnit as HUnit
 
-import Control.Applicative
-import Control.Monad
+import Control.Monad (liftM, when)
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.Char
@@ -91,6 +93,29 @@
     return GenericContainer {
       fromContainer = Map.fromList $ zip names configs }
 
+-- FYI: Currently only memory node value is used
+instance Arbitrary PartialHvStateParams where
+  arbitrary = PartialHvStateParams <$> pure Nothing <*> pure Nothing
+              <*> pure Nothing <*> genMaybe (fromPositive <$> arbitrary)
+              <*> pure Nothing
+
+instance Arbitrary PartialHvState where
+  arbitrary = do
+    hv_params <- arbitrary
+    return GenericContainer {
+      fromContainer = Map.fromList [ hv_params ] }
+
+-- FYI: Currently only memory node value is used
+instance Arbitrary FilledHvStateParams where
+  arbitrary = FilledHvStateParams <$> pure 0 <*> pure 0 <*> pure 0
+              <*> (fromPositive <$> arbitrary) <*> pure 0
+
+instance Arbitrary FilledHvState where
+  arbitrary = do
+    hv_params <- arbitrary
+    return GenericContainer {
+      fromContainer = Map.fromList [ hv_params ] }
+
 instance Arbitrary BS.ByteString where
   arbitrary = fmap UTF8.fromString arbitrary
 
@@ -389,6 +414,35 @@
     , pure ECDSA
     ]
 
+instance Arbitrary RepairStatus where
+  arbitrary = elements [ RSNoted, RSPending, RSCanceled, RSFailed, RSCompleted ]
+
+instance Arbitrary RepairAction where
+  arbitrary = elements [ RANoop, RALiveRepair, RAEvacuate, RAEvacuateFailover ]
+
+instance Arbitrary Incident where
+  arbitrary = Incident <$> pure (J.JSObject $ J.toJSObject [])
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+                       <*> arbitrary
+
+instance Arbitrary MaintenanceData where
+  arbitrary = MaintenanceData <$> (fromPositive <$> arbitrary)
+                              <*> arbitrary
+                              <*> arbitrary
+                              <*> arbitrary
+                              <*> arbitrary
+                              <*> arbitrary
+                              <*> arbitrary
+                              <*> arbitrary
+                              <*> arbitrary
+
 -- | Generates a network instance with minimum netmasks of /24. Generating
 -- bigger networks slows down the tests, because long bit strings are generated
 -- for the reservations.
@@ -445,6 +499,7 @@
       networks = GenericContainer Map.empty
       disks = GenericContainer Map.empty
       filters = GenericContainer Map.empty
+  maintenance <- arbitrary
   let contgroups = GenericContainer $ Map.singleton (UTF8.fromString guuid) grp
   serial <- arbitrary
   -- timestamp fields
@@ -452,7 +507,7 @@
   mtime <- arbitrary
   cluster <- resize 8 arbitrary
   let c = ConfigData version cluster contnodes contgroups continsts networks
-            disks filters ctime mtime serial
+            disks filters ctime maintenance mtime serial
   return c
 
 -- | FIXME: make an even simpler base version of creating a cluster.
diff --git a/test/hs/Test/Ganeti/OpCodes.hs b/test/hs/Test/Ganeti/OpCodes.hs
index 959d803..603c394 100644
--- a/test/hs/Test/Ganeti/OpCodes.hs
+++ b/test/hs/Test/Ganeti/OpCodes.hs
@@ -40,11 +40,13 @@
   , OpCodes.OpCode(..)
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.HUnit as HUnit
 import Test.QuickCheck as QuickCheck
 
-import Control.Applicative
-import Control.Monad
+import Control.Monad (when)
 import Data.Char
 import Data.List
 import qualified Data.Map as Map
@@ -256,6 +258,10 @@
           <*> arbitrary                    -- enabled_user_shutdown
           <*> genMaybe arbitraryDataCollector   -- enabled_data_collectors
           <*> arbitraryDataCollectorInterval   -- data_collector_interval
+          <*> genMaybe genName             -- diagnose_data_collector_filename
+          <*> genMaybe (fromPositive <$> arbitrary) -- maintd round interval
+          <*> genMaybe arbitrary           -- enable maintd balancing
+          <*> genMaybe arbitrary           -- maintd balancing threshold
       "OP_CLUSTER_REDIST_CONF" -> pure OpCodes.OpClusterRedistConf
       "OP_CLUSTER_ACTIVATE_MASTER_IP" ->
         pure OpCodes.OpClusterActivateMasterIp
@@ -271,12 +277,13 @@
           arbitrary <*> arbitrary <*> arbitrary <*>
           (arbitrary `suchThat` (>0))
       "OP_NODE_REMOVE" ->
-        OpCodes.OpNodeRemove <$> genNodeNameNE <*> return Nothing
+        OpCodes.OpNodeRemove <$> genNodeNameNE <*> return Nothing <*>
+          arbitrary <*> arbitrary
       "OP_NODE_ADD" ->
         OpCodes.OpNodeAdd <$> genNodeNameNE <*> emptyMUD <*> emptyMUD <*>
           genMaybe genNameNE <*> genMaybe genNameNE <*> arbitrary <*>
           genMaybe genNameNE <*> arbitrary <*> arbitrary <*> emptyMUD <*>
-          arbitrary
+          arbitrary <*> arbitrary <*> arbitrary
       "OP_NODE_QUERYVOLS" ->
         OpCodes.OpNodeQueryvols <$> genNamesNE <*> genNodeNamesNE
       "OP_NODE_QUERY_STORAGE" ->
@@ -292,7 +299,8 @@
         OpCodes.OpNodeSetParams <$> genNodeNameNE <*> return Nothing <*>
           arbitrary <*> emptyMUD <*> emptyMUD <*> arbitrary <*> arbitrary <*>
           arbitrary <*> arbitrary <*> arbitrary <*> arbitrary <*>
-          genMaybe genNameNE <*> emptyMUD <*> arbitrary
+          genMaybe genNameNE <*> emptyMUD <*> arbitrary <*> arbitrary <*>
+          arbitrary
       "OP_NODE_POWERCYCLE" ->
         OpCodes.OpNodePowercycle <$> genNodeNameNE <*> return Nothing <*>
           arbitrary
@@ -517,6 +525,9 @@
       "OP_RESTRICTED_COMMAND" ->
         OpCodes.OpRestrictedCommand <$> arbitrary <*> genNodeNamesNE <*>
           return Nothing <*> genNameNE
+      "OP_REPAIR_COMMAND" ->
+        OpCodes.OpRepairCommand <$> genNodeNameNE <*> genNameNE <*>
+          genMaybe genPrintableAsciiStringNE
       _ -> fail $ "Undefined arbitrary for opcode " ++ op_id
 
 instance Arbitrary OpCodes.CommonOpParams where
diff --git a/test/hs/Test/Ganeti/Query/Language.hs b/test/hs/Test/Ganeti/Query/Language.hs
index 9556bc3..677990a 100644
--- a/test/hs/Test/Ganeti/Query/Language.hs
+++ b/test/hs/Test/Ganeti/Query/Language.hs
@@ -41,10 +41,12 @@
   , genJSValue
   ) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.HUnit (Assertion, assertEqual)
 import Test.QuickCheck
 
-import Control.Applicative
 import Control.Arrow (second)
 import Text.JSON
 
diff --git a/test/hs/Test/Ganeti/Rpc.hs b/test/hs/Test/Ganeti/Rpc.hs
index 8205cc1..bdb83ac 100644
--- a/test/hs/Test/Ganeti/Rpc.hs
+++ b/test/hs/Test/Ganeti/Rpc.hs
@@ -37,10 +37,12 @@
 
 module Test.Ganeti.Rpc (testRpc) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck
 import Test.QuickCheck.Monadic (monadicIO, run, stop)
 
-import Control.Applicative
 import qualified Data.Map as Map
 
 import Test.Ganeti.TestHelper
diff --git a/test/hs/Test/Ganeti/Runtime.hs b/test/hs/Test/Ganeti/Runtime.hs
index ee48e0e..3e49dd6 100644
--- a/test/hs/Test/Ganeti/Runtime.hs
+++ b/test/hs/Test/Ganeti/Runtime.hs
@@ -97,6 +97,7 @@
               \         constants.KVMD_USER,\n\
               \         constants.LUXID_USER,\n\
               \         constants.MOND_USER,\n\
+              \         constants.MOND_USER,\n\
               \        ]\n\
               \groups = [constants.MASTERD_GROUP,\n\
               \          constants.METAD_GROUP,\n\
@@ -107,6 +108,7 @@
               \          constants.KVMD_GROUP,\n\
               \          constants.LUXID_GROUP,\n\
               \          constants.MOND_GROUP,\n\
+              \          constants.MOND_GROUP,\n\
               \          constants.DAEMONS_GROUP,\n\
               \          constants.ADMIN_GROUP,\n\
               \         ]\n\
diff --git a/test/hs/Test/Ganeti/SlotMap.hs b/test/hs/Test/Ganeti/SlotMap.hs
index 295240d..7897c72 100644
--- a/test/hs/Test/Ganeti/SlotMap.hs
+++ b/test/hs/Test/Ganeti/SlotMap.hs
@@ -42,16 +42,15 @@
   , overfullKeys
   ) where
 
-import Prelude hiding (all)
+import Prelude ()
+import Ganeti.Prelude hiding (all)
 
-import Control.Applicative
 import Control.Monad
 import Data.Foldable (all)
 import qualified Data.Map as Map
 import Data.Map (Map, member, keys, keysSet)
 import Data.Set (Set, size, union)
 import qualified Data.Set as Set
-import Data.Traversable (traverse)
 import Test.HUnit
 import Test.QuickCheck
 
diff --git a/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs b/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs
index 8193ae9..4a63b02 100644
--- a/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs
+++ b/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs
@@ -35,13 +35,15 @@
 
 module Test.Ganeti.Storage.Diskstats.Parser (testBlock_Diskstats_Parser) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck as QuickCheck hiding (Result)
 import Test.HUnit
 
 import Test.Ganeti.TestHelper
 import Test.Ganeti.TestCommon
 
-import Control.Applicative ((<*>), (<$>))
 import qualified Data.Attoparsec.Text as A
 import Data.Text (pack)
 import Text.Printf
diff --git a/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs b/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs
index 9a00799..bb1ec64 100644
--- a/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs
+++ b/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs
@@ -35,13 +35,15 @@
 
 module Test.Ganeti.Storage.Lvm.LVParser (testStorage_Lvm_LVParser) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck as QuickCheck hiding (Result)
 import Test.HUnit
 
 import Test.Ganeti.TestHelper
 import Test.Ganeti.TestCommon
 
-import Control.Applicative ((<$>), (<*>))
 import Data.List (intercalate)
 
 import Ganeti.Storage.Lvm.LVParser
diff --git a/test/hs/Test/Ganeti/TestCommon.hs b/test/hs/Test/Ganeti/TestCommon.hs
index bcd8421..43595df 100644
--- a/test/hs/Test/Ganeti/TestCommon.hs
+++ b/test/hs/Test/Ganeti/TestCommon.hs
@@ -41,6 +41,7 @@
   , maxCpu
   , maxSpindles
   , maxVcpuRatio
+  , maxMemoryRatio
   , maxSpindleRatio
   , maxNodes
   , maxOpCodes
@@ -92,9 +93,11 @@
   , counterexample
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Control.Exception (catchJust)
-import Control.Monad
+import Control.Monad (guard, liftM, foldM)
 import Data.Attoparsec.Text (Parser, parseOnly)
 import Data.List
 import qualified Data.Map as M
@@ -154,6 +157,10 @@
 maxSpindleRatio :: Double
 maxSpindleRatio = 1024.0
 
+-- | Max memory ratio (random value).
+maxMemoryRatio :: Double
+maxMemoryRatio = 1024.0
+
 -- | Max nodes, used just to limit arbitrary instances for smaller
 -- opcode definitions (e.g. list of nodes in OpTestDelay).
 maxNodes :: Int
diff --git a/test/hs/Test/Ganeti/TestHTools.hs b/test/hs/Test/Ganeti/TestHTools.hs
index e2ec6a5..92fef8d 100644
--- a/test/hs/Test/Ganeti/TestHTools.hs
+++ b/test/hs/Test/Ganeti/TestHTools.hs
@@ -94,6 +94,9 @@
   , Types.iPolicyVcpuRatio = maxVcpuRatio -- somewhat random value, high
                                           -- enough to not impact us
   , Types.iPolicySpindleRatio = maxSpindleRatio
+  , Types.iPolicyMemoryRatio = 1 -- because there are several test which
+                                 -- become senseless in case of memory
+                                 -- over-commitment
   }
 
 -- | Default group definition.
diff --git a/test/hs/Test/Ganeti/TestHelper.hs b/test/hs/Test/Ganeti/TestHelper.hs
index 399ad58..01be610 100644
--- a/test/hs/Test/Ganeti/TestHelper.hs
+++ b/test/hs/Test/Ganeti/TestHelper.hs
@@ -39,7 +39,9 @@
   , genArbitrary
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import Data.List (stripPrefix, isPrefixOf)
 import Data.Maybe (fromMaybe)
 import Test.Framework
diff --git a/test/hs/Test/Ganeti/Types.hs b/test/hs/Test/Ganeti/Types.hs
index 12f957a..5ce6dae 100644
--- a/test/hs/Test/Ganeti/Types.hs
+++ b/test/hs/Test/Ganeti/Types.hs
@@ -47,7 +47,9 @@
   , genReasonTrail
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import System.Time (ClockTime(..))
 
 import Test.QuickCheck as QuickCheck hiding (Result)
diff --git a/test/hs/Test/Ganeti/Utils.hs b/test/hs/Test/Ganeti/Utils.hs
index af1c5b6..c65db11 100644
--- a/test/hs/Test/Ganeti/Utils.hs
+++ b/test/hs/Test/Ganeti/Utils.hs
@@ -37,13 +37,19 @@
 
 module Test.Ganeti.Utils (testUtils) where
 
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.QuickCheck hiding (Result)
 import Test.HUnit
 
-import Control.Applicative ((<$>), (<*>))
 import Data.Char (isSpace)
 import qualified Data.Either as Either
+#if MIN_VERSION_base(4,8,0)
+import Data.List hiding (isSubsequenceOf)
+#else
 import Data.List
+#endif
 import Data.Maybe (listToMaybe)
 import qualified Data.Set as S
 import System.Time
diff --git a/test/hs/Test/Ganeti/Utils/MultiMap.hs b/test/hs/Test/Ganeti/Utils/MultiMap.hs
index 3656841..02dfc46 100644
--- a/test/hs/Test/Ganeti/Utils/MultiMap.hs
+++ b/test/hs/Test/Ganeti/Utils/MultiMap.hs
@@ -39,7 +39,9 @@
   ( testUtils_MultiMap
   ) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
 import qualified Data.Set as S
 import qualified Data.Map as M
 
diff --git a/test/hs/Test/Ganeti/Utils/Statistics.hs b/test/hs/Test/Ganeti/Utils/Statistics.hs
index f39546b..573769c 100644
--- a/test/hs/Test/Ganeti/Utils/Statistics.hs
+++ b/test/hs/Test/Ganeti/Utils/Statistics.hs
@@ -55,9 +55,7 @@
   let original = xs ++ [a] ++ ys
       modified = xs ++ [b] ++ ys
       with_update =
-        getStatisticValue
-        $ updateStatistics (getStdDevStatistics $ map SimpleNumber original)
-                           (SimpleNumber a, SimpleNumber b)
+        getValue $ update (calculate original :: StdDevStat) a b
       direct = stdDev modified
   in counterexample ("Value computed by update " ++ show with_update
                      ++ " differs too much from correct value " ++ show direct)
diff --git a/test/hs/Test/Ganeti/WConfd/TempRes.hs b/test/hs/Test/Ganeti/WConfd/TempRes.hs
index 768804c..8b8745b 100644
--- a/test/hs/Test/Ganeti/WConfd/TempRes.hs
+++ b/test/hs/Test/Ganeti/WConfd/TempRes.hs
@@ -37,7 +37,8 @@
 
 module Test.Ganeti.WConfd.TempRes (testWConfd_TempRes) where
 
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
 
 import Test.QuickCheck
 
diff --git a/test/hs/htest.hs b/test/hs/htest.hs
index 86d193e..ca83366 100644
--- a/test/hs/htest.hs
+++ b/test/hs/htest.hs
@@ -34,7 +34,9 @@
 
 module Main(main) where
 
-import Data.Monoid (mappend)
+import Prelude ()
+import Ganeti.Prelude
+
 import Test.Framework
 import System.Environment (getArgs)
 import System.Log.Logger
diff --git a/test/hs/shelltests/htools-balancing.test b/test/hs/shelltests/htools-balancing.test
index 383cb8e..95f82c2 100644
--- a/test/hs/shelltests/htools-balancing.test
+++ b/test/hs/shelltests/htools-balancing.test
@@ -133,11 +133,60 @@
 >>>/Solution length=1/
 >>>=0
 
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data -C
+>>>/gnt-instance migrate -f -n node-01-001 inst-0./
+>>>=0
+
 # ...but the --ignore-dynu option should be honored
 ./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --ignore-dynu
 >>>/Cluster is already well balanced/
 >>>=0
 
+# Assuming idle default also gives 0 utilisation
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --idle-default
+>>>/Cluster is already well balanced/
+>>>=0
+
+# Heavy CPU load can even push instances on the more
+# crowded node
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --idle-default --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json -C
+>>>/gnt-instance migrate -f -n node-01-000 inst-1./
+>>>=0
+
+# ...but with default assumption about disk/net/mem fully used the move is in
+# the other direction.
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json
+>>>/Solution length=1/
+>>>=0
+
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json -C
+>>>/gnt-instance migrate -f -n node-01-001 inst-0./
+>>>=0
+
+# Still, --ignore-dynu overrides everything
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --idle-default --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json --ignore-dynu
+>>>/Cluster is already well balanced/
+>>>=0
+
+# On an overcommitted cluster with small amount of memory, taking memory
+# usage into account can make a difference.
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn2.json
+>>>/Solution length=0/
+>>>=0
+
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn2.json --mond-kvm-rss -C
+>>>/gnt-instance migrate -f -n node-01-000 inst-1./
+>>>=0
+
+# Depending on weight, instances can move in either direction
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn3.json --mond-kvm-rss -C --mem-weight=0.5
+>>>/ 1\. inst-0./
+>>>=0
+
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn3.json --mond-kvm-rss -C --mem-weight=3.0
+>>>/ 1\. inst-1./
+>>>=0
+
 # Test CPU speed is taken into account
 ./test/hs/hbal -t$TESTDATA_DIR/hbal-cpu-speed.data --ignore-dynu
 >>>/inst[12] node-slow:node-fast => node-fast:node-slow/
diff --git a/test/hs/shelltests/htools-hail.test b/test/hs/shelltests/htools-hail.test
index 4725f84..d40b67f 100644
--- a/test/hs/shelltests/htools-hail.test
+++ b/test/hs/shelltests/htools-hail.test
@@ -44,7 +44,7 @@
 >>>= 0
 
 ./test/hs/hail $TESTDATA_DIR/hail-alloc-invalid-twodisks.json
->>> /"success":false,.*FailDisk: 1/
+>>> /"success":false,.*FailTooSmall: 1/
 >>>= 0
 
 # check that hail honors network requirements
@@ -84,12 +84,12 @@
 >>>= 0
 
 ./test/hs/hail $T/hail-alloc-invalid-twodisks.json.excl-stor
->>> /"success":false,.*FailDisk: 1"/
+>>> /"success":false,.*FailTooSmall: 1"/
 >>>= 0
 
 # Same tests with exclusive storage enabled, but no spindles info in instances
 ./test/hs/hail $T/hail-alloc-drbd.json.fail-excl-stor
->>> /"success":false,.*FailSpindles: 12"/
+>>> /"success":false,.*FailTooSmall: 12"/
 >>>= 0
 
 ./test/hs/hail $T/hail-reloc-drbd.json.fail-excl-stor
@@ -101,11 +101,11 @@
 >>>= 0
 
 ./test/hs/hail $T/hail-change-group.json.fail-excl-stor
->>> /"success":true,"info":"Request successful: 1 instances failed to move and 0 were moved successfully",.*FailSpindles: 2"/
+>>> /"success":true,"info":"Request successful: 1 instances failed to move and 0 were moved successfully",.*FailTooSmall: 2"/
 >>>= 0
 
 ./test/hs/hail $T/hail-alloc-twodisks.json.fail-excl-stor
->>> /"success":false,.*FailSpindles: 1"/
+>>> /"success":false,.*FailTooSmall: 1"/
 >>>= 0
 
 # check that hail correctly parses admin state
@@ -165,7 +165,7 @@
 >>>= 0
 
 ./test/hs/hail $T/hail-alloc-spindles.json.excl-stor
->>> /"success":true,"info":"Request successful: Selected group: group1,.*FailSpindles: 2",.*"result":\["node1"\]/
+>>> /"success":true,"info":"Request successful: Selected group: group1,.*FailTooSmall: 2",.*"result":\["node1"\]/
 >>>= 0
 
 # Check that --ignore-soft-errors works and ignores tag errors
@@ -230,6 +230,11 @@
 >>> /successes 2, failures 0.*"result":"node-2-2"/
 >>>= 0
 
+# Memory over-commitment test
+./test/hs/hail $TESTDATA_DIR/hail-alloc-memory-over-commitment.json
+>>> /"success":true.*/
+>>>= 0
+
 # Check that hail account location tags
 ./test/hs/hail $TESTDATA_DIR/hail-alloc-nlocation.json
 >>> /"success":true,.*,"result":\["node3","node2"\]/
diff --git a/test/hs/shelltests/htools-hbal.test b/test/hs/shelltests/htools-hbal.test
index b7b29d8..f7ce274 100644
--- a/test/hs/shelltests/htools-hbal.test
+++ b/test/hs/shelltests/htools-hbal.test
@@ -92,3 +92,19 @@
  node-02    0
  node-03    1/
 >>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-avoid-disk-moves.data --avoid-disk-moves=1.2
+>>>/Solution length=1/
+>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-avoid-disk-moves.data --avoid-disk-moves=5
+>>>/Solution length=2/
+>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-memory-over-commitment.data
+>>>/No solution found/
+>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-memory-over-commitment-2.data
+>>>/Solution length=1/
+>>>= 0
diff --git a/test/hs/shelltests/htools-hcheck.test b/test/hs/shelltests/htools-hcheck.test
index d5bd0dd..854e045 100644
--- a/test/hs/shelltests/htools-hcheck.test
+++ b/test/hs/shelltests/htools-hcheck.test
@@ -22,3 +22,6 @@
 >>>/Cluster is not healthy: False/
 >>>= 0
 
+./test/hs/hcheck -t $TESTDATA_DIR/hsqueeze-underutilized.data --machine-readable
+>>>/HCHECK_INIT_GROUP_0_REDUNDANCY=4/
+>>>= 0
diff --git a/test/hs/shelltests/htools-hspace.test b/test/hs/shelltests/htools-hspace.test
index 80ad64f..0dba25a 100644
--- a/test/hs/shelltests/htools-hspace.test
+++ b/test/hs/shelltests/htools-hspace.test
@@ -26,7 +26,7 @@
 >>>=0
 
 # Mixed cluster, half with exclusive storage
-./test/hs/hspace --machine-readable -t $TESTDATA_DIR/hspace-tiered-mixed.data --no-capacity-checks > $T/capacity && sh -c ". $T/capacity && test \"\${HTS_TSPEC}\" = '131072,1048576,4,12=2 131072,1048576,4,10=2 129984,1048320,4,10=2' && test \"\${HTS_ALLOC_INSTANCES}\" = 6 && test \"\${HTS_TRL_SPN_FREE}\" = 0 && test \"\${HTS_FIN_SPN_FREE}\" = 18"
+./test/hs/hspace --machine-readable -t $TESTDATA_DIR/hspace-tiered-mixed.data --no-capacity-checks > $T/capacity && sh -c ". $T/capacity && echo \"\${HTS_TSPEC}\" | grep -q '131072,1048576,4,12=2 .*129984,1048320,4,10=2' && test \"\${HTS_ALLOC_INSTANCES}\" = 6 && test \"\${HTS_TRL_SPN_FREE}\" = 0 && test \"\${HTS_FIN_SPN_FREE}\" = 18"
 >>>=0
 
 # Verify that instance policy for disks is adhered to
diff --git a/test/py/cfgupgrade_unittest.py b/test/py/cfgupgrade_unittest.py
index a6dec64..132575a 100755
--- a/test/py/cfgupgrade_unittest.py
+++ b/test/py/cfgupgrade_unittest.py
@@ -56,7 +56,7 @@
     "version": constants.CONFIG_VERSION,
     "cluster": {
       "master_node": "node1-uuid",
-      "ipolicy": None,
+      "ipolicy": {},
       "default_iallocator_params": {},
       "diskparams": {},
       "ndparams": {},
@@ -67,13 +67,16 @@
       "compression_tools": constants.IEC_DEFAULT_TOOLS,
       "enabled_user_shutdown": False,
       "data_collectors": {
+        "diagnose": { "active": True, "interval": 5000000 },
         "diskstats": { "active": True, "interval": 5000000 },
         "drbd": { "active": True, "interval": 5000000 },
+        "kvm-inst-rss": { "active": True, "interval": 5000000 },
         "lv": { "active": True, "interval": 5000000 },
         "inst-status-xen": { "active": True, "interval": 5000000 },
         "cpu-avg-load": { "active": True, "interval": 5000000 },
         "xen-cpu-avg-load": { "active": True, "interval": 5000000 },
       },
+      "diagnose_data_collector_filename": "",
       "ssh_key_type": "dsa",
       "ssh_key_bits": 1024,
     },
@@ -81,6 +84,7 @@
     "disks": {},
     "networks": {},
     "filters": {},
+    "maintenance": {},
     "nodegroups": {},
     "nodes": {
       "node1-uuid": {
@@ -435,6 +439,19 @@
   def testUpgradeFullConfigFrom_2_15(self):
     self._TestUpgradeFromFile("cluster_config_2.15.json", False)
 
+  def testUpgradeFullConfigFrom_2_16(self):
+    self._TestUpgradeFromFile("cluster_config_2.16.json", False)
+
+  def testUpgradeFullConfigFrom_2_17(self):
+    self._TestUpgradeFromFile("cluster_config_2.17.json", False)
+
+  def test_2_17_to_2_16_downgrade(self):
+    self._TestUpgradeFromFile("cluster_config_2.17.json", False)
+    _RunUpgrade(self.tmpdir, False, True, downgrade=True)
+    oldconf = self._LoadConfig()
+    newconf = self._LoadTestDataConfig("cluster_config_2.16.json")
+    self.assertEqual(oldconf, newconf)
+
   def testUpgradeCurrent(self):
     self._TestSimpleUpgrade(constants.CONFIG_VERSION, False)
 
@@ -452,7 +469,7 @@
   def testDowngradeFullConfig(self):
     """Test for upgrade + downgrade combination."""
     # This test can work only with the previous version of a configuration!
-    oldconfname = "cluster_config_2.15.json"
+    oldconfname = "cluster_config_2.16.json"
     self._TestUpgradeFromFile(oldconfname, False)
     _RunUpgrade(self.tmpdir, False, True, downgrade=True)
     oldconf = self._LoadTestDataConfig(oldconfname)
diff --git a/test/py/cmdlib/cluster_unittest.py b/test/py/cmdlib/cluster_unittest.py
index d8f3185..225c40d 100644
--- a/test/py/cmdlib/cluster_unittest.py
+++ b/test/py/cmdlib/cluster_unittest.py
@@ -2136,7 +2136,9 @@
   def setUp(self):
     super(TestLUClusterVerifyGroupUpdateNodeInfo, self).setUp()
     self.nimg = verify.LUClusterVerifyGroup.NodeImage(uuid=self.master_uuid)
-    self.valid_hvresult = {constants.NV_HVINFO: {"memory_free": 1024}}
+    self.valid_hvresult = {constants.NV_HVINFO: {"memory_free":  1024,
+                                                 "memory_total": 4096,
+                                                 "memory_dom0":  3072}}
 
   @withLockedLU
   def testInvalidHvNodeResult(self, lu):
@@ -2148,7 +2150,9 @@
   @withLockedLU
   def testInvalidMemoryFreeHvNodeResult(self, lu):
     lu._UpdateNodeInfo(self.master,
-                       {constants.NV_HVINFO: {"memory_free": "abc"}},
+                       {constants.NV_HVINFO: {"memory_free":  'abc',
+                                              "memory_total": 1024,
+                                              "memory_dom0":  2048}},
                        self.nimg, None)
     self.mcpu.assertLogContainsRegex(
       "node returned invalid nodeinfo, check hypervisor")
diff --git a/test/py/daemon-util_unittest.bash b/test/py/daemon-util_unittest.bash
index 1437713..84fd6f3 100755
--- a/test/py/daemon-util_unittest.bash
+++ b/test/py/daemon-util_unittest.bash
@@ -45,8 +45,8 @@
 STOPDAEMONS_LIST="kvmd luxid rapi wconfd confd noded"
 
 if grep -q '^ENABLE_MOND = True' lib/_constants.py; then
-  DAEMONS_LIST="$DAEMONS_LIST mond"
-  STOPDAEMONS_LIST="mond $STOPDAEMONS_LIST"
+  DAEMONS_LIST="$DAEMONS_LIST mond maintd"
+  STOPDAEMONS_LIST="maintd mond $STOPDAEMONS_LIST"
 fi
 
 STOPDAEMONS_LIST="metad $STOPDAEMONS_LIST"
diff --git a/test/py/docs_unittest.py b/test/py/docs_unittest.py
index b9aa6a0..6802d48 100755
--- a/test/py/docs_unittest.py
+++ b/test/py/docs_unittest.py
@@ -77,6 +77,7 @@
 
   # Very sensitive in nature
   opcodes.OpRestrictedCommand,
+  opcodes.OpRepairCommand,
   opcodes.OpClusterRenewCrypto,
 
   # Helper opcodes (e.g. submitted by LUs)
diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ganeti.backend_unittest.py
index 897fcba..e737dad 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -32,6 +32,7 @@
 
 import collections
 import copy
+import time
 import mock
 import os
 import shutil
@@ -424,7 +425,7 @@
   return "Executing command '%s' failed" % cmd
 
 
-class TestRunRestrictedCmd(unittest.TestCase):
+class TestRunConstrainedCmd(unittest.TestCase):
   def setUp(self):
     self.tmpdir = tempfile.mkdtemp()
 
@@ -436,10 +437,10 @@
     sleep_fn = testutils.CallCounter(_SleepForRestrictedCmd)
     self.assertFalse(os.path.exists(lockfile))
     self.assertRaises(backend.RPCFail,
-                      backend.RunRestrictedCmd, "test",
+                      backend.RunConstrainedCmd, "test",
                       _lock_timeout=NotImplemented,
-                      _lock_file=lockfile,
-                      _path=NotImplemented,
+                      lock_file=lockfile,
+                      path=NotImplemented,
                       _sleep_fn=sleep_fn,
                       _prepare_fn=NotImplemented,
                       _runcmd_fn=NotImplemented,
@@ -452,14 +453,14 @@
 
     result = False
     try:
-      backend.RunRestrictedCmd("test22717",
-                               _lock_timeout=0.1,
-                               _lock_file=lockfile,
-                               _path=NotImplemented,
-                               _sleep_fn=sleep_fn,
-                               _prepare_fn=NotImplemented,
-                               _runcmd_fn=NotImplemented,
-                               _enabled=True)
+      backend.RunConstrainedCmd("test22717",
+                                _lock_timeout=0.1,
+                                lock_file=lockfile,
+                                path=NotImplemented,
+                                _sleep_fn=sleep_fn,
+                                _prepare_fn=NotImplemented,
+                                _runcmd_fn=NotImplemented,
+                                _enabled=True)
     except backend.RPCFail, err:
       assert str(err) == _GenericRestrictedCmdError("test22717"), \
              "Did not fail with generic error message"
@@ -491,11 +492,11 @@
     prepare_fn = testutils.CallCounter(self._PrepareRaisingException)
 
     try:
-      backend.RunRestrictedCmd("test23122",
-                               _lock_timeout=1.0, _lock_file=lockfile,
-                               _path=NotImplemented, _runcmd_fn=NotImplemented,
-                               _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
-                               _enabled=True)
+      backend.RunConstrainedCmd("test23122",
+                                _lock_timeout=1.0, lock_file=lockfile,
+                                path=NotImplemented, _runcmd_fn=NotImplemented,
+                                _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
+                                _enabled=True)
     except backend.RPCFail, err:
       self.assertEqual(str(err), _GenericRestrictedCmdError("test23122"))
     else:
@@ -516,11 +517,11 @@
     prepare_fn = testutils.CallCounter(self._PrepareFails)
 
     try:
-      backend.RunRestrictedCmd("test29327",
-                               _lock_timeout=1.0, _lock_file=lockfile,
-                               _path=NotImplemented, _runcmd_fn=NotImplemented,
-                               _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
-                               _enabled=True)
+      backend.RunConstrainedCmd("test29327",
+                                _lock_timeout=1.0, lock_file=lockfile,
+                                path=NotImplemented, _runcmd_fn=NotImplemented,
+                                _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
+                                _enabled=True)
     except backend.RPCFail, err:
       self.assertEqual(str(err), _GenericRestrictedCmdError("test29327"))
     else:
@@ -533,11 +534,11 @@
   def _SuccessfulPrepare(path, cmd):
     return (True, utils.PathJoin(path, cmd))
 
-  def testRunCmdFails(self):
+  def testRunConstrainedCmdFails(self):
     lockfile = utils.PathJoin(self.tmpdir, "lock")
 
     def fn(args, env=NotImplemented, reset_env=NotImplemented,
-           postfork_fn=NotImplemented):
+           postfork_fn=NotImplemented, input_fd=NotImplemented):
       self.assertEqual(args, [utils.PathJoin(self.tmpdir, "test3079")])
       self.assertEqual(env, {})
       self.assertTrue(reset_env)
@@ -567,11 +568,11 @@
     runcmd_fn = testutils.CallCounter(fn)
 
     try:
-      backend.RunRestrictedCmd("test3079",
-                               _lock_timeout=1.0, _lock_file=lockfile,
-                               _path=self.tmpdir, _runcmd_fn=runcmd_fn,
-                               _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
-                               _enabled=True)
+      backend.RunConstrainedCmd("test3079",
+                                _lock_timeout=1.0, lock_file=lockfile,
+                                path=self.tmpdir, _runcmd_fn=runcmd_fn,
+                                _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
+                                _enabled=True)
     except backend.RPCFail, err:
       self.assertTrue(str(err).startswith("Restricted command 'test3079'"
                                           " failed:"))
@@ -584,11 +585,11 @@
     self.assertEqual(prepare_fn.Count(), 1)
     self.assertEqual(runcmd_fn.Count(), 1)
 
-  def testRunCmdSucceeds(self):
+  def testRunConstrainedCmdSucceeds(self):
     lockfile = utils.PathJoin(self.tmpdir, "lock")
 
     def fn(args, env=NotImplemented, reset_env=NotImplemented,
-           postfork_fn=NotImplemented):
+           postfork_fn=NotImplemented, input_fd=NotImplemented):
       self.assertEqual(args, [utils.PathJoin(self.tmpdir, "test5667")])
       self.assertEqual(env, {})
       self.assertTrue(reset_env)
@@ -605,12 +606,12 @@
     prepare_fn = testutils.CallCounter(self._SuccessfulPrepare)
     runcmd_fn = testutils.CallCounter(fn)
 
-    result = backend.RunRestrictedCmd("test5667",
-                                      _lock_timeout=1.0, _lock_file=lockfile,
-                                      _path=self.tmpdir, _runcmd_fn=runcmd_fn,
-                                      _sleep_fn=sleep_fn,
-                                      _prepare_fn=prepare_fn,
-                                      _enabled=True)
+    result = backend.RunConstrainedCmd("test5667",
+                                       _lock_timeout=1.0, lock_file=lockfile,
+                                       path=self.tmpdir, _runcmd_fn=runcmd_fn,
+                                       _sleep_fn=sleep_fn,
+                                       _prepare_fn=prepare_fn,
+                                       _enabled=True)
     self.assertEqual(result, "stdout14463")
 
     self.assertEqual(sleep_fn.Count(), 0)
@@ -619,14 +620,14 @@
 
   def testCommandsDisabled(self):
     try:
-      backend.RunRestrictedCmd("test",
-                               _lock_timeout=NotImplemented,
-                               _lock_file=NotImplemented,
-                               _path=NotImplemented,
-                               _sleep_fn=NotImplemented,
-                               _prepare_fn=NotImplemented,
-                               _runcmd_fn=NotImplemented,
-                               _enabled=False)
+      backend.RunConstrainedCmd("test",
+                                _lock_timeout=NotImplemented,
+                                lock_file=NotImplemented,
+                                path=NotImplemented,
+                                _sleep_fn=NotImplemented,
+                                _prepare_fn=NotImplemented,
+                                _runcmd_fn=NotImplemented,
+                                _enabled=False)
     except backend.RPCFail, err:
       self.assertEqual(str(err),
                        "Restricted commands disabled at configure time")
@@ -1033,6 +1034,11 @@
     self._ssh_replace_name_by_uuid_mock.side_effect = \
       self._ssh_file_manager.ReplaceNameByUuid
 
+    self._time_sleep_patcher = testutils \
+        .patch_object(time, "sleep")
+    self._time_sleep_mock = \
+        self._time_sleep_patcher.start()
+
     self.noded_cert_file = testutils.TestDataFilename("cert1.pem")
 
     self._SetupTestData()
@@ -1045,6 +1051,7 @@
     self._ssh_remove_public_key_patcher.stop()
     self._ssh_query_pub_key_file_patcher.stop()
     self._ssh_replace_name_by_uuid_patcher.stop()
+    self._time_sleep_patcher.stop()
     self._TearDownTestData()
 
   def _SetupTestData(self, number_of_nodes=15, number_of_pot_mcs=5,
@@ -1102,10 +1109,9 @@
                      key_file=self._pub_key_file)
 
     backend._GenerateNodeSshKey(
-        test_node_uuid, test_node_name,
+        test_node_name,
         self._ssh_file_manager.GetSshPortMap(self._SSH_PORT),
         "rsa", 2048,
-        pub_key_file=self._pub_key_file,
         ssconf_store=self._ssconf_mock,
         noded_cert_file=self.noded_cert_file,
         run_cmd_fn=self._run_cmd_mock)
@@ -1959,6 +1965,127 @@
     self.assertTrue([error_msg for (node, error_msg) in error_msgs
                      if node == node_name])
 
+  def _MockReadRemoteSshPubKey(self, pub_key_file, node, cluster_name, port,
+                               ask_key, strict_host_check):
+    return self._ssh_file_manager.GetKeyOfNode(self._master_node)
+
+
+  def _MockReadLocalSshPubKeys(self, key_types, suffix=""):
+    return [self._ssh_file_manager.GetKeyOfNode(self._master_node)]
+
+  def _setUpRenewCrypto(self):
+    """Preparations only needed for the renew-crypto unittests."""
+    self.tmpdir = tempfile.mkdtemp()
+    self._dsa_keyfile = os.path.join(self.tmpdir, "id_dsa.pub")
+    self._rsa_keyfile = os.path.join(self.tmpdir, "id_rsa.pub")
+
+    self._ssh_get_all_user_files_patcher = testutils \
+        .patch_object(ssh, "GetAllUserFiles")
+    self._ssh_get_all_user_files_mock = \
+        self._ssh_get_all_user_files_patcher.start()
+    self._ssh_get_all_user_files_mock.return_value = (None,
+        {constants.SSHK_DSA: (None, self._dsa_keyfile),
+         constants.SSHK_RSA: (None, self._rsa_keyfile)})
+
+    self._ssh_read_remote_ssh_pub_key_patcher = testutils \
+        .patch_object(ssh, "ReadRemoteSshPubKey")
+    self._ssh_read_remote_ssh_pub_key_mock = \
+        self._ssh_read_remote_ssh_pub_key_patcher.start()
+    self._ssh_read_remote_ssh_pub_key_mock.side_effect = \
+        self._MockReadRemoteSshPubKey
+
+    self._ssh_read_local_ssh_pub_keys_patcher = testutils \
+        .patch_object(ssh, "ReadLocalSshPubKeys")
+    self._ssh_read_local_ssh_pub_keys_mock = \
+        self._ssh_read_local_ssh_pub_keys_patcher.start()
+    self._ssh_read_local_ssh_pub_keys_mock.side_effect = \
+        self._MockReadLocalSshPubKeys
+
+    self._ssh_replace_ssh_keys_patcher = testutils \
+        .patch_object(ssh, "ReplaceSshKeys")
+    self._ssh_replace_ssh_keys_mock = \
+        self._ssh_replace_ssh_keys_patcher.start()
+
+  def _tearDownRenewCrypto(self):
+    self._ssh_get_all_user_files_patcher.stop()
+    self._ssh_read_remote_ssh_pub_key_patcher.stop()
+    self._ssh_read_local_ssh_pub_keys_patcher.stop()
+    self._ssh_replace_ssh_keys_patcher.stop()
+
+  def testRenewCrypto(self):
+    self._setUpRenewCrypto()
+
+    node_uuids = self._ssh_file_manager.GetAllNodeUuids()
+    node_names = self._ssh_file_manager.GetAllNodeNames()
+
+    old_ssh_file_manager = copy.deepcopy(self._ssh_file_manager)
+
+    backend.RenewSshKeys(node_uuids, node_names,
+                         self._master_candidate_uuids,
+                         self._potential_master_candidates,
+                         constants.SSHK_DSA, constants.SSHK_DSA,
+                         constants.SSH_DEFAULT_KEY_BITS,
+                         ganeti_pub_keys_file=self._pub_key_file,
+                         ssconf_store=self._ssconf_mock,
+                         noded_cert_file=self.noded_cert_file,
+                         run_cmd_fn=self._run_cmd_mock)
+
+    self._tearDownRenewCrypto()
+
+    self.assertEqual(set(old_ssh_file_manager.GetAllNodeNames()),
+                     set(self._ssh_file_manager.GetAllNodeNames()))
+
+    for node_name in self._ssh_file_manager.GetAllNodeNames():
+      self.assertNotEqual(self._ssh_file_manager.GetKeyOfNode(node_name),
+                          old_ssh_file_manager.GetKeyOfNode(node_name))
+
+
+class TestRemoveSshKeyFromPublicKeyFile(testutils.GanetiTestCase):
+
+  def setUp(self):
+    testutils.GanetiTestCase.setUp(self)
+    self._ssconf_mock = mock.Mock()
+    self._ssconf_mock.GetNodeList = mock.Mock()
+    self._tmpdir = tempfile.mkdtemp()
+    self._pub_keys_file = os.path.join(self._tmpdir, "pub_keys_file")
+
+  def testValidRemoval(self):
+    key = "myKey"
+    name = "myName"
+    ssh.AddPublicKey(name, key, key_file=self._pub_keys_file)
+    self._ssconf_mock.GetNodeList.return_value = \
+        ["myOtherNode1", "myOtherNode2"]
+
+    backend.RemoveSshKeyFromPublicKeyFile(
+        name, pub_key_file=self._pub_keys_file,
+        ssconf_store=self._ssconf_mock)
+
+    result = ssh.QueryPubKeyFile([name], key_file=self._pub_keys_file)
+    self.assertEqual({}, result)
+
+  def testStillClusterNode(self):
+    """Tests the safety check to only remove keys of obsolete nodes."""
+    key = "myKey"
+    name = "myName"
+    ssh.AddPublicKey(name, key, key_file=self._pub_keys_file)
+    self._ssconf_mock.GetNodeList.return_value = ["myName", "myOtherNode"]
+
+    self.assertRaises(
+        errors.SshUpdateError,
+        backend.RemoveSshKeyFromPublicKeyFile,
+        name, pub_key_file=self._pub_keys_file,
+        ssconf_store=self._ssconf_mock)
+
+  def testNoKey(self):
+    name = "myName"
+    # 'clear' file to make sure it exists.
+    ssh.ClearPubKeyFile(key_file=self._pub_keys_file)
+    self._ssconf_mock.GetNodeList.return_value = ["myOtherNode"]
+
+    backend.RemoveSshKeyFromPublicKeyFile(
+        name, pub_key_file=self._pub_keys_file,
+        ssconf_store=self._ssconf_mock)
+
 
 class TestVerifySshSetup(testutils.GanetiTestCase):
 
diff --git a/test/py/ganeti.client.gnt_cluster_unittest.py b/test/py/ganeti.client.gnt_cluster_unittest.py
index 38bda23..2c827a7 100755
--- a/test/py/ganeti.client.gnt_cluster_unittest.py
+++ b/test/py/ganeti.client.gnt_cluster_unittest.py
@@ -409,7 +409,7 @@
     self._setUpFakeKeys()
 
     self._ssh_read_remote_ssh_pub_keys_patcher = testutils \
-      .patch_object(ssh, "ReadRemoteSshPubKeys")
+      .patch_object(ssh, "ReadRemoteSshPubKey")
     self._ssh_read_remote_ssh_pub_keys_mock = \
       self._ssh_read_remote_ssh_pub_keys_patcher.start()
     self._ssh_read_remote_ssh_pub_keys_mock.return_value = self._SOME_KEY_DICT
diff --git a/test/py/ganeti.masterd.iallocator_unittest.py b/test/py/ganeti.masterd.iallocator_unittest.py
index d92e572..5fb18a8 100755
--- a/test/py/ganeti.masterd.iallocator_unittest.py
+++ b/test/py/ganeti.masterd.iallocator_unittest.py
@@ -101,6 +101,8 @@
 class _FakeConfigWithNdParams:
   def GetNdParams(self, _):
     return None
+  def GetFilledHvStateParams(self, _):
+    return None
 
 
 class TestComputeBasicNodeData(unittest.TestCase):
@@ -112,6 +114,7 @@
     self.assertEqual(self.fn({}), {})
 
   def testSimple(self):
+    self.maxDiff = None
     node1 = objects.Node(name="node1",
                          primary_ip="192.0.2.1",
                          secondary_ip="192.0.2.2",
@@ -151,6 +154,7 @@
         "master_capable": True,
         "vm_capable": False,
         "ndparams": None,
+        "hv_state": None,
         },
       "node2": {
         "tags": [],
@@ -163,6 +167,7 @@
         "master_capable": False,
         "vm_capable": True,
         "ndparams": None,
+        "hv_state": None,
         },
       })
 
diff --git a/test/py/ganeti.ssh_unittest.py b/test/py/ganeti.ssh_unittest.py
index 265adec..661245b 100755
--- a/test/py/ganeti.ssh_unittest.py
+++ b/test/py/ganeti.ssh_unittest.py
@@ -488,36 +488,203 @@
     self.assertTrue(os.path.exists(self.priv_filename + suffix + ".pub"))
 
 
-class TestDetermineKeyBits():
+class TestDetermineKeyBits(testutils.GanetiTestCase):
   def testCompleteness(self):
-    self.assertEquals(constants.SSHK_ALL, ssh.SSH_KEY_VALID_BITS.keys())
+    self.assertEquals(constants.SSHK_ALL,
+                      frozenset(ssh.SSH_KEY_VALID_BITS.keys()))
 
   def testAdoptDefault(self):
-    self.assertEquals(2048, DetermineKeyBits("rsa", None, None, None))
-    self.assertEquals(1024, DetermineKeyBits("dsa", None, None, None))
+    self.assertEquals(2048, ssh.DetermineKeyBits("rsa", None, None, None))
+    self.assertEquals(1024, ssh.DetermineKeyBits("dsa", None, None, None))
 
   def testAdoptOldKeySize(self):
-    self.assertEquals(4098, DetermineKeyBits("rsa", None, "rsa", 4098))
-    self.assertEquals(2048, DetermineKeyBits("rsa", None, "dsa", 1024))
+    self.assertEquals(4098, ssh.DetermineKeyBits("rsa", None, "rsa", 4098))
+    self.assertEquals(2048, ssh.DetermineKeyBits("rsa", None, "dsa", 1024))
 
   def testDsaSpecificValues(self):
-    self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "dsa", 2048,
+    self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "dsa", 2048,
                       None, None)
-    self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "dsa", 512,
+    self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "dsa", 512,
                       None, None)
-    self.assertEquals(1024, DetermineKeyBits("dsa", None, None, None))
+    self.assertEquals(1024, ssh.DetermineKeyBits("dsa", None, None, None))
 
   def testEcdsaSpecificValues(self):
-    self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "ecdsa", 2048,
+    self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "ecdsa", 2048,
                       None, None)
     for b in [256, 384, 521]:
-      self.assertEquals(b, DetermineKeyBits("ecdsa", b, None, None))
+      self.assertEquals(b, ssh.DetermineKeyBits("ecdsa", b, None, None))
 
   def testRsaSpecificValues(self):
-    self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "dsa", 766,
+    self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "dsa", 766,
                       None, None)
     for b in [768, 769, 2048, 2049, 4096]:
-      self.assertEquals(b, DetermineKeyBits("rsa", b, None, None))
+      self.assertEquals(b, ssh.DetermineKeyBits("rsa", b, None, None))
+
+
+class TestManageLocalSshPubKeys(testutils.GanetiTestCase):
+  """Test class for several methods handling local SSH keys.
+
+  Methods covered are:
+  - GetSshKeyFilenames
+  - GetSshPubKeyFilename
+  - ReplaceSshKeys
+  - ReadLocalSshPubKeys
+
+  These methods are covered in one test, because the preparations for
+  their tests is identical and thus can be reused.
+
+  """
+  VISIBILITY_PRIVATE = "private"
+  VISIBILITY_PUBLIC = "public"
+  VISIBILITIES = frozenset([VISIBILITY_PRIVATE, VISIBILITY_PUBLIC])
+
+  def _GenerateKey(self, key_id, visibility):
+    assert visibility in self.VISIBILITIES
+    return "I am the %s %s SSH key." % (visibility, key_id)
+
+  def _GetKeyPath(self, key_file_basename):
+     return os.path.join(self.tmpdir, key_file_basename)
+
+  def _SetUpKeys(self):
+    """Creates a fake SSH key for each type and with/without suffix."""
+    self._key_file_dict = {}
+    for key_type in constants.SSHK_ALL:
+      for suffix in ["", self._suffix]:
+        pub_key_filename = "id_%s%s.pub" % (key_type, suffix)
+        priv_key_filename = "id_%s%s" % (key_type, suffix)
+
+        pub_key_path = self._GetKeyPath(pub_key_filename)
+        priv_key_path = self._GetKeyPath(priv_key_filename)
+
+        utils.WriteFile(
+            priv_key_path,
+            data=self._GenerateKey(key_type + suffix, self.VISIBILITY_PRIVATE))
+
+        utils.WriteFile(
+            pub_key_path,
+            data=self._GenerateKey(key_type + suffix, self.VISIBILITY_PUBLIC))
+
+        # Fill key dict only for non-suffix keys
+        # (as this is how it will be in the code)
+        if not suffix:
+          self._key_file_dict[key_type] = \
+            (priv_key_path, pub_key_path)
+
+  def setUp(self):
+    testutils.GanetiTestCase.setUp(self)
+    self.tmpdir = tempfile.mkdtemp()
+    self._suffix = "_suffix"
+    self._SetUpKeys()
+
+  def tearDown(self):
+    shutil.rmtree(self.tmpdir)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testReadAllPublicKeyFiles(self, mock_getalluserfiles):
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    keys = ssh.ReadLocalSshPubKeys([], suffix="")
+
+    self.assertEqual(len(constants.SSHK_ALL), len(keys))
+    for key_type in constants.SSHK_ALL:
+      self.assertTrue(
+          self._GenerateKey(key_type, self.VISIBILITY_PUBLIC) in keys)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testReadOnePublicKeyFile(self, mock_getalluserfiles):
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    keys = ssh.ReadLocalSshPubKeys([constants.SSHK_DSA], suffix="")
+
+    self.assertEqual(1, len(keys))
+    self.assertEqual(
+        self._GenerateKey(constants.SSHK_DSA, self.VISIBILITY_PUBLIC),
+        keys[0])
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testReadPublicKeyFilesWithSuffix(self, mock_getalluserfiles):
+    key_types = [constants.SSHK_DSA, constants.SSHK_ECDSA]
+
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    keys = ssh.ReadLocalSshPubKeys(key_types, suffix=self._suffix)
+
+    self.assertEqual(2, len(keys))
+    for key_id in [key_type + self._suffix for key_type in key_types]:
+      self.assertTrue(
+          self._GenerateKey(key_id, self.VISIBILITY_PUBLIC) in keys)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testGetSshKeyFilenames(self, mock_getalluserfiles):
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    priv, pub = ssh.GetSshKeyFilenames(constants.SSHK_DSA)
+
+    self.assertEqual("id_dsa", os.path.basename(priv))
+    self.assertNotEqual("id_dsa", priv)
+    self.assertEqual("id_dsa.pub", os.path.basename(pub))
+    self.assertNotEqual("id_dsa.pub", pub)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testGetSshKeyFilenamesWithSuffix(self, mock_getalluserfiles):
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    priv, pub = ssh.GetSshKeyFilenames(constants.SSHK_RSA, suffix=self._suffix)
+
+    self.assertEqual("id_rsa_suffix", os.path.basename(priv))
+    self.assertNotEqual("id_rsa_suffix", priv)
+    self.assertEqual("id_rsa_suffix.pub", os.path.basename(pub))
+    self.assertNotEqual("id_rsa_suffix.pub", pub)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testGetPubSshKeyFilename(self, mock_getalluserfiles):
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    pub = ssh.GetSshPubKeyFilename(constants.SSHK_DSA)
+    pub_suffix = ssh.GetSshPubKeyFilename(
+        constants.SSHK_DSA, suffix=self._suffix)
+
+    self.assertEqual("id_dsa.pub", os.path.basename(pub))
+    self.assertNotEqual("id_dsa.pub", pub)
+    self.assertEqual("id_dsa_suffix.pub", os.path.basename(pub_suffix))
+    self.assertNotEqual("id_dsa_suffix.pub", pub_suffix)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testReplaceSshKeys(self, mock_getalluserfiles):
+    """Replace SSH keys without suffixes.
+
+    Note: usually it does not really make sense to replace the DSA key
+    by the RSA key. This is just to test the function without suffixes.
+
+    """
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    ssh.ReplaceSshKeys(constants.SSHK_RSA, constants.SSHK_DSA)
+
+    priv_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][0])
+    pub_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][1])
+
+    self.assertEqual("I am the private rsa SSH key.", priv_key)
+    self.assertEqual("I am the public rsa SSH key.", pub_key)
+
+  @testutils.patch_object(ssh, "GetAllUserFiles")
+  def testReplaceSshKeysBySuffixedKeys(self, mock_getalluserfiles):
+    """Replace SSH keys with keys from suffixed files.
+
+    Note: usually it does not really make sense to replace the DSA key
+    by the RSA key. This is just to test the function without suffixes.
+
+    """
+    mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+    ssh.ReplaceSshKeys(constants.SSHK_DSA, constants.SSHK_DSA,
+                       src_key_suffix=self._suffix)
+
+    priv_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][0])
+    pub_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][1])
+
+    self.assertEqual("I am the private dsa_suffix SSH key.", priv_key)
+    self.assertEqual("I am the public dsa_suffix SSH key.", pub_key)
 
 
 if __name__ == "__main__":
diff --git a/test/py/ganeti.utils.log_unittest.py b/test/py/ganeti.utils.log_unittest.py
index a5d98e9..c568b96 100755
--- a/test/py/ganeti.utils.log_unittest.py
+++ b/test/py/ganeti.utils.log_unittest.py
@@ -204,70 +204,5 @@
     self.assertTrue(utils.ReadFile(logfile2).endswith("This is a test\n"))
 
 
-class TestSetupToolLogging(unittest.TestCase):
-  def test(self):
-    error_name = logging.getLevelName(logging.ERROR)
-    warn_name = logging.getLevelName(logging.WARNING)
-    info_name = logging.getLevelName(logging.INFO)
-    debug_name = logging.getLevelName(logging.DEBUG)
-
-    for debug in [False, True]:
-      for verbose in [False, True]:
-        logger = logging.Logger("TestLogger")
-        buf = StringIO()
-
-        utils.SetupToolLogging(debug, verbose, _root_logger=logger, _stream=buf)
-
-        logger.error("level=error")
-        logger.warning("level=warning")
-        logger.info("level=info")
-        logger.debug("level=debug")
-
-        lines = buf.getvalue().splitlines()
-
-        self.assertTrue(compat.all(line.count(":") == 3 for line in lines))
-
-        messages = [line.split(":", 3)[-1].strip() for line in lines]
-
-        if debug:
-          self.assertEqual(messages, [
-            "%s level=error" % error_name,
-            "%s level=warning" % warn_name,
-            "%s level=info" % info_name,
-            "%s level=debug" % debug_name,
-            ])
-        elif verbose:
-          self.assertEqual(messages, [
-            "%s level=error" % error_name,
-            "%s level=warning" % warn_name,
-            "%s level=info" % info_name,
-            ])
-        else:
-          self.assertEqual(messages, [
-            "level=error",
-            "level=warning",
-            ])
-
-  def testThreadName(self):
-    thread_name = threading.currentThread().getName()
-
-    for enable_threadname in [False, True]:
-      logger = logging.Logger("TestLogger")
-      buf = StringIO()
-
-      utils.SetupToolLogging(True, True, threadname=enable_threadname,
-                             _root_logger=logger, _stream=buf)
-
-      logger.debug("test134042376")
-
-      lines = buf.getvalue().splitlines()
-      self.assertEqual(len(lines), 1)
-
-      if enable_threadname:
-        self.assertTrue((" %s " % thread_name) in lines[0])
-      else:
-        self.assertTrue(thread_name not in lines[0])
-
-
 if __name__ == "__main__":
   testutils.GanetiTestProgram()
diff --git a/test/py/ganeti.utils.retry_unittest.py b/test/py/ganeti.utils.retry_unittest.py
index f8c5daa..93638cd 100755
--- a/test/py/ganeti.utils.retry_unittest.py
+++ b/test/py/ganeti.utils.retry_unittest.py
@@ -30,6 +30,8 @@
 
 """Script for testing ganeti.utils.retry"""
 
+import mock
+import time
 import unittest
 
 from ganeti import constants
@@ -205,5 +207,74 @@
     self.assertEqual(self.called, 3)
 
 
+class TestRetryByNumberOfTimes(testutils.GanetiTestCase):
+
+  def setUp(self):
+    testutils.GanetiTestCase.setUp(self)
+
+  def testSuccessOnFirst(self):
+    test_fn = mock.Mock()
+    utils.RetryByNumberOfTimes(5, 0, Exception, test_fn)
+    test_fn.assert_called_once()
+
+  def testSuccessOnFirstWithArgs(self):
+    test_fn = mock.Mock()
+    utils.RetryByNumberOfTimes(5, 0, Exception, test_fn,
+        "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value")
+    test_fn.assert_called_with(
+        "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value")
+
+  def testSuccessAtSomePoint(self):
+    self.succeed_after_try = 2
+    self.num_try = 0
+    self.max_tries = 5
+
+    def test_fn():
+      self.num_try +=1
+      if self.num_try <= self.succeed_after_try:
+        raise errors.OpExecError("I fail!")
+      else:
+        return "I succeed."
+
+    utils.RetryByNumberOfTimes(self.max_tries, 0, Exception, test_fn)
+
+  def testFailAllTries(self):
+    self.max_tries = 5
+
+    def test_fn():
+      raise errors.OpExecError("I fail!")
+
+    self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+                      0, Exception, test_fn)
+
+  @testutils.patch_object(time, "sleep")
+  def testBackoffZero(self, mock_sleep):
+    self.max_tries = 5
+
+    def test_fn():
+      raise errors.OpExecError("I fail!")
+
+    self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+                      backoff=0, exception_class=Exception, fn=test_fn)
+    for call in mock_sleep.mock_calls:
+      self.assertEqual(mock.call(0), call)
+
+  @testutils.patch_object(time, "sleep")
+  def testBackoffPositive(self, mock_sleep):
+    self.max_tries = 5
+
+    def test_fn():
+      raise errors.OpExecError("I fail!")
+
+    backoff = 3
+    self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+                      backoff=backoff, exception_class=Exception, fn=test_fn)
+
+    expected_calls = [3, 6, 12, 24, 48]
+    for call_idx in range(len(mock_sleep.mock_calls)):
+      self.assertEqual(mock.call(expected_calls[call_idx]),
+                       mock_sleep.mock_calls[call_idx])
+
+
 if __name__ == "__main__":
   testutils.GanetiTestProgram()
diff --git a/test/py/testutils/config_mock.py b/test/py/testutils/config_mock.py
index 1d70798..b79ec29 100644
--- a/test/py/testutils/config_mock.py
+++ b/test/py/testutils/config_mock.py
@@ -64,6 +64,7 @@
 
 
 # pylint: disable=R0904
+# pylint: disable=W0102
 class ConfigMock(config.ConfigWriter):
   """A mocked cluster configuration with added methods for easy customization.
 
@@ -109,7 +110,7 @@
                       ndparams=None,
                       diskparams=None,
                       ipolicy=None,
-                      hv_state_static=None,
+                      hv_state_static={},
                       disk_state_static=None,
                       alloc_policy=None,
                       networks=None):
@@ -160,7 +161,7 @@
                  ndparams=None,
                  powered=True,
                  hv_state=None,
-                 hv_state_static=None,
+                 hv_state_static={},
                  disk_state=None,
                  disk_state_static=None):
     """Add a new L{objects.Node} to the cluster configuration
diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
index a38304d..e700e47 100644
--- a/test/py/testutils_ssh.py
+++ b/test/py/testutils_ssh.py
@@ -183,6 +183,14 @@
     """
     return self._all_node_data.keys()
 
+  def GetAllNodeUuids(self):
+    """Returns all node UUIDs of the cluster.
+
+    @rtype: list of str
+    @returns: list of all node UUIDs
+    """
+    return [node.uuid for node in self._all_node_data.values()]
+
   def GetAllPotentialMasterCandidateNodeNames(self):
     return [name for name, node_info
             in self._all_node_data.items()
@@ -281,12 +289,25 @@
   def GetAuthorizedKeysOfNode(self, node):
     """Returns the authorized keys of the given node.
 
+    @type node: string
+    @param node: name of the node
     @rtype: list of str
     @returns: a list of authorized keys that are stored on that node
 
     """
     return self._authorized_keys[node]
 
+  def GetKeyOfNode(self, node):
+    """Returns the SSH key of the given node.
+
+    @type node: string
+    @param node: name of the node
+    @rtype: string
+    @returns: the SSH key of the node
+
+    """
+    return self._all_node_data[node].key
+
   def SetOrAddNode(self, name, uuid, key, pot_mc, mc, master):
     """Adds a new node to the state of the file manager.
 
@@ -508,8 +529,47 @@
     if constants.SSHS_SSH_PUBLIC_KEYS in data:
       instructions_pub = data[constants.SSHS_SSH_PUBLIC_KEYS]
       self._HandlePublicKeys(instructions_pub, node)
+    if constants.SSHS_GENERATE in data:
+      instructions_generate = data[constants.SSHS_GENERATE]
+      self._GenerateNewKey(instructions_generate, node)
   # pylint: enable=W0613
 
+  def _GenerateNewKey(self, instructions_generate, node):
+    """Generates a new key for the given node.
+
+    Note that this is a very rudimentary generation of a new key. The key is
+    always generated with the same pattern, starting with 'new_key'. That
+    means if you run it twice, it will actually produce the same key. However,
+    for what we want to test, this is sufficient.
+    The 'suffix' instruction is also ignored and the key is directly overriden.
+    This works so far, but simplifies the tests a bit. It might be extended
+    in case it becomes necessary.
+
+    @type instructions_generate: tuple of (string, integer, string)
+    @param instructions_generate: an instructions tuple for generating a new
+        SSH key. This has to comply to the C{_DATA_CHECK} description in
+        C{ssh_update.py}.
+    @type node: string
+    @param node: name of node
+    """
+    (key_type, key_bits, suffix) = instructions_generate
+    assert key_type in constants.SSHK_ALL
+    assert key_bits > 0
+    assert isinstance(suffix, str)
+
+    new_key = "new_key_%s" % node
+    old_node_data = self._all_node_data[node]
+
+    new_node_data = self._NodeInfo(
+        uuid=old_node_data.uuid,
+        key=new_key,
+        is_potential_master_candidate=old_node_data
+          .is_potential_master_candidate,
+        is_master_candidate=old_node_data.is_master_candidate,
+        is_master=old_node_data.is_master)
+
+    self._all_node_data[node] = new_node_data
+
   def _EnsureAuthKeyFile(self, file_node_name):
     if file_node_name not in self._authorized_keys:
       self._authorized_keys[file_node_name] = set()
diff --git a/tools/cluster-merge b/tools/cluster-merge
index 926b705..8af20df 100755
--- a/tools/cluster-merge
+++ b/tools/cluster-merge
@@ -807,7 +807,9 @@
 
   (options, args) = parser.parse_args()
 
-  utils.SetupToolLogging(options.debug, options.verbose)
+  utils.SetupToolLogging(
+      options.debug, options.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   if not args:
     parser.error("No clusters specified")
diff --git a/tools/move-instance b/tools/move-instance
index 8913f62..32474d5 100755
--- a/tools/move-instance
+++ b/tools/move-instance
@@ -1033,7 +1033,10 @@
   """
   (parser, options, args) = ParseOptions()
 
-  utils.SetupToolLogging(options.debug, options.verbose, threadname=True)
+  utils.SetupToolLogging(
+      options.debug, options.verbose, threadname=True,
+      toolname=os.path.splitext(os.path.basename(__file__))[0],
+      logfile=None)
 
   (src_cluster_name, dest_cluster_name, instance_names) = \
     CheckOptions(parser, options, args)
diff --git a/tools/ovfconverter b/tools/ovfconverter
index ba437c7..f13a3a9 100755
--- a/tools/ovfconverter
+++ b/tools/ovfconverter
@@ -177,7 +177,9 @@
   """
   (mode, input_path, options) = ParseOptions()
 
-  utils.SetupToolLogging(options.debug, options.verbose)
+  utils.SetupToolLogging(
+      options.debug, options.verbose,
+      toolname=os.path.splitext(os.path.basename(__file__))[0])
 
   logging.info("Chosen %s mode, reading the %s file", mode, input_path)
   assert mode in (IMPORT_MODE, EXPORT_MODE)
diff --git a/tools/post-upgrade b/tools/post-upgrade
index 4d673e0..41ca528 100644
--- a/tools/post-upgrade
+++ b/tools/post-upgrade
@@ -63,7 +63,8 @@
 
   if utils.version.IsBefore(version, 2, 13, 0):
     result = utils.RunCmd(["gnt-cluster", "renew-crypto",
-                           "--new-ssh-keys", "--no-ssh-key-check", "-f", "-d"])
+                           "--new-ssh-keys", "--no-ssh-key-check",
+                           "--verbose", "-f", "-d"])
 
     if result.failed:
       cli.ToStderr("Failed to create SSH keys: %s; Output %s" %
