Merge branch 'stable-2.16' into stable-2.17
* stable-2.16
Improve error reporting in _VerifyClientCertificates
Simplify some inscrutable map/map/ifilter/zip code
Avoid overuse of operator in watcher *.py
Sprinkle some more list comprehensions
Replace map/partial with list comprehension
Replace uses of map/lambda with more Pythonic code
Replace map(operator.attrgetter, ...) uses
Fix typos in gnt-cluster man page
Hide errors for expected inotify failures in unittest
Add gnt-instance rename --force option
Improve documentation for gnt-instance failover
Allow master failover to ignore offline nodes
Fix LogicalVolume code to work with older /sbin/lvs
Shorten verifyMasterVote failure message
Adding a confirmation before gnt-node --offline no
Removed unnecessary dependency from rpc in cli
Refactor cli exception to its appropriate module
Clean-up of code and fix of pylint warnings
Use fork instead of spawnv in the watcher
Make 'make pep8' happy
Manually fix merge conflicts in src/Ganeti/Utils.py
Signed-off-by: Brian Foley <bpfoley@google.com>
Reviewed-by: Viktor Bachraty <vbachraty@google.com>
diff --git a/.gitignore b/.gitignore
index e653ffc..0d19ea9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,6 +167,7 @@
/src/ganeti-kvmd
/src/ganeti-luxid
/src/ganeti-metad
+/src/ganeti-maintd
/src/ganeti-mond
/src/rpc-test
diff --git a/Makefile.am b/Makefile.am
index 75fb5d8..cb81b6a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -151,6 +151,7 @@
src/Ganeti/JQueue \
src/Ganeti/Locking \
src/Ganeti/Logging \
+ src/Ganeti/MaintD \
src/Ganeti/Monitoring \
src/Ganeti/Metad \
src/Ganeti/Objects \
@@ -302,6 +303,8 @@
$(addsuffix /*.py[co],$(DIRS)) \
$(addsuffix /*.hi,$(HS_DIRS)) \
$(addsuffix /*.o,$(HS_DIRS)) \
+ $(addsuffix /*.dyn_hi,$(HS_DIRS)) \
+ $(addsuffix /*.dyn_o,$(HS_DIRS)) \
$(addsuffix /*.$(HTEST_SUFFIX)_hi,$(HS_DIRS)) \
$(addsuffix /*.$(HTEST_SUFFIX)_o,$(HS_DIRS)) \
$(HASKELL_PACKAGE_VERSIONS_FILE) \
@@ -352,6 +355,7 @@
src/ganeti-confd \
src/ganeti-wconfd \
src/ganeti-luxid \
+ src/ganeti-maintd \
src/ganeti-metad \
src/ganeti-mond \
.hpc/*.mix src/*.tix test/hs/*.tix *.tix \
@@ -371,7 +375,7 @@
HS_GENERATED_FILES = $(HS_PROGS) src/hluxid src/ganeti-luxid \
src/hconfd src/ganeti-confd
if ENABLE_MOND
-HS_GENERATED_FILES += src/ganeti-mond
+HS_GENERATED_FILES += src/ganeti-mond src/ganeti-maintd
endif
if ENABLE_METADATA
HS_GENERATED_FILES += src/ganeti-metad
@@ -410,6 +414,7 @@
doc/examples/systemd/ganeti-kvmd.service \
doc/examples/systemd/ganeti-luxid.service \
doc/examples/systemd/ganeti-metad.service \
+ doc/examples/systemd/ganeti-maintd.service \
doc/examples/systemd/ganeti-mond.service \
doc/examples/systemd/ganeti-noded.service \
doc/examples/systemd/ganeti-rapi.service \
@@ -660,6 +665,7 @@
doc/design-2.14.rst \
doc/design-2.15.rst \
doc/design-2.16.rst \
+ doc/design-2.17.rst \
doc/design-allocation-efficiency.rst \
doc/design-autorepair.rst \
doc/design-bulk-create.rst \
@@ -691,11 +697,15 @@
doc/design-location.rst \
doc/design-linuxha.rst \
doc/design-lu-generated-jobs.rst \
+ doc/design-macvtap.rst \
+ doc/design-memory-over-commitment.rst \
+ doc/design-migration-speed-hbal.rst \
doc/design-monitoring-agent.rst \
doc/design-move-instance-improvements.rst \
doc/design-multi-reloc.rst \
doc/design-multi-storage-htools.rst \
doc/design-multi-version-tests.rst \
+ doc/design-n-m-redundancy.rst \
doc/design-network.rst \
doc/design-network2.rst \
doc/design-node-add.rst \
@@ -769,7 +779,7 @@
src/hs2py \
src/rpc-test
if ENABLE_MOND
-HS_COMPILE_PROGS += src/ganeti-mond
+HS_COMPILE_PROGS += src/ganeti-mond src/ganeti-maintd
endif
if ENABLE_METADATA
HS_COMPILE_PROGS += src/ganeti-metad
@@ -885,6 +895,7 @@
$(patsubst src.%,--exclude Test.%,$(subst /,.,$(patsubst %.hs,%, $(HS_LIB_SRCS))))
HS_LIB_SRCS = \
+ src/Ganeti/Prelude.hs \
src/Ganeti/BasicTypes.hs \
src/Ganeti/Codec.hs \
src/Ganeti/Common.hs \
@@ -906,10 +917,12 @@
src/Ganeti/DataCollectors.hs \
src/Ganeti/DataCollectors/CLI.hs \
src/Ganeti/DataCollectors/CPUload.hs \
+ src/Ganeti/DataCollectors/Diagnose.hs \
src/Ganeti/DataCollectors/Diskstats.hs \
src/Ganeti/DataCollectors/Drbd.hs \
src/Ganeti/DataCollectors/InstStatus.hs \
src/Ganeti/DataCollectors/InstStatusTypes.hs \
+ src/Ganeti/DataCollectors/KvmRSS.hs \
src/Ganeti/DataCollectors/Lv.hs \
src/Ganeti/DataCollectors/Program.hs \
src/Ganeti/DataCollectors/Types.hs \
@@ -929,6 +942,8 @@
src/Ganeti/HTools/Cluster/AllocationSolution.hs \
src/Ganeti/HTools/Cluster/Evacuate.hs \
src/Ganeti/HTools/Cluster/Metrics.hs \
+ src/Ganeti/HTools/Cluster/MetricsComponents.hs \
+ src/Ganeti/HTools/Cluster/MetricsTH.hs \
src/Ganeti/HTools/Cluster/Moves.hs \
src/Ganeti/HTools/Cluster/Utils.hs \
src/Ganeti/HTools/Container.hs \
@@ -952,6 +967,8 @@
src/Ganeti/HTools/Program/Hsqueeze.hs \
src/Ganeti/HTools/Program/Hroller.hs \
src/Ganeti/HTools/Program/Main.hs \
+ src/Ganeti/HTools/RedundancyLevel.hs \
+ src/Ganeti/HTools/Repair.hs \
src/Ganeti/HTools/Tags.hs \
src/Ganeti/HTools/Tags/Constants.hs \
src/Ganeti/HTools/Types.hs \
@@ -981,12 +998,23 @@
src/Ganeti/Logging/Lifted.hs \
src/Ganeti/Logging/WriterLog.hs \
src/Ganeti/Luxi.hs \
+ src/Ganeti/MaintD/Autorepairs.hs \
+ src/Ganeti/MaintD/Balance.hs \
+ src/Ganeti/MaintD/CleanupIncidents.hs \
+ src/Ganeti/MaintD/CollectIncidents.hs \
+ src/Ganeti/MaintD/FailIncident.hs \
+ src/Ganeti/MaintD/HandleIncidents.hs \
+ src/Ganeti/MaintD/MemoryState.hs \
+ src/Ganeti/MaintD/Server.hs \
+ src/Ganeti/MaintD/Utils.hs \
src/Ganeti/Network.hs \
src/Ganeti/Objects.hs \
src/Ganeti/Objects/BitArray.hs \
src/Ganeti/Objects/Disk.hs \
src/Ganeti/Objects/Instance.hs \
+ src/Ganeti/Objects/HvState.hs \
src/Ganeti/Objects/Lens.hs \
+ src/Ganeti/Objects/Maintenance.hs \
src/Ganeti/Objects/Nic.hs \
src/Ganeti/OpCodes.hs \
src/Ganeti/OpCodes/Lens.hs \
@@ -1034,6 +1062,7 @@
src/Ganeti/Utils.hs \
src/Ganeti/Utils/Atomic.hs \
src/Ganeti/Utils/AsyncWorker.hs \
+ src/Ganeti/Utils/Http.hs \
src/Ganeti/Utils/IORef.hs \
src/Ganeti/Utils/Livelock.hs \
src/Ganeti/Utils/Monad.hs \
@@ -1503,7 +1532,7 @@
cp -f $< $@
if ENABLE_MOND
-nodist_sbin_SCRIPTS += src/ganeti-mond
+nodist_sbin_SCRIPTS += src/ganeti-mond src/ganeti-maintd
endif
if ENABLE_METADATA
@@ -1614,6 +1643,7 @@
daemons/ganeti-cleaner.in \
$(pkglib_python_scripts) \
devel/build_chroot \
+ devel/cert_digest.py \
devel/upload \
devel/webserver \
tools/kvm-ifup.in \
@@ -1714,6 +1744,9 @@
test/autotools/autotools-check-news.test \
test/data/htools/clean-nonzero-score.data \
test/data/htools/common-suffix.data \
+ test/data/htools/dyn1.json \
+ test/data/htools/dyn2.json \
+ test/data/htools/dyn3.json \
test/data/htools/empty-cluster.data \
test/data/htools/hail-alloc-dedicated-1.json \
test/data/htools/hail-alloc-desired-location.json \
@@ -1728,23 +1761,28 @@
test/data/htools/hail-alloc-secondary.json \
test/data/htools/hail-alloc-spindles.json \
test/data/htools/hail-alloc-twodisks.json \
+ test/data/htools/hail-alloc-memory-over-commitment.json \
test/data/htools/hail-change-group.json \
test/data/htools/hail-invalid-reloc.json \
test/data/htools/hail-node-evac.json \
test/data/htools/hail-reloc-drbd.json \
test/data/htools/hail-reloc-drbd-crowded.json \
+ test/data/htools/hbal-avoid-disk-moves.data \
test/data/htools/hbal-cpu-speed.data \
test/data/htools/hbal-desiredlocation-1.data \
test/data/htools/hbal-desiredlocation-2.data \
test/data/htools/hbal-desiredlocation-3.data \
test/data/htools/hbal-desiredlocation-4.data \
test/data/htools/hbal-dyn.data \
+ test/data/htools/hbal-dyn2.data \
test/data/htools/hbal-evac.data \
test/data/htools/hbal-excl-tags.data \
test/data/htools/hbal-forth.data \
test/data/htools/hbal-location-1.data \
test/data/htools/hbal-location-exclusion.data \
test/data/htools/hbal-location-2.data \
+ test/data/htools/hbal-memory-over-commitment.data \
+ test/data/htools/hbal-memory-over-commitment-2.data \
test/data/htools/hbal-migration-1.data \
test/data/htools/hbal-migration-2.data \
test/data/htools/hbal-migration-3.data \
@@ -1837,6 +1875,8 @@
test/data/cluster_config_2.13.json \
test/data/cluster_config_2.14.json \
test/data/cluster_config_2.15.json \
+ test/data/cluster_config_2.16.json \
+ test/data/cluster_config_2.17.json \
test/data/instance-minor-pairing.txt \
test/data/instance-disks.txt \
test/data/ip-addr-show-dummy0.txt \
diff --git a/NEWS b/NEWS
index f22825e..944fd0d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,43 @@
====
+Version 2.17.0 beta1
+--------------------
+
+*(Released Mon, 22 Feb 2016)*
+
+Incompatible/important changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- The IAllocator protocol has been extended by a new ``hv_state`` parameter.
+ This new parameter is used to estimate the amount of memory utilized by
+ the node. It replaces ``reserved_mem`` on hypervisors other than ``xen-pvm``
+ and ``xen-hvm`` because ``reserved_mem`` was reported incorrectly on them.
+ If this ``hv_state`` parameter is not presented in an iallocator input, the
+ old ``reserved_mem`` will be used.
+- Tools now log into a separate log file ``tools.log``. Also, each log
+ message of tools is now properly labelled with the name of the tool
+ that submitted the message.
+- The options ``--debug`` and ``--verbose`` of ``gnt-cluster
+ renew-crypto`` and ``gnt-node {add,remove,modify}`` now (also) control the
+ log level of the SSH calls to all nodes.
+
+New features
+~~~~~~~~~~~~
+
+- There is a new daemon, the :doc:`Ganeti Maintenance Daemon <design-repaird>`,
+ that coordinates all maintenance operations on a cluster, i.e. rebalancing,
+ activate disks, ERROR_down handling and node repairs actions.
+- ``htools`` support memory over-commitment now. Look at
+ :doc:`Memory Over Commitment <design-memory-over-commitment>` for the
+ details.
+- ``hbal`` has a new option ``--avoid-disk-moves *factor*`` that allows disk
+ moves only if the gain in the cluster metrics is ``*factor*`` times higher
+ than with no disk moves.
+- ``hcheck`` reports the level of redundancy for each node group as a new ouput
+ parameter, see :doc:`N+M Redundancy <design-n-m-redundancy>`.
+
+
Version 2.16.0 rc1
------------------
diff --git a/README b/README
index 4327d89..6f3e88b 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-Ganeti 2.16
+Ganeti 2.17
===========
For installation instructions, read the INSTALL and the doc/install.rst
diff --git a/configure.ac b/configure.ac
index 4d57798..e9be40a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,8 +1,8 @@
# Configure script for Ganeti
m4_define([gnt_version_major], [2])
-m4_define([gnt_version_minor], [16])
+m4_define([gnt_version_minor], [17])
m4_define([gnt_version_revision], [0])
-m4_define([gnt_version_suffix], [~rc1])
+m4_define([gnt_version_suffix], [~beta1])
m4_define([gnt_version_full],
m4_format([%d.%d.%d%s],
gnt_version_major, gnt_version_minor,
diff --git a/daemons/daemon-util.in b/daemons/daemon-util.in
index 6af85c2..0cdbbe5 100644
--- a/daemons/daemon-util.in
+++ b/daemons/daemon-util.in
@@ -56,7 +56,7 @@
}
if _mond_enabled; then
- DAEMONS+=( ganeti-mond )
+ DAEMONS+=( ganeti-mond ganeti-maintd)
fi
# The full list of all daemons we know about
@@ -111,6 +111,9 @@
metad)
echo "@GNTMETADUSER@:@GNTMETADGROUP@"
;;
+ maintd)
+ echo "@GNTMONDUSER@:@GNTMONDGROUP@"
+ ;;
*)
echo "root:@GNTDAEMONSGROUP@"
;;
diff --git a/devel/build_chroot b/devel/build_chroot
index b6a6379..d1160f6 100755
--- a/devel/build_chroot
+++ b/devel/build_chroot
@@ -42,10 +42,13 @@
SHA1_LIST='
cabal-install-1.18.0.2.tar.gz 2d1f7a48d17b1e02a1e67584a889b2ff4176a773
cabal-install-1.22.4.0.tar.gz b98eea96d321cdeed83a201c192dac116e786ec2
+cabal-install-1.22.6.0.tar.gz d474b0eef6944af1abef92419cea13cee50993f3
ghc-7.6.3-i386-unknown-linux.tar.bz2 f042b4171a2d4745137f2e425e6949c185f8ea14
ghc-7.6.3-x86_64-unknown-linux.tar.bz2 46ec3f3352ff57fba0dcbc8d9c20f7bcb6924b77
ghc-7.8.4-i386-unknown-linux-deb7.tar.bz2 4f523f854c37a43b738359506a89a37a9fa9fc5f
ghc-7.8.4-x86_64-unknown-linux-deb7.tar.bz2 3f68321b064e5c1ffcb05838b85bcc00aa2315b4
+ghc-7.10.2-i386-unknown-linux-deb7.tar.bz2 c759ab9af566f5c3c9b75b702615f1d0c2f999fd
+ghc-7.10.2-x86_64-unknown-linux-deb7.tar.bz2 f028e4a07995353a47286478fc8644f66defa227
'
# export all variables needed in the schroot
@@ -407,6 +410,136 @@
'hlint>=1.9.12'
;;
+ jessie-ghc710)
+
+ GHC_VERSION="7.10.2"
+ GHC_VARIANT="-deb7"
+ CABAL_INSTALL_VERSION="1.22.6.0"
+ # the version of the Cabal library below must match the version used by
+ # CABAL_INSTALL_VERSION, see the dependencies of cabal-install
+ CABAL_LIB_VERSION=">=1.22.2 && <1.23"
+ export GHC_VERSION GHC_VARIANT CABAL_INSTALL_VERSION
+
+ in_chroot -- \
+ $APT_INSTALL \
+ autoconf automake \
+ zlib1g-dev \
+ libgmp3-dev \
+ libcurl4-openssl-dev \
+ libpcre3-dev \
+ happy \
+ hlint hscolour pandoc \
+ shelltestrunner \
+ graphviz qemu-utils \
+ python-docutils \
+ python-simplejson \
+ python-pyparsing \
+ python-pyinotify \
+ python-pycurl \
+ python-ipaddr \
+ python-yaml \
+ python-paramiko \
+ git \
+ git-email \
+ vim
+
+ in_chroot -- \
+ $APT_INSTALL python-setuptools python-dev build-essential
+
+ in_chroot -- \
+ easy_install \
+ logilab-astng==0.24.1 \
+ logilab-common==0.58.3 \
+ mock==1.0.1 \
+ pylint==0.26.0
+
+ in_chroot -- \
+ easy_install \
+ sphinx==1.1.3 \
+ pep8==1.3.3 \
+ coverage==3.4 \
+ bitarray==0.8.0
+
+ install_ghc
+
+ install_cabal
+
+ in_chroot -- \
+ cabal update
+
+ in_chroot -- \
+ cabal install --global \
+ HUnit-1.2.5.2 \
+ PSQueue-1.1 \
+ StateVar-1.1.0.0 \
+ ansi-terminal-0.6.2.1 \
+ ansi-wl-pprint-0.6.7.2 \
+ base-orphans-0.4.1 \
+ base64-bytestring-1.0.0.1 \
+ blaze-builder-0.4.0.1 \
+ bytestring-builder-0.10.6.0.0 \
+ bytestring-mmap-0.2.2 \
+ curl-1.3.8 \
+ enumerator-0.4.20 \
+ extensible-exceptions-0.1.1.4 \
+ hashable-1.2.3.3 \
+ case-insensitive-1.2.0.4 \
+ hinotify-0.3.7 \
+ hostname-1.0 \
+ hslogger-1.2.9 \
+ monads-tf-0.1.0.2 \
+ MonadCatchIO-transformers-0.3.1.3 \
+ nats-1 \
+ parallel-3.2.0.6 \
+ prelude-extras-0.4 \
+ primitive-0.6 \
+ reflection-2 \
+ regex-base-0.93.2 \
+ regex-pcre-0.94.4 \
+ regex-posix-0.95.2 \
+ scientific-0.3.3.8 \
+ attoparsec-0.12.1.6 \
+ attoparsec-enumerator-0.3.4 \
+ streaming-commons-0.1.12.1 \
+ blaze-builder-enumerator-0.2.1.0 \
+ syb-0.5.1 \
+ json-0.9.1 \
+ tagged-0.8.1 \
+ tf-random-0.5 \
+ QuickCheck-2.7.6 \
+ Crypto-4.2.5.1 \
+ transformers-compat-0.4.0.4 \
+ distributive-0.4.4 \
+ exceptions-0.8.0.2 \
+ temporary-1.2.0.3 \
+ transformers-base-0.4.4 \
+ monad-control-1.0.0.4 \
+ lifted-base-0.2.3.6 \
+ unix-compat-0.4.1.4 \
+ unordered-containers-0.2.5.1 \
+ semigroups-0.16.2.2 \
+ bifunctors-5 \
+ utf8-string-0.3.8 \
+ vector-0.11.0.0 \
+ void-0.7 \
+ contravariant-1.3.2 \
+ comonad-4.2.7.2 \
+ profunctors-5.1.1 \
+ semigroupoids-5.0.0.2 \
+ free-4.12.1 \
+ adjunctions-4.2.1 \
+ kan-extensions-4.2.2 \
+ lens-4.12.3 \
+ xml-1.3.14 \
+ test-framework-0.8.1.1 \
+ test-framework-hunit-0.3.0.1 \
+ test-framework-quickcheck2-0.3.0.3 \
+ zlib-bindings-0.1.1.5 \
+ zlib-enum-0.2.3.1 \
+ snap-core-0.9.7.2 \
+ snap-server-0.9.5.1 \
+;;
+
jessie-ghc78)
GHC_VERSION="7.8.4"
@@ -560,7 +693,8 @@
test-framework-0.8.0.3 \
test-framework-hunit-0.3.0.1 \
test-framework-quickcheck2-0.3.0.2 \
- 'transformers>=0.3.0.0'
+ 'transformers>=0.3.0.0' \
+ zlib-0.5.4.2
;;
*)
diff --git a/devel/cert_digest.py b/devel/cert_digest.py
new file mode 100755
index 0000000..683fbd3
--- /dev/null
+++ b/devel/cert_digest.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+# Copyright (C) 2015 Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This is a test script to ease debugging of SSL problems. It can be
+# applied on any of Ganeti's SSL certificates (for example client.pem
+# and server.pem) and will output a digest.
+
+import sys
+import OpenSSL
+
+
+def usage():
+ print "%s filename" % sys.argv[0]
+ print
+ print "'filename' must be a filename of an SSL certificate in PEM format."
+
+
+if __name__ == "__main__":
+
+ if len(sys.argv) < 2:
+ usage()
+
+ cert_fd = open(sys.argv[1], "r")
+ cert_plain = cert_fd.read()
+
+ print "Certificate:"
+ print cert_plain
+
+ cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
+ cert_plain)
+
+ print "Digest:"
+ print cert.digest("sha1")
+
diff --git a/doc/design-2.17.rst b/doc/design-2.17.rst
new file mode 100644
index 0000000..bd1414f
--- /dev/null
+++ b/doc/design-2.17.rst
@@ -0,0 +1,10 @@
+==================
+Ganeti 2.17 design
+==================
+
+The following designs' implementations were completed in Ganeti 2.17.
+
+- :doc:`design-memory-over-commitment`
+- :doc:`design-migration-speed-hbal`
+- :doc:`design-n-m-redundancy`
+- :doc:`design-repaird`
\ No newline at end of file
diff --git a/doc/design-draft.rst b/doc/design-draft.rst
index b2ce6a2..e7c47a3 100644
--- a/doc/design-draft.rst
+++ b/doc/design-draft.rst
@@ -2,7 +2,7 @@
Design document drafts
======================
-.. Last updated for Ganeti 2.16
+.. Last updated for Ganeti 2.17
.. toctree::
:maxdepth: 2
@@ -24,7 +24,7 @@
design-network2.rst
design-configlock.rst
design-multi-storage-htools.rst
- design-repaird.rst
+ design-macvtap.rst
design-scsi-kvm.rst
design-disks.rst
diff --git a/doc/design-macvtap.rst b/doc/design-macvtap.rst
new file mode 100644
index 0000000..1440ab9
--- /dev/null
+++ b/doc/design-macvtap.rst
@@ -0,0 +1,266 @@
+===============
+MacVTap support
+===============
+
+.. contents:: :depth: 3
+
+This is a design document detailing the implementation of `MacVTap`
+support in Ganeti. The initial implementation targets the KVM
+hypervisor, but it is intended to be ported to the XEN hypervisor as
+well.
+
+Current state and shortcomings
+==============================
+
+Currently, Ganeti provides a number of options for networking a virtual
+machine, that are the ``bridged``, ``routed``, and ``openvswitch``
+modes. ``MacVTap``, is another virtual network interface in Linux, that
+is not supported by Ganeti and that could be added to the currently
+supported solutions. It is an interface that acts as a regular TUN/TAP
+device, and thus it is transparently supported by QEMU. Because of its
+design, it can greatly simplify Ganeti setups using bridged instances.
+
+In brief, the MacVTap interface is based on the ``MacVLan`` Linux
+driver, which basically allows a single physical interface to be
+associated with multiple IPs and MAC addresses. It is meant to replace
+the combination of the TUN/TAP and bridge drivers with a more
+lightweight setup that doesn't require any extra configuration on the
+host. MacVTap driver is supposed to be more efficient than using a
+regular bridge. Unlike bridges, it doesn't need to do STP or to
+discover/learn MAC addresses of other connected devices on a given
+domain, as it it knows every MAC address it can receive. In fact, it
+introduces a bridge-like behavior for virtual machines but without the
+need to have a real bridge setup on the host. Instead, each virtual
+interface extends an existing network device by attaching directly to
+it, having its own MAC address, and providing a separate virtual
+interface to be used by the userspace processes. The MacVTap MAC address
+is used on the external network and the guest OS cannot spoof or change
+that address.
+
+Background
+==========
+
+This section provides some extra information for the MacVTap interface,
+that we took into account for the rest of this design document.
+
+MacVTap modes of operation
+--------------------------
+
+A MacVTap device can operate in one of four modes, just like the MacVLan
+driver does. These modes determine how the tap endpoints communicate
+between each other providing various levels of isolation between them.
+Those modes are the following:
+
+* `VEPA (Virtual Ethernet Port Aggregator) mode`: The default mode that
+ is compatible with virtualization-enabled switches. The communication
+ between endpoints on the same lower device, happens through the
+ external switch.
+
+* `Bridge mode`: It works almost like a traditional bridge, connecting
+ all endpoints directly to each other.
+
+* `Private mode`: An endpoint in this mode can never communicate to any
+ other endpoint on the same lower device.
+
+* `Passthru mode`: This mode was added later to work on some limitations
+ on MacVLans (more details here_).
+
+MacVTap internals
+-----------------
+
+The creation of a MacVTap device is *not* done by opening the
+`/dev/net/tun` device and issuing a corresponding `ioctl()` to register
+a network device as happens in tap devices. Instead, there are two ways
+to create a MacVTap device. The first one is using the `rtnetlink(7)`
+interface directly, just like the `libvirt` or the `iproute2` utilities
+do, and the second one is to use the high-level `ip-link` command. Since
+creating a MacVTap interface programmatically using the netlink protocol
+is a bit more complicated than creating a normal TUN/TAP device, we
+propose using the ip-link tool for the MacVTap handling, which it is
+much simpler and straightforward in use, and also fulfills all our
+needs. Additionally, since Ganeti already depends on `iproute2` being
+installed in the system, this does not introduces an extra dependency.
+
+The following example, creates a MacVTap device using the `ip-link`
+tool, named `macvtap0`, operating in `bridge` mode, and which is using
+`eth0` as its lower device:
+
+::
+
+ ip link add link eth0 name macvtap0 address 1a:36:1b:aa:b3:77 type macvtap mode bridge
+
+Once a MacVTap interface is created, an actual character device appears
+under `/dev`, called ``/dev/tapXX``, where ``XX`` is the interface index
+of the device.
+
+Proposed changes
+================
+
+In order to be able to create instances using the MacVTap device driver,
+we propose some modifications that affect the ``nicparams`` slot of the
+Ganeti's configuration ``NIC`` object, and also the code part regarding
+to the KVM hypervisor, as detailed in the following sections.
+
+Configuration changes
+---------------------
+
+The nicparams ``mode`` attribute will be extended to support the
+``macvtap`` mode. When using the MacVTap mode, the ``link`` attribute
+will specify the network device where the MacVTap interfaces will be
+attached to, the *lower device*. Note that the lower device should
+exists, otherwise the operation will fail. If no link is specified, the
+cluster-wide default NIC `link` param will be used instead.
+
+We propose the MacVTap mode to be configurable, and so the nicparams
+object will be extended with an extra slot named ``mvtap_mode``. This
+parameter will only be used if the network mode is set to MacVTap since
+it does not make sense in other modes, similarly to the `vlan` slot of
+the `openvswitch` mode.
+
+Below there is a snippet of some of the ``gnt-network`` commands'
+output:
+
+Network connection
+~~~~~~~~~~~~~~~~~~
+
+::
+
+ gnt-network connect -N mode=macvtap,link=eth0,mvtap_mode=bridge vtap-net vtap_group
+
+Network listing
+~~~~~~~~~~~~~~~
+
+::
+
+ gnt-network list
+
+ Network Subnet Gateway MacPrefix GroupList
+ br-net 10.48.1.0/24 10.48.1.254 - default (bridged, br0, , )
+ vtap-net 192.168.100.0/24 192.168.100.1 - vtap_group (macvtap, eth0, , bridge)
+
+Network information
+~~~~~~~~~~~~~~~~~~~
+
+::
+
+ gnt-network info
+
+ Network name: vtap-net
+ UUID: 4f139b48-3f08-46b1-911f-d37de7e12dcf
+ Serial number: 1
+ Subnet: 192.168.100.0/28
+ Gateway: 192.168.100.1
+ IPv6 Subnet: 2001:db8:2ffc::/64
+ IPv6 Gateway: 2001:db8:2ffc::1
+ Mac Prefix: None
+ size: 16
+ free: 10 (62.50%)
+ usage map:
+ 0 XXXXX..........X 63
+ (X) used (.) free
+ externally reserved IPs:
+ 192.168.100.0, 192.168.100.1, 192.168.100.15
+ connected to node groups:
+ vtap_group (mode:macvtap link:eth0 vlan: mvtap_mode:bridge)
+ used by 2 instances:
+ inst1.example.com: 0:192.168.100.2
+ inst2.example.com: 0:192.168.100.3
+
+
+Hypervisor changes
+------------------
+
+A new method will be introduced in the KVM's `netdev.py` module, named
+``OpenVTap``, similar to the ``OpenTap`` method, that will be
+responsible for creating a MacVTap device using the `ip-link` command,
+and returning its file descriptor. The ``OpenVtap`` method will receive
+as arguments the network's `link`, the mode of the MacVTap device
+(``mvtap_mode``), and also the ``interface name`` of the device to be
+created, otherwise we will not be able to retrieve it, and so opening
+the created device.
+
+Since we want the names among the MacVTap devices to be unique on the
+same node, we will make use of the existing ``_GenerateKvmTapName``
+method to generate device names but with some modifications, to be
+adapted to our needs. This method is actually a wrapper over the
+``GenerateTapName`` method which currently is being used to generate TAP
+interface names for NICs meant to be used in instance communication
+using the ``gnt.com`` prefix. We propose extending this method to
+generate names for the MacVTap interface too, using the ``vtap`` prefix.
+To do so, we could add an extra boolean argument in that method, named
+`inst_comm`, to differentiate the two cases, so that the method will
+return the appropriate name depending on its usage. This argument will
+be optional and defaulted to `True`, to not affect the existing API.
+
+Currently, the `OpenTap` method handles the `vhost-net`, `mq`, and the
+`vnet_hdr` features. The `vhost-net` feature will be normally supported
+for the MacVTap devices too, and so is the `multiqueue` feature, which
+can be enabled using the `numrxqueues` and `numtxqueues` parameters of
+the `ip-link` command. The only drawback seems to be the `vnet_hdr`
+feature modification. For a MacVTap device this flag is enabled by
+default, and it can not be disabled if a user requests to.
+
+A last hypervisor change will be the introduction of a new method named
+``_RemoveStaleMacvtapDevs`` that will remove any remaining MacVTap
+devices, and which is detailed in the following section.
+
+Tools changes
+-------------
+
+Some of the Ganeti tools should also be extended to support MacVTap
+devices. Those are the ``kvm-ifup`` and ``net-common`` scripts. These
+modifications will include a new method named ``setup_macvtap`` that
+will simply change the device status to `UP` just before and instance is
+started:
+
+::
+
+ ip link set $INTERFACE up
+
+As mentioned in the `Background` section, MacVTap devices are
+persistent. So, we have to manually delete the MacVTap device after an
+instance shutdown. To do so, we propose creating a ``kvm-ifdown``
+script, that will be invoked after an instance shutdown in order to
+remove the relevant MacVTap devices. The ``kvm-ifdown`` script should
+explicitly call the following commands and currently will be functional
+for MacVTap NICs only:
+
+::
+
+ ip link set $INTERFACE down
+ ip link delete $INTERFACE
+
+To be able to call the `kvm-ifdown` script we should extend the KVM's
+``_ConfigureNIC`` method with an extra argument that is the name of the
+script to be invoked, instead of calling by default the `kvm-ifup`
+script, as it currently happens.
+
+The invocation of the `kvm-ifdown` script will be made through a
+separate method that we will create, named ``_RemoveStaleMacvtapDevs``.
+This method will read the NIC runtime files of an instance and will
+remove any devices using the MacVTap interface. This method will be
+included in the ``CleanupInstance`` method in order to cover all the
+cases where an instance using MacVTap NICs needs to be cleaned up.
+
+Besides the instance shutdown, there are a couple of cases where the
+MacVTap NICs will need to be cleaned up too. In case of an internal
+instance shutdown, where the ``kvmd`` is not enabled, the instance will
+be in ``ERROR_DOWN`` state. In that case, when the instance is started
+either by the `ganeti-watcher` or by the admin, the ``CleanupInstance``
+method, and consequently the `kvm-ifdown` script, will not be called and
+so the MacVTap NICs will have to manually be deleted. Otherwise starting
+the instance will result in more than one MacVTap devices using the same
+MAC address. An instance migration is another case where deleting an
+instance will keep stale MacVTap devices on the source node. In order
+to solve those potential issues, we will explicitly call the
+``_RemoveStaleMacvtapDevs`` method after a successful instance migration
+on the source node, and also before creating a new device for a NIC that
+is using the MacVTap interface to remove any stale devices.
+
+.. _here: http://thread.gmane.org/gmane.comp.emulators.kvm.devel/61824/)
+
+.. vim: set textwidth=72 :
+.. Local Variables:
+.. mode: rst
+.. fill-column: 72
+.. End:
diff --git a/doc/design-memory-over-commitment.rst b/doc/design-memory-over-commitment.rst
new file mode 100644
index 0000000..281a6ef
--- /dev/null
+++ b/doc/design-memory-over-commitment.rst
@@ -0,0 +1,181 @@
+======================
+Memory Over Commitment
+======================
+
+.. contents:: :depth: 4
+
+This document describes the proposed changes to support memory
+overcommitment in Ganeti.
+
+Background
+==========
+
+Memory is a non-preemptable resource, and thus cannot be shared, e.g.,
+in a round-robin fashion. Therefore, Ganeti is very careful to make
+sure there is always enough physical memory for the memory promised
+to the instances. In fact, even in an N+1 redundant way: should one
+node fail, its instances can be relocated to other nodes while still
+having enough physical memory for the memory promised to all instances.
+
+Overview over the current memory model
+--------------------------------------
+
+To make decisions, ``htools`` query the following parameters from Ganeti.
+
+- The amount of memory used by each instance. This is the state-of-record
+ backend parameter ``maxmem`` for that instance (maybe inherited from
+ group-level or cluster-level backend paramters). It tells the hypervisor
+ the maximal amount of memory that instance may use.
+
+- The state-of-world parameters for the node memory. They are collected
+ live and are hypervisor specific. The following parameters are collected.
+
+ - memory_total: the total memory size on the node
+
+ - memory_free: the available memory on the node for instances
+
+ - memory_dom0: the memory used by the node itself, if available
+
+ For Xen, the amount of total and free memory are obtained by parsing
+ the output of Xen ``info`` command (e.g., ``xm info``). The dom0
+ memory is obtained by looking in the output of the ``list`` command
+ for ``Domain-0``.
+
+ For the ``kvm`` hypervisor, all these paramters are obtained by
+ reading ``/proc/memstate``, where the entries ``MemTotal`` and
+ ``Active`` are considered the values for ``memory_total`` and
+ ``memory_dom0``, respectively. The value for ``memory_free`` is
+ taken as the sum of the entries ``MemFree``, ``Buffers``, and ``Cached``.
+
+
+Current state and shortcomings
+==============================
+
+While the current model of never over committing memory serves well
+to provide reliability guarantees to instances, it does not suit well
+situations were the actual use of memory in the instances is spiky. Consider
+a scenario where instances only touch a small portion of their memory most
+of the time, but occasionally use a large amount of memory. Then, at any moment,
+a large fraction of the memory used for the instances sits around without
+being actively used. By swapping out the not actively used memory, resources
+can be used more efficiently.
+
+Proposed changes
+================
+
+We propose to support over commitment of memory if desired by the
+administrator. Memory will change from being a hard constraint to
+being a question of policy. The default will be not to over commit
+memory.
+
+Extension of the policy by a new parameter
+------------------------------------------
+
+The instance policy is extended by a new real-number field ``memory-ratio``.
+Policies on groups inherit this parameter from the cluster wide policy in the
+same way as all other parameters of the instance policy.
+
+When a cluster is upgraded from an earlier version not containing
+``memory-ratio``, the value ``1.0`` is inserted for this new field in
+the cluster-level ``ipolicy``; in this way, the status quo of not over
+committing memory is preserved via upgrades. The ``gnt-cluster
+modify`` and ``gnt-group modify`` commands are extended to allow
+setting of the ``memory-ratio``.
+
+The ``htools`` text format is extended to also contain this new
+ipolicy parameter. It is added as an optional entry at the end of the
+parameter list of an ipolicy line, to remain backwards compatible.
+If the paramter is missing, the value ``1.0`` is assumed.
+
+Changes to the memory reporting on non ``xen-hvm`` and ``xen-pvm``
+------------------------------------------------------------------
+
+For all hypervisors ``memory_dom0`` corresponds to the amount of memory used
+by Ganeti itself and all other non-hypervisor processes running on this node.
+The amount of memory currently reported for ``memory_dom0`` on hypervisors
+other than ``xen-hvm`` and ``xen-pvm``, however, includes the amount of active
+memory of the hypervisor processes. This is in conflict with the underlying
+assumption ``memory_dom0`` memory is not available for instance.
+
+Therefore, for hypervisors other than ``xen-pvm`` and ``xen-hvm`` we will use
+a new state-of-recored hypervisor paramter called ``mem_node`` in htools
+instead of the reported ``memory_dom0``. As a hypervisor state parameter, it is
+run-time tunable and inheritable at group and cluster levels. If this paramter
+is not present, a default value of ``1024M`` will be used, which is a
+conservative estimate of the amount of memory used by Ganeti on a medium-sized
+cluster. The reason for using a state-of-record value is to have a stable
+amount of reserved memory, irrespective of the current activity of Ganeti.
+
+Currently, hypervisor state parameters are partly implemented but not used
+by ganeti.
+
+Changes to the memory policy
+----------------------------
+
+The memory policy will be changed in that we assume that one byte
+of physical node memory can hold ``memory-ratio`` bytes of instance
+memory, but still only one byte of Ganeti memory. Of course, in practise
+this has to be backed by swap space; it is the administrator's responsibility
+to ensure that each node has swap of at
+least ``(memory-ratio - 1.0) * (memory_total - memory_dom0)``. Ganeti
+will warn if the amount of swap space is not big enough.
+
+
+The new memory policy will be as follows.
+
+- The difference between the total memory of a node and its dom0
+ memory will be considered the amount of *available memory*.
+
+- The amount of *used memory* will be (as is now) the sum of
+ the memory of all instance and the reserved memory.
+
+- The *relative memory usage* is the fraction of used and available
+ memory. Note that the relative usage can be bigger than ``1.0``.
+
+- The memory-related constraint for instance placement is that
+ afterwards the relative memory usage be at most the
+ memory-ratio. Again, if the ratio of the memory of the real
+ instances on the node to available memory is bigger than the
+ memory-ratio this is considered a hard violation, otherwise
+ it is considered a soft violation.
+
+- The definition of N+1 redundancy (including
+ :doc:`design-shared-storage-redundancy`) is kept literally as is.
+ Note, however, that the meaning does change, as the definition depends
+ on the notion of allowed moves, which is changed by this proposal.
+
+
+Changes to cluster verify
+-------------------------
+
+The only place where the Ganeti core handles memory is
+when ``gnt-cluster verify`` verifies N+1 redundancy. This code will be changed
+to follow the new memory model.
+
+Additionally, ``gnt-cluster verify`` will warn if the sum of available memory
+and swap space is not at least as big as the used memory.
+
+Changes to ``htools``
+---------------------
+
+The underlying model of the cluster will be changed in accordance with
+the suggested change of the memory policy. As all higher-level ``htools``
+operations go through only the primitives of adding/moving an instance
+if possible, and inspecting the cluster metrics, changing the base
+model will make all ``htools`` compliant with the new memory model.
+
+Balancing
+---------
+
+The cluster metric components will not be changed. Note the standard
+deviation of relative memory usage is already one of the components.
+For dynamic (load-based) balancing, the amount of not immediately
+discardable memory will serve as an indication of memory activity;
+as usual, the measure will be the standard deviation of the relative
+value (i.e., the ratio of non-discardable memory to available
+memory). The weighting for this metric component will have to be
+determined by experimentation and will depend on the memory ratio;
+for a memory ratio of ``1.0`` the weight will be ``0.0``, as memory
+need not be taken into account if no over-commitment is in place.
+For memory ratios bigger than ``1.0``, the weight will be positive
+and grow with the ratio.
diff --git a/doc/design-migration-speed-hbal.rst b/doc/design-migration-speed-hbal.rst
new file mode 100644
index 0000000..a0dcfe0
--- /dev/null
+++ b/doc/design-migration-speed-hbal.rst
@@ -0,0 +1,28 @@
+==================================
+Migration speed accounting in Hbal
+==================================
+
+.. contents:: :depth: 2
+
+Hbal usually performs complex sequence of moves during cluster balancing in
+order to achieve local optimal cluster state. Unfortunately, each move may take
+significant amount of time. Thus, during the sequence of moves the situation on
+cluster may change (e.g., because of adding new instance or because of instance
+or node parameters change) and desired moves can become unprofitable.
+
+Usually disk moves become a bottleneck and require sufficient amount of time.
+:ref:`Instance move improvements <move-performance>` considers
+disk moves speed in more details. Currently, ``hbal`` has a ``--no-disk-moves``
+option preventing disk moves during cluster balancing in order to perform fast
+(but of course non optimal) balancing. It may be useful, but ideally we need to
+find a balance between optimal configuration and time to reach this
+configuration.
+
+Avoiding insignificant disk moves
+=================================
+
+Allowing only profitable enough disk moves may become a first step to reach
+a compromise between moves speed and optimal scoring. This can be implemented
+by introducing ``--avoid-disk-moves *FACTOR*`` option which will admit disk
+moves only if the gain in the cluster metrics is *FACTOR* times
+higher than the gain achievable by non disk moves.
diff --git a/doc/design-move-instance-improvements.rst b/doc/design-move-instance-improvements.rst
index c64b4bf..6948fd8 100644
--- a/doc/design-move-instance-improvements.rst
+++ b/doc/design-move-instance-improvements.rst
@@ -31,6 +31,8 @@
aspects of the problem, they do not exclude each other and will be presented
independently.
+.. _move-performance:
+
The performance of Ganeti moves
===============================
diff --git a/doc/design-n-m-redundancy.rst b/doc/design-n-m-redundancy.rst
new file mode 100644
index 0000000..4536f4c
--- /dev/null
+++ b/doc/design-n-m-redundancy.rst
@@ -0,0 +1,77 @@
+===========================
+Checking for N+M redundancy
+===========================
+
+.. contents:: :depth: 4
+
+This document describes how the level of redundancy is estimated
+in Ganeti.
+
+
+Current state and shortcomings
+==============================
+
+Ganeti keeps the cluster N+1 redundant, also taking into account
+:doc:`design-shared-storage-redundancy`. In other words, Ganeti
+tries to keep the cluster in a state, where after failure of a single
+node, no matter which one, all instances can be started immediately.
+However, e.g., for planning
+maintenance, it is sometimes desirable to know from how many node
+losses the cluster can recover from. This is also useful information,
+when operating big clusters and expecting long times for hardware repair.
+
+
+Proposed changes
+================
+
+Higher redundancy as a sequential concept
+-----------------------------------------
+
+The intuitive meaning of an N+M redundant cluster is that M nodes can
+fail without instances being lost. However, when DRBD is used, already
+failure of 2 nodes can cause complete loss of an instance. Therefore, the
+best we can hope for, is to be able to recover from M sequential failures.
+This intuition that a cluster is N+M redundant, if M nodes can fail one-by-one,
+leaving enough time for a rebalance in between, without losing instances, is
+formalized in the next definition.
+
+Definition of N+M redundancy
+----------------------------
+
+We keep the definition of :doc:`design-shared-storage-redundancy`. Moreover,
+for M a non-negative integer, we define a cluster to be N+(M+2) redundant,
+if after draining any node the standard rebalancing procedure (as, e.g.,
+provided by `hbal`) will fully evacuate that node and result in an N+(M+1)
+redundant cluster.
+
+Independence of Groups
+----------------------
+
+Immediately from the definition, we see that the redundancy level, i.e.,
+the maximal M such that the cluster is N+M redundant, can be computed
+in a group-by-group manner: the standard balancing algorithm will never
+move instances between node groups. The redundancy level of the cluster
+is then the minimum of the redundancy level of the independent groups.
+
+Estimation of the redundancy level
+----------------------------------
+
+The definition of N+M redundancy requires to consider M failures in
+arbitrary order, thus considering super-exponentially many cases for
+large M. As, however, balancing moves instances anyway, the redundancy
+level mainly depends on the amount of node resources available to the
+instances in a node group. So we can get a good approximation of the
+redundancy level of a node group by only considering draining one largest
+node in that group. This is how Ganeti will estimate the redundancy level.
+
+Modifications to existing tools
+-------------------------------
+
+As redundancy levels higher than N+1 are mainly about planning capacity,
+they level of redundancy only needs to be computed on demand. Hence, we
+keep the tool changes minimal.
+
+- ``hcheck`` will report the level of redundancy for each node group as
+ a new output parameter
+
+The rest of Ganeti will not be changed.
diff --git a/doc/design-node-security.rst b/doc/design-node-security.rst
index 1215277..f4f10aa 100644
--- a/doc/design-node-security.rst
+++ b/doc/design-node-security.rst
@@ -129,48 +129,19 @@
access and a compromised normal node, one can make this node a master
candidate and then still have the power to compromise the whole cluster.
-To mitigate this issue, we propose the following changes:
+Various options have been explored to mitigate this, with no feasible
+solution so far. We generally advise to not expose RAPI to the Internet.
+For more details on making Ganeti secure, see :doc:`security`.
-- Add a flag ``master_capability_rapi_modifiable`` to the cluster
- configuration which indicates whether or not it should be possible
- to modify the ``master_capable`` flag of nodes via RAPI. The flag is
- set to ``False`` by default and can itself only be changed on the
- commandline. In this design doc, we refer to the flag as the
- "rapi flag" from here on.
-- Only if the ``master_capabability_rapi_modifiable`` switch is set to
- ``True``, it is possible to modify the master-capability flag of
- nodes.
-
-With this setup, there are the following definitions of "potential
-master candidates" depending on the rapi flag:
-
-- If the rapi flag is set to ``True``, all cluster nodes are potential
- master candidates, because as described above, all of them can
- eventually be made master candidates via RAPI and thus security-wise,
- we haven't won anything above the current SSH handling.
-- If the rapi flag is set to ``False``, only the master capable nodes
- are considered potential master candidates, as it is not possible to
- make them master candidates via RAPI at all.
-
-Note that when the rapi flag is changed, the state of the
-``ganeti_pub_keys`` file on all nodes has to be updated accordingly.
-This should be done in the client script ``gnt_cluster`` before the
-RPC call to update the configuration is made, because this way, if
-someone would try to perform that RPC call on master to trick it into
-thinking that the flag is enabled, this would not help as the content of
-the ``ganeti_pub_keys`` file is a crucial part in the design of the
-distribution of the SSH keys.
-
-Note: One could think of always allowing to disable the master-capability
-via RAPI and just restrict the enabling of it, thus making it possible
-to RAPI-"freeze" the nodes' master-capability state once it disabled.
-However, we think these are rather confusing semantics of the involved
-flags and thus we go with proposed design.
-
-Note that this change will break RAPI compatibility, at least if the
-rapi flag is not explicitely set to ``True``. We made this choice to
-have the more secure option as default, because otherwise it is
-unlikely to be widely used.
+Alternatively, there was the idea of adding a flag to the cluster config
+that would 'freeze' the ``master_capable`` state of nodes. This turned
+out to be infeasible, as promoting a node from not ``master_capable``
+to ``master_capable`` would mean to add the nodes's key to the
+``ganeti_pub_keys`` file. Due to security reasons, this needed to be
+done in the client (similar to when adding a node). That would have
+meant that it would no longer be possible to set this flag via RAPI. As
+setting this flag via RAPI is a feature our users depend on and that
+has been available in the past, we refrain from breaking this feature.
Cluster initialization
diff --git a/doc/design-repaird.rst b/doc/design-repaird.rst
index 6dad3e7..67fe45b 100644
--- a/doc/design-repaird.rst
+++ b/doc/design-repaird.rst
@@ -189,7 +189,7 @@
Returns a list of all non-cleared incidents. Each incident is reported
as a JSON object with at least the following information.
-- ``id`` The unique identifier assigned to the event.
+- ``uuid`` The unique identifier assigned to the event.
- ``node`` The UUID of the node on which the even was observed.
diff --git a/doc/examples/ganeti.cron.in b/doc/examples/ganeti.cron.in
index eedb58b..ad5c79c 100644
--- a/doc/examples/ganeti.cron.in
+++ b/doc/examples/ganeti.cron.in
@@ -3,8 +3,11 @@
# On reboot, continue a Ganeti upgrade, if one was in progress
@reboot root [ -x @SBINDIR@/gnt-cluster ] && @SBINDIR@/gnt-cluster upgrade --resume
-# Restart failed instances (every 5 minutes)
-*/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
+# Restart failed instances (in non-strict mode every 5 minutes)
+5-25/5,35-55/5 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher --no-strict
+
+# Restart failed instances (in strict mode every 30 minutes)
+*/30 * * * * root [ -x @SBINDIR@/ganeti-watcher ] && @SBINDIR@/ganeti-watcher
# Clean job archive (at 01:45 AM)
45 1 * * * @GNTMASTERUSER@ [ -x @SBINDIR@/ganeti-cleaner ] && @SBINDIR@/ganeti-cleaner master
diff --git a/doc/examples/ganeti.default b/doc/examples/ganeti.default
index 49b7d8a..f0649a2 100644
--- a/doc/examples/ganeti.default
+++ b/doc/examples/ganeti.default
@@ -5,3 +5,4 @@
MOND_ARGS=""
WCONFD_ARGS=""
LUXID_ARGS=""
+MAINTD_ARGS=""
diff --git a/doc/examples/ganeti.default-debug b/doc/examples/ganeti.default-debug
index 00dece4..249f3fa 100644
--- a/doc/examples/ganeti.default-debug
+++ b/doc/examples/ganeti.default-debug
@@ -5,3 +5,4 @@
MOND_ARGS="-d"
WCONFD_ARGS="-d"
LUXID_ARGS="-d"
+MAINTD_ARGS="-d"
diff --git a/doc/examples/systemd/ganeti-maintd.service.in b/doc/examples/systemd/ganeti-maintd.service.in
new file mode 100644
index 0000000..f7e906e
--- /dev/null
+++ b/doc/examples/systemd/ganeti-maintd.service.in
@@ -0,0 +1,18 @@
+[Unit]
+Description = Ganeti maintenance daemon (maintd)
+Documentation = man:ganeti-maintd(8)
+Requires = ganeti-common.service
+After = ganeti-common.service
+PartOf = ganeti-master.target
+ConditionPathExists = @LOCALSTATEDIR@/lib/ganeti/config.data
+
+[Service]
+Type = simple
+User = @GNTMONDUSER@
+Group = @GNTMONDGROUP@
+ExecStart = @SBINDIR@/ganeti-maintd -f
+Restart = on-failure
+SuccessExitStatus = 0 11
+
+[Install]
+WantedBy = ganeti-master.target ganeti.target
diff --git a/doc/hooks.rst b/doc/hooks.rst
index de794bb..667906b 100644
--- a/doc/hooks.rst
+++ b/doc/hooks.rst
@@ -1,7 +1,7 @@
Ganeti customisation using hooks
================================
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
.. contents::
@@ -9,7 +9,8 @@
------------
In order to allow customisation of operations, Ganeti runs scripts in
-sub-directories of ``@SYSCONFDIR@/ganeti/hooks``. These sub-directories
+sub-directories of ``@SYSCONFDIR@/ganeti/hooks`` (that is usually
+``/etc/ganeti/hooks``). These sub-directories
are named ``$hook-$phase.d``, where ``$phase`` is either ``pre`` or
``post`` and ``$hook`` matches the directory name given for a hook (e.g.
``cluster-verify-post.d`` or ``node-add-pre.d``).
@@ -17,6 +18,10 @@
This is similar to the ``/etc/network/`` structure present in Debian
for network interface handling.
+Note that Ganeti does not create its ``hooks`` directory by default.
+If you want to use hooks scripts, create it on all nodes. This applies
+also to all sub directories such as ``node-add-pre.d``.
+
Organisation
------------
@@ -31,6 +36,11 @@
Note that, even though we call them scripts, we are actually talking
about any executable.
+The filenames of the scripts need to match the regular expression
+``^[a-zA-Z0-9_-]+$``. This means in particular, that scripts having
+a filename extension (such as ``myhook.sh``) are silently ignored
+by Ganeti.
+
*pre* scripts
~~~~~~~~~~~~~
diff --git a/doc/iallocator.rst b/doc/iallocator.rst
index 406f52a..5e59857 100644
--- a/doc/iallocator.rst
+++ b/doc/iallocator.rst
@@ -1,7 +1,7 @@
Ganeti automatic instance allocation
====================================
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
.. contents::
diff --git a/doc/index.rst b/doc/index.rst
index 225c88f..a8b3fba 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -82,6 +82,7 @@
design-2.14.rst
design-2.15.rst
design-2.16.rst
+ design-2.17.rst
Draft designs
-------------
@@ -118,11 +119,14 @@
design-location.rst
design-linuxha.rst
design-lu-generated-jobs.rst
+ design-memory-over-commitment.rst
+ design-migration-speed-hbal.rst
design-monitoring-agent.rst
design-move-instance-improvements.rst
design-multi-reloc.rst
design-multi-version-tests.rst
design-network.rst
+ design-n-m-redundancy.rst
design-node-add.rst
design-node-security.rst
design-oob.rst
@@ -137,6 +141,7 @@
design-query2.rst
design-query-splitting.rst
design-reason-trail.rst
+ design-repaird.rst
design-restricted-commands.rst
design-shared-storage.rst
design-shared-storage-redundancy.rst
diff --git a/doc/rapi.rst b/doc/rapi.rst
index dc7784a..d6cab78 100644
--- a/doc/rapi.rst
+++ b/doc/rapi.rst
@@ -232,7 +232,8 @@
constants.ISPECS_STD,
constants.IPOLICY_DTS,
constants.IPOLICY_VCPU_RATIO,
- constants.IPOLICY_SPINDLE_RATIO])
+ constants.IPOLICY_SPINDLE_RATIO,
+ constants.IPOLICY_MEMORY_RATIO])
.. pyassert::
@@ -280,6 +281,8 @@
Maximum ratio of virtual to physical CPUs (`float`)
:pyeval:`constants.IPOLICY_SPINDLE_RATIO`
Maximum ratio of instances to their node's ``spindle_count`` (`float`)
+:pyeval:`constants.IPOLICY_MEMORY_RATIO`
+ Maximum ratio of memory overcommitment (`float`)
Usage examples
--------------
diff --git a/doc/security.rst b/doc/security.rst
index ea86a8a..ff3ac0d 100644
--- a/doc/security.rst
+++ b/doc/security.rst
@@ -1,7 +1,7 @@
Security in Ganeti
==================
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
Ganeti was developed to run on internal, trusted systems. As such, the
security model is all-or-nothing.
diff --git a/doc/virtual-cluster.rst b/doc/virtual-cluster.rst
index 7213a80..e4614fd 100644
--- a/doc/virtual-cluster.rst
+++ b/doc/virtual-cluster.rst
@@ -1,7 +1,7 @@
Virtual cluster support
=======================
-Documents Ganeti version 2.16
+Documents Ganeti version 2.17
.. contents::
diff --git a/lib/backend.py b/lib/backend.py
index 2c2448b..58c8b3a 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1463,7 +1463,9 @@
pub_key_file=pathutils.SSH_PUB_KEYS,
ssconf_store=None,
noded_cert_file=pathutils.NODED_CERT_FILE,
- run_cmd_fn=ssh.RunSshCmdWithStdin):
+ run_cmd_fn=ssh.RunSshCmdWithStdin,
+ ssh_update_debug=False,
+ ssh_update_verbose=False):
"""Distributes a node's public SSH key across the cluster.
Note that this function should only be executed on the master node, which
@@ -1499,7 +1501,9 @@
pub_key_file=pub_key_file,
ssconf_store=ssconf_store,
noded_cert_file=noded_cert_file,
- run_cmd_fn=run_cmd_fn)
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
# Node info named tuple specifically for the use with AddNodeSshKeyBulk
@@ -1517,7 +1521,9 @@
pub_key_file=pathutils.SSH_PUB_KEYS,
ssconf_store=None,
noded_cert_file=pathutils.NODED_CERT_FILE,
- run_cmd_fn=ssh.RunSshCmdWithStdin):
+ run_cmd_fn=ssh.RunSshCmdWithStdin,
+ ssh_update_debug=False,
+ ssh_update_verbose=False):
"""Distributes a node's public SSH key across the cluster.
Note that this function should only be executed on the master node, which
@@ -1595,13 +1601,14 @@
(constants.SSHS_OVERRIDE, all_keys)
try:
+ backoff = 5 # seconds
utils.RetryByNumberOfTimes(
- constants.SSHS_MAX_RETRIES,
+ constants.SSHS_MAX_RETRIES, backoff,
errors.SshUpdateError,
run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
ssh_port_map.get(node_info.name), node_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False, strict_host_check=False)
except errors.SshUpdateError as e:
# Clean up the master's public key file if adding key fails
if node_info.to_public_keys:
@@ -1641,13 +1648,13 @@
if node in potential_master_candidates:
logging.debug("Updating SSH key files of node '%s'.", node)
try:
+ backoff = 5 # seconds
utils.RetryByNumberOfTimes(
- constants.SSHS_MAX_RETRIES,
- errors.SshUpdateError,
+ constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
ssh_port_map.get(node), pot_mc_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False, strict_host_check=False)
except errors.SshUpdateError as last_exception:
error_msg = ("When adding the key of node '%s', updating SSH key"
" files of node '%s' failed after %s retries."
@@ -1663,12 +1670,15 @@
if to_authorized_keys:
run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
ssh_port_map.get(node), base_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False,
+ strict_host_check=False)
return node_errors
+# TODO: will be fixed with pending patch series.
+# pylint: disable=R0913
def RemoveNodeSshKey(node_uuid, node_name,
master_candidate_uuids,
potential_master_candidates,
@@ -1682,7 +1692,9 @@
ssconf_store=None,
noded_cert_file=pathutils.NODED_CERT_FILE,
readd=False,
- run_cmd_fn=ssh.RunSshCmdWithStdin):
+ run_cmd_fn=ssh.RunSshCmdWithStdin,
+ ssh_update_debug=False,
+ ssh_update_verbose=False):
"""Removes the node's SSH keys from the key files and distributes those.
Note that at least one of the flags C{from_authorized_keys},
@@ -1736,7 +1748,9 @@
ssconf_store=ssconf_store,
noded_cert_file=noded_cert_file,
readd=readd,
- run_cmd_fn=run_cmd_fn)
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
# Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
@@ -1759,7 +1773,9 @@
ssconf_store=None,
noded_cert_file=pathutils.NODED_CERT_FILE,
readd=False,
- run_cmd_fn=ssh.RunSshCmdWithStdin):
+ run_cmd_fn=ssh.RunSshCmdWithStdin,
+ ssh_update_debug=False,
+ ssh_update_verbose=False):
"""Removes the node's SSH keys from the key files and distributes those.
Note that at least one of the flags C{from_authorized_keys},
@@ -1902,13 +1918,13 @@
logging.debug("Updating key setup of potential master candidate node"
" %s.", node)
try:
+ backoff = 5 # seconds
utils.RetryByNumberOfTimes(
- constants.SSHS_MAX_RETRIES,
- errors.SshUpdateError,
+ constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
ssh_port, pot_mc_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False, strict_host_check=False)
except errors.SshUpdateError as last_exception:
error_msg = error_msg_final % (
node_info.name, node, last_exception)
@@ -1919,13 +1935,13 @@
if from_authorized_keys:
logging.debug("Updating key setup of normal node %s.", node)
try:
+ backoff = 5 # seconds
utils.RetryByNumberOfTimes(
- constants.SSHS_MAX_RETRIES,
- errors.SshUpdateError,
+ constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
ssh_port, base_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False, strict_host_check=False)
except errors.SshUpdateError as last_exception:
error_msg = error_msg_final % (
node_info.name, node, last_exception)
@@ -1973,13 +1989,14 @@
logging.debug("Updating SSH key setup of target node '%s'.",
node_info.name)
try:
+ backoff = 5 # seconds
utils.RetryByNumberOfTimes(
- constants.SSHS_MAX_RETRIES,
+ constants.SSHS_MAX_RETRIES, backoff,
errors.SshUpdateError,
run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
ssh_port, data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False, strict_host_check=False)
except errors.SshUpdateError as last_exception:
result_msgs.append(
(node_info.name,
@@ -1992,18 +2009,52 @@
ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
return result_msgs
+# pylint: enable=R0913
-def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
- ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
+def RemoveSshKeyFromPublicKeyFile(node_name,
+ pub_key_file=pathutils.SSH_PUB_KEYS,
+ ssconf_store=None):
+ """Removes a SSH key from the master's public key file.
+
+ This is an operation that is only used to clean up after failed operations
+ (for example failed hooks before adding a node). To avoid abuse of this
+ function (and the matching RPC call), we add a safety check to make sure
+ that only stray keys can be removed that belong to nodes that are not
+ in the cluster (anymore).
+
+ @type node_name: string
+ @param node_name: the name of the node whose key is removed
+
+ """
+ if not ssconf_store:
+ ssconf_store = ssconf.SimpleStore()
+
+ node_list = ssconf_store.GetNodeList()
+
+ if node_name in node_list:
+ raise errors.SshUpdateError("Cannot remove key of node '%s',"
+ " because it still belongs to the cluster."
+ % node_name)
+
+ keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file)
+ if not keys_by_name or node_name not in keys_by_name:
+ logging.info("The node '%s' whose key is supposed to be removed does not"
+ " have an entry in the public key file. Hence, there is"
+ " nothing left to do.", node_name)
+
+ ssh.RemovePublicKey(node_name, key_file=pub_key_file)
+
+
+def _GenerateNodeSshKey(node_name, ssh_port_map, ssh_key_type, ssh_key_bits,
ssconf_store=None,
noded_cert_file=pathutils.NODED_CERT_FILE,
run_cmd_fn=ssh.RunSshCmdWithStdin,
- suffix=""):
+ suffix="",
+ ssh_update_debug=False,
+ ssh_update_verbose=False):
"""Generates the root SSH key pair on the node.
- @type node_uuid: str
- @param node_uuid: UUID of the node whose key is removed
@type node_name: str
@param node_name: name of the node whose key is remove
@type ssh_port_map: dict of str to int
@@ -2017,12 +2068,6 @@
if not ssconf_store:
ssconf_store = ssconf.SimpleStore()
- keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
- if not keys_by_uuid or node_uuid not in keys_by_uuid:
- raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
- " be regenerated is not registered in the"
- " public keys file." % (node_name, node_uuid))
-
data = {}
_InitSshUpdateData(data, noded_cert_file, ssconf_store)
cluster_name = data[constants.SSHS_CLUSTER_NAME]
@@ -2030,8 +2075,8 @@
run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
ssh_port_map.get(node_name), data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+ debug=ssh_update_debug, verbose=ssh_update_verbose,
+ use_cluster_key=False, ask_key=False, strict_host_check=False)
def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
@@ -2055,58 +2100,15 @@
return old_master_keys_by_uuid
-def _GetNewMasterKey(root_keyfiles, master_node_uuid):
- new_master_keys = []
- for (_, (_, public_key_file)) in root_keyfiles.items():
- public_key_dir = os.path.dirname(public_key_file)
- public_key_file_tmp_filename = \
- os.path.splitext(os.path.basename(public_key_file))[0] \
- + constants.SSHS_MASTER_SUFFIX + ".pub"
- public_key_path_tmp = os.path.join(public_key_dir,
- public_key_file_tmp_filename)
- if os.path.exists(public_key_path_tmp):
- # for some key types, there might not be any keys
- key = utils.ReadFile(public_key_path_tmp)
- new_master_keys.append(key)
- if not new_master_keys:
- raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
- return {master_node_uuid: new_master_keys}
-
-
-def _ReplaceMasterKeyOnMaster(root_keyfiles):
- number_of_moves = 0
- for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
- key_dir = os.path.dirname(public_key_file)
- private_key_file_tmp = \
- os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
- public_key_file_tmp = private_key_file_tmp + ".pub"
- private_key_path_tmp = os.path.join(key_dir,
- private_key_file_tmp)
- public_key_path_tmp = os.path.join(key_dir,
- public_key_file_tmp)
- if os.path.exists(public_key_file):
- utils.CreateBackup(public_key_file)
- utils.RemoveFile(public_key_file)
- if os.path.exists(private_key_file):
- utils.CreateBackup(private_key_file)
- utils.RemoveFile(private_key_file)
- if os.path.exists(public_key_path_tmp) and \
- os.path.exists(private_key_path_tmp):
- # for some key types, there might not be any keys
- shutil.move(public_key_path_tmp, public_key_file)
- shutil.move(private_key_path_tmp, private_key_file)
- number_of_moves += 1
- if not number_of_moves:
- raise errors.SshUpdateError("Could not move at least one master SSH key.")
-
-
def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
potential_master_candidates, old_key_type, new_key_type,
new_key_bits,
ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
ssconf_store=None,
noded_cert_file=pathutils.NODED_CERT_FILE,
- run_cmd_fn=ssh.RunSshCmdWithStdin):
+ run_cmd_fn=ssh.RunSshCmdWithStdin,
+ ssh_update_debug=False,
+ ssh_update_verbose=False):
"""Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
@type node_uuids: list of str
@@ -2144,11 +2146,9 @@
raise errors.ProgrammerError("List of nodes UUIDs and node names"
" does not match in length.")
- (_, root_keyfiles) = \
- ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
- (_, old_pub_keyfile) = root_keyfiles[old_key_type]
- (_, new_pub_keyfile) = root_keyfiles[new_key_type]
- old_master_key = utils.ReadFile(old_pub_keyfile)
+ old_pub_keyfile = ssh.GetSshPubKeyFilename(old_key_type)
+ new_pub_keyfile = ssh.GetSshPubKeyFilename(new_key_type)
+ old_master_key = ssh.ReadLocalSshPubKeys([old_key_type])
node_uuid_name_map = zip(node_uuids, node_names)
@@ -2179,20 +2179,13 @@
node_list.append((node_uuid, node_name, master_candidate,
potential_master_candidate))
- keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
- key_file=ganeti_pub_keys_file)
- if not keys_by_uuid:
- raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
- " not generating a new key."
- % (node_name, node_uuid))
-
if master_candidate:
logging.debug("Fetching old SSH key from node '%s'.", node_name)
- old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
- node_name, cluster_name,
- ssh_port_map[node_name],
- False, # ask_key
- False) # key_check
+ old_pub_key = ssh.ReadRemoteSshPubKey(old_pub_keyfile,
+ node_name, cluster_name,
+ ssh_port_map[node_name],
+ False, # ask_key
+ False) # key_check
if old_pub_key != old_master_key:
# If we are already in a multi-key setup (that is past Ganeti 2.12),
# we can safely remove the old key of the node. Otherwise, we cannot
@@ -2216,7 +2209,13 @@
node_info_to_remove,
master_candidate_uuids,
potential_master_candidates,
- master_uuid=master_node_uuid)
+ master_uuid=master_node_uuid,
+ pub_key_file=ganeti_pub_keys_file,
+ ssconf_store=ssconf_store,
+ noded_cert_file=noded_cert_file,
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
if node_errors:
all_node_errors = all_node_errors + node_errors
@@ -2224,19 +2223,20 @@
in node_list:
logging.debug("Generating new SSH key for node '%s'.", node_name)
- _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
- new_key_bits, pub_key_file=ganeti_pub_keys_file,
+ _GenerateNodeSshKey(node_name, ssh_port_map, new_key_type, new_key_bits,
ssconf_store=ssconf_store,
noded_cert_file=noded_cert_file,
- run_cmd_fn=run_cmd_fn)
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_verbose=ssh_update_verbose,
+ ssh_update_debug=ssh_update_debug)
try:
logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
- pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
- node_name, cluster_name,
- ssh_port_map[node_name],
- False, # ask_key
- False) # key_check
+ pub_key = ssh.ReadRemoteSshPubKey(new_pub_keyfile,
+ node_name, cluster_name,
+ ssh_port_map[node_name],
+ False, # ask_key
+ False) # key_check
except:
raise errors.SshUpdateError("Could not fetch key of node %s"
" (UUID %s)" % (node_name, node_uuid))
@@ -2256,7 +2256,9 @@
node_keys_to_add, potential_master_candidates,
pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
noded_cert_file=noded_cert_file,
- run_cmd_fn=run_cmd_fn)
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
if node_errors:
all_node_errors = all_node_errors + node_errors
@@ -2268,19 +2270,21 @@
# Generate a new master key with a suffix, don't touch the old one for now
logging.debug("Generate new ssh key of master.")
- _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
+ _GenerateNodeSshKey(master_node_name, ssh_port_map,
new_key_type, new_key_bits,
- pub_key_file=ganeti_pub_keys_file,
ssconf_store=ssconf_store,
noded_cert_file=noded_cert_file,
run_cmd_fn=run_cmd_fn,
- suffix=constants.SSHS_MASTER_SUFFIX)
+ suffix=constants.SSHS_MASTER_SUFFIX,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
# Read newly created master key
- new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
+ new_master_keys = ssh.ReadLocalSshPubKeys(
+ [new_key_type], suffix=constants.SSHS_MASTER_SUFFIX)
# Replace master key in the master nodes' public key file
ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
- for pub_key in new_master_key_dict[master_node_uuid]:
+ for pub_key in new_master_keys:
ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
# Add new master key to all node's public and authorized keys
@@ -2290,12 +2294,15 @@
to_authorized_keys=True, to_public_keys=True,
get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
- run_cmd_fn=run_cmd_fn)
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
if node_errors:
all_node_errors = all_node_errors + node_errors
# Remove the old key file and rename the new key to the non-temporary filename
- _ReplaceMasterKeyOnMaster(root_keyfiles)
+ ssh.ReplaceSshKeys(new_key_type, new_key_type,
+ src_key_suffix=constants.SSHS_MASTER_SUFFIX)
# Remove old key from authorized keys
(auth_key_file, _) = \
@@ -2310,7 +2317,13 @@
potential_master_candidates,
keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
from_public_keys=False, clear_authorized_keys=False,
- clear_public_keys=False)
+ clear_public_keys=False,
+ pub_key_file=ganeti_pub_keys_file,
+ ssconf_store=ssconf_store,
+ noded_cert_file=noded_cert_file,
+ run_cmd_fn=run_cmd_fn,
+ ssh_update_debug=ssh_update_debug,
+ ssh_update_verbose=ssh_update_verbose)
if node_errors:
all_node_errors = all_node_errors + node_errors
@@ -5775,18 +5788,25 @@
return _verify_cmd(path, cmd)
-def RunRestrictedCmd(cmd,
- _lock_timeout=_RCMD_LOCK_TIMEOUT,
- _lock_file=pathutils.RESTRICTED_COMMANDS_LOCK_FILE,
- _path=pathutils.RESTRICTED_COMMANDS_DIR,
- _sleep_fn=time.sleep,
- _prepare_fn=_PrepareRestrictedCmd,
- _runcmd_fn=utils.RunCmd,
- _enabled=constants.ENABLE_RESTRICTED_COMMANDS):
- """Executes a restricted command after performing strict tests.
+def RunConstrainedCmd(cmd,
+ lock_file,
+ path,
+ inp=None,
+ _lock_timeout=_RCMD_LOCK_TIMEOUT,
+ _sleep_fn=time.sleep,
+ _prepare_fn=_PrepareRestrictedCmd,
+ _runcmd_fn=utils.RunCmd,
+ _enabled=constants.ENABLE_RESTRICTED_COMMANDS):
+ """Executes a command after performing strict tests.
@type cmd: string
@param cmd: Command name
+ @type lock_file: string
+ @param lock_file: path to the lock file
+ @type path: string
+ @param path: path to the directory in which the command is present
+ @type inp: string
+ @param inp: Input to be passed to the command
@rtype: string
@return: Command output
@raise RPCFail: In case of an error
@@ -5801,14 +5821,24 @@
try:
cmdresult = None
try:
- lock = utils.FileLock.Open(_lock_file)
+ lock = utils.FileLock.Open(lock_file)
lock.Exclusive(blocking=True, timeout=_lock_timeout)
- (status, value) = _prepare_fn(_path, cmd)
+ (status, value) = _prepare_fn(path, cmd)
if status:
+ if inp:
+ input_fd = tempfile.TemporaryFile()
+ input_fd.write(inp)
+ input_fd.flush()
+ input_fd.seek(0)
+ else:
+ input_fd = None
cmdresult = _runcmd_fn([value], env={}, reset_env=True,
- postfork_fn=lambda _: lock.Unlock())
+ postfork_fn=lambda _: lock.Unlock(),
+ input_fd=input_fd)
+ if input_fd:
+ input_fd.close()
else:
logging.error(value)
except Exception: # pylint: disable=W0703
diff --git a/lib/bootstrap.py b/lib/bootstrap.py
index 0afb68b..8eb0b4c 100644
--- a/lib/bootstrap.py
+++ b/lib/bootstrap.py
@@ -867,6 +867,7 @@
default_nodegroup.uuid: default_nodegroup,
}
now = time.time()
+ maintenance = objects.Maintenance(serial_no=1, ctime=now, mtime=now)
config_data = objects.ConfigData(version=version,
cluster=cluster_config,
nodegroups=nodegroups,
@@ -875,6 +876,7 @@
networks={},
disks={},
filters={},
+ maintenance=maintenance,
serial_no=1,
ctime=now, mtime=now)
utils.WriteFile(cfg_file,
@@ -934,6 +936,8 @@
constants.NDS_CLUSTER_NAME: cluster_name,
constants.NDS_NODE_DAEMON_CERTIFICATE:
utils.ReadFile(pathutils.NODED_CERT_FILE),
+ constants.NDS_HMAC:
+ utils.ReadFile(pathutils.CONFD_HMAC_KEY),
constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
constants.NDS_START_NODE_DAEMON: True,
constants.NDS_NODE_NAME: node,
diff --git a/lib/cli.py b/lib/cli.py
index 67ea375..73c9b96 100644
--- a/lib/cli.py
+++ b/lib/cli.py
@@ -2877,6 +2877,7 @@
ipolicy_disk_templates=None,
ipolicy_vcpu_ratio=None,
ipolicy_spindle_ratio=None,
+ ipolicy_memory_ratio=None,
group_ipolicy=False,
allowed_values=None,
fill_all=False):
@@ -2914,6 +2915,8 @@
ipolicy_out[constants.IPOLICY_VCPU_RATIO] = ipolicy_vcpu_ratio
if ipolicy_spindle_ratio is not None:
ipolicy_out[constants.IPOLICY_SPINDLE_RATIO] = ipolicy_spindle_ratio
+ if ipolicy_memory_ratio is not None:
+ ipolicy_out[constants.IPOLICY_MEMORY_RATIO] = ipolicy_memory_ratio
assert not (frozenset(ipolicy_out.keys()) - constants.IPOLICY_ALL_KEYS)
diff --git a/lib/cli_opts.py b/lib/cli_opts.py
index 3e4fd4c..c81355d 100644
--- a/lib/cli_opts.py
+++ b/lib/cli_opts.py
@@ -82,6 +82,7 @@
"DST_NODE_OPT",
"EARLY_RELEASE_OPT",
"ENABLED_DATA_COLLECTORS_OPT",
+ "DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT",
"ENABLED_DISK_TEMPLATES_OPT",
"ENABLED_HV_OPT",
"ENABLED_USER_SHUTDOWN_OPT",
@@ -123,6 +124,7 @@
"IGNORE_SOFT_ERRORS_OPT",
"IGNORE_SIZE_OPT",
"INCLUDEDEFAULTS_OPT",
+ "INPUT_OPT",
"INSTALL_IMAGE_OPT",
"INSTANCE_COMMUNICATION_NETWORK_OPT",
"INSTANCE_COMMUNICATION_OPT",
@@ -134,8 +136,12 @@
"IPOLICY_STD_SPECS_OPT",
"IPOLICY_STD_SPECS_STR",
"IPOLICY_VCPU_RATIO",
+ "IPOLICY_MEMORY_RATIO",
"LONG_SLEEP_OPT",
"MAC_PREFIX_OPT",
+ "MAINT_BALANCE_OPT",
+ "MAINT_BALANCE_THRESHOLD_OPT",
+ "MAINT_INTERVAL_OPT",
"MAINTAIN_NODE_HEALTH_OPT",
"MASTER_NETDEV_OPT",
"MASTER_NETMASK_OPT",
@@ -807,6 +813,13 @@
help=("The maximum allowed instances to"
" spindle ratio"))
+IPOLICY_MEMORY_RATIO = cli_option("--ipolicy-memory-ratio",
+ dest="ipolicy_memory_ratio",
+ type="maybefloat", default=None,
+ help=("The maximum allowed used memory to"
+ " physicall memory ratio (in terms of"
+ " memory overcommitment)"))
+
HYPERVISOR_OPT = cli_option("-H", "--hypervisor-parameters", dest="hypervisor",
help="Hypervisor and hypervisor options, in the"
" format hypervisor:option=value,option=value,...",
@@ -1100,6 +1113,21 @@
help="Comma-separated list of compression tools which are"
" allowed to be used by Ganeti in various operations")
+MAINT_INTERVAL_OPT = \
+ cli_option("--maintenance-interval", dest="maint_round_delay", type="int",
+ default=None, help="Minimal time in seconds, the maintenance"
+ " daemon waits between rounds")
+
+MAINT_BALANCE_OPT = \
+ cli_option("--auto-balance-cluster", dest="maint_balance", type="bool",
+ default=None, metavar=_YORNO, help="Whether the maintenance"
+ " daemon should balance the cluster")
+
+MAINT_BALANCE_THRESHOLD_OPT = \
+ cli_option("--auto-balance-threshold", dest="maint_balance_threshold",
+ type="float", default=None, metavar="CLUSTERSCORE",
+ help="Minimal gain for an auto-balancing step to be taken")
+
VG_NAME_OPT = cli_option("--vg-name", dest="vg_name",
help=("Enables LVM and specifies the volume group"
" name (cluster-wide) for disk allocation"
@@ -1588,6 +1616,17 @@
"in the format collector=bool, where collector is one of %s."
% ", ".join(constants.DATA_COLLECTOR_NAMES))
+DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT = \
+ cli_option("--diagnose-data-collector-filename",
+ dest="diagnose_data_collector_filename",
+ help=("Set's the file name of the script"
+ " diagnose data collector should run"
+ " If this value is empty string, the collector"
+ " will return a success value"
+ " without running anything"),
+ type="string")
+
+
VERIFY_CLUTTER_OPT = cli_option(
"--verify-ssh-clutter", default=False, dest="verify_clutter",
help="Verify that Ganeti did not clutter"
@@ -1597,6 +1636,11 @@
"--long-sleep", default=False, dest="long_sleep",
help="Allow long shutdowns when backing up instances", action="store_true")
+INPUT_OPT = cli_option("--input", dest="input", default=None,
+ help=("input to be passed as stdin"
+ " to the repair command"),
+ type="string")
+
SSH_KEY_TYPE_OPT = \
cli_option("--ssh-key-type", default=None,
choices=list(constants.SSHK_ALL), dest="ssh_key_type",
@@ -1654,6 +1698,7 @@
IPOLICY_DISK_TEMPLATES,
IPOLICY_VCPU_RATIO,
IPOLICY_SPINDLE_RATIO,
+ IPOLICY_MEMORY_RATIO,
]
# instance policy split specs options
diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
index f834d2b..2cc8328 100644
--- a/lib/client/gnt_cluster.py
+++ b/lib/client/gnt_cluster.py
@@ -59,6 +59,7 @@
from ganeti import ssh
from ganeti import uidpool
from ganeti import utils
+from ganeti import wconfd
from ganeti.client import base
@@ -94,6 +95,10 @@
"--data-collector-interval", default={}, type="keyval",
help="Set collection intervals in seconds of data collectors.")
+STRICT_OPT = cli_option("--no-strict", default=False,
+ dest="no_strict", action="store_true",
+ help="Do not run group verify in strict mode")
+
_EPO_PING_INTERVAL = 30 # 30 seconds between pings
_EPO_PING_TIMEOUT = 1 # 1 second
_EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
@@ -245,6 +250,7 @@
ipolicy_disk_templates=opts.ipolicy_disk_templates,
ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+ ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
fill_all=True)
if opts.candidate_pool_size is None:
@@ -798,7 +804,8 @@
"""
cl = GetClient()
- op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup)
+ op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup,
+ is_strict=not opts.no_strict)
result = SubmitOpCode(op, cl=cl, opts=opts)
@@ -1211,7 +1218,9 @@
node_certificates=new_node_cert or new_cluster_cert,
renew_ssh_keys=new_ssh_keys,
ssh_key_type=ssh_key_type,
- ssh_key_bits=ssh_key_bits)
+ ssh_key_bits=ssh_key_bits,
+ verbose=verbose,
+ debug=debug)
SubmitOpCode(renew_op, cl=cl)
ToStdout("All requested certificates and keys have been replaced."
@@ -1268,10 +1277,10 @@
# get the key files of all non-master nodes
for node in nonmaster_nodes:
- pub_key = ssh.ReadRemoteSshPubKeys(pub_key_filename, node, cluster_name,
- ssh_port_map[node],
- options.ssh_key_check,
- options.ssh_key_check)
+ pub_key = ssh.ReadRemoteSshPubKey(pub_key_filename, node, cluster_name,
+ ssh_port_map[node],
+ options.ssh_key_check,
+ options.ssh_key_check)
ssh.AddPublicKey(node_uuid_map[node], pub_key, key_file=pub_key_file)
@@ -1391,6 +1400,7 @@
opts.ipolicy_disk_templates is not None or
opts.ipolicy_vcpu_ratio is not None or
opts.ipolicy_spindle_ratio is not None or
+ opts.ipolicy_memory_ratio is not None or
opts.modify_etc_hosts is not None or
opts.file_storage_dir is not None or
opts.install_image is not None or
@@ -1400,7 +1410,11 @@
opts.compression_tools is not None or
opts.shared_file_storage_dir is not None or
opts.enabled_user_shutdown is not None or
+ opts.maint_round_delay is not None or
+ opts.maint_balance is not None or
+ opts.maint_balance_threshold is not None or
opts.data_collector_interval or
+ opts.diagnose_data_collector_filename is not None or
opts.enabled_data_collectors):
ToStderr("Please give at least one of the parameters.")
return 1
@@ -1444,6 +1458,7 @@
ipolicy_disk_templates=opts.ipolicy_disk_templates,
ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+ ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
)
mnh = opts.maintain_node_health
@@ -1545,8 +1560,12 @@
shared_file_storage_dir=opts.shared_file_storage_dir,
compression_tools=compression_tools,
enabled_user_shutdown=opts.enabled_user_shutdown,
+ maint_round_delay=opts.maint_round_delay,
+ maint_balance=opts.maint_balance,
+ maint_balance_threshold=opts.maint_balance_threshold,
enabled_data_collectors=enabled_data_collectors,
data_collector_interval=data_collector_interval,
+ diagnose_data_collector_filename=opts.diagnose_data_collector_filename
)
return base.GetResult(None, opts, SubmitOrSend(op, opts))
@@ -1941,6 +1960,21 @@
return _off_fn(opts, node_list, inst_map)
+def RemoveRepair(opts, args):
+ """Uncoditionally remove a repair event
+
+ @param opts: the command line options selected by the user (ignored)
+ @type args: list
+ @param args: one element, the uuid of the event to remove
+ @rtype: int
+ @return: the desired exit code
+
+ """
+ uuid = args[0]
+ wconfd.Client().RmMaintdIncident(uuid)
+ return 0
+
+
def _GetCreateCommand(info):
buf = StringIO()
buf.write("gnt-cluster init")
@@ -2493,7 +2527,7 @@
VERIFY_CLUTTER_OPT],
"", "Does a check on the cluster configuration"),
"verify-disks": (
- VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT],
+ VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT, STRICT_OPT],
"", "Does a check on the cluster disk status"),
"repair-disk-sizes": (
RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
@@ -2560,7 +2594,9 @@
INSTANCE_POLICY_OPTS +
[GLOBAL_FILEDIR_OPT, GLOBAL_SHARED_FILEDIR_OPT, ZEROING_IMAGE_OPT,
COMPRESSION_TOOLS_OPT] +
- [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT],
+ [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT,
+ DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT,
+ MAINT_INTERVAL_OPT, MAINT_BALANCE_OPT, MAINT_BALANCE_THRESHOLD_OPT],
"[opts...]",
"Alters the parameters of the cluster"),
"renew-crypto": (
@@ -2590,6 +2626,9 @@
"upgrade": (
UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
"Upgrade (or downgrade) to a new Ganeti version"),
+ "remove-repair": (
+ RemoveRepair, [ArgUnknown()], [], "<uuid>",
+ "Remove a repair event from the list of pending events"),
}
diff --git a/lib/client/gnt_group.py b/lib/client/gnt_group.py
index 5f44001..8b3c9e5 100644
--- a/lib/client/gnt_group.py
+++ b/lib/client/gnt_group.py
@@ -63,6 +63,7 @@
minmax_ispecs=opts.ipolicy_bounds_specs,
ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+ ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
ipolicy_disk_templates=opts.ipolicy_disk_templates,
group_ipolicy=True)
@@ -170,7 +171,8 @@
allmods = [opts.ndparams, opts.alloc_policy, opts.diskparams, opts.hv_state,
opts.disk_state, opts.ipolicy_bounds_specs,
opts.ipolicy_vcpu_ratio, opts.ipolicy_spindle_ratio,
- opts.diskparams, opts.ipolicy_disk_templates]
+ opts.ipolicy_memory_ratio, opts.diskparams,
+ opts.ipolicy_disk_templates]
if allmods.count(None) == len(allmods):
ToStderr("Please give at least one of the parameters.")
return 1
@@ -190,6 +192,7 @@
ipolicy_disk_templates=opts.ipolicy_disk_templates,
ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
+ ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
group_ipolicy=True,
allowed_values=[constants.VALUE_DEFAULT])
diff --git a/lib/client/gnt_node.py b/lib/client/gnt_node.py
index 90ffcab..bac95e9 100644
--- a/lib/client/gnt_node.py
+++ b/lib/client/gnt_node.py
@@ -251,9 +251,9 @@
strict_host_check=options.ssh_key_check)
(_, pub_keyfile) = root_keyfiles[ssh_key_type]
- pub_key = ssh.ReadRemoteSshPubKeys(pub_keyfile, node, cluster_name, ssh_port,
- options.ssh_key_check,
- options.ssh_key_check)
+ pub_key = ssh.ReadRemoteSshPubKey(pub_keyfile, node, cluster_name, ssh_port,
+ options.ssh_key_check,
+ options.ssh_key_check)
# Unfortunately, we have to add the key with the node name rather than
# the node's UUID here, because at this point, we do not have a UUID yet.
# The entry will be corrected in noded later.
@@ -358,7 +358,9 @@
master_capable=opts.master_capable,
disk_state=disk_state,
hv_state=hv_state,
- node_setup=modify_ssh_setup)
+ node_setup=modify_ssh_setup,
+ verbose=opts.verbose,
+ debug=opts.debug > 0)
SubmitOpCode(op, opts=opts)
@@ -661,7 +663,9 @@
@return: the desired exit code
"""
- op = opcodes.OpNodeRemove(node_name=args[0])
+ op = opcodes.OpNodeRemove(node_name=args[0],
+ debug=opts.debug > 0,
+ verbose=opts.verbose)
SubmitOpCode(op, opts=opts)
return 0
@@ -1014,7 +1018,9 @@
auto_promote=opts.auto_promote,
powered=opts.node_powered,
hv_state=hv_state,
- disk_state=disk_state)
+ disk_state=disk_state,
+ verbose=opts.verbose,
+ debug=opts.debug > 0)
# even if here we process the result, we allow submit only
result = SubmitOrSend(op, opts)
@@ -1067,6 +1073,19 @@
return exit_code
+def RepairCommand(opts, args):
+ cl = GetClient()
+ if opts.input:
+ inp = opts.input.decode('string_escape')
+ else:
+ inp = None
+ op = opcodes.OpRepairCommand(command=args[0], node_name=args[1],
+ input=inp)
+ result = SubmitOrSend(op, opts, cl=cl)
+ print result
+ return constants.EXIT_SUCCESS
+
+
class ReplyStatus(object):
"""Class holding a reply status for synchronous confd clients.
@@ -1161,7 +1180,7 @@
CAPAB_MASTER_OPT, CAPAB_VM_OPT, NODE_PARAMS_OPT, HV_STATE_OPT,
DISK_STATE_OPT],
"[-s ip] [--readd] [--no-ssh-key-check] [--force-join]"
- " [--no-node-setup] [--verbose] [--network] <node_name>",
+ " [--no-node-setup] [--verbose] [--network] [--debug] <node_name>",
"Add a node to the cluster"),
"evacuate": (
EvacuateNode, ARGS_ONE_NODE,
@@ -1207,7 +1226,7 @@
[MC_OPT, DRAINED_OPT, OFFLINE_OPT,
CAPAB_MASTER_OPT, CAPAB_VM_OPT, SECONDARY_IP_OPT,
AUTO_PROMOTE_OPT, DRY_RUN_OPT, PRIORITY_OPT, NODE_PARAMS_OPT,
- NODE_POWERED_OPT, HV_STATE_OPT, DISK_STATE_OPT],
+ NODE_POWERED_OPT, HV_STATE_OPT, DISK_STATE_OPT, VERBOSE_OPT],
"<node_name>", "Alters the parameters of a node"),
"powercycle": (
PowercycleNode, ARGS_ONE_NODE,
@@ -1224,8 +1243,8 @@
"on|off|cycle|status [nodes...]",
"Change power state of node by calling out-of-band helper."),
"remove": (
- RemoveNode, ARGS_ONE_NODE, [DRY_RUN_OPT, PRIORITY_OPT],
- "<node_name>", "Removes a node from the cluster"),
+ RemoveNode, ARGS_ONE_NODE, [DRY_RUN_OPT, PRIORITY_OPT, VERBOSE_OPT],
+ "[--verbose] [--debug] <node_name>", "Removes a node from the cluster"),
"volumes": (
ListVolumes, [ArgNode()],
[NOHDR_OPT, SEP_OPT, USEUNITS_OPT, FIELDS_OPT, PRIORITY_OPT],
@@ -1276,6 +1295,10 @@
[SYNC_OPT, PRIORITY_OPT] + SUBMIT_OPTS + [SHOW_MACHINE_OPT, NODEGROUP_OPT],
"<command> <node_name> [<node_name>...]",
"Executes a restricted command on node(s)"),
+ "repair-command": (
+ RepairCommand, [ArgUnknown(min=1, max=1), ArgNode(min=1, max=1)],
+ [SUBMIT_OPT, INPUT_OPT], "{--input <input>} <command> <node_name>",
+ "Executes a repair command on a node"),
}
#: dictionary with aliases for commands
diff --git a/lib/cmdlib/__init__.py b/lib/cmdlib/__init__.py
index 5fd9b8d..08d9616 100644
--- a/lib/cmdlib/__init__.py
+++ b/lib/cmdlib/__init__.py
@@ -126,7 +126,8 @@
from ganeti.cmdlib.misc import \
LUOobCommand, \
LUExtStorageDiagnose, \
- LURestrictedCommand
+ LURestrictedCommand, \
+ LURepairCommand
from ganeti.cmdlib.test import \
LUTestOsParams, \
LUTestDelay, \
diff --git a/lib/cmdlib/base.py b/lib/cmdlib/base.py
index 57eb8d5..1e8b2d9 100644
--- a/lib/cmdlib/base.py
+++ b/lib/cmdlib/base.py
@@ -438,6 +438,30 @@
# pylint: disable=W0613,R0201
return lu_result
+ def HooksAbortCallBack(self, phase, feedback_fn, exception):
+ """Called when the hooks get aborted by an exception.
+
+ This method is called everytime a hooks phase is aborted by an exception.
+ This exception is most likely of type C{errors.HooksAbort}. However, we
+ keep the design of this function broad enough to handle any kind of
+ exception.
+
+ The intended purpose of this call back is to run any action that is
+ necessary to bring the cluster back to a clean state from the point
+ in time before calling the hook.
+
+ @type phase: string
+ @param phase: one of L{constants.HOOKS_PHASE_POST} or
+ L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
+ @type feedback_fn: callable
+ @param feedback_fn: function used send feedback back to the caller
+ @type exception: Exception
+ @param exception: The exception that was raised during the execution of
+ hooks.
+
+ """
+ pass
+
def _ExpandAndLockInstance(self, allow_forthcoming=False):
"""Helper function to expand and lock an instance.
diff --git a/lib/cmdlib/cluster/__init__.py b/lib/cmdlib/cluster/__init__.py
index ab701cd..8182910 100644
--- a/lib/cmdlib/cluster/__init__.py
+++ b/lib/cmdlib/cluster/__init__.py
@@ -191,7 +191,9 @@
potential_master_candidates,
cluster_info.ssh_key_type, # Old key type
self.ssh_key_type, # New key type
- self.ssh_key_bits) # New key bits
+ self.ssh_key_bits, # New key bits
+ self.op.debug,
+ self.op.verbose)
result[master_uuid].Raise("Could not renew the SSH keys of all nodes")
# After the keys have been successfully swapped, time to commit the change
@@ -1479,6 +1481,20 @@
feedback_fn("Cluster LVM configuration already in desired"
" state, not changing")
+ def _SetDiagnoseDataCollectorFilename(self, feedback_fn):
+ """Determines and sets the filename of the script
+ diagnose data collector should run.
+
+ """
+ if self.op.diagnose_data_collector_filename is not None:
+ fn = self.op.diagnose_data_collector_filename
+ if fn != self.cfg.GetDiagnoseDataCollectorFilename():
+ self.cfg.SetDiagnoseDataCollectorFilename(fn)
+ else:
+ feedback_fn("Diagnose data collector filename"
+ " configuration already in desired"
+ " state, not changing")
+
def _SetFileStorageDir(self, feedback_fn):
"""Set the file storage directory.
@@ -1646,6 +1662,7 @@
self._SetSharedFileStorageDir(feedback_fn)
self.cfg.Update(self.cluster, feedback_fn)
self._SetDrbdHelper(feedback_fn)
+ self._SetDiagnoseDataCollectorFilename(feedback_fn)
# re-read the fresh configuration again
self.cluster = self.cfg.GetClusterInfo()
@@ -1824,6 +1841,15 @@
if self.op.compression_tools is not None:
self.cfg.SetCompressionTools(self.op.compression_tools)
+ if self.op.maint_round_delay is not None:
+ self.cfg.SetMaintdRoundDelay(self.op.maint_round_delay)
+
+ if self.op.maint_balance is not None:
+ self.cfg.SetMaintdBalance(self.op.maint_balance)
+
+ if self.op.maint_balance_threshold is not None:
+ self.cfg.SetMaintdBalanceThreshold(self.op.maint_balance_threshold)
+
network_name = self.op.instance_communication_network
if network_name is not None:
return self._ModifyInstanceCommunicationNetwork(self.cfg,
diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py
index c32820a..8c68039 100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@ -259,8 +259,10 @@
return ResultWithJobs([])
else:
# Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
- return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
- for group in group_names])
+ return ResultWithJobs(
+ [[opcodes.OpGroupVerifyDisks(group_name=group,
+ is_strict=self.op.is_strict)]
+ for group in group_names])
class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
@@ -388,6 +390,8 @@
@ivar sbp: dictionary of {primary-node: list of instances} for all
instances for which this node is secondary (config)
@ivar mfree: free memory, as reported by hypervisor (runtime)
+ @ivar mtotal: total memory, as reported by hypervisor (runtime)
+ @ivar mdom0: domain0 memory, as reported by hypervisor (runtime)
@ivar dfree: free disk, as reported by the node (runtime)
@ivar offline: the offline status (config)
@type rpc_fail: boolean
@@ -419,6 +423,8 @@
self.sinst = []
self.sbp = {}
self.mfree = 0
+ self.mtotal = 0
+ self.mdom0 = 0
self.dfree = 0
self.offline = offline
self.vm_capable = vm_capable
@@ -985,6 +991,10 @@
"""
cluster_info = self.cfg.GetClusterInfo()
+ ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info,
+ self.group_info)
+ memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO]
+
for node_uuid, n_img in node_image.items():
# This code checks that every node which is now listed as
# secondary has enough memory to host all instances it is
@@ -994,8 +1004,9 @@
# WARNING: we currently take into account down instances as well
# as up ones, considering that even if they're down someone
# might want to start them even in the event of a node failure.
+ node_cfg = self.all_node_info[node_uuid]
if n_img.offline or \
- self.all_node_info[node_uuid].group != self.group_uuid:
+ node_cfg.group != self.group_uuid:
# we're skipping nodes marked offline and nodes in other groups from
# the N+1 warning, since most likely we don't have good memory
# information from them; we already list instances living on such
@@ -1008,7 +1019,13 @@
bep = cluster_info.FillBE(all_insts[inst_uuid])
if bep[constants.BE_AUTO_BALANCE]:
needed_mem += bep[constants.BE_MINMEM]
- test = n_img.mfree < needed_mem
+ mnode = n_img.mdom0
+ (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0]
+ if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM:
+ mnode = hv_state["mem_node"]
+ # minimum allowed free memory (it's negative due to over-commitment)
+ mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1)
+ test = n_img.mfree - needed_mem < mem_treshold
self._ErrorIf(test, constants.CV_ENODEN1,
self.cfg.GetNodeName(node_uuid),
"not enough memory to accomodate instance failovers"
@@ -1601,12 +1618,16 @@
"""
# try to read free memory (from the hypervisor)
hv_info = nresult.get(constants.NV_HVINFO, None)
- test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
+ test = not isinstance(hv_info, dict) or "memory_free" not in hv_info \
+ or "memory_total" not in hv_info \
+ or "memory_dom0" not in hv_info
self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
"rpc call to node failed (hvinfo)")
if not test:
try:
nimg.mfree = int(hv_info["memory_free"])
+ nimg.mtotal = int(hv_info["memory_total"])
+ nimg.mdom0 = int(hv_info["memory_dom0"])
except (ValueError, TypeError):
self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
"node returned invalid nodeinfo, check hypervisor")
diff --git a/lib/cmdlib/common.py b/lib/cmdlib/common.py
index 33142ef..6ee86b9 100644
--- a/lib/cmdlib/common.py
+++ b/lib/cmdlib/common.py
@@ -482,7 +482,9 @@
potential_master_candidates,
True, # add node's key to all node's 'authorized_keys'
True, # all nodes are potential master candidates
- False) # do not update the node's public keys
+ False, # do not update the node's public keys
+ lu.op.debug,
+ lu.op.verbose)
ssh_result[master_node].Raise(
"Could not update the SSH setup of node '%s' after promotion"
" (UUID: %s)." % (node.name, node.uuid))
diff --git a/lib/cmdlib/group.py b/lib/cmdlib/group.py
index 91f8752..2cf3483 100644
--- a/lib/cmdlib/group.py
+++ b/lib/cmdlib/group.py
@@ -851,6 +851,13 @@
self.dont_collate_locks[locking.LEVEL_NODEGROUP] = True
self.dont_collate_locks[locking.LEVEL_NODE] = True
+ # If run in strict mode, require locks for all nodes in the node group
+ # so we can verify all the disks. In non-strict mode, just verify the
+ # nodes that are available for locking.
+ if not self.op.is_strict:
+ self.opportunistic_locks[locking.LEVEL_NODE] = True
+ self.opportunistic_locks[locking.LEVEL_INSTANCE] = True
+
def DeclareLocks(self, level):
if level == locking.LEVEL_INSTANCE:
assert not self.needed_locks[locking.LEVEL_INSTANCE]
@@ -893,8 +900,9 @@
assert self.group_uuid in owned_groups
- # Check if locked instances are still correct
- CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_inst_names)
+ if self.op.is_strict:
+ # Check if locked instances are still correct
+ CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_inst_names)
# Get instance information
self.instances = dict(self.cfg.GetMultiInstanceInfoByName(owned_inst_names))
@@ -937,6 +945,7 @@
def _VerifyDrbdStates(self, node_errors, offline_disk_instance_names):
node_to_inst = {}
+ owned_node_uuids = set(self.owned_locks(locking.LEVEL_NODE))
for inst in self.instances.values():
disks = self.cfg.GetInstanceDisks(inst.uuid)
if not (inst.disks_active and
@@ -944,8 +953,10 @@
continue
secondary_nodes = self.cfg.GetInstanceSecondaryNodes(inst.uuid)
- for node_uuid in itertools.chain([inst.primary_node],
- secondary_nodes):
+ for node_uuid in itertools.chain([inst.primary_node], secondary_nodes):
+ if not node_uuid in owned_node_uuids:
+ logging.info("Node %s is not locked, skipping check.", node_uuid)
+ continue
node_to_inst.setdefault(node_uuid, []).append(inst)
for (node_uuid, insts) in node_to_inst.items():
diff --git a/lib/cmdlib/misc.py b/lib/cmdlib/misc.py
index 62bff52..d0bad88 100644
--- a/lib/cmdlib/misc.py
+++ b/lib/cmdlib/misc.py
@@ -40,7 +40,11 @@
from ganeti import query
from ganeti import utils
from ganeti.cmdlib.base import NoHooksLU, QueryBase
-from ganeti.cmdlib.common import GetWantedNodes, SupportsOob
+from ganeti.cmdlib.common import (
+ GetWantedNodes,
+ SupportsOob,
+ ExpandNodeUuidAndName
+)
class LUOobCommand(NoHooksLU):
@@ -418,3 +422,35 @@
result.append((True, nres.payload))
return result
+
+
+class LURepairCommand(NoHooksLU):
+ """Logical unit for executing repair commands.
+
+ """
+ REQ_BGL = False
+
+ def ExpandNames(self):
+ self.node_uuid, _ = ExpandNodeUuidAndName(self.cfg, None, self.op.node_name)
+
+ self.needed_locks = {
+ locking.LEVEL_NODE: self.node_uuid,
+ }
+ self.share_locks = {
+ locking.LEVEL_NODE: False,
+ }
+
+ def CheckPrereq(self):
+ """Check prerequisites.
+
+ """
+
+ def Exec(self, feedback_fn):
+ """Execute restricted command and return output.
+
+ """
+ owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
+ assert self.node_uuid in owned_nodes
+ return self.rpc.call_repair_command(self.op.node_name,
+ self.op.command,
+ self.op.input).data[1]
diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
index 210fd97..d1eae5e 100644
--- a/lib/cmdlib/node.py
+++ b/lib/cmdlib/node.py
@@ -151,6 +151,24 @@
def PreparePostHookNodes(self, post_hook_node_uuids):
return post_hook_node_uuids + [self.new_node.uuid]
+ def HooksAbortCallBack(self, phase, feedback_fn, exception):
+ """Cleans up if the hooks fail.
+
+ This function runs actions that necessary to bring the cluster into a
+ clean state again. This is necessary if for example the hooks of this
+ operation failed and leave the node in an inconsistent state.
+
+ """
+ if phase == constants.HOOKS_PHASE_PRE:
+ feedback_fn("Pre operation hook failed. Rolling back preparations.")
+
+ master_node = self.cfg.GetMasterNodeInfo().name
+ remove_result = self.rpc.call_node_ssh_key_remove_light(
+ [master_node],
+ self.op.node_name)
+ remove_result[master_node].Raise(
+ "Error removing SSH key of node '%s'." % self.op.node_name)
+
def CheckPrereq(self):
"""Check prerequisites.
@@ -358,7 +376,9 @@
True, # from public keys
False, # clear authorized keys
True, # clear public keys
- True) # it's a readd
+ True, # it's a readd
+ self.op.debug,
+ self.op.verbose)
remove_result[master_node].Raise(
"Could not remove SSH keys of node %s before readding,"
" (UUID: %s)." % (new_node_name, new_node_uuid))
@@ -368,7 +388,7 @@
[master_node], new_node_uuid, new_node_name,
potential_master_candidates,
is_master_candidate, is_potential_master_candidate,
- is_potential_master_candidate)
+ is_potential_master_candidate, self.op.debug, self.op.verbose)
result[master_node].Raise("Could not update the node's SSH setup.")
WarnAboutFailedSshUpdates(result, master_node, feedback_fn)
@@ -874,7 +894,9 @@
False, # currently, all nodes are potential master candidates
False, # do not clear node's 'authorized_keys'
False, # do not clear node's 'ganeti_pub_keys'
- False) # no readd
+ False, # no readd
+ self.op.debug,
+ self.op.verbose)
ssh_result[master_node].Raise(
"Could not adjust the SSH setup after demoting node '%s'"
" (UUID: %s)." % (node.name, node.uuid))
@@ -1574,7 +1596,9 @@
potential_master_candidate, # from_public_keys
True, # clear node's 'authorized_keys'
True, # clear node's 'ganeti_public_keys'
- False) # no readd
+ False, # no readd
+ self.op.debug,
+ self.op.verbose)
result[master_node].Raise(
"Could not remove the SSH key of node '%s' (UUID: %s)." %
(self.op.node_name, self.node.uuid))
diff --git a/lib/config/__init__.py b/lib/config/__init__.py
index 095fb88..16b6ee1 100644
--- a/lib/config/__init__.py
+++ b/lib/config/__init__.py
@@ -225,6 +225,30 @@
"""
return self._UnlockedGetNdParams(node)
+ def _UnlockedGetFilledHvStateParams(self, node):
+ cfg = self._ConfigData()
+ cluster_hv_state = cfg.cluster.hv_state_static
+ def_hv = self._UnlockedGetHypervisorType()
+ cluster_fv = constants.HVST_DEFAULTS if def_hv not in cluster_hv_state \
+ else cluster_hv_state[def_hv]
+ group_hv_state = self._UnlockedGetNodeGroup(node.group).hv_state_static
+ group_fv = cluster_fv if def_hv not in group_hv_state else \
+ objects.FillDict(cluster_fv, group_hv_state[def_hv])
+ node_fv = group_fv if def_hv not in node.hv_state_static else \
+ objects.FillDict(group_fv, node.hv_state_static[def_hv])
+ return {def_hv: node_fv}
+
+ @ConfigSync(shared=1)
+ def GetFilledHvStateParams(self, node):
+ """Get the node params populated with cluster defaults.
+
+ @type node: L{objects.Node}
+ @param node: The node we want to know the params for
+ @return: A dict with the filled in node hv_state params for the default hv
+
+ """
+ return self._UnlockedGetFilledHvStateParams(node)
+
@ConfigSync(shared=1)
def GetNdGroupParams(self, nodegroup):
"""Get the node groups params populated with cluster defaults.
@@ -1267,12 +1291,18 @@
"""
return self._ConfigData().cluster.gluster_storage_dir
+ def _UnlockedGetHypervisorType(self):
+ """Get the hypervisor type for this cluster.
+
+ """
+ return self._ConfigData().cluster.enabled_hypervisors[0]
+
@ConfigSync(shared=1)
def GetHypervisorType(self):
"""Get the hypervisor type for this cluster.
"""
- return self._ConfigData().cluster.enabled_hypervisors[0]
+ return self._UnlockedGetHypervisorType()
@ConfigSync(shared=1)
def GetRsaHostKey(self):
@@ -2949,6 +2979,21 @@
self._ConfigData().cluster.serial_no += 1
@ConfigSync(shared=1)
+ def GetDiagnoseDataCollectorFilename(self):
+ """Return the diagnose data collector filename
+
+ """
+ return self._ConfigData().cluster.diagnose_data_collector_filename
+
+ @ConfigSync()
+ def SetDiagnoseDataCollectorFilename(self, fn):
+ """Set the volume group name.
+
+ """
+ self._ConfigData().cluster.diagnose_data_collector_filename = fn
+ self._ConfigData().cluster.serial_no += 1
+
+ @ConfigSync(shared=1)
def GetDRBDHelper(self):
"""Return DRBD usermode helper.
@@ -3377,6 +3422,21 @@
if disk_uuid in inst_info.disks:
return inst_uuid
+ def SetMaintdRoundDelay(self, delay):
+ """Set the minimal time the maintenance daemon should wait between rounds"""
+ utils.SimpleRetry(True, self._wconfd.SetMaintdRoundDelay, 0.1, 30,
+ args=[delay])
+
+ def SetMaintdBalance(self, flag):
+ """Enable/disable auto-balancing by the maintenance daemon"""
+ utils.SimpleRetry(True, self._wconfd.SetMaintdBalance, 0.1, 30,
+ args=[flag])
+
+ def SetMaintdBalanceThreshold(self, score):
+ """Set the minimal score improvement per move for balancing steps"""
+ utils.SimpleRetry(True, self._wconfd.SetMaintdBalanceThreshold, 0.1, 30,
+ args=[score])
+
class DetachedConfig(ConfigWriter):
"""Read-only snapshot of the config."""
diff --git a/lib/masterd/iallocator.py b/lib/masterd/iallocator.py
index ed6b358..631acff 100644
--- a/lib/masterd/iallocator.py
+++ b/lib/masterd/iallocator.py
@@ -572,6 +572,7 @@
"master_capable": ninfo.master_capable,
"vm_capable": ninfo.vm_capable,
"ndparams": cfg.GetNdParams(ninfo),
+ "hv_state": cfg.GetFilledHvStateParams(ninfo)
})
for ninfo in node_cfg.values())
diff --git a/lib/mcpu.py b/lib/mcpu.py
index bdcc6af..41021ef 100644
--- a/lib/mcpu.py
+++ b/lib/mcpu.py
@@ -483,7 +483,16 @@
lu.CheckPrereq()
hm = self.BuildHooksManager(lu)
- h_results = hm.RunPhase(constants.HOOKS_PHASE_PRE)
+ try:
+ h_results = hm.RunPhase(constants.HOOKS_PHASE_PRE)
+ except Exception, err: # pylint: disable=W0703
+ # This gives the LU a chance of cleaning up in case of an hooks failure.
+ # The type of exception is deliberately broad to be able to react to
+ # any kind of failure.
+ lu.HooksAbortCallBack(constants.HOOKS_PHASE_PRE, self.Log, err)
+ # We re-raise the exception to not alter the behavior of LU handling
+ # otherwise.
+ raise err
lu.HooksCallBack(constants.HOOKS_PHASE_PRE, h_results,
self.Log, None)
diff --git a/lib/objects.py b/lib/objects.py
index e91719e..f53f846 100644
--- a/lib/objects.py
+++ b/lib/objects.py
@@ -63,7 +63,7 @@
__all__ = ["ConfigObject", "ConfigData", "NIC", "Disk", "Instance",
"OS", "Node", "NodeGroup", "Cluster", "FillDict", "Network",
- "Filter"]
+ "Filter", "Maintenance"]
_TIMESTAMPS = ["ctime", "mtime"]
_UUID = ["uuid"]
@@ -416,6 +416,7 @@
"networks",
"disks",
"filters",
+ "maintenance",
"serial_no",
] + _TIMESTAMPS
@@ -428,6 +429,7 @@
"""
mydict = super(ConfigData, self).ToDict(_with_private=_with_private)
mydict["cluster"] = mydict["cluster"].ToDict()
+ mydict["maintenance"] = mydict["maintenance"].ToDict()
for key in ("nodes", "instances", "nodegroups", "networks", "disks",
"filters"):
mydict[key] = outils.ContainerToDicts(mydict[key])
@@ -449,6 +451,7 @@
obj.networks = outils.ContainerFromDicts(obj.networks, dict, Network)
obj.disks = outils.ContainerFromDicts(obj.disks, dict, Disk)
obj.filters = outils.ContainerFromDicts(obj.filters, dict, Filter)
+ obj.maintenance = Maintenance.FromDict(obj.maintenance)
return obj
def DisksOfType(self, dev_type):
@@ -491,6 +494,9 @@
disk.UpgradeConfig()
if self.filters is None:
self.filters = {}
+ if self.maintenance is None:
+ self.maintenance = Maintenance.FromDict({})
+ self.maintenance.UpgradeConfig()
def _UpgradeEnabledDiskTemplates(self):
"""Upgrade the cluster's enabled disk templates by inspecting the currently
@@ -549,6 +555,20 @@
"predicates", "action", "reason_trail"] + _UUID
+class Maintenance(ConfigObject):
+ """Config object representing the state of the maintenance daemon"""
+ __slots__ = ["roundDelay", "jobs", "evacuated", "balance", "balanceThreshold",
+ "incidents", "serial_no"] + _TIMESTAMPS
+
+ def UpgradeConfig(self):
+ if self.serial_no is None:
+ self.serial_no = 1
+ if self.mtime is None:
+ self.mtime = time.time()
+ if self.ctime is None:
+ self.ctime = time.time()
+
+
class Disk(ConfigObject):
"""Config object representing a block device."""
__slots__ = [
@@ -1493,6 +1513,11 @@
if self.powered is None:
self.powered = True
+ if self.hv_state_static is None:
+ self.hv_state_static = {}
+ if self.disk_state_static is None:
+ self.disk_state_static = {}
+
def ToDict(self, _with_private=False):
"""Custom function for serializing.
@@ -1590,6 +1615,11 @@
if self.ipolicy is None:
self.ipolicy = MakeEmptyIPolicy()
+ if self.hv_state_static is None:
+ self.hv_state_static = {}
+ if self.disk_state_static is None:
+ self.disk_state_static = {}
+
if self.networks is None:
self.networks = {}
@@ -1675,6 +1705,7 @@
"compression_tools",
"enabled_user_shutdown",
"data_collectors",
+ "diagnose_data_collector_filename",
"ssh_key_type",
"ssh_key_bits",
] + _TIMESTAMPS + _UUID
diff --git a/lib/pathutils.py b/lib/pathutils.py
index 77a1cc4..78e321a 100644
--- a/lib/pathutils.py
+++ b/lib/pathutils.py
@@ -123,6 +123,7 @@
HOOKS_BASE_DIR = CONF_DIR + "/hooks"
FILE_STORAGE_PATHS_FILE = CONF_DIR + "/file-storage-paths"
RESTRICTED_COMMANDS_DIR = CONF_DIR + "/restricted-commands"
+REPAIR_COMMANDS_DIR = CONF_DIR + "/node-repair-commands"
#: Node daemon certificate path
NODED_CERT_FILE = DATA_DIR + "/server.pem"
@@ -134,6 +135,9 @@
#: Locked in exclusive mode while noded verifies a remote command
RESTRICTED_COMMANDS_LOCK_FILE = LOCK_DIR + "/ganeti-restricted-commands.lock"
+#: Locked in exclusive mode while noded verifies a remote command
+REPAIR_COMMANDS_LOCK_FILE = LOCK_DIR + "/ganeti-repair-commands.lock"
+
#: Lock file for watcher, locked in shared mode by watcher; lock in exclusive
# mode to block watcher (see L{cli._RunWhileDaemonsStoppedHelper.Call}
WATCHER_LOCK_FILE = LOCK_DIR + "/ganeti-watcher.lock"
@@ -190,3 +194,4 @@
LOG_WATCHER = GetLogFilename("watcher")
LOG_COMMANDS = GetLogFilename("commands")
LOG_BURNIN = GetLogFilename("burnin")
+LOG_TOOLS = GetLogFilename("tools")
diff --git a/lib/query.py b/lib/query.py
index dfeccf5..86c72b6 100644
--- a/lib/query.py
+++ b/lib/query.py
@@ -1309,32 +1309,6 @@
return _FS_UNAVAIL
-def _GetNodeHvState(_, node):
- """Converts node's hypervisor state for query result.
-
- """
- hv_state = node.hv_state
-
- if hv_state is None:
- return _FS_UNAVAIL
-
- return dict((name, value.ToDict()) for (name, value) in hv_state.items())
-
-
-def _GetNodeDiskState(_, node):
- """Converts node's disk state for query result.
-
- """
- disk_state = node.disk_state
-
- if disk_state is None:
- return _FS_UNAVAIL
-
- return dict((disk_kind, dict((name, value.ToDict())
- for (name, value) in kind_state.items()))
- for (disk_kind, kind_state) in disk_state.items())
-
-
def _BuildNodeFields():
"""Builds list of fields for node queries.
@@ -1361,10 +1335,16 @@
(_MakeField("custom_ndparams", "CustomNodeParameters", QFT_OTHER,
"Custom node parameters"),
NQ_GROUP, 0, _GetItemAttr("ndparams")),
- (_MakeField("hv_state", "HypervisorState", QFT_OTHER, "Hypervisor state"),
- NQ_CONFIG, 0, _GetNodeHvState),
+ # FIXME: The code below return custom hv_state instead of filled one.
+ # Anyway, this functionality is unlikely to be used.
+ (_MakeField("hv_state", "HypervisorState", QFT_OTHER,
+ "Static hypervisor state for default hypervisor only"),
+ NQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
+ (_MakeField("custom_hv_state", "CustomHypervisorState", QFT_OTHER,
+ "Custom static hypervisor state"),
+ NQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
(_MakeField("disk_state", "DiskState", QFT_OTHER, "Disk state"),
- NQ_CONFIG, 0, _GetNodeDiskState),
+ NQ_CONFIG, 0, _GetItemAttr("disk_state_static")),
]
fields.extend(_BuildNDFields(False))
@@ -2451,6 +2431,9 @@
(_MakeField("ipolicy", "InstancePolicy", QFT_OTHER,
"Instance policy limitations (merged)"),
GQ_CONFIG, 0, lambda ctx, _: ctx.group_ipolicy),
+ (_MakeField("networks", "Networks", QFT_OTHER,
+ "Node group networks"),
+ GQ_CONFIG, 0, _GetItemAttr("networks")),
(_MakeField("custom_ipolicy", "CustomInstancePolicy", QFT_OTHER,
"Custom instance policy limitations"),
GQ_CONFIG, 0, _GetItemAttr("ipolicy")),
@@ -2466,6 +2449,11 @@
(_MakeField("custom_diskparams", "CustomDiskParameters", QFT_OTHER,
"Custom disk parameters"),
GQ_CONFIG, 0, _GetItemAttr("diskparams")),
+ (_MakeField("hv_state", "HypervisorState", QFT_OTHER,
+ "Custom static hypervisor state"),
+ GQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
+ (_MakeField("disk_state", "DiskState", QFT_OTHER, "Disk state"),
+ GQ_CONFIG, 0, _GetItemAttr("disk_state_static")),
])
# ND parameters
@@ -2778,6 +2766,11 @@
(_MakeField("master_node", "Master", QFT_TEXT, "Master node name"),
CQ_CONFIG, QFF_HOSTNAME,
lambda ctx, cluster: _GetNodeName(ctx, None, cluster.master_node)),
+ (_MakeField("hv_state", "HypervisorState", QFT_OTHER,
+ "Custom static hypervisor state"),
+ CQ_CONFIG, 0, _GetItemAttr("hv_state_static")),
+ (_MakeField("disk_state", "DiskState", QFT_OTHER, "Disk state"),
+ CQ_CONFIG, 0, _GetItemAttr("disk_state_static")),
]
# Simple fields
diff --git a/lib/rapi/rlib2.py b/lib/rapi/rlib2.py
index 14c12ac..8514fcb 100644
--- a/lib/rapi/rlib2.py
+++ b/lib/rapi/rlib2.py
@@ -93,7 +93,7 @@
N_FIELDS = ["name", "offline", "master_candidate", "drained",
"dtotal", "dfree", "sptotal", "spfree",
- "mtotal", "mnode", "mfree",
+ "mtotal", "mnode", "mfree", "hv_state",
"pinst_cnt", "sinst_cnt",
"ctotal", "cnos", "cnodes", "csockets",
"pip", "sip", "role",
@@ -121,7 +121,7 @@
"diskparams",
"custom_diskparams",
"ndparams",
- "custom_ndparams",
+ "custom_ndparams"
] + _COMMON_FIELDS
FILTER_RULE_FIELDS = [
diff --git a/lib/rpc_defs.py b/lib/rpc_defs.py
index 71fa231..48f2ecb 100644
--- a/lib/rpc_defs.py
+++ b/lib/rpc_defs.py
@@ -543,7 +543,9 @@
("to_public_keys", None, "Whether the node's key should be added"
" to all nodes' public key file"),
("get_public_keys", None, "Whether the node should get the other nodes'"
- " public keys")],
+ " public keys"),
+ ("debug", None, "Set loglevel of ssh calls to 'debug'."),
+ ("verbose", None, "Set loglevel of ssh calls to 'verbose'.")],
None, None, "Distribute a new node's public SSH key on the cluster."),
("node_ssh_key_remove", MULTI, None, constants.RPC_TMO_FAST, [
("node_uuid", None, "UUID of the node whose key is removed"),
@@ -559,7 +561,9 @@
("clear_public_keys", None,
"If the 'ganeti_pub_keys' file of the node should be cleared."),
("readd", None,
- "Whether this is a readd operation.")],
+ "Whether this is a readd operation."),
+ ("debug", None, "Set loglevel of ssh calls to 'debug'."),
+ ("verbose", None, "Set loglevel of ssh calls to 'verbose'.")],
None, None, "Remove a node's SSH key from the other nodes' key files."),
("node_ssh_keys_renew", MULTI, None, constants.RPC_TMO_4HRS, [
("node_uuids", None, "UUIDs of the nodes whose key is renewed"),
@@ -568,8 +572,13 @@
("potential_master_candidates", None, "Potential master candidates"),
("old_key_type", None, "The type of key previously used"),
("new_key_type", None, "The type of key to generate"),
- ("new_key_bits", None, "The length of the key to generate")],
+ ("new_key_bits", None, "The length of the key to generate"),
+ ("debug", None, "Set logging of SSH update tool to 'debug'."),
+ ("verbose", None, "Set logging of SSH update tool to 'info'.")],
None, None, "Renew all SSH key pairs of all nodes nodes."),
+ ("node_ssh_key_remove_light", MULTI, None, constants.RPC_TMO_FAST, [
+ ("node_name", None, "Name of the node whose key is removed")],
+ None, None, "Remove a node's SSH key from the master's public key file."),
]
_MISC_CALLS = [
@@ -593,6 +602,10 @@
("restricted_command", MULTI, None, constants.RPC_TMO_SLOW, [
("cmd", None, "Command name"),
], None, None, "Runs restricted command"),
+ ("repair_command", SINGLE, None, constants.RPC_TMO_SLOW, [
+ ("cmd", None, "Command name"),
+ ("inp", None, "Input to be passed as stdin"),
+ ], None, None, "Runs repair command"),
("run_oob", SINGLE, None, constants.RPC_TMO_NORMAL, [
("oob_program", None, None),
("command", None, None),
diff --git a/lib/server/noded.py b/lib/server/noded.py
index a5e05dd..1397fbd 100644
--- a/lib/server/noded.py
+++ b/lib/server/noded.py
@@ -932,12 +932,15 @@
"""
(node_uuid, node_name, potential_master_candidates,
- to_authorized_keys, to_public_keys, get_public_keys) = params
+ to_authorized_keys, to_public_keys, get_public_keys,
+ debug, verbose) = params
return backend.AddNodeSshKey(node_uuid, node_name,
potential_master_candidates,
to_authorized_keys=to_authorized_keys,
to_public_keys=to_public_keys,
- get_public_keys=get_public_keys)
+ get_public_keys=get_public_keys,
+ ssh_update_debug=debug,
+ ssh_update_verbose=verbose)
@staticmethod
def perspective_node_ssh_keys_renew(params):
@@ -946,10 +949,12 @@
"""
(node_uuids, node_names, master_candidate_uuids,
potential_master_candidates, old_key_type, new_key_type,
- new_key_bits) = params
+ new_key_bits, debug, verbose) = params
return backend.RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
potential_master_candidates, old_key_type,
- new_key_type, new_key_bits)
+ new_key_type, new_key_bits,
+ ssh_update_debug=debug,
+ ssh_update_verbose=verbose)
@staticmethod
def perspective_node_ssh_key_remove(params):
@@ -959,7 +964,7 @@
(node_uuid, node_name,
master_candidate_uuids, potential_master_candidates,
from_authorized_keys, from_public_keys, clear_authorized_keys,
- clear_public_keys, readd) = params
+ clear_public_keys, readd, debug, verbose) = params
return backend.RemoveNodeSshKey(node_uuid, node_name,
master_candidate_uuids,
potential_master_candidates,
@@ -967,7 +972,17 @@
from_public_keys=from_public_keys,
clear_authorized_keys=clear_authorized_keys,
clear_public_keys=clear_public_keys,
- readd=readd)
+ readd=readd,
+ ssh_update_debug=debug,
+ ssh_update_verbose=verbose)
+
+ @staticmethod
+ def perspective_node_ssh_key_remove_light(params):
+ """Removes a node's SSH key from the master's public key file.
+
+ """
+ (node_name, ) = params
+ return backend.RemoveSshKeyFromPublicKeyFile(node_name)
# cluster --------------------------
@@ -1024,7 +1039,23 @@
"""
(cmd, ) = params
- return backend.RunRestrictedCmd(cmd)
+ return backend.RunConstrainedCmd(
+ cmd,
+ lock_file=pathutils.RESTRICTED_COMMANDS_LOCK_FILE,
+ path=pathutils.RESTRICTED_COMMANDS_DIR)
+
+ @staticmethod
+ def perspective_repair_command(params):
+ """ Run a repair command.
+
+ """
+ (cmd, inp, ) = params
+
+ return backend.RunConstrainedCmd(
+ cmd,
+ lock_file=pathutils.REPAIR_COMMANDS_LOCK_FILE,
+ path=pathutils.REPAIR_COMMANDS_DIR,
+ inp=inp)
@staticmethod
def perspective_write_ssconf_files(params):
diff --git a/lib/ssh.py b/lib/ssh.py
index a8fe86d..0fb592b 100644
--- a/lib/ssh.py
+++ b/lib/ssh.py
@@ -35,6 +35,7 @@
import logging
import os
+import shutil
import tempfile
from collections import namedtuple
@@ -1073,8 +1074,8 @@
(result.cmd, result.fail_reason))
-def ReadRemoteSshPubKeys(pub_key_file, node, cluster_name, port, ask_key,
- strict_host_check):
+def ReadRemoteSshPubKey(pub_key_file, node, cluster_name, port, ask_key,
+ strict_host_check):
"""Fetches a public SSH key from a node via SSH.
@type pub_key_file: string
@@ -1100,6 +1101,153 @@
return result.stdout
+def GetSshKeyFilenames(key_type, suffix=""):
+ """Get filenames of the SSH key pair of the given type.
+
+ @type key_type: string
+ @param key_type: type of SSH key, must be element of C{constants.SSHK_ALL}
+ @type suffix: string
+ @param suffix: optional suffix for the key filenames
+ @rtype: tuple of (string, string)
+ @returns: a tuple containing the name of the private key file and the
+ public key file.
+
+ """
+ if key_type not in constants.SSHK_ALL:
+ raise errors.SshUpdateError("Unsupported key type '%s'. Supported key types"
+ " are: %s." % (key_type, constants.SSHK_ALL))
+ (_, root_keyfiles) = \
+ GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
+ if not key_type in root_keyfiles.keys():
+ raise errors.SshUpdateError("No keyfile for key type '%s' available."
+ % key_type)
+
+ key_filenames = root_keyfiles[key_type]
+ if suffix:
+ key_filenames = [_ComputeKeyFilePathWithSuffix(key_filename, suffix)
+ for key_filename in key_filenames]
+
+ return key_filenames
+
+
+def GetSshPubKeyFilename(key_type, suffix=""):
+ """Get filename of the public SSH key of the given type.
+
+ @type key_type: string
+ @param key_type: type of SSH key, must be element of C{constants.SSHK_ALL}
+ @type suffix: string
+ @param suffix: optional suffix for the key filenames
+ @rtype: string
+ @returns: file name of the public key file
+
+ """
+ return GetSshKeyFilenames(key_type, suffix=suffix)[1]
+
+
+def _ComputeKeyFilePathWithSuffix(key_filepath, suffix):
+ """Converts the given key filename to a key filename with a suffix.
+
+ @type key_filepath: string
+ @param key_filepath: path of the key file
+ @type suffix: string
+ @param suffix: suffix to be appended to the basename of the file
+
+ """
+ path = os.path.dirname(key_filepath)
+ ext = os.path.splitext(os.path.basename(key_filepath))[1]
+ basename = os.path.splitext(os.path.basename(key_filepath))[0]
+ return os.path.join(path, basename + suffix + ext)
+
+
+def ReplaceSshKeys(src_key_type, dest_key_type,
+ src_key_suffix="", dest_key_suffix=""):
+ """Replaces an SSH key pair by another SSH key pair.
+
+ Note that both parts, the private and the public key, are replaced.
+
+ @type src_key_type: string
+ @param src_key_type: key type of key pair that is replacing the other
+ key pair
+ @type dest_key_type: string
+ @param dest_key_type: key type of the key pair that is being replaced
+ by the source key pair
+ @type src_key_suffix: string
+ @param src_key_suffix: optional suffix of the key files of the source
+ key pair
+ @type dest_key_suffix: string
+ @param dest_key_suffix: optional suffix of the keey files of the
+ destination key pair
+
+ """
+ (src_priv_filename, src_pub_filename) = GetSshKeyFilenames(
+ src_key_type, suffix=src_key_suffix)
+ (dest_priv_filename, dest_pub_filename) = GetSshKeyFilenames(
+ dest_key_type, suffix=dest_key_suffix)
+
+ if not (os.path.exists(src_priv_filename) and
+ os.path.exists(src_pub_filename)):
+ raise errors.SshUpdateError(
+ "At least one of the source key files is missing: %s",
+ ", ".join([src_priv_filename, src_pub_filename]))
+
+ for dest_file in [dest_priv_filename, dest_pub_filename]:
+ if os.path.exists(dest_file):
+ utils.CreateBackup(dest_file)
+ utils.RemoveFile(dest_file)
+
+ shutil.move(src_priv_filename, dest_priv_filename)
+ shutil.move(src_pub_filename, dest_pub_filename)
+
+
+def ReadLocalSshPubKeys(key_types, suffix=""):
+ """Reads the local root user SSH key.
+
+ @type key_types: list of string
+ @param key_types: types of SSH keys. Must be subset of constants.SSHK_ALL. If
+ 'None' or [], all available keys are returned.
+ @type suffix: string
+ @param suffix: optional suffix to be attached to key names when reading
+ them. Used for temporary key files.
+ @rtype: list of string
+ @return: list of public keys
+
+ """
+ fetch_key_types = []
+ if key_types:
+ fetch_key_types += key_types
+ else:
+ fetch_key_types = constants.SSHK_ALL
+
+ (_, root_keyfiles) = \
+ GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
+
+ result_keys = []
+ for (public_key_type, (_, public_key_file)) in root_keyfiles.items():
+
+ if public_key_type not in fetch_key_types:
+ continue
+
+ public_key_dir = os.path.dirname(public_key_file)
+ public_key_filename = ""
+ if suffix:
+ public_key_filename = \
+ os.path.splitext(os.path.basename(public_key_file))[0] \
+ + suffix + ".pub"
+ else:
+ public_key_filename = public_key_file
+ public_key_path = os.path.join(public_key_dir,
+ public_key_filename)
+
+ if not os.path.exists(public_key_path):
+ raise errors.SshUpdateError("Cannot find SSH public key of type '%s'."
+ % public_key_type)
+ else:
+ key = utils.ReadFile(public_key_path)
+ result_keys.append(key)
+
+ return result_keys
+
+
# Update gnt-cluster.rst when changing which combinations are valid.
KeyBitInfo = namedtuple('KeyBitInfo', ['default', 'validation_fn'])
SSH_KEY_VALID_BITS = {
diff --git a/lib/tools/cfgupgrade.py b/lib/tools/cfgupgrade.py
index 14e2e20..59ab1e1 100644
--- a/lib/tools/cfgupgrade.py
+++ b/lib/tools/cfgupgrade.py
@@ -59,11 +59,11 @@
#: Target major version we will upgrade to
TARGET_MAJOR = 2
#: Target minor version we will upgrade to
-TARGET_MINOR = 16
+TARGET_MINOR = 17
#: Target major version for downgrade
DOWNGRADE_MAJOR = 2
#: Target minor version for downgrade
-DOWNGRADE_MINOR = 15
+DOWNGRADE_MINOR = 16
# map of legacy device types
# (mapping differing old LD_* constants to new DT_* constants)
@@ -183,8 +183,8 @@
self._Downgrade(config_major, config_minor, config_version,
config_revision)
- # Upgrade from 2.{0..15} to 2.16
- elif config_major == 2 and config_minor in range(0, 16):
+ # Upgrade from 2.{0..n-1} to 2.n
+ elif config_major == 2 and config_minor in range(0, TARGET_MINOR):
if config_revision != 0:
logging.warning("Config revision is %s, not 0", config_revision)
if not self.UpgradeAll():
@@ -340,6 +340,8 @@
cluster["data_collectors"].get(
name, dict(active=True,
interval=constants.MOND_TIME_INTERVAL * 1e6))
+ if "diagnose_data_collector_filename" not in cluster:
+ cluster["diagnose_data_collector_filename"] = ""
# These parameters are set to pre-2.16 default values, which
# differ from post-2.16 default values
@@ -696,6 +698,14 @@
else:
disk["nodes"] = []
+ @OrFail("Upgrading maintenance data")
+ def UpgradeMaintenance(self):
+ # pylint can't infer config_data type
+ # pylint: disable=E1103
+ maintenance = self.config_data.get("maintenance", None)
+ if maintenance is None:
+ self.config_data["maintenance"] = {}
+
def UpgradeAll(self):
self.config_data["version"] = version.BuildVersion(TARGET_MAJOR,
TARGET_MINOR, 0)
@@ -711,48 +721,33 @@
self.UpgradeInstanceIndices,
self.UpgradeFilters,
self.UpgradeDiskNodes,
- self.UpgradeDiskTemplate]
+ self.UpgradeDiskTemplate,
+ self.UpgradeMaintenance]
for s in steps:
s()
return not self.errors
# DOWNGRADE ------------------------------------------------------------
- @OrFail("Removing SSH parameters")
- def DowngradeSshKeyParams(self):
- """Removes the SSH key type and bits parameters from the config.
-
- Also fails if these have been changed from values appropriate in lower
- Ganeti versions.
-
- """
- # pylint: disable=E1103
- # Because config_data is a dictionary which has the get method.
- cluster = self.config_data.get("cluster", None)
- if cluster is None:
- raise Error("Can't find the cluster entry in the configuration")
-
- def _FetchAndDelete(key):
- val = cluster.get(key, None)
- if key in cluster:
- del cluster[key]
- return val
-
- ssh_key_type = _FetchAndDelete("ssh_key_type")
- _FetchAndDelete("ssh_key_bits")
-
- if ssh_key_type is not None and ssh_key_type != "dsa":
- raise Error("The current Ganeti setup is using non-DSA SSH keys, and"
- " versions below 2.16 do not support these. To downgrade,"
- " please perform a gnt-cluster renew-crypto using the "
- " --new-ssh-keys and --ssh-key-type=dsa options, generating"
- " DSA keys that older versions can also use.")
-
def DowngradeAll(self):
+ if "maintenance" in self.config_data:
+ del self.config_data["maintenance"]
+ if "cluster" in self.config_data:
+ cluster = self.config_data["cluster"]
+ if "diagnose_data_collector_filename" in cluster:
+ del cluster["diagnose_data_collector_filename"]
+ if "data_collectors" in cluster:
+ if constants.DATA_COLLECTOR_DIAGNOSE in cluster["data_collectors"]:
+ del cluster["data_collectors"][constants.DATA_COLLECTOR_DIAGNOSE]
+ if constants.DATA_COLLECTOR_KVM_R_S_S in cluster["data_collectors"]:
+ del cluster["data_collectors"][constants.DATA_COLLECTOR_KVM_R_S_S]
+ if "ipolicy" in cluster:
+ ipolicy = cluster["ipolicy"]
+ if "memory-ratio" in ipolicy:
+ del ipolicy["memory-ratio"]
self.config_data["version"] = version.BuildVersion(DOWNGRADE_MAJOR,
DOWNGRADE_MINOR, 0)
- self.DowngradeSshKeyParams()
return not self.errors
def _ComposePaths(self):
diff --git a/lib/tools/common.py b/lib/tools/common.py
index 60fe169..d8f1588 100644
--- a/lib/tools/common.py
+++ b/lib/tools/common.py
@@ -182,6 +182,19 @@
return name
+def VerifyHmac(data, error_fn):
+ """Verifies the presence of the hmac secret.
+
+ @type data: dict
+
+ """
+ hmac = data.get(constants.NDS_HMAC)
+ if not hmac:
+ raise error_fn("Hmac key must be provided")
+
+ return hmac
+
+
def LoadData(raw, data_check):
"""Parses and verifies input data.
diff --git a/lib/tools/ensure_dirs.py b/lib/tools/ensure_dirs.py
index 0a197ba..66b37e8 100644
--- a/lib/tools/ensure_dirs.py
+++ b/lib/tools/ensure_dirs.py
@@ -250,7 +250,9 @@
"""
(opts, args) = ParseOptions()
- utils.SetupToolLogging(opts.debug, opts.verbose)
+ utils.SetupToolLogging(
+ opts.debug, opts.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
if args:
logging.error("No arguments are expected")
diff --git a/lib/tools/node_cleanup.py b/lib/tools/node_cleanup.py
index 7a9ff81..1324db8 100644
--- a/lib/tools/node_cleanup.py
+++ b/lib/tools/node_cleanup.py
@@ -80,7 +80,9 @@
"""
opts = ParseOptions()
- utils.SetupToolLogging(opts.debug, opts.verbose)
+ utils.SetupToolLogging(
+ opts.debug, opts.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
try:
# List of files to delete. Contains tuples consisting of the absolute path
diff --git a/lib/tools/node_daemon_setup.py b/lib/tools/node_daemon_setup.py
index e45e2e0..c971d15 100644
--- a/lib/tools/node_daemon_setup.py
+++ b/lib/tools/node_daemon_setup.py
@@ -51,6 +51,7 @@
_DATA_CHECK = ht.TStrictDict(False, True, {
constants.NDS_CLUSTER_NAME: ht.TNonEmptyString,
constants.NDS_NODE_DAEMON_CERTIFICATE: ht.TNonEmptyString,
+ constants.NDS_HMAC: ht.TNonEmptyString,
constants.NDS_SSCONF: ht.TDictOf(ht.TNonEmptyString, ht.TString),
constants.NDS_START_NODE_DAEMON: ht.TBool,
constants.NDS_NODE_NAME: ht.TString,
@@ -117,7 +118,9 @@
"""
opts = ParseOptions()
- utils.SetupToolLogging(opts.debug, opts.verbose)
+ utils.SetupToolLogging(
+ opts.debug, opts.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
try:
getent = runtime.GetEnts()
@@ -127,11 +130,18 @@
cluster_name = common.VerifyClusterName(data, SetupError,
constants.NDS_CLUSTER_NAME)
cert_pem = common.VerifyCertificateStrong(data, SetupError)
+ hmac_key = common.VerifyHmac(data, SetupError)
ssdata = VerifySsconf(data, cluster_name)
logging.info("Writing ssconf files ...")
ssconf.WriteSsconfFiles(ssdata, dry_run=opts.dry_run)
+ logging.info("Writing hmac.key ...")
+ utils.WriteFile(pathutils.CONFD_HMAC_KEY, data=hmac_key,
+ mode=pathutils.NODED_CERT_MODE,
+ uid=getent.masterd_uid, gid=getent.masterd_gid,
+ dry_run=opts.dry_run)
+
logging.info("Writing node daemon certificate ...")
utils.WriteFile(pathutils.NODED_CERT_FILE, data=cert_pem,
mode=pathutils.NODED_CERT_MODE,
diff --git a/lib/tools/prepare_node_join.py b/lib/tools/prepare_node_join.py
index fa45a58..0a0e2c8 100644
--- a/lib/tools/prepare_node_join.py
+++ b/lib/tools/prepare_node_join.py
@@ -195,7 +195,9 @@
"""
opts = ParseOptions()
- utils.SetupToolLogging(opts.debug, opts.verbose)
+ utils.SetupToolLogging(
+ opts.debug, opts.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
try:
data = common.LoadData(sys.stdin.read(), _DATA_CHECK)
diff --git a/lib/tools/ssh_update.py b/lib/tools/ssh_update.py
index b37972e..23f5077 100644
--- a/lib/tools/ssh_update.py
+++ b/lib/tools/ssh_update.py
@@ -210,7 +210,9 @@
"""
opts = ParseOptions()
- utils.SetupToolLogging(opts.debug, opts.verbose)
+ utils.SetupToolLogging(
+ opts.debug, opts.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
try:
data = common.LoadData(sys.stdin.read(), _DATA_CHECK)
diff --git a/lib/tools/ssl_update.py b/lib/tools/ssl_update.py
index 56e8d6a..05be975 100644
--- a/lib/tools/ssl_update.py
+++ b/lib/tools/ssl_update.py
@@ -114,7 +114,9 @@
"""
opts = ParseOptions()
- utils.SetupToolLogging(opts.debug, opts.verbose)
+ utils.SetupToolLogging(
+ opts.debug, opts.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
try:
data = common.LoadData(sys.stdin.read(), _DATA_CHECK)
diff --git a/lib/utils/log.py b/lib/utils/log.py
index 3703221..903d993 100644
--- a/lib/utils/log.py
+++ b/lib/utils/log.py
@@ -34,10 +34,10 @@
import os.path
import logging
import logging.handlers
-from cStringIO import StringIO
from ganeti import constants
from ganeti import compat
+from ganeti import pathutils
class _ReopenableLogHandler(logging.handlers.BaseRotatingHandler):
@@ -188,7 +188,8 @@
def SetupLogging(logfile, program, debug=0, stderr_logging=False,
multithreaded=False, syslog=constants.SYSLOG_USAGE,
- console_logging=False, root_logger=None):
+ console_logging=False, root_logger=None,
+ verbose=True):
"""Configures the logging module.
@type logfile: str
@@ -212,6 +213,8 @@
the system console if logging fails
@type root_logger: logging.Logger
@param root_logger: Root logger to use (for unittests)
+ @type verbose: boolean
+ @param verbose: whether to log at 'info' level already (logfile logging only)
@raise EnvironmentError: if we can't open the log file and
syslog/stderr logging is disabled
@rtype: callable
@@ -252,7 +255,7 @@
syslog_handler.setLevel(logging.INFO)
root_logger.addHandler(syslog_handler)
- if syslog != constants.SYSLOG_ONLY:
+ if syslog != constants.SYSLOG_ONLY and logfile:
# this can fail, if the logging directories are not setup or we have
# a permisssion problem; in this case, it's best to log but ignore
# the error if stderr_logging is True, and if false we re-raise the
@@ -267,8 +270,10 @@
logfile_handler.setFormatter(formatter)
if debug:
logfile_handler.setLevel(logging.DEBUG)
- else:
+ elif verbose:
logfile_handler.setLevel(logging.INFO)
+ else:
+ logfile_handler.setLevel(logging.WARN)
root_logger.addHandler(logfile_handler)
reopen_handlers.append(logfile_handler)
except EnvironmentError:
@@ -282,45 +287,37 @@
def SetupToolLogging(debug, verbose, threadname=False,
- _root_logger=None, _stream=None):
+ toolname=None, logfile=pathutils.LOG_TOOLS):
"""Configures the logging module for tools.
- All log messages are sent to stderr.
+ All log messages are sent to the tools.log logfile.
+ @type toolname: string
+ @param toolname: name of the tool that's logging
@type debug: boolean
@param debug: Disable log message filtering
@type verbose: boolean
@param verbose: Enable verbose log messages
@type threadname: boolean
@param threadname: Whether to include thread name in output
+ @type logfile: string
+ @param logfile: the path of the log file to use, use "None"
+ for tools which don't necessarily run on Ganeti nodes (and
+ thus don't have the Ganeti log directory).
"""
- if _root_logger is None:
- root_logger = logging.getLogger("")
- else:
- root_logger = _root_logger
+ if not toolname:
+ toolname = "unspecified_tool"
- fmt = StringIO()
- fmt.write("%(asctime)s:")
-
- if threadname:
- fmt.write(" %(threadName)s")
-
- if debug or verbose:
- fmt.write(" %(levelname)s")
-
- fmt.write(" %(message)s")
-
- formatter = logging.Formatter(fmt.getvalue())
-
- stderr_handler = logging.StreamHandler(_stream)
- stderr_handler.setFormatter(formatter)
+ # 'SetupLogging' takes a quite unintuitive 'debug' option that
+ # is '0' for 'log higher than debug level' and '1' for
+ # 'log at NOSET' level. Hence this conversion.
+ debug_int = 0
if debug:
- stderr_handler.setLevel(logging.NOTSET)
- elif verbose:
- stderr_handler.setLevel(logging.INFO)
- else:
- stderr_handler.setLevel(logging.WARNING)
+ debug_int = 1
- root_logger.setLevel(logging.NOTSET)
- root_logger.addHandler(stderr_handler)
+ SetupLogging(logfile,
+ program=toolname,
+ debug=debug_int,
+ multithreaded=threadname,
+ verbose=verbose)
diff --git a/lib/utils/process.py b/lib/utils/process.py
index 268ff54..5933929 100644
--- a/lib/utils/process.py
+++ b/lib/utils/process.py
@@ -185,7 +185,8 @@
@type noclose_fds: list
@param noclose_fds: list of additional (fd >=3) file descriptors to leave
open for the child process
- @type input_fd: C{file}-like object or numeric file descriptor
+ @type input_fd: C{file}-like object containing an actual file descriptor
+ or numeric file descriptor
@param input_fd: File descriptor for process' standard input
@type postfork_fn: Callable receiving PID as parameter
@param postfork_fn: Callback run after fork but before timeout
@@ -526,7 +527,8 @@
@type noclose_fds: list
@param noclose_fds: list of additional (fd >=3) file descriptors to leave
open for the child process
- @type input_fd: C{file}-like object or numeric file descriptor
+ @type input_fd: C{file}-like object containing an actual file descriptor
+ or numeric file descriptor
@param input_fd: File descriptor for process' standard input
@type postfork_fn: Callable receiving PID as parameter
@param postfork_fn: Function run after fork but before timeout
diff --git a/lib/utils/retry.py b/lib/utils/retry.py
index 8079303..895cc0e 100644
--- a/lib/utils/retry.py
+++ b/lib/utils/retry.py
@@ -253,7 +253,8 @@
wait_fn=inc_tries, _time_fn=get_tries)
-def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs):
+def RetryByNumberOfTimes(max_retries, backoff, exception_class, fn, *args,
+ **kwargs):
"""Retries calling a function up to the specified number of times.
@type max_retries: integer
@@ -264,9 +265,23 @@
@type fn: callable
@param fn: Function to be called (up to the specified maximum number of
retries.
+ @type backoff: int
+ @param backoff: this enables and configures the back off behavior after
+ failed tries. If value is '0', there will be no delay between failed
+ tries. If the value is a positive integer, it is interpreted as the
+ base length of the back off delay (in seconds). That means there will be a
+ delay between failed tries of the length specified in this paramter. With
+ each next retry, the delay is increased by the factor of two. For example,
+ if the value is '2', the first delay is 2 seconds, the second 4 seconds,
+ the third 8 seconds (until the max_retries) are hit or the function call
+ succeeds.
"""
+ if backoff < 0:
+ raise exception_class("Backoff must be a non-negative integer.")
+
last_exception = None
+ delay = backoff
for i in range(max_retries):
try:
fn(*args, **kwargs)
@@ -274,6 +289,8 @@
except errors.OpExecError as e:
logging.error("Error after retry no. %s: %s.", i, e)
last_exception = e
+ time.sleep(delay)
+ delay *= 2
else:
if last_exception:
raise exception_class("Error after %s retries. Last exception: %s."
diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index b3ced47..5a557c8 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -345,12 +345,36 @@
return compat.any(nodes[node_name].offline for node_name in instance.snodes)
-def _VerifyDisks(cl, uuid, nodes, instances):
+def _GetPendingVerifyDisks(cl, uuid):
+ """Checks if there are any currently running or pending group verify jobs and
+ if so, returns their id.
+
+ """
+ qfilter = qlang.MakeSimpleFilter("status",
+ frozenset([constants.JOB_STATUS_RUNNING,
+ constants.JOB_STATUS_QUEUED,
+ constants.JOB_STATUS_WAITING]))
+ qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter)
+
+ ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data
+ if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)]
+ return ids
+
+
+def _VerifyDisks(cl, uuid, nodes, instances, is_strict):
"""Run a per-group "gnt-cluster verify-disks".
"""
+
+ existing_jobs = _GetPendingVerifyDisks(cl, uuid)
+ if existing_jobs:
+ logging.info("There are verify disks jobs already pending (%s), skipping "
+ "VerifyDisks step for %s.",
+ utils.CommaJoin(existing_jobs), uuid)
+ return
+
op = opcodes.OpGroupVerifyDisks(
- group_name=uuid, priority=constants.OP_PRIO_LOW)
+ group_name=uuid, priority=constants.OP_PRIO_LOW, is_strict=is_strict)
op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
"Verifying disks of group %s" % uuid,
utils.EpochNano())]
@@ -477,6 +501,9 @@
help="Don't wait for child processes")
parser.add_option("--no-verify-disks", dest="no_verify_disks", default=False,
action="store_true", help="Do not verify disk status")
+ parser.add_option("--no-strict", dest="no_strict",
+ default=False, action="store_true",
+ help="Do not run group verify in strict mode")
parser.add_option("--rapi-ip", dest="rapi_ip",
default=constants.IP4_ADDRESS_LOCALHOST,
help="Use this IP to talk to RAPI.")
@@ -704,6 +731,7 @@
# we are on master now
utils.EnsureDaemon(constants.RAPI)
utils.EnsureDaemon(constants.WCONFD)
+ utils.EnsureDaemon(constants.MAINTD)
# If RAPI isn't responding to queries, try one restart
logging.debug("Attempting to talk to remote API on %s",
@@ -843,7 +871,7 @@
logging.debug("Using state file %s", state_path)
- # Global watcher
+ # Group watcher file lock
statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
if not statefile:
return constants.EXIT_FAILURE
@@ -866,26 +894,28 @@
started = _CheckInstances(client, notepad, instances, locks)
_CheckDisks(client, notepad, nodes, instances, started)
-
- # Check if the nodegroup only has ext storage type
- only_ext = compat.all(i.disk_template == constants.DT_EXT
- for i in instances.values())
-
- # We skip current NodeGroup verification if there are only external storage
- # devices. Currently we provide an interface for external storage provider
- # for disk verification implementations, however current ExtStorageDevice
- # does not provide an API for this yet.
- #
- # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
- # is implemented.
- if not opts.no_verify_disks and not only_ext:
- _VerifyDisks(client, group_uuid, nodes, instances)
except Exception, err:
logging.info("Not updating status file due to failure: %s", err)
raise
else:
# Save changes for next run
notepad.Save(state_path)
+ notepad.Close()
+
+ # Check if the nodegroup only has ext storage type
+ only_ext = compat.all(i.disk_template == constants.DT_EXT
+ for i in instances.values())
+
+ # We skip current NodeGroup verification if there are only external storage
+ # devices. Currently we provide an interface for external storage provider
+ # for disk verification implementations, however current ExtStorageDevice
+ # does not provide an API for this yet.
+ #
+ # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
+ # is implemented.
+ if not opts.no_verify_disks and not only_ext:
+ is_strict = not opts.no_strict
+ _VerifyDisks(client, group_uuid, nodes, instances, is_strict=is_strict)
return constants.EXIT_SUCCESS
diff --git a/lib/watcher/state.py b/lib/watcher/state.py
index 5c51b5b..b8ff4ef 100644
--- a/lib/watcher/state.py
+++ b/lib/watcher/state.py
@@ -111,7 +111,7 @@
self._orig_data = serializer.Dump(self._data)
def Save(self, filename):
- """Save state to file, then unlock and close it.
+ """Save state to file.
"""
assert self.statefile
diff --git a/man/ganeti-maintd.rst b/man/ganeti-maintd.rst
new file mode 100644
index 0000000..d04fa6a
--- /dev/null
+++ b/man/ganeti-maintd.rst
@@ -0,0 +1,101 @@
+ganeti-maintd(8) Ganeti | Version @GANETI_VERSION@
+==================================================
+
+Name
+----
+
+ganeti-maintd - Ganeti maintenance daemon
+
+Synopsis
+--------
+**ganeti-maintd** [-f] [-d] [-p *PORT*] [-b *ADDRESS*] [--no-voting --yes-do-it]
+
+DESCRIPTION
+-----------
+
+**ganeti-maintd** is the the daemon carrying out regular maintenance
+of the cluster.
+
+For testing purposes, you can give the ``-f`` option and the
+program won't detach from the running terminal.
+
+Debug-level message can be activated by giving the ``-d`` option.
+
+The **ganeti-maintd** daemon listens to port 1816 TCP, on all interfaces,
+by default. The port can be overridden by an entry the services database
+by passing the ``-p`` option.
+The ``-b`` option can be used to specify the address to bind to
+(defaults to ``0.0.0.0``).
+
+The daemon will refuse to start if it cannot verify that the majority
+of cluster nodes believes that it is running on the master node. To
+allow failover in a two-node cluster, this can be overridden by the
+``--no-voting`` option. In this case, the ``--yes-do-it`` option has
+to be given as well.
+
+Operation
+~~~~~~~~~
+
+The maintenance daemon will carry out precisely the same jobs that
+**harep**\(1) would do if continously run. In particular, it can
+be controlled by the same set of opt-in tags.
+
+Communication
+~~~~~~~~~~~~~
+
+The daemon will expose its internal state via HTTP. The answer is
+encoded in JSON format and is specific to the particular request.
+
+``/``
++++++
+The root resource. It will return the list of supported protocol
+versions. At the moment, only version ``1`` is supported.
+
+``1/status``
+++++++++++++
+
+List of all currently ongoing incidents. This is a list of JSON
+objects, each containing at least the following information.
+
+- ``uuid`` The unique identifier assigned to the event.
+
+- ``node`` The UUID of the node on which the even was observed.
+
+- ``original`` The very JSON object reported by self-diagnose data collector.
+
+- ``repair-status`` A string describing the progress made on this event so
+ far. It is one of the following.
+
+ + ``noted`` The event has been observed, but no action has been taken yet
+
+ + ``pending`` At least one job has been submitted in reaction to the event
+ and none of the submitted jobs has failed so far.
+
+ + ``canceled`` The event has been canceled, i.e., ordered to be ignored, but
+ is still observed.
+
+ + ``failed`` At least one of the submitted jobs has failed. To avoid further
+ damage, the repair daemon will not take any further action for this event.
+
+ + ``completed`` All Ganeti actions associated with this event have been
+ completed successfully, including tagging the node.
+
+- ``jobs`` The list of the numbers of ganeti jobs submitted in response to
+ this event.
+
+- ``tag`` A string that is the tag that either has been added to the node, or,
+ if the repair event is not yet finalized, will be added in case of success.
+
+
+``/1/jobs``
++++++++++++
+The list of jobs the daemon will wait for to finish, before starting
+the next round of maintenance.
+
+``/1/evacuated``
+++++++++++++++++
+The list of instance names the daemon does not expect to have load
+data available because they have been recently evacuated from an
+offline (or drained) node. Currently, this affects only Xen instances,
+as for other hypervisors the overall CPU load on the node is taken
+as balancing measure.
diff --git a/man/ganeti-watcher.rst b/man/ganeti-watcher.rst
index 539ba2e..533428e 100644
--- a/man/ganeti-watcher.rst
+++ b/man/ganeti-watcher.rst
@@ -10,7 +10,7 @@
--------
**ganeti-watcher** [\--debug] [\--job-age=*age* ] [\--ignore-pause]
-[\--rapi-ip=*IP*] [\--no-verify-disks]
+[\--rapi-ip=*IP*] [\--no-verify-disks] [\--no-strict]
DESCRIPTION
-----------
@@ -30,6 +30,11 @@
The ``--debug`` option will increase the verbosity of the watcher
and also activate logging to the standard error.
+The ``--no-strict`` option runs the group verify disks job in a
+non-strict mode. This only verifies those disks whose node locks could
+be acquired in a best-effort attempt and will skip nodes that are
+recognized as busy with other jobs.
+
The ``--rapi-ip`` option needs to be set if the RAPI daemon was
started with a particular IP (using the ``-b`` option). The two
options need to be exactly the same to ensure that the watcher
diff --git a/man/ganeti.rst b/man/ganeti.rst
index d3b37e8..b68ad08 100644
--- a/man/ganeti.rst
+++ b/man/ganeti.rst
@@ -179,8 +179,8 @@
discovered or set manually. Only used for estimating how many VCPUs
are left for instances
-Note that currently this option is unused by Ganeti; values will be
-recorded but will not influence the Ganeti operation.
+Note that currently only ``mem_node`` is used by Ganeti; other values
+will be recorded but will not influence the Ganeti operation.
Disk State Parameters
diff --git a/man/gnt-cluster.rst b/man/gnt-cluster.rst
index 5c055ee..9d247e0 100644
--- a/man/gnt-cluster.rst
+++ b/man/gnt-cluster.rst
@@ -198,6 +198,7 @@
| [\--ipolicy-disk-templates *template* [,*template*...]]
| [\--ipolicy-spindle-ratio *ratio*]
| [\--ipolicy-vcpu-ratio *ratio*]
+| [\--ipolicy-memory-ratio *ratio*]
| [\--disk-state *diskstate*]
| [\--hypervisor-state *hvstate*]
| [\--drbd-usermode-helper *helper*]
@@ -587,6 +588,7 @@
- ``--ipolicy-spindle-ratio`` limits the instances-spindles ratio
- ``--ipolicy-vcpu-ratio`` limits the vcpu-cpu ratio
+- ``--ipolicy-memory-ratio`` limits the memory over-commitment ratio
All the instance policy elements can be overridden at group level. Group
level overrides can be removed by specifying ``default`` as the value of
@@ -743,6 +745,10 @@
| [\--user-shutdown {yes \| no}]
| [\--enabled-data-collectors *collectors*]
| [\--data-collector-interval *intervals*]
+| [\--maintenance-interval *seconds*]
+| [\--auto-balance-cluster {yes \| no }]
+| [\--auto-balance-threshold *score* ]
+| [\--diagnose-data-collector-filename *filename*]
Modify the options for the cluster.
@@ -814,6 +820,21 @@
and number of seconds specifying the interval at which the collector
shall be collected.
+The ``--diagnose-data-collector-filename`` option specifies the filename
+of the script diagnose data collector should run. If this value is an
+empty string, the data collector will return sucess without running
+anything. The default value is empty string.
+
+The ``--maintenance-interval`` option specified the minimal waiting
+time by the maintenance daemon between maintenance rounds.
+The ``--auto-balance-cluster`` option tell the maintenance daemon
+whether to also keep the cluster in a balanced fashion. If so, it
+will carry out moves, provided the gain in the cluster score for
+that move is at least the value specified by ``--auto-balance-threshold``
+in absolute terms, unless the cluster score it at least 10 times that
+value, in which case all beneficial steps will be done if auto-balancing
+is enabled.
+
See **gnt-cluster init** for a description of ``--install-image`` and
``--zeroing-image``.
@@ -863,6 +884,16 @@
See **ganeti**\(7) for a description of ``--submit`` and other common
options.
+REMOVE-REPAIR
+~~~~~~~~~~~~~
+
+**remove-repair** *uuid*
+
+Unconditionally remove the specified repair event from the list of repair
+events tracked by the maintenance daemon. Note that if the node still reports
+the same breakage, a new event for this breakage will be created at next
+node querying by the daemon.
+
RENAME
~~~~~~
@@ -889,6 +920,7 @@
| [\--new-ssh-keys] [\--no-ssh-key-check]
| [\--new-cluster-domain-secret] [\--cluster-domain-secret *filename*]
| [\--ssh-key-type *type*] | [\--ssh-key-bits *bits*]
+| [\--verbose] | [\--debug]
This command will stop all Ganeti daemons in the cluster and start
them again once the new certificates and keys are replicated. The
@@ -934,6 +966,11 @@
properties of the disk types used. They are described in more detail
in the ``init`` option description.
+The options ``--verbose`` and ``--debug`` increase the log level
+of underlying ssh calls to all nodes. If running ``renew-crypto``
+causes any problems, use them and inspect the ``tools.log`` file
+for any unusual output.
+
REPAIR-DISK-SIZES
~~~~~~~~~~~~~~~~~
@@ -1039,7 +1076,7 @@
VERIFY-DISKS
~~~~~~~~~~~~
-**verify-disks** [\--node-group *nodegroup*]
+**verify-disks** [\--node-group *nodegroup*] [\--no-strict]
The command checks which instances have degraded DRBD disks and
activates the disks of those instances.
@@ -1047,6 +1084,11 @@
With ``--node-group``, restrict the verification to those nodes and
instances that live in the named group.
+The ``--no-strict`` option runs the group verify disks job in a
+non-strict mode. This only verifies those disks whose node locks could
+be acquired in a best-effort attempt and will skip nodes that are
+recognized as busy with other jobs.
+
This command is run from the **ganeti-watcher** tool, which also
has a different, complementary algorithm for doing this check.
Together, these two should ensure that DRBD disks are kept
diff --git a/man/gnt-group.rst b/man/gnt-group.rst
index 1c313b2..7864687 100644
--- a/man/gnt-group.rst
+++ b/man/gnt-group.rst
@@ -31,6 +31,7 @@
| [\--ipolicy-disk-templates *template* [,*template*...]]
| [\--ipolicy-spindle-ratio *ratio*]
| [\--ipolicy-vcpu-ratio *ratio*]
+| [\--ipolicy-memory-ratio *ratio*]
| [\--disk-state *diskstate*]
| [\--hypervisor-state *hvstate*]
| {*group*}
@@ -103,6 +104,7 @@
| [\--ipolicy-disk-templates *template* [,*template*...]]
| [\--ipolicy-spindle-ratio *ratio*]
| [\--ipolicy-vcpu-ratio *ratio*]
+| [\--ipolicy-memory-ratio *ratio*]
| {*group*}
Modifies some parameters from the node group.
diff --git a/man/gnt-node.rst b/man/gnt-node.rst
index bf3fff3..65eb6a3 100644
--- a/man/gnt-node.rst
+++ b/man/gnt-node.rst
@@ -30,6 +30,7 @@
| [\--disk-state *diskstate*]
| [\--hypervisor-state *hvstate*]
| [\--no-node-setup]
+| [\--verbose] | [\--debug]
| {*nodename*}
Adds the given node to the cluster.
@@ -87,6 +88,10 @@
running, the ``node-cleanup`` tool can be run on the machine to be added
to clean remains of the previous cluster from the node.
+The options ``--verbose`` and ``--debug`` control the log level of the
+operation, in particular the one of the underlying SSH calls that
+Ganeti makes when adding a node.
+
Example::
# gnt-node add node5.example.com
@@ -339,6 +344,7 @@
| [\--node-powered=``yes|no``]
| [\--hypervisor-state *hvstate*]
| [\--disk-state *diskstate*]
+| [\--verbose] [\--debug]
| {*node*}
This command changes the role of the node. Each options takes
@@ -372,6 +378,11 @@
``--force`` is needed as well, and the target node for the first change
must be the master.
+The options ``--verbose`` and ``--debug`` control the log level of the
+operation, in particular the one of the underlying SSH calls that
+Ganeti makes when modifying some parameters a node (e.g. promoting
+or demoting a node to or from 'master candidate' status).
+
See **ganeti**\(7) for a description of ``--submit`` and other common
options.
@@ -383,11 +394,16 @@
REMOVE
~~~~~~
-**remove** {*nodename*}
+**remove** [\--verbose] [\--debug] {*nodename*}
Removes a node from the cluster. Instances must be removed or
migrated to another cluster before.
+The options ``--verbose`` and ``--debug`` control the log level of the
+operation, in particular the one of the underlying SSH calls that
+Ganeti makes when removing a node.
+
+
Example::
# gnt-node remove node5.example.com
@@ -652,6 +668,23 @@
output lines. ``--sync`` forces the opcode to acquire the node lock(s)
in exclusive mode.
+REPAIR-COMMAND
+~~~~~~~~~~~~~~~~~~
+
+| **repair-command** { --input *input* } *command* *node*
+
+Executes a repair command. Repair commands reside in
+``@SYSCONFDIR@/ganeti/node-repair-commands`` on a node, either as a regular
+file or as a symlink. The directory must be owned by root and not be
+world- or group-writable. If a command fails verification or otherwise
+fails to start, the node daemon log must be consulted for more detailed
+information.
+
+Example for running a command::
+
+ # gnt-node repair-command --input "input string" \
+ mycommand node.example.com
+
Tags
~~~~
diff --git a/man/hbal.rst b/man/hbal.rst
index ec2e3d1..9910de1 100644
--- a/man/hbal.rst
+++ b/man/hbal.rst
@@ -28,12 +28,15 @@
**[ -g *delta* ]** **[ \--min-gain-limit *threshold* ]**
**[ -O *name...* ]**
**[ \--no-disk-moves ]**
+**[ \--avoid-disk-moves *factor* ]**
**[ \--no-instance-moves ]**
**[ -U *util-file* ]**
+**[ \--idle-default ]**
**[ \--ignore-dynu ]**
**[ \--ignore-soft-errors ]**
**[ \--mond *yes|no* ]**
**[ \--mond-xen ]**
+**[ \--mond-kvm-rss ]**
**[ \--exit-on-missing-mond-data ]**
**[ \--evac-mode ]**
**[ \--restricted-migration ]**
@@ -363,6 +366,12 @@
a much quicker balancing, but of course the improvements are
limited. It is up to the user to decide when to use one or another.
+\--avoid-disk-moves=*factor*
+ This parameter prevents hbal from not profitable enough disk moves.
+ During each balancing step it will admit disk move only if the gain
+ in the cluster metrics is *factor* times higher than the gain
+ achievable without disk moves.
+
\--no-instance-moves
This parameter prevents hbal from using instance moves
(i.e. "gnt-instance migrate/failover") operations. This will only use
@@ -414,6 +423,13 @@
metrics and thus the influence of the dynamic utilisation will be
practically insignificant.
+\--idle-default
+ If given, all dynamic utilisation information not provided explicitly
+ by the ``-U`` option or by the MonDs, if ``--mond`` is given, will be
+ assumed to be 0. Note that without this option the default assumption
+ about utilization will apply for the unspecified resources, which is 1.0,
+ i.e., full load, for every instance.
+
\--ignore-dynu
If given, all dynamic utilisation information will be ignored by
assuming it to be 0. This option will take precedence over any data
@@ -448,6 +464,14 @@
If given, also query Xen-specific collectors from MonD, provided
that monitoring daemons are queried at all.
+\--mond-kvm-rss
+ If given, also query the residual set size for kvm instances, provided
+ that monitoring daemons are queried at all.
+
+\--mem-weight=*factor*
+ Scale the weight of the dynamic memory utilization in the cluster metrics
+ by the given factor.
+
\--exit-on-missing-mond-data
If given, abort if the data obtainable from querying MonDs is incomplete.
The default behavior is to continue with a best guess based on the static
diff --git a/man/htools.rst b/man/htools.rst
index f1ff44b..cdf3c8d 100644
--- a/man/htools.rst
+++ b/man/htools.rst
@@ -224,6 +224,7 @@
- disk templates
- vcpu ratio
- spindle ratio
+ - memory ratio (optional)
\--mond=*yes|no*
If given the program will query all MonDs to fetch data from the
diff --git a/qa/qa_cluster.py b/qa/qa_cluster.py
index 2199d00..9105018 100644
--- a/qa/qa_cluster.py
+++ b/qa/qa_cluster.py
@@ -1450,14 +1450,6 @@
nodes = qa_config.AcquireManyNodes(n)
live_instances.append(cf(nodes))
- # 2.16 only - prior to performing a downgrade, we have to make sure that the
- # SSH keys used are such that the lower version can still use them,
- # regardless of cluster defaults.
- if constants.VERSION_MINOR != 16:
- raise qa_error.Error("Please remove the key type downgrade code in 2.17")
- AssertCommand(["gnt-cluster", "renew-crypto", "--no-ssh-key-check", "-f",
- "--new-ssh-keys", "--ssh-key-type=dsa"])
-
AssertRedirectedCommand(["gnt-cluster", "upgrade", "--to", other_version])
AssertRedirectedCommand(["gnt-cluster", "verify"])
diff --git a/qa/qa_node.py b/qa/qa_node.py
index 1ed6bbe..55af8b8 100644
--- a/qa/qa_node.py
+++ b/qa/qa_node.py
@@ -93,6 +93,31 @@
if node != master:
NodeAdd(node, readd=False)
+ for node in qa_config.get("nodes"):
+ def GetNonStartDaemons():
+ cmd = utils.ShellQuoteArgs(["ps", "-Ao", "comm"])
+ prcs = AssertCommand(cmd, node=node)[1]
+
+ non_start_daemons = []
+
+ def AddIfNotStarted(daemon):
+ if daemon not in prcs:
+ non_start_daemons.append(daemon)
+
+ AddIfNotStarted('ganeti-noded')
+ if constants.ENABLE_MOND:
+ AddIfNotStarted('ganeti-mond')
+ if node == master:
+ AddIfNotStarted('ganeti-wconfd')
+ AddIfNotStarted('ganeti-rapi')
+ AddIfNotStarted('ganeti-luxid')
+ AddIfNotStarted('ganeti-maintd')
+ return non_start_daemons
+
+ nsd = GetNonStartDaemons()
+ for daemon in nsd:
+ raise qa_error.Error(daemon + ' is not running at %s' % node.primary)
+
def MarkNodeAddedAll():
"""Mark all nodes as added.
diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py
index 9282587..18142f6 100644
--- a/qa/qa_rapi.py
+++ b/qa/qa_rapi.py
@@ -728,7 +728,8 @@
# Identifying the node - RAPI provides these itself
IDENTIFIERS = ["node_name", "node_uuid"]
# As the name states, these can be set but not retrieved yet
- NOT_EXPOSED_YET = ["hv_state", "disk_state", "auto_promote"]
+ NOT_EXPOSED_YET = ["hv_state", "disk_state", "auto_promote",
+ "debug", "verbose"]
_DoGetPutTests("/2/nodes/%s" % node.primary,
"/2/nodes/%s/modify" % node.primary,
diff --git a/src/Ganeti/BasicTypes.hs b/src/Ganeti/BasicTypes.hs
index 15a26a3..caec414 100644
--- a/src/Ganeti/BasicTypes.hs
+++ b/src/Ganeti/BasicTypes.hs
@@ -8,7 +8,7 @@
{-
-Copyright (C) 2009, 2010, 2011, 2012 Google Inc.
+Copyright (C) 2009, 2010, 2011, 2012, 2015 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,8 @@
, Result
, ResultT(..)
, mkResultT
+ , mkResultT'
+ , mkResultTEither
, withError
, withErrorT
, toError
@@ -50,6 +52,7 @@
, tryError
, Error(..) -- re-export from Control.Monad.Error
, MonadIO(..) -- re-export from Control.Monad.IO.Class
+ , FromString(..)
, isOk
, isBad
, justOk
@@ -75,8 +78,12 @@
, compareNameComponent
, ListSet(..)
, emptyListSet
+ , Down(..)
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Applicative
import Control.Exception (try)
import Control.Monad
@@ -85,13 +92,15 @@
import Control.Monad.Trans
import Control.Monad.Trans.Control
import Data.Function
-import Data.List
+import Data.List (find, isPrefixOf)
import Data.Maybe
-import Data.Monoid
import Data.Set (Set)
import qualified Data.Set as Set (empty)
import Text.JSON (JSON)
import qualified Text.JSON as JSON (readJSON, showJSON)
+#if MIN_VERSION_base(4,6,0)
+import Data.Ord
+#endif
-- Remove after we require >= 1.8.58
-- See: https://github.com/ndmitchell/hlint/issues/24
@@ -112,26 +121,42 @@
-- | Type alias for a string Result.
type Result = GenericResult String
+-- | Type class for things that can be built from strings.
+class FromString a where
+ mkFromString :: String -> a
+
+-- | Trivial 'String' instance; requires FlexibleInstances extension
+-- though.
+instance FromString [Char] where
+ mkFromString = id
+
+instance FromString IOError where
+ mkFromString = userError
+
-- | 'Monad' instance for 'GenericResult'.
-instance (Error a) => Monad (GenericResult a) where
+instance (FromString a) => Monad (GenericResult a) where
(>>=) (Bad x) _ = Bad x
(>>=) (Ok x) fn = fn x
return = Ok
- fail = Bad . strMsg
+ fail = Bad . mkFromString
instance Functor (GenericResult a) where
fmap _ (Bad msg) = Bad msg
fmap fn (Ok val) = Ok (fn val)
-instance (Error a, Monoid a) => MonadPlus (GenericResult a) where
- mzero = Bad $ strMsg "zero Result when used as MonadPlus"
+instance (FromString a, Monoid a) => Alternative (GenericResult a) where
+ empty = Bad $ mkFromString "zero Result when used as empty"
-- for mplus, when we 'add' two Bad values, we concatenate their
-- error descriptions
- (Bad x) `mplus` (Bad y) = Bad (x `mappend` strMsg "; " `mappend` y)
- (Bad _) `mplus` x = x
- x@(Ok _) `mplus` _ = x
+ (Bad x) <|> (Bad y) = Bad (x `mappend` mkFromString "; " `mappend` y)
+ (Bad _) <|> x = x
+ x@(Ok _) <|> _ = x
-instance (Error a) => MonadError a (GenericResult a) where
+instance (FromString a, Monoid a) => MonadPlus (GenericResult a) where
+ mzero = empty
+ mplus = (<|>)
+
+instance (FromString a) => MonadError a (GenericResult a) where
throwError = Bad
{-# INLINE throwError #-}
catchError x h = genericResult h (const x) x
@@ -143,10 +168,6 @@
_ <*> (Bad x) = Bad x
(Ok f) <*> (Ok x) = Ok $ f x
-instance (Error a, Monoid a) => Alternative (GenericResult a) where
- empty = mzero
- (<|>) = mplus
-
-- | This is a monad transformation for Result. It's implementation is
-- based on the implementations of MaybeT and ErrorT.
--
@@ -154,7 +175,6 @@
-- If 'mplus' combines two failing operations, errors of both of them
-- are combined.
newtype ResultT a m b = ResultT {runResultT :: m (GenericResult a b)}
- deriving (Functor)
-- | Eliminates a 'ResultT' value given appropriate continuations
elimResultT :: (Monad m)
@@ -168,16 +188,19 @@
result (Bad e) = l e
{-# INLINE elimResultT #-}
-instance (Applicative m, Monad m, Error a) => Applicative (ResultT a m) where
+instance (Monad m) => Functor (ResultT a m) where
+ fmap f = ResultT . liftM (fmap f) . runResultT
+
+instance (Monad m, FromString a) => Applicative (ResultT a m) where
pure = return
(<*>) = ap
-instance (Monad m, Error a) => Monad (ResultT a m) where
- fail err = ResultT (return . Bad $ strMsg err)
+instance (Monad m, FromString a) => Monad (ResultT a m) where
+ fail err = ResultT (return . Bad $ mkFromString err)
return = lift . return
(>>=) = flip (elimResultT throwError)
-instance (Monad m, Error a) => MonadError a (ResultT a m) where
+instance (Monad m, FromString a) => MonadError a (ResultT a m) where
throwError = ResultT . return . Bad
catchError = catchErrorT
@@ -185,24 +208,24 @@
lift = ResultT . liftM Ok
-- | The instance catches any 'IOError' using 'try' and converts it into an
--- error message using 'strMsg'.
+-- error message using 'mkFromString'.
--
-- This way, monadic code within 'ResultT' that uses solely 'liftIO' to
-- include 'IO' actions ensures that all IO exceptions are handled.
--
-- Other exceptions (see instances of 'Exception') are not currently handled.
-- This might be revised in the future.
-instance (MonadIO m, Error a) => MonadIO (ResultT a m) where
+instance (MonadIO m, FromString a) => MonadIO (ResultT a m) where
liftIO = ResultT . liftIO
. liftM (either (failError . show) return)
. (try :: IO a -> IO (Either IOError a))
-instance (MonadBase IO m, Error a) => MonadBase IO (ResultT a m) where
+instance (MonadBase IO m, FromString a) => MonadBase IO (ResultT a m) where
liftBase = ResultT . liftBase
. liftM (either (failError . show) return)
. (try :: IO a -> IO (Either IOError a))
-instance (Error a) => MonadTransControl (ResultT a) where
+instance (FromString a) => MonadTransControl (ResultT a) where
#if MIN_VERSION_monad_control(1,0,0)
-- Needs Undecidable instances
type StT (ResultT a) b = GenericResult a b
@@ -216,7 +239,7 @@
{-# INLINE liftWith #-}
{-# INLINE restoreT #-}
-instance (Error a, MonadBaseControl IO m)
+instance (FromString a, MonadBaseControl IO m)
=> MonadBaseControl IO (ResultT a m) where
#if MIN_VERSION_monad_control(1,0,0)
-- Needs Undecidable instances
@@ -233,17 +256,18 @@
{-# INLINE liftBaseWith #-}
{-# INLINE restoreM #-}
-instance (Monad m, Error a, Monoid a) => MonadPlus (ResultT a m) where
- mzero = ResultT $ return mzero
+instance (Monad m, FromString a, Monoid a)
+ => Alternative (ResultT a m) where
+ empty = ResultT $ return mzero
-- Ensure that 'y' isn't run if 'x' contains a value. This makes it a bit
-- more complicated than 'mplus' of 'GenericResult'.
- mplus x y = elimResultT combine return x
+ x <|> y = elimResultT combine return x
where combine x' = ResultT $ liftM (mplus (Bad x')) (runResultT y)
-instance (Alternative m, Monad m, Error a, Monoid a)
- => Alternative (ResultT a m) where
- empty = mzero
- (<|>) = mplus
+instance (Monad m, FromString a, Monoid a)
+ => MonadPlus (ResultT a m) where
+ mzero = empty
+ mplus = (<|>)
-- | Changes the error message of a result value, if present.
-- Note that since 'GenericResult' is also a 'MonadError', this function
@@ -253,7 +277,7 @@
withError f = genericResult (throwError . f) return
-- | Changes the error message of a @ResultT@ value, if present.
-withErrorT :: (Monad m, Error e)
+withErrorT :: (Monad m, FromString e)
=> (e' -> e) -> ResultT e' m a -> ResultT e m a
withErrorT f = ResultT . liftM (withError f) . runResultT
@@ -269,10 +293,10 @@
toErrorBase = (toError =<<) . liftBase . runResultT
{-# INLINE toErrorBase #-}
--- | An alias for @withError strMsg@, which is often used to lift a pure error
--- to a monad stack. See also 'annotateResult'.
-toErrorStr :: (MonadError e m, Error e) => Result a -> m a
-toErrorStr = withError strMsg
+-- | An alias for @withError mkFromString@, which is often
+-- used to lift a pure error to a monad stack. See also 'annotateResult'.
+toErrorStr :: (MonadError e m, FromString e) => Result a -> m a
+toErrorStr = withError mkFromString
-- | Run a given computation and if an error occurs, return it as `Left` of
-- `Either`.
@@ -289,9 +313,19 @@
-- should be handled by the given action.
--
-- See also 'toErrorStr'.
-mkResultT :: (Monad m, Error e) => m (Result a) -> ResultT e m a
+mkResultT :: (Monad m, FromString e) => m (Result a) -> ResultT e m a
mkResultT = ResultT . liftM toErrorStr
+-- | Generalisation of mkResultT accepting any showable failures.
+mkResultT' :: (Monad m, FromString e, Show s)
+ => m (GenericResult s a) -> ResultT e m a
+mkResultT' = mkResultT . liftM (genericResult (Bad . show) Ok)
+
+-- | Generalisation of mkResultT accepting any showable failures.
+mkResultTEither :: (Monad m, FromString e, Show s)
+ => m (Either s a) -> ResultT e m a
+mkResultTEither = mkResultT . liftM (either (Bad . show) Ok)
+
-- | Simple checker for whether a 'GenericResult' is OK.
isOk :: GenericResult a b -> Bool
isOk (Ok _) = True
@@ -329,32 +363,33 @@
-- 'MonadError'. Since 'Result' is an instance of 'MonadError' itself,
-- it's a generalization of type @String -> Result a -> Result a@.
-- See also 'toErrorStr'.
-annotateResult :: (MonadError e m, Error e) => String -> Result a -> m a
+annotateResult :: (MonadError e m, FromString e) => String -> Result a -> m a
annotateResult owner = toErrorStr . annotateError owner
-- | Annotate an error with an ownership information inside a 'MonadError'.
-- See also 'annotateResult'.
-annotateError :: (MonadError e m, Error e, Monoid e) => String -> m a -> m a
+annotateError :: (MonadError e m, FromString e, Monoid e)
+ => String -> m a -> m a
annotateError owner =
- flip catchError (throwError . mappend (strMsg $ owner ++ ": "))
+ flip catchError (throwError . mappend (mkFromString $ owner ++ ": "))
{-# INLINE annotateError #-}
-- | Throws a 'String' message as an error in a 'MonadError'.
-- This is a generalization of 'Bad'.
-- It's similar to 'fail', but works within a 'MonadError', avoiding the
-- unsafe nature of 'fail'.
-failError :: (MonadError e m, Error e) => String -> m a
-failError = throwError . strMsg
+failError :: (MonadError e m, FromString e) => String -> m a
+failError = throwError . mkFromString
-- | A synonym for @flip@ 'catchErrorT'.
-handleErrorT :: (Monad m, Error e)
+handleErrorT :: (Monad m, FromString e)
=> (e' -> ResultT e m a) -> ResultT e' m a -> ResultT e m a
handleErrorT handler = elimResultT handler return
{-# INLINE handleErrorT #-}
-- | Catches an error in a @ResultT@ value. This is similar to 'catchError',
-- but in addition allows to change the error type.
-catchErrorT :: (Monad m, Error e)
+catchErrorT :: (Monad m, FromString e)
=> ResultT e' m a -> (e' -> ResultT e m a) -> ResultT e m a
catchErrorT = flip handleErrorT
{-# INLINE catchErrorT #-}
@@ -471,3 +506,52 @@
emptyListSet :: ListSet a
emptyListSet = ListSet Set.empty
+
+#if MIN_VERSION_base(4,6,0)
+-- Down already defined in Data.Ord
+#else
+-- Copyright : (c) The University of Glasgow 2005
+-- License : BSD-style
+
+newtype Down a = Down a deriving (Eq, Show, Read)
+
+instance Ord a => Ord (Down a) where
+ compare (Down x) (Down y) = y `compare` x
+
+{- License text of the above code fragment:
+
+The Glasgow Haskell Compiler License
+
+Copyright 2004, The University Court of the University of Glasgow.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- Neither name of the University nor the names of its contributors may be
+used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY COURT OF THE UNIVERSITY OF
+GLASGOW AND THE CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+UNIVERSITY COURT OF THE UNIVERSITY OF GLASGOW OR THE CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
+
+-}
+
+#endif
diff --git a/src/Ganeti/Codec.hs b/src/Ganeti/Codec.hs
index 404c70b..6f54c0d 100644
--- a/src/Ganeti/Codec.hs
+++ b/src/Ganeti/Codec.hs
@@ -37,12 +37,17 @@
, decompressZlib
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Codec.Compression.Zlib
import qualified Codec.Compression.Zlib.Internal as I
-import Control.Monad.Error
+import Control.Monad (liftM)
+import Control.Monad.Error.Class (MonadError(..))
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Internal as BL
-import Data.Monoid (mempty)
+
+import Ganeti.BasicTypes
-- | Compresses a lazy bytestring.
@@ -52,11 +57,12 @@
-- | Decompresses a lazy bytestring, throwing decoding errors using
-- 'throwError'.
-decompressZlib :: (MonadError e m, Error e) => BL.ByteString -> m BL.ByteString
+decompressZlib :: (MonadError e m, FromString e)
+ => BL.ByteString -> m BL.ByteString
decompressZlib = I.foldDecompressStream
(liftM . BL.chunk)
(return mempty)
- (const $ throwError . strMsg . ("Zlib: " ++))
+ (const $ throwError . mkFromString . ("Zlib: " ++))
. I.decompressWithErrors
I.zlibFormat
I.defaultDecompressParams
diff --git a/src/Ganeti/Confd/Client.hs b/src/Ganeti/Confd/Client.hs
index 49ab5fd..ae77090 100644
--- a/src/Ganeti/Confd/Client.hs
+++ b/src/Ganeti/Confd/Client.hs
@@ -82,7 +82,7 @@
hmac = hmacKey client
jobs = map (queryOneServer semaphore answer crType cQuery hmac) dest
watchdog reqAnswers = do
- threadDelay $ 1000000 * C.confdClientExpireTimeout
+ threadDelaySeconds C.confdClientExpireTimeout
_ <- swapMVar reqAnswers 0
putMVar semaphore ()
waitForResult reqAnswers = do
diff --git a/src/Ganeti/Confd/ClientFunctions.hs b/src/Ganeti/Confd/ClientFunctions.hs
index 3213669..a119d99 100644
--- a/src/Ganeti/Confd/ClientFunctions.hs
+++ b/src/Ganeti/Confd/ClientFunctions.hs
@@ -35,6 +35,7 @@
module Ganeti.Confd.ClientFunctions
( getInstances
, getInstanceDisks
+ , getDiagnoseCollectorFilename
) where
import Control.Monad (liftM)
@@ -89,3 +90,15 @@
getInstanceDisks node srvAddr srvPort =
liftM (uncurry (++)) (getInstances node srvAddr srvPort) >>=
mapM (\i -> liftM ((,) i) (getDisks i srvAddr srvPort))
+
+-- | Get the name of the diagnose collector.
+getDiagnoseCollectorFilename
+ :: Maybe String -> Maybe Int -> BT.ResultT String IO String
+getDiagnoseCollectorFilename srvAddr srvPort = do
+ client <- liftIO $ getConfdClient srvAddr srvPort
+ reply <- liftIO . query client ReqConfigQuery
+ $ PlainQuery "/cluster/diagnose_data_collector_filename"
+ case fmap (J.readJSON . confdReplyAnswer) reply of
+ Just (J.Ok filename) -> return filename
+ Just (J.Error msg) -> fail msg
+ Nothing -> fail "No answer from the Confd server"
diff --git a/src/Ganeti/Confd/Server.hs b/src/Ganeti/Confd/Server.hs
index b32eb70..a2ec0a9 100644
--- a/src/Ganeti/Confd/Server.hs
+++ b/src/Ganeti/Confd/Server.hs
@@ -40,7 +40,9 @@
, prepMain
) where
-import Control.Applicative((<$>))
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent
import Control.Monad (forever, liftM)
import Data.IORef
diff --git a/src/Ganeti/Confd/Utils.hs b/src/Ganeti/Confd/Utils.hs
index afb8e4f..ba5585f 100644
--- a/src/Ganeti/Confd/Utils.hs
+++ b/src/Ganeti/Confd/Utils.hs
@@ -47,7 +47,9 @@
import qualified Data.Attoparsec.Text as P
-import Control.Applicative ((*>))
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.ByteString as B
import Data.Text (pack)
import qualified Text.JSON as J
diff --git a/src/Ganeti/Config.hs b/src/Ganeti/Config.hs
index 4d0e5a0..5687b54 100644
--- a/src/Ganeti/Config.hs
+++ b/src/Ganeti/Config.hs
@@ -68,6 +68,7 @@
, getInstDisksFromObj
, getDrbdMinorsForDisk
, getDrbdMinorsForInstance
+ , getFilledHvStateParams
, getFilledInstHvParams
, getFilledInstBeParams
, getFilledInstOsParams
@@ -82,10 +83,11 @@
, instNodes
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow ((&&&))
-import Control.Monad
-import Control.Monad.State
+import Control.Monad (liftM)
import qualified Data.ByteString as BS
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Foldable as F
@@ -99,6 +101,7 @@
import Ganeti.BasicTypes
import qualified Ganeti.Constants as C
+import qualified Ganeti.ConstantUtils as CU
import Ganeti.Errors
import Ganeti.JSON (fromJResult, fromContainer, GenericContainer(..))
import Ganeti.Objects
@@ -364,6 +367,36 @@
ginsts = map (getNodeInstances cfg) gnodes in
(concatMap fst ginsts, concatMap snd ginsts)
+-- | default FilledHvStateParams.
+defaultHvStateParams :: FilledHvStateParams
+defaultHvStateParams = FilledHvStateParams
+ { hvstateCpuNode = CU.hvstDefaultCpuNode
+ , hvstateCpuTotal = CU.hvstDefaultCpuTotal
+ , hvstateMemHv = CU.hvstDefaultMemoryHv
+ , hvstateMemNode = CU.hvstDefaultMemoryNode
+ , hvstateMemTotal = CU.hvstDefaultMemoryTotal
+ }
+
+-- | Retrieves the node's static hypervisor state parameters, missing values
+-- filled with group's parameters, missing group parameters are filled
+-- with cluster's parameters. Currently, returns hvstate parameters only for
+-- the default hypervisor.
+getFilledHvStateParams :: ConfigData -> Node -> FilledHvState
+getFilledHvStateParams cfg n =
+ let cluster_hv_state =
+ fromContainer . clusterHvStateStatic $ configCluster cfg
+ def_hv = getDefaultHypervisor cfg
+ cluster_fv = fromMaybe defaultHvStateParams $ M.lookup def_hv
+ cluster_hv_state
+ group_fv = case getGroupOfNode cfg n >>=
+ M.lookup def_hv . fromContainer . groupHvStateStatic of
+ Just pv -> fillParams cluster_fv pv
+ Nothing -> cluster_fv
+ node_fv = case M.lookup def_hv . fromContainer $ nodeHvStateStatic n of
+ Just pv -> fillParams group_fv pv
+ Nothing -> group_fv
+ in GenericContainer $ M.fromList [(def_hv, node_fv)]
+
-- | Retrieves the instance hypervisor params, missing values filled with
-- cluster defaults.
getFilledInstHvParams :: [String] -> ConfigData -> Instance -> HvParams
diff --git a/src/Ganeti/ConstantUtils.hs b/src/Ganeti/ConstantUtils.hs
index 6a61cf2..dc966d6 100644
--- a/src/Ganeti/ConstantUtils.hs
+++ b/src/Ganeti/ConstantUtils.hs
@@ -37,8 +37,10 @@
-}
module Ganeti.ConstantUtils where
+import Prelude ()
+import Ganeti.Prelude
+
import Data.Char (ord)
-import Data.Monoid (Monoid(..))
import Data.Set (Set)
import qualified Data.Set as Set (difference, fromList, toList, union)
@@ -204,8 +206,31 @@
ipolicySpindleRatio :: String
ipolicySpindleRatio = "spindle-ratio"
+ipolicyMemoryRatio :: String
+ipolicyMemoryRatio = "memory-ratio"
+
ipolicyDefaultsVcpuRatio :: Double
ipolicyDefaultsVcpuRatio = 4.0
ipolicyDefaultsSpindleRatio :: Double
ipolicyDefaultsSpindleRatio = 32.0
+
+ipolicyDefaultsMemoryRatio :: Double
+ipolicyDefaultsMemoryRatio = 1.0
+
+-- * Hypervisor state default parameters
+
+hvstDefaultCpuNode :: Int
+hvstDefaultCpuNode = 1
+
+hvstDefaultCpuTotal :: Int
+hvstDefaultCpuTotal = 1
+
+hvstDefaultMemoryHv :: Int
+hvstDefaultMemoryHv = 1024
+
+hvstDefaultMemoryTotal :: Int
+hvstDefaultMemoryTotal = 1024
+
+hvstDefaultMemoryNode :: Int
+hvstDefaultMemoryNode = 4096
diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
index 420ccb6..13bff2e 100644
--- a/src/Ganeti/Constants.hs
+++ b/src/Ganeti/Constants.hs
@@ -367,6 +367,9 @@
mond :: String
mond = Runtime.daemonName GanetiMond
+maintd :: String
+maintd = Runtime.daemonName GanetiMaintd
+
noded :: String
noded = Runtime.daemonName GanetiNoded
@@ -398,6 +401,9 @@
defaultMondPort :: Int
defaultMondPort = 1815
+defaultMaintdPort :: Int
+defaultMaintdPort = 1816
+
defaultMetadPort :: Int
defaultMetadPort = 80
@@ -413,6 +419,7 @@
[ (confd, (Udp, defaultConfdPort))
, (metad, (Tcp, defaultMetadPort))
, (mond, (Tcp, defaultMondPort))
+ , (maintd, (Tcp, defaultMaintdPort))
, (noded, (Tcp, defaultNodedPort))
, (rapi, (Tcp, defaultRapiPort))
, (ssh, (Tcp, 22))
@@ -2028,11 +2035,12 @@
hvstDefaults :: Map String Int
hvstDefaults =
Map.fromList
- [(hvstCpuNode, 1),
- (hvstCpuTotal, 1),
- (hvstMemoryHv, 0),
- (hvstMemoryTotal, 0),
- (hvstMemoryNode, 0)]
+ [ (hvstCpuNode , ConstantUtils.hvstDefaultCpuNode )
+ , (hvstCpuTotal , ConstantUtils.hvstDefaultCpuTotal )
+ , (hvstMemoryHv , ConstantUtils.hvstDefaultMemoryHv )
+ , (hvstMemoryTotal, ConstantUtils.hvstDefaultMemoryTotal)
+ , (hvstMemoryNode , ConstantUtils.hvstDefaultMemoryNode )
+ ]
hvstsParameterTypes :: Map String VType
hvstsParameterTypes =
@@ -2187,13 +2195,17 @@
ipolicySpindleRatio :: String
ipolicySpindleRatio = ConstantUtils.ipolicySpindleRatio
+ipolicyMemoryRatio :: String
+ipolicyMemoryRatio = ConstantUtils.ipolicyMemoryRatio
+
ispecsMinmaxKeys :: FrozenSet String
ispecsMinmaxKeys = ConstantUtils.mkSet [ispecsMax, ispecsMin]
ipolicyParameters :: FrozenSet String
ipolicyParameters =
ConstantUtils.mkSet [ConstantUtils.ipolicyVcpuRatio,
- ConstantUtils.ipolicySpindleRatio]
+ ConstantUtils.ipolicySpindleRatio,
+ ConstantUtils.ipolicyMemoryRatio]
ipolicyAllKeys :: FrozenSet String
ipolicyAllKeys =
@@ -4323,8 +4335,9 @@
, (ispecSpindleUse, 1)
] :: Map String Int))
, (ipolicyDts, PyValueEx (ConstantUtils.toList diskTemplates))
- , (ipolicyVcpuRatio, PyValueEx (4.0 :: Double))
- , (ipolicySpindleRatio, PyValueEx (32.0 :: Double))
+ , (ipolicyVcpuRatio, PyValueEx ConstantUtils.ipolicyDefaultsVcpuRatio)
+ , (ipolicySpindleRatio, PyValueEx ConstantUtils.ipolicyDefaultsSpindleRatio)
+ , (ipolicyMemoryRatio, PyValueEx ConstantUtils.ipolicyDefaultsMemoryRatio)
]
masterPoolSizeDefault :: Int
@@ -4816,6 +4829,9 @@
ndsSsconf :: String
ndsSsconf = "ssconf"
+ndsHmac :: String
+ndsHmac = "hmac_key"
+
ndsStartNodeDaemon :: String
ndsStartNodeDaemon = "start_node_daemon"
@@ -4856,6 +4872,9 @@
opcodeReasonSrcNoded :: String
opcodeReasonSrcNoded = _opcodeReasonSrcDaemon ++ ":noded"
+opcodeReasonSrcMaintd :: String
+opcodeReasonSrcMaintd = _opcodeReasonSrcDaemon ++ ":maintd"
+
opcodeReasonSrcOpcode :: String
opcodeReasonSrcOpcode = "gnt:opcode"
@@ -4978,6 +4997,12 @@
mondDefaultCategory :: String
mondDefaultCategory = "default"
+-- * Maintenance daemon
+
+-- | Default wait in seconds time between maintenance rounds.
+maintdDefaultRoundDelay :: Int
+maintdDefaultRoundDelay = 300
+
-- * Disk access modes
diskUserspace :: String
@@ -5466,9 +5491,17 @@
dataCollectorLv :: String
dataCollectorLv = "lv"
+-- | Collector for the resident set size of kvm processes, i.e.,
+-- the number of pages the kvm process has in RAM.
+dataCollectorKvmRSS :: String
+dataCollectorKvmRSS = "kvm-inst-rss"
+
dataCollectorInstStatus :: String
dataCollectorInstStatus = "inst-status-xen"
+dataCollectorDiagnose :: String
+dataCollectorDiagnose = "diagnose"
+
dataCollectorParameterInterval :: String
dataCollectorParameterInterval = "interval"
@@ -5480,6 +5513,8 @@
, dataCollectorLv
, dataCollectorInstStatus
, dataCollectorXenCpuLoad
+ , dataCollectorKvmRSS
+ , dataCollectorDiagnose
]
dataCollectorStateActive :: String
@@ -5491,11 +5526,25 @@
dataCollectorsIntervalName :: String
dataCollectorsIntervalName = "data_collector_interval"
+dataCollectorDiagnoseDirectory :: String
+dataCollectorDiagnoseDirectory = sysconfdir ++ "/ganeti/node-diagnose-commands"
+
-- * HTools tag prefixes
exTagsPrefix :: String
exTagsPrefix = Tags.exTagsPrefix
+-- * MaintD tag prefixes
+
+maintdPrefix :: String
+maintdPrefix = "maintd:"
+
+maintdSuccessTagPrefix :: String
+maintdSuccessTagPrefix = maintdPrefix ++ "repairready:"
+
+maintdFailureTagPrefix :: String
+maintdFailureTagPrefix = maintdPrefix ++ "repairfailed:"
+
-- | The polling frequency to wait for a job status change
cliWfjcFrequency :: Int
cliWfjcFrequency = 20
@@ -5503,3 +5552,4 @@
-- | Default 'WaitForJobChange' timeout in seconds
defaultWfjcTimeout :: Int
defaultWfjcTimeout = 60
+
diff --git a/src/Ganeti/Cpu/LoadParser.hs b/src/Ganeti/Cpu/LoadParser.hs
index 7be0759..e2ffa01 100644
--- a/src/Ganeti/Cpu/LoadParser.hs
+++ b/src/Ganeti/Cpu/LoadParser.hs
@@ -36,7 +36,10 @@
-}
module Ganeti.Cpu.LoadParser (cpustatParser) where
-import Control.Applicative ((<*>), (<*), (*>), (<$>), (<|>))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative ((<|>))
import qualified Data.Attoparsec.Text as A
import qualified Data.Attoparsec.Combinator as AC
import Data.Attoparsec.Text (Parser)
@@ -50,16 +53,16 @@
oneCPUstatParser :: Parser CPUstat
oneCPUstatParser =
let nameP = stringP
- userP = numberP
- niceP = numberP
- systemP = numberP
- idleP = numberP
- iowaitP = numberP
- irqP = numberP
- softirqP = numberP
- stealP = numberP
- guestP = numberP
- guest_niceP = numberP
+ userP = integerP
+ niceP = integerP
+ systemP = integerP
+ idleP = integerP
+ iowaitP = integerP
+ irqP = integerP
+ softirqP = integerP
+ stealP = integerP
+ guestP = integerP
+ guest_niceP = integerP
in
CPUstat <$> nameP <*> userP <*> niceP <*> systemP <*> idleP <*> iowaitP
<*> irqP <*> softirqP <*> stealP <*> guestP <*> guest_niceP
diff --git a/src/Ganeti/Cpu/Types.hs b/src/Ganeti/Cpu/Types.hs
index cc67e4d..5786435 100644
--- a/src/Ganeti/Cpu/Types.hs
+++ b/src/Ganeti/Cpu/Types.hs
@@ -37,6 +37,7 @@
module Ganeti.Cpu.Types
( CPUstat(..)
, CPUavgload(..)
+ , emptyCPUavgload
) where
import Ganeti.THH
@@ -49,17 +50,25 @@
, simpleField "cpu_total" [t| Double |]
])
+-- | CPU activity of an idle node. This can be used as a default
+-- value for offline nodes.
+emptyCPUavgload :: CPUavgload
+emptyCPUavgload = CPUavgload { cavCpuNumber = 1
+ , cavCpus = [ 0.0 ]
+ , cavCpuTotal = 0.0
+ }
+
-- | This is the format of the data parsed by the input file.
$(buildObject "CPUstat" "cs"
[ simpleField "name" [t| String |]
- , simpleField "user" [t| Int |]
- , simpleField "nice" [t| Int |]
- , simpleField "system" [t| Int |]
- , simpleField "idle" [t| Int |]
- , simpleField "iowait" [t| Int |]
- , simpleField "irq" [t| Int |]
- , simpleField "softirq" [t| Int |]
- , simpleField "steal" [t| Int |]
- , simpleField "guest" [t| Int |]
- , simpleField "guest_nice" [t| Int |]
+ , simpleField "user" [t| Integer |]
+ , simpleField "nice" [t| Integer |]
+ , simpleField "system" [t| Integer |]
+ , simpleField "idle" [t| Integer |]
+ , simpleField "iowait" [t| Integer |]
+ , simpleField "irq" [t| Integer |]
+ , simpleField "softirq" [t| Integer |]
+ , simpleField "steal" [t| Integer |]
+ , simpleField "guest" [t| Integer |]
+ , simpleField "guest_nice" [t| Integer |]
])
diff --git a/src/Ganeti/DataCollectors.hs b/src/Ganeti/DataCollectors.hs
index 33ad9cb..3c1146d 100644
--- a/src/Ganeti/DataCollectors.hs
+++ b/src/Ganeti/DataCollectors.hs
@@ -34,14 +34,18 @@
module Ganeti.DataCollectors( collectors ) where
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.ByteString.UTF8 as UTF8
import Data.Map (findWithDefault)
-import Data.Monoid (mempty)
import qualified Ganeti.DataCollectors.CPUload as CPUload
+import qualified Ganeti.DataCollectors.Diagnose as Diagnose
import qualified Ganeti.DataCollectors.Diskstats as Diskstats
import qualified Ganeti.DataCollectors.Drbd as Drbd
import qualified Ganeti.DataCollectors.InstStatus as InstStatus
+import qualified Ganeti.DataCollectors.KvmRSS as KvmRSS
import qualified Ganeti.DataCollectors.Lv as Lv
import qualified Ganeti.DataCollectors.XenCpuLoad as XenCpuLoad
import Ganeti.DataCollectors.Types (DataCollector(..),ReportBuilder(..))
@@ -54,10 +58,12 @@
collectors =
[ cpuLoadCollector
, xenCpuLoadCollector
+ , kvmRSSCollector
, diskStatsCollector
, drdbCollector
, instStatusCollector
, lvCollector
+ , diagnoseCollector
]
where
f .&&. g = \x y -> f x y && g x y
@@ -83,6 +89,9 @@
lvCollector =
DataCollector Lv.dcName Lv.dcCategory Lv.dcKind
(StatelessR Lv.dcReport) Nothing activeConfig updateInterval
+ diagnoseCollector =
+ DataCollector Diagnose.dcName Diagnose.dcCategory Diagnose.dcKind
+ (StatelessR Diagnose.dcReport) Nothing activeConfig updateInterval
cpuLoadCollector =
DataCollector CPUload.dcName CPUload.dcCategory CPUload.dcKind
(StatefulR CPUload.dcReport) (Just CPUload.dcUpdate) activeConfig
@@ -91,3 +100,6 @@
DataCollector XenCpuLoad.dcName XenCpuLoad.dcCategory XenCpuLoad.dcKind
(StatefulR XenCpuLoad.dcReport) (Just XenCpuLoad.dcUpdate) activeConfig
updateInterval
+ kvmRSSCollector =
+ DataCollector KvmRSS.dcName KvmRSS.dcCategory KvmRSS.dcKind
+ (StatelessR KvmRSS.dcReport) Nothing activeConfig updateInterval
diff --git a/src/Ganeti/DataCollectors/CPUload.hs b/src/Ganeti/DataCollectors/CPUload.hs
index 65ac423..ca9376c 100644
--- a/src/Ganeti/DataCollectors/CPUload.hs
+++ b/src/Ganeti/DataCollectors/CPUload.hs
@@ -5,7 +5,7 @@
{-
-Copyright (C) 2013 Google Inc.
+Copyright (C) 2013, 2016 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -45,6 +45,7 @@
import Control.Arrow (first)
import qualified Control.Exception as E
+import Control.Monad (liftM)
import Data.Attoparsec.Text.Lazy as A
import Data.Maybe (fromMaybe)
import Data.Text.Lazy (pack, unpack)
@@ -71,8 +72,8 @@
bufferSize = C.cpuavgloadBufferSize
-- | The window size of the values that will export the average load.
-windowSize :: Integer
-windowSize = toInteger C.cpuavgloadWindowSize
+windowSizeInUSec :: Integer
+windowSizeInUSec = 1000000 * toInteger C.cpuavgloadWindowSize
-- | The default setting for the maximum amount of not parsed character to
-- print in case of error.
@@ -111,17 +112,17 @@
in buildDCReport cpuLoadData
-- | Data stored by the collector in mond's memory.
-type Buffer = Seq.Seq (ClockTime, [Int])
+type Buffer = Seq.Seq (ClockTime, [Integer])
-- | Compute the load from a CPU.
-computeLoad :: CPUstat -> Int
+computeLoad :: CPUstat -> Integer
computeLoad cpuData =
csUser cpuData + csNice cpuData + csSystem cpuData
+ csIowait cpuData + csIrq cpuData + csSoftirq cpuData
+ csSteal cpuData + csGuest cpuData + csGuestNice cpuData
-- | Reads and Computes the load for each CPU.
-dcCollectFromFile :: FilePath -> IO (ClockTime, [Int])
+dcCollectFromFile :: FilePath -> IO (ClockTime, [Integer])
dcCollectFromFile inputFile = do
contents <-
((E.try $ readFile inputFile) :: IO (Either IOError String)) >>=
@@ -149,10 +150,7 @@
-- | Update a Map Entry.
updateEntry :: Buffer -> Buffer -> Buffer
updateEntry newBuffer mapEntry =
- (Seq.><) newBuffer
- (if Seq.length mapEntry < bufferSize
- then mapEntry
- else Seq.drop 1 mapEntry)
+ (Seq.><) newBuffer (Seq.take bufferSize mapEntry)
-- | Updates the given Collector data.
dcUpdate :: Maybe CollectorData -> IO CollectorData
@@ -178,7 +176,7 @@
(timestampR, listR) = rightmost
workInWindow = zipWith (-) listL listR
timediff = timestampL - timestampR
- overall = fromInteger (timediff * ticks) / 1000000 :: Double
+ overall = fromIntegral (timediff * ticks) / 1000000 :: Double
if overall > 0
then BT.Ok $ map (flip (/) overall . fromIntegral) workInWindow
else BT.Bad $ "Time covered by data is not sufficient."
@@ -190,7 +188,8 @@
buildJsonReport :: Buffer -> IO J.JSValue
buildJsonReport v = do
ticks <- getSysVar ClockTick
- let res = computeAverage v windowSize ticks
+ now <- liftM clockTimeToUSec getClockTime
+ let res = computeAverage v (now - windowSizeInUSec) ticks
showError s = J.showJSON $ GJ.containerFromList [("error", s)]
return $ BT.genericResult showError (J.showJSON . formatData) res
diff --git a/src/Ganeti/DataCollectors/Diagnose.hs b/src/Ganeti/DataCollectors/Diagnose.hs
new file mode 100644
index 0000000..aaa5ac4
--- /dev/null
+++ b/src/Ganeti/DataCollectors/Diagnose.hs
@@ -0,0 +1,157 @@
+{-| Self-diagnose data collector
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.DataCollectors.Diagnose
+ ( dcName
+ , dcCategory
+ , dcKind
+ , dcReport
+ ) where
+
+import Control.Monad.Trans.Class (lift)
+import System.Directory (doesFileExist)
+import System.FilePath.Posix (isValid, takeFileName, (</>))
+import System.Posix.Files ( getFileStatus
+ , fileOwner
+ , fileGroup
+ , fileMode
+ , ownerModes
+ , groupReadMode
+ , groupExecuteMode
+ , otherReadMode
+ , otherExecuteMode
+ , intersectFileModes
+ , unionFileModes
+ , ownerExecuteMode
+ , isRegularFile
+ , regularFileMode
+ )
+import System.Process (readProcess)
+import Text.JSON (JSValue(..), toJSObject, toJSString, decode, Result(..))
+
+import Ganeti.BasicTypes (runResultT, ResultT(..), genericResult)
+import Ganeti.Confd.ClientFunctions (getDiagnoseCollectorFilename)
+import Ganeti.Constants (dataCollectorDiagnose, dataCollectorDiagnoseDirectory)
+import Ganeti.DataCollectors.Types ( DCCategory(..)
+ , DCKind(..)
+ , DCVersion(..)
+ , DCReport(..)
+ , buildReport
+ )
+
+-- | The name of this data collector.
+dcName :: String
+dcName = dataCollectorDiagnose
+
+-- | The category of this data collector.
+dcCategory :: Maybe DCCategory
+dcCategory = Just DCNode
+
+-- | The kind of this data collector.
+dcKind :: DCKind
+dcKind = DCKStatus
+
+-- | The version of this data collector.
+dcVersion :: DCVersion
+dcVersion = DCVerBuiltin
+
+-- | The version number for the data format of this data collector.
+dcFormatVersion :: Int
+dcFormatVersion = 1
+
+okWithDetails :: String -> JSValue
+okWithDetails details = JSObject $ toJSObject
+ [ ("status", JSString $ toJSString "Ok")
+ , ("details", JSString $ toJSString details)
+ ]
+
+
+fnToVal :: String -> IO JSValue
+fnToVal fn
+ | null fn = return $ okWithDetails
+ "No file specified for diagnose data collector"
+ | not $ isValid fn = return $ okWithDetails
+ "Invalid filename specified for diagnose data collector"
+ | takeFileName fn /= fn = return $ okWithDetails
+ "Filepaths cannot be specified for diagnose data collector"
+ | otherwise = do
+ let fp = dataCollectorDiagnoseDirectory </> fn
+ exists <- doesFileExist fp
+ if exists
+ then do
+ fs <- getFileStatus fp
+ let maxFileMode = foldl1 unionFileModes [ ownerModes
+ , groupReadMode
+ , groupExecuteMode
+ , otherReadMode
+ , otherExecuteMode
+ , regularFileMode
+ ]
+ isSubSetOf m1 m2 = m1 `intersectFileModes` m2 == m1
+ case () of _
+ | fileOwner fs /= 0 -> return . okWithDetails $
+ "File for diagnose data collector " ++
+ "must be owned by root"
+ | fileGroup fs /= 0 -> return . okWithDetails $
+ "File for diagnose data collector " ++
+ "must have group root"
+ | not $ isRegularFile fs -> return . okWithDetails $
+ "File for diagnose data collector " ++
+ "must be a regular file"
+ | not $ isSubSetOf (fileMode fs) maxFileMode ->
+ return . okWithDetails $
+ "File for diagnose data collector " ++
+ "must have permissions 755 or stricter"
+ | not $ isSubSetOf ownerExecuteMode (fileMode fs) ->
+ return . okWithDetails $
+ "File for diagnose data collector " ++
+ "must be executable by owner"
+ | otherwise -> do
+ r <- fmap decode (readProcess fp [] "")
+ case r of
+ Ok val -> return val
+ Error str -> return . okWithDetails $
+ "Could not parse result: " ++ str
+ else return $ okWithDetails
+ "File specified for diagnose data collector does not exist"
+
+buildJsonReport :: IO JSValue
+buildJsonReport = fmap (genericResult okWithDetails id) . runResultT $ do
+ statusFnName <- getDiagnoseCollectorFilename Nothing Nothing
+ lift $ fnToVal statusFnName
+
+-- | The data exported by the data collector, taken from the default location.
+dcReport :: IO DCReport
+dcReport = buildJsonReport >>=
+ buildReport dcName dcVersion dcFormatVersion dcCategory dcKind
diff --git a/src/Ganeti/DataCollectors/KvmRSS.hs b/src/Ganeti/DataCollectors/KvmRSS.hs
new file mode 100644
index 0000000..3f26617
--- /dev/null
+++ b/src/Ganeti/DataCollectors/KvmRSS.hs
@@ -0,0 +1,119 @@
+{-| kvm resident set size collector
+
+It collects the resident set size (RSS) for all kvm
+processes managed by Ganeti, i.e., the number of pages
+the process has in RAM. The value is obtained
+by taking the corresponding value from /proc/$pid/memstat.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.DataCollectors.KvmRSS
+ ( dcName
+ , dcVersion
+ , dcFormatVersion
+ , dcCategory
+ , dcKind
+ , dcReport
+ ) where
+
+import Control.Monad (liftM)
+import Data.Char (isSpace)
+import Data.Maybe (mapMaybe)
+import Network.BSD (getHostName)
+import System.FilePath ((</>))
+import qualified Text.JSON as J
+import Text.Printf (printf)
+
+import Ganeti.BasicTypes
+import Ganeti.Confd.ClientFunctions (getInstances)
+import qualified Ganeti.Constants as C
+import Ganeti.DataCollectors.Types
+import Ganeti.Objects
+import Ganeti.Path (kvmPidDir)
+
+-- | The name of this data collector for the resident set size (RSS).
+dcName :: String
+dcName = C.dataCollectorKvmRSS
+
+-- | The version number for the data format of this data collector.
+dcFormatVersion :: Int
+dcFormatVersion = 1
+
+-- | The version of this data collector.
+dcVersion :: DCVersion
+dcVersion = DCVerBuiltin
+
+-- | The category of this data collector.
+dcCategory :: Maybe DCCategory
+dcCategory = Nothing
+
+-- | The kind of this data collector.
+dcKind :: DCKind
+dcKind = DCKPerf
+
+-- | Parse the contents of a pid file.
+parsePid :: Monad m => String -> m Int
+parsePid s = case reads s of
+ [(pid, r)] | all isSpace r -> return pid
+ _ -> fail $ "Couldn't parse pid " ++ s
+
+-- | From the contents of a memstat file get the resident set size,
+-- in pages.
+parseRss :: Monad m => String -> m Int
+parseRss s =
+ let drop1 = dropWhile isSpace . dropWhile (not . isSpace) . dropWhile isSpace
+ in case reads (drop1 s) of
+ [(n, _)] -> return n
+ _ -> fail $ "Failed to parse memstat " ++ s
+
+-- | For an instance, collect the resident set size, if available.
+collectInstanceRSS :: String -> IO (Result (String, J.JSValue))
+collectInstanceRSS inst = runResultT $ do
+ piddir <- liftIO kvmPidDir
+ let pidfile = piddir </> inst
+ pidstring <- liftIO $ readFile pidfile
+ pid <- parsePid pidstring
+ let procfspath = printf "/proc/%d/statm" pid
+ memstat <- liftIO $ readFile procfspath
+ rss <- parseRss memstat
+ return (inst, J.showJSON rss)
+
+-- | The data exported by the data collector.
+dcReport :: IO DCReport
+dcReport = do
+ node <- getHostName
+ instances <- liftM (genericResult (const []) (mapMaybe instName . fst))
+ . runResultT $ getInstances node Nothing Nothing
+ reports <- liftM justOk $ mapM collectInstanceRSS instances
+ buildReport dcName dcVersion dcFormatVersion dcCategory dcKind
+ . J.JSObject $ J.toJSObject reports
diff --git a/src/Ganeti/DataCollectors/Types.hs b/src/Ganeti/DataCollectors/Types.hs
index 8b60be1..20386ce 100644
--- a/src/Ganeti/DataCollectors/Types.hs
+++ b/src/Ganeti/DataCollectors/Types.hs
@@ -68,7 +68,7 @@
import Ganeti.Utils (getCurrentTimeUSec)
-- | The possible classes a data collector can belong to.
-data DCCategory = DCInstance | DCStorage | DCDaemon | DCHypervisor
+data DCCategory = DCInstance | DCStorage | DCDaemon | DCHypervisor | DCNode
deriving (Show, Eq, Read, Enum, Bounded)
-- | Get the category name and return it as a string.
@@ -145,7 +145,7 @@
-- | Type for the value field of the `CollectorMap` below.
data CollectorData =
- CPULoadData (Seq.Seq (ClockTime, [Int]))
+ CPULoadData (Seq.Seq (ClockTime, [Integer]))
| InstanceCpuLoad (Map.Map String (Seq.Seq (ClockTime, Double)))
instance NFData ClockTime where
diff --git a/src/Ganeti/DataCollectors/XenCpuLoad.hs b/src/Ganeti/DataCollectors/XenCpuLoad.hs
index 10c39cd..1526b57 100644
--- a/src/Ganeti/DataCollectors/XenCpuLoad.hs
+++ b/src/Ganeti/DataCollectors/XenCpuLoad.hs
@@ -42,7 +42,10 @@
, dcUpdate
) where
-import Control.Applicative ((<$>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Arrow ((***))
import Control.Monad (liftM, when)
import Control.Monad.IO.Class (liftIO)
@@ -143,7 +146,8 @@
combinedValues
withoutOld = Map.filter
(liftA2 (&&) (not . Seq.null)
- $ (>) (fromIntegral $ C.xentopAverageThreshold * 1000000)
+ $ (>) (fromIntegral
+ $ 3 * C.xentopAverageThreshold * 1000000)
. (clockTimeToUSec now -) . clockTimeToUSec
. fst . flip Seq.index 0)
withinRange
diff --git a/src/Ganeti/Errors.hs b/src/Ganeti/Errors.hs
index 5d64892..1dccb93 100644
--- a/src/Ganeti/Errors.hs
+++ b/src/Ganeti/Errors.hs
@@ -122,13 +122,13 @@
, ("FileStoragePathError", [excErrMsg])
])
-instance Error GanetiException where
- strMsg = GenericError
-
instance JSON GanetiException where
showJSON = saveGanetiException
readJSON = loadGanetiException
+instance FromString GanetiException where
+ mkFromString = GenericError
+
-- | Error monad using 'GanetiException' type alias.
type ErrorResult = GenericResult GanetiException
diff --git a/src/Ganeti/HTools/AlgorithmParams.hs b/src/Ganeti/HTools/AlgorithmParams.hs
index b93f437..8a53e69 100644
--- a/src/Ganeti/HTools/AlgorithmParams.hs
+++ b/src/Ganeti/HTools/AlgorithmParams.hs
@@ -41,11 +41,17 @@
, fromCLIOptions
) where
+import qualified Data.Set as Set
+
import qualified Ganeti.HTools.CLI as CLI
import qualified Ganeti.HTools.Types as T
data AlgorithmOptions = AlgorithmOptions
{ algDiskMoves :: Bool -- ^ Whether disk moves are allowed
+ , algDiskMovesFactor :: Double -- ^ Allow only disk moves leads to gain
+ -- in cluster score more than
+ -- algDiskMovesFactor times higher than
+ -- the gain in migration moves
, algInstanceMoves :: Bool -- ^ Whether instance moves are allowed
, algRestrictedMigration :: Bool -- ^ Whether migration is restricted
, algIgnoreSoftErrors :: Bool -- ^ Whether to always ignore soft errors
@@ -56,6 +62,8 @@
-- like global N+1 redundancy
, algCapacityIgnoreGroups :: [T.Gdx] -- ^ Groups to ignore in capacity checks
, algRestrictToNodes :: Maybe [String] -- ^ nodes to restrict allocation to
+ , algAllowedNodes :: Maybe (Set.Set Int) -- ^ if given, do not perform any
+ -- operations involving other nodes
, algAcceptExisting :: Bool -- ^ accept existing violations in capacity
-- checks
}
@@ -64,6 +72,7 @@
fromCLIOptions :: CLI.Options -> AlgorithmOptions
fromCLIOptions opts = AlgorithmOptions
{ algDiskMoves = CLI.optDiskMoves opts
+ , algDiskMovesFactor = CLI.optAvoidDiskMoves opts
, algInstanceMoves = CLI.optInstMoves opts
, algRestrictedMigration = CLI.optRestrictedMigrate opts
, algIgnoreSoftErrors = CLI.optIgnoreSoftErrors opts
@@ -73,6 +82,7 @@
, algCapacity = CLI.optCapacity opts
, algCapacityIgnoreGroups = []
, algRestrictToNodes = CLI.optRestrictToNodes opts
+ , algAllowedNodes = Nothing
, algAcceptExisting = CLI.optAcceptExisting opts
}
diff --git a/src/Ganeti/HTools/Backend/IAlloc.hs b/src/Ganeti/HTools/Backend/IAlloc.hs
index 3a67c2d..e40c3d7 100644
--- a/src/Ganeti/HTools/Backend/IAlloc.hs
+++ b/src/Ganeti/HTools/Backend/IAlloc.hs
@@ -65,7 +65,7 @@
import Ganeti.HTools.CLI
import Ganeti.HTools.Loader
import Ganeti.HTools.Types
-import Ganeti.JSON (maybeFromObj, JSRecord, tryFromObj, toArray, asObjectList, readEitherString, fromJResult, fromObj, fromObjWithDefault, asJSObject)
+import Ganeti.JSON (maybeFromObj, JSRecord, tryFromObj, toArray, asObjectList, readEitherString, fromJResult, fromObj, fromObjWithDefault, asJSObject, emptyContainer)
import Ganeti.Types ( EvacMode(ChangePrimary, ChangeSecondary)
, adminStateFromRaw, AdminState(..))
import Ganeti.Utils
@@ -157,6 +157,7 @@
offline <- extract "offline"
drained <- extract "drained"
guuid <- extract "group"
+ hvstate <- extractDef emptyContainer "hv_state"
vm_capable <- annotateResult desc $ maybeFromObj a "vm_capable"
let vm_capable' = fromMaybe True vm_capable
gidx <- lookupGroup ktg n guuid
@@ -178,8 +179,9 @@
dfree <- lvextract 0 "free_disk"
ctotal <- lvextract 0.0 "total_cpus"
cnos <- lvextract 0 "reserved_cpus"
- let node = flip Node.setNodeTags tags $
- Node.create n mtotal mnode mfree dtotal dfree ctotal cnos
+ let node_mem = obtainNodeMemory hvstate mnode
+ node = flip Node.setNodeTags tags $
+ Node.create n mtotal node_mem mfree dtotal dfree ctotal cnos
(not live || drained) sptotal spfree gidx excl_stor
return (n, node)
diff --git a/src/Ganeti/HTools/Backend/Luxi.hs b/src/Ganeti/HTools/Backend/Luxi.hs
index 53b0794..639d74d 100644
--- a/src/Ganeti/HTools/Backend/Luxi.hs
+++ b/src/Ganeti/HTools/Backend/Luxi.hs
@@ -51,47 +51,14 @@
import qualified Ganeti.HTools.Group as Group
import qualified Ganeti.HTools.Node as Node
import qualified Ganeti.HTools.Instance as Instance
-import Ganeti.JSON (fromObj, fromJVal, tryFromObj, arrayMaybeFromJVal)
+import Ganeti.JSON (fromJVal, tryFromObj, arrayMaybeFromJVal,
+ getKeysFromContainer, Container)
+import Ganeti.Objects (PartialNicParams)
{-# ANN module "HLint: ignore Eta reduce" #-}
-- * Utility functions
--- | Get values behind \"data\" part of the result.
-getData :: (Monad m) => JSValue -> m JSValue
-getData (JSObject o) = fromObj (fromJSObject o) "data"
-getData x = fail $ "Invalid input, expected dict entry but got " ++ show x
-
--- | Converts a (status, value) into m value, if possible.
-parseQueryField :: (Monad m) => JSValue -> m (JSValue, JSValue)
-parseQueryField (JSArray [status, result]) = return (status, result)
-parseQueryField o =
- fail $ "Invalid query field, expected (status, value) but got " ++ show o
-
--- | Parse a result row.
-parseQueryRow :: (Monad m) => JSValue -> m [(JSValue, JSValue)]
-parseQueryRow (JSArray arr) = mapM parseQueryField arr
-parseQueryRow o =
- fail $ "Invalid query row result, expected array but got " ++ show o
-
--- | Parse an overall query result and get the [(status, value)] list
--- for each element queried.
-parseQueryResult :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
-parseQueryResult (JSArray arr) = mapM parseQueryRow arr
-parseQueryResult o =
- fail $ "Invalid query result, expected array but got " ++ show o
-
--- | Prepare resulting output as parsers expect it.
-extractArray :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
-extractArray v =
- getData v >>= parseQueryResult
-
--- | Testing result status for more verbose error message.
-fromJValWithStatus :: (Text.JSON.JSON a, Monad m) => (JSValue, JSValue) -> m a
-fromJValWithStatus (st, v) = do
- st' <- fromJVal st
- Qlang.checkRS st' v >>= fromJVal
-
annotateConvert :: String -> String -> String -> Result a -> Result a
annotateConvert otype oname oattr =
annotateResult $ otype ++ " '" ++ oname ++
@@ -106,7 +73,7 @@
-> (JSValue, JSValue) -- ^ The value we're trying to convert
-> Result a -- ^ The annotated result
genericConvert otype oname oattr =
- annotateConvert otype oname oattr . fromJValWithStatus
+ annotateConvert otype oname oattr . L.fromJValWithStatus
convertArrayMaybe :: (Text.JSON.JSON a) =>
String -- ^ The object type
@@ -128,7 +95,8 @@
["name", "mtotal", "mnode", "mfree", "dtotal", "dfree",
"ctotal", "cnos", "offline", "drained", "vm_capable",
"ndp/spindle_count", "group.uuid", "tags",
- "ndp/exclusive_storage", "sptotal", "spfree", "ndp/cpu_speed"]
+ "ndp/exclusive_storage", "sptotal", "spfree", "ndp/cpu_speed",
+ "hv_state"]
Qlang.EmptyFilter
-- | The input data for instance query.
@@ -149,7 +117,7 @@
queryGroupsMsg :: L.LuxiOp
queryGroupsMsg =
L.Query (Qlang.ItemTypeOpCode Qlang.QRGroup)
- ["uuid", "name", "alloc_policy", "ipolicy", "tags"]
+ ["uuid", "name", "alloc_policy", "ipolicy", "tags", "networks"]
Qlang.EmptyFilter
-- | Wraper over 'callMethod' doing node query.
@@ -172,7 +140,7 @@
getInstances :: NameAssoc
-> JSValue
-> Result [(String, Instance.Instance)]
-getInstances ktn arr = extractArray arr >>= mapM (parseInstance ktn)
+getInstances ktn arr = L.extractArray arr >>= mapM (parseInstance ktn)
-- | Construct an instance from a JSON object.
parseInstance :: NameAssoc
@@ -182,7 +150,7 @@
, status, pnode, snodes, tags, oram
, auto_balance, disk_template, su
, dsizes, dspindles, forthcoming ] = do
- xname <- annotateResult "Parsing new instance" (fromJValWithStatus name)
+ xname <- annotateResult "Parsing new instance" (L.fromJValWithStatus name)
let convert a = genericConvert "Instance" xname a
xdisk <- convert "disk_usage" disk
xmem <- case oram of -- FIXME: remove the "guessing"
@@ -212,15 +180,16 @@
-- | Parse a node list in JSON format.
getNodes :: NameAssoc -> JSValue -> Result [(String, Node.Node)]
-getNodes ktg arr = extractArray arr >>= mapM (parseNode ktg)
+getNodes ktg arr = L.extractArray arr >>= mapM (parseNode ktg)
-- | Construct a node from a JSON object.
parseNode :: NameAssoc -> [(JSValue, JSValue)] -> Result (String, Node.Node)
parseNode ktg [ name, mtotal, mnode, mfree, dtotal, dfree
, ctotal, cnos, offline, drained, vm_capable, spindles, g_uuid
- , tags, excl_stor, sptotal, spfree, cpu_speed ]
+ , tags, excl_stor, sptotal, spfree, cpu_speed, hv_state ]
+
= do
- xname <- annotateResult "Parsing new node" (fromJValWithStatus name)
+ xname <- annotateResult "Parsing new node" (L.fromJValWithStatus name)
let convert a = genericConvert "Node" xname a
xoffline <- convert "offline" offline
xdrained <- convert "drained" drained
@@ -249,9 +218,11 @@
-- is the only supported disk template
xctotal <- lvconvert 0.0 "ctotal" ctotal
xcnos <- lvconvert 0 "cnos" cnos
- let node = flip Node.setCpuSpeed xcpu_speed .
+ xhv_state <- convert "hv_state" hv_state
+ let node_mem = obtainNodeMemory xhv_state xmnode
+ node = flip Node.setCpuSpeed xcpu_speed .
flip Node.setNodeTags xtags $
- Node.create xname xmtotal xmnode xmfree xdtotal xdfree
+ Node.create xname xmtotal node_mem xmfree xdtotal xdfree
xctotal xcnos (not live || xdrained) xsptotal xspfree
xgdx xexcl_stor
return (xname, node)
@@ -272,19 +243,20 @@
-- | Parses the cluster groups.
getGroups :: JSValue -> Result [(String, Group.Group)]
-getGroups jsv = extractArray jsv >>= mapM parseGroup
+getGroups jsv = L.extractArray jsv >>= mapM parseGroup
-- | Parses a given group information.
parseGroup :: [(JSValue, JSValue)] -> Result (String, Group.Group)
-parseGroup [uuid, name, apol, ipol, tags] = do
- xname <- annotateResult "Parsing new group" (fromJValWithStatus name)
+parseGroup [uuid, name, apol, ipol, tags, nets] = do
+ xname <- annotateResult "Parsing new group" (L.fromJValWithStatus name)
let convert a = genericConvert "Group" xname a
xuuid <- convert "uuid" uuid
xapol <- convert "alloc_policy" apol
xipol <- convert "ipolicy" ipol
xtags <- convert "tags" tags
- -- TODO: parse networks to which this group is connected
- return (xuuid, Group.create xname xuuid xapol [] xipol xtags)
+ xnets <- convert "networks" nets :: Result (Container PartialNicParams)
+ let xnetids = getKeysFromContainer xnets
+ return (xuuid, Group.create xname xuuid xapol xnetids xipol xtags)
parseGroup v = fail ("Invalid group query result: " ++ show v)
diff --git a/src/Ganeti/HTools/Backend/MonD.hs b/src/Ganeti/HTools/Backend/MonD.hs
index 9944bd6..be420a5 100644
--- a/src/Ganeti/HTools/Backend/MonD.hs
+++ b/src/Ganeti/HTools/Backend/MonD.hs
@@ -41,6 +41,16 @@
module Ganeti.HTools.Backend.MonD
( queryAllMonDDCs
, pMonDData
+ , Report(..)
+ , DataCollector
+ , dName
+ , fromCurl
+ , mkReport
+ , totalCPUCollector
+ , xenCPUCollector
+ , kvmRSSCollector
+ , scaleMemoryWeight
+ , useInstanceRSSData
) where
import Control.Monad
@@ -56,8 +66,9 @@
import Ganeti.BasicTypes
import qualified Ganeti.Constants as C
import Ganeti.Cpu.Types
-import qualified Ganeti.DataCollectors.XenCpuLoad as XenCpuLoad
import qualified Ganeti.DataCollectors.CPUload as CPUload
+import qualified Ganeti.DataCollectors.KvmRSS as KvmRSS
+import qualified Ganeti.DataCollectors.XenCpuLoad as XenCpuLoad
import Ganeti.DataCollectors.Types ( DCReport, DCCategory
, dcReportData, dcReportName
, getCategoryName )
@@ -76,6 +87,7 @@
-- | The actual data types for MonD's Data Collectors.
data Report = CPUavgloadReport CPUavgload
| InstanceCpuReport (Map.Map String Double)
+ | InstanceRSSReport (Map.Map String Double)
-- | Type describing a data collector basic information.
data DataCollector = DataCollector
@@ -188,14 +200,90 @@
, dUse = useInstanceCpuData
}
+-- * kvm instance RSS collector
+
+-- | Parse results of the kvm instance RSS data Collector
+mkKvmRSSReport :: DCReport -> Maybe Report
+mkKvmRSSReport =
+ liftM InstanceRSSReport . maybeParseMap . dcReportData
+
+-- | Conversion constant from htools' internal memory unit,
+-- which is MiB to RSS unit, which reported in pages (of 4kiB
+-- each).
+pagesPerMiB :: Double
+pagesPerMiB = 256.0
+
+-- | Update cluster data based on per-instance RSS data.
+-- Also set the node's memoy util pool correctly. Our unit
+-- of memory usage is pages; there are 256 pages per MiB
+-- of node memory not used by the node itself.
+useInstanceRSSData :: [(Node.Node, Report)]
+ -> (Node.List, Instance.List)
+ -> Result (Node.List, Instance.List)
+useInstanceRSSData reports (nl, il) = do
+ let toMap (InstanceRSSReport m) = Just m
+ toMap _ = Nothing
+ let usage = Map.unions $ mapMaybe (toMap . snd) reports
+ missingData = (Set.fromList . map Instance.name $ IntMap.elems il)
+ Set.\\ Map.keysSet usage
+ unless (Set.null missingData)
+ . Bad . (++) "No RSS information available for "
+ . show $ Set.elems missingData
+ let updateInstance inst =
+ let mem = Map.lookup (Instance.name inst) usage
+ dynU = Instance.util inst
+ dynU' = maybe dynU (\m -> dynU { memWeight = m }) mem
+ in inst { Instance.util = dynU' }
+ let il' = IntMap.map updateInstance il
+ let updateNode node =
+ let mem = sum
+ . map (\ idx -> maybe 0 (memWeight . Instance.util)
+ $ IntMap.lookup idx il')
+ $ Node.pList node
+ dynU = Node.utilLoad node
+ dynU' = dynU { memWeight = mem }
+ pool = Node.utilPool node
+ nodePages = (Node.tMem node - fromIntegral (Node.nMem node))
+ * pagesPerMiB
+ pool' = pool { memWeight = nodePages }
+ in node { Node.utilLoad = dynU', Node.utilPool = pool' }
+ let nl' = IntMap.map updateNode nl
+ return (nl', il')
+
+-- | Update cluster data based on the per-instance CPU usage
+kvmRSSCollector :: DataCollector
+kvmRSSCollector = DataCollector { dName = KvmRSS.dcName
+ , dCategory = KvmRSS.dcCategory
+ , dMkReport = mkKvmRSSReport
+ , dUse = useInstanceRSSData
+ }
+
+-- | Scale the importance of the memory weight in dynamic utilisation,
+-- by multiplying the usage with the given factor. Note that the underlying
+-- model for dynamic utilisation is that they are reported in arbitrary units.
+scaleMemoryWeight :: Double
+ -> (Node.List, Instance.List)
+ -> (Node.List, Instance.List)
+scaleMemoryWeight f (nl, il) =
+ let updateInst inst =
+ let dynU = Instance.util inst
+ dynU' = dynU { memWeight = f * memWeight dynU}
+ in inst { Instance.util = dynU' }
+ updateNode node =
+ let dynU = Node.utilLoad node
+ dynU' = dynU { memWeight = f * memWeight dynU}
+ in node { Node.utilLoad = dynU' }
+ in (IntMap.map updateNode nl, IntMap.map updateInst il)
+
-- * Collector choice
-- | The list of Data Collectors used by hail and hbal.
collectors :: Options -> [DataCollector]
collectors opts
| optIgnoreDynu opts = []
- | optMonDXen opts = [ xenCPUCollector ]
- | otherwise = [ totalCPUCollector ]
+ | otherwise =
+ (if optMonDXen opts then [ xenCPUCollector ] else [ totalCPUCollector ] )
+ ++ [ kvmRSSCollector | optMonDKvmRSS opts ]
-- * Querying infrastructure
diff --git a/src/Ganeti/HTools/Backend/Rapi.hs b/src/Ganeti/HTools/Backend/Rapi.hs
index 7d76751..218411c 100644
--- a/src/Ganeti/HTools/Backend/Rapi.hs
+++ b/src/Ganeti/HTools/Backend/Rapi.hs
@@ -53,7 +53,7 @@
import Ganeti.BasicTypes
import Ganeti.HTools.Loader
import Ganeti.HTools.Types
-import Ganeti.JSON (loadJSArray, JSRecord, tryFromObj, fromJVal, maybeFromObj, fromJResult, tryArrayMaybeFromObj, readEitherString, fromObjWithDefault, asJSObject)
+import Ganeti.JSON (loadJSArray, JSRecord, tryFromObj, fromJVal, maybeFromObj, fromJResult, tryArrayMaybeFromObj, readEitherString, fromObjWithDefault, asJSObject, emptyContainer)
import qualified Ganeti.HTools.Group as Group
import qualified Ganeti.HTools.Node as Node
import qualified Ganeti.HTools.Instance as Instance
@@ -186,8 +186,10 @@
ctotal <- lvextract 0.0 "ctotal"
cnos <- lvextract 0 "cnos"
tags <- extract "tags"
- let node = flip Node.setNodeTags tags $
- Node.create name mtotal mnode mfree dtotal dfree ctotal cnos
+ hv_state <- extractDef emptyContainer "hv_state"
+ let node_mem = obtainNodeMemory hv_state mnode
+ node = flip Node.setNodeTags tags $
+ Node.create name mtotal node_mem mfree dtotal dfree ctotal cnos
(not live || drained) sptotal spfree guuid' excl_stor
return (name, node)
@@ -196,12 +198,13 @@
parseGroup a = do
name <- tryFromObj "Parsing new group" a "name"
let extract s = tryFromObj ("Group '" ++ name ++ "'") a s
+ let extractDef s d = fromObjWithDefault a s d
uuid <- extract "uuid"
apol <- extract "alloc_policy"
ipol <- extract "ipolicy"
tags <- extract "tags"
- -- TODO: parse networks to which this group is connected
- return (uuid, Group.create name uuid apol [] ipol tags)
+ nets <- extractDef "networks" []
+ return (uuid, Group.create name uuid apol nets ipol tags)
-- | Parse cluster data from the info resource.
parseCluster :: JSObject JSValue -> Result ([String], IPolicy, String)
diff --git a/src/Ganeti/HTools/Backend/Text.hs b/src/Ganeti/HTools/Backend/Text.hs
index 5aaa784..4929f74 100644
--- a/src/Ganeti/HTools/Backend/Text.hs
+++ b/src/Ganeti/HTools/Backend/Text.hs
@@ -168,13 +168,14 @@
-- | Generate policy data from a given policy object.
serializeIPolicy :: String -> IPolicy -> String
serializeIPolicy owner ipol =
- let IPolicy minmax stdspec dts vcpu_ratio spindle_ratio = ipol
+ let IPolicy minmax stdspec dts vcpu_ratio spindle_ratio memory_ratio = ipol
strings = [ owner
, serializeISpec stdspec
, serializeMultipleMinMaxISpecs minmax
, serializeDiskTemplates dts
, show vcpu_ratio
, show spindle_ratio
+ , show memory_ratio
]
in intercalate "|" strings
@@ -370,16 +371,21 @@
-- | Loads an ipolicy from a field list.
loadIPolicy :: [String] -> Result (String, IPolicy)
loadIPolicy (owner:stdspec:minmaxspecs:dtemplates:
- vcpu_ratio:spindle_ratio:_) = do
+ vcpu_ratio:spindle_ratio:memory_ratio:_) = do
xstdspec <- loadISpec (owner ++ "/stdspec") (commaSplit stdspec)
xminmaxspecs <- loadMultipleMinMaxISpecs owner $
sepSplit iSpecsSeparator minmaxspecs
xdts <- mapM diskTemplateFromRaw $ commaSplit dtemplates
xvcpu_ratio <- tryRead (owner ++ "/vcpu_ratio") vcpu_ratio
xspindle_ratio <- tryRead (owner ++ "/spindle_ratio") spindle_ratio
+ xmemory_ratio <- tryRead (owner ++ "/memory_ratio") memory_ratio
return (owner,
IPolicy xminmaxspecs xstdspec
- xdts xvcpu_ratio xspindle_ratio)
+ xdts xvcpu_ratio xspindle_ratio xmemory_ratio)
+loadIPolicy (owner:stdspec:minmaxspecs:dtemplates:
+ vcpu_ratio:spindle_ratio:_) =
+ loadIPolicy (owner:stdspec:minmaxspecs:dtemplates:
+ vcpu_ratio:spindle_ratio:["1.0"])
loadIPolicy s = fail $ "Invalid ipolicy data: '" ++ show s ++ "'"
loadOnePolicy :: (IPolicy, Group.List) -> String
diff --git a/src/Ganeti/HTools/CLI.hs b/src/Ganeti/HTools/CLI.hs
index 7ca25d9..110375e 100644
--- a/src/Ganeti/HTools/CLI.hs
+++ b/src/Ganeti/HTools/CLI.hs
@@ -8,7 +8,7 @@
{-
-Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
+Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -55,12 +55,15 @@
-- * The options
, oDataFile
, oDiskMoves
+ , oAvoidDiskMoves
, oDiskTemplate
, oDryRun
, oSpindleUse
, oDynuFile
+ , oMemWeight
, oMonD
, oMonDDataFile
+ , oMonDKvmRSS
, oMonDXen
, oEvacMode
, oMonDExitMissing
@@ -72,6 +75,7 @@
, oForce
, oFullEvacuation
, oGroup
+ , oIdleDefault
, oIAllocSrc
, oIgnoreDyn
, oIgnoreNonRedundant
@@ -143,11 +147,16 @@
data Options = Options
{ optDataFile :: Maybe FilePath -- ^ Path to the cluster data file
, optDiskMoves :: Bool -- ^ Allow disk moves
+ , optAvoidDiskMoves :: Double -- ^ Allow only disk moves improving
+ -- cluster score in more than
+ -- optAvoidDiskMoves times
, optInstMoves :: Bool -- ^ Allow instance moves
, optDiskTemplate :: Maybe DiskTemplate -- ^ Override for the disk template
, optSpindleUse :: Maybe Int -- ^ Override for the spindle usage
, optDynuFile :: Maybe FilePath -- ^ Optional file with dynamic use data
, optIgnoreDynu :: Bool -- ^ Do not use dynamic use data
+ , optIdleDefault :: Bool -- ^ Assume idle load for all not provided
+ -- dynamic utilisation data
, optIgnoreSoftErrors :: Bool -- ^ Ignore soft errors in balancing moves
, optIndependentGroups :: Bool -- ^ consider groups independently
, optAcceptExisting :: Bool -- ^ accept existing N+1 violations
@@ -156,8 +165,12 @@
-- by MonDs
, optMonDXen :: Bool -- ^ Should Xen-specific collectors be
-- considered (only if MonD is queried)
+ , optMonDKvmRSS :: Bool -- ^ Should kvm RSS information be
+ -- considered (only if MonD is queried)
, optMonDExitMissing :: Bool -- ^ If the program should exit on missing
-- MonD data
+ , optMemWeight :: Double -- ^ Rescale the weight of memory
+ -- utilisation
, optEvacMode :: Bool -- ^ Enable evacuation mode
, optRestrictedMigrate :: Bool -- ^ Disallow replace-primary moves
, optExInst :: [String] -- ^ Instances to be excluded
@@ -218,18 +231,22 @@
defaultOptions = Options
{ optDataFile = Nothing
, optDiskMoves = True
+ , optAvoidDiskMoves = 1.0
, optInstMoves = True
, optIndependentGroups = False
, optAcceptExisting = False
, optDiskTemplate = Nothing
, optSpindleUse = Nothing
, optIgnoreDynu = False
+ , optIdleDefault = False
, optIgnoreSoftErrors = False
, optDynuFile = Nothing
, optMonD = False
, optMonDFile = Nothing
, optMonDXen = False
+ , optMonDKvmRSS = False
, optMonDExitMissing = False
+ , optMemWeight = 1.0
, optEvacMode = False
, optRestrictedMigrate = False
, optExInst = []
@@ -340,6 +357,16 @@
\ thus allowing only the 'cheap' failover/migrate operations",
OptComplNone)
+oAvoidDiskMoves :: OptType
+oAvoidDiskMoves =
+ (Option "" ["avoid-disk-moves"]
+ (reqWithConversion (tryRead "disk moves avoiding factor")
+ (\f opts -> Ok opts { optAvoidDiskMoves = f }) "FACTOR")
+ "gain in cluster metrics on each balancing step including disk moves\
+ \ should be FACTOR times higher than the gain after migrations in order to\
+ \ admit disk move during the step",
+ OptComplFloat)
+
oMonD :: OptType
oMonD =
(Option "" ["mond"]
@@ -363,6 +390,21 @@
"also consider xen-specific collectors in MonD queries",
OptComplNone)
+oMonDKvmRSS :: OptType
+oMonDKvmRSS =
+ (Option "" ["mond-kvm-rss"]
+ (NoArg (\ opts -> Ok opts { optMonDKvmRSS = True }))
+ "also consider residual-set-size data for kvm instances via MonD",
+ OptComplNone)
+
+oMemWeight :: OptType
+oMemWeight =
+ (Option "" ["mem-weight"]
+ (reqWithConversion (tryRead "memory weight factor")
+ (\ f opts -> Ok opts { optMemWeight = f }) "FACTOR")
+ "Rescale the weight of the memory utilization by the given factor",
+ OptComplFloat)
+
oMonDExitMissing :: OptType
oMonDExitMissing =
(Option "" ["exit-on-missing-mond-data"]
@@ -420,6 +462,13 @@
"Ignore any dynamic utilisation information",
OptComplNone)
+oIdleDefault :: OptType
+oIdleDefault =
+ (Option "" ["idle-default"]
+ (NoArg (\ opts -> Ok opts {optIdleDefault = True}))
+ "Assume idleness for any non-availabe dynamic utilisation data",
+ OptComplNone)
+
oIgnoreSoftErrors :: OptType
oIgnoreSoftErrors =
(Option "" ["ignore-soft-errors"]
diff --git a/src/Ganeti/HTools/Cluster.hs b/src/Ganeti/HTools/Cluster.hs
index 8e4327c..65746fd 100644
--- a/src/Ganeti/HTools/Cluster.hs
+++ b/src/Ganeti/HTools/Cluster.hs
@@ -82,11 +82,22 @@
, findSplitInstances
) where
-import Control.Applicative ((<$>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Arrow ((&&&))
import Control.Monad (unless)
import qualified Data.IntSet as IntSet
-import Data.List
+import qualified Data.Set as Set
+import Data.List ( nub
+ , sortBy
+ , foldl'
+ , intersect
+ , partition
+ , (\\)
+ , sort
+ , intercalate)
import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
import Data.Ord (comparing)
import Text.Printf (printf)
@@ -334,61 +345,65 @@
upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
in compareTables cur_tbl upd_tbl
--- | Given the status of the current secondary as a valid new node and
--- the current candidate target node, generate the possible moves for
--- a instance.
-possibleMoves :: MirrorType -- ^ The mirroring type of the instance
- -> Bool -- ^ Whether the secondary node is a valid new node
- -> Bool -- ^ Whether we can change the primary node
- -> Bool -- ^ Whether we alowed to move disks
- -> (Bool, Bool) -- ^ Whether migration is restricted and whether
- -- the instance primary is offline
- -> Ndx -- ^ Target node candidate
- -> [IMove] -- ^ List of valid result moves
+-- | Generate all possible migration moves of an instance given some
+-- additional parameters
+migrationMoves :: MirrorType -- ^ The mirroring type of the instance
+ -> Bool -- ^ Whether the secondary node is active
+ -> [Ndx] -- ^ Target node candidate list
+ -> [IMove] -- ^ List of valid result moves
+migrationMoves MirrorNone _ _ = []
+migrationMoves MirrorInternal False _ = []
+migrationMoves MirrorInternal True _ = [Failover]
+migrationMoves MirrorExternal _ nodes_idx = map FailoverToAny nodes_idx
-possibleMoves MirrorNone _ _ _ _ _ = []
+-- | Generate all possible disk moves (complex instance moves consist of disk
+-- moves and maybe migrations) of an instance given some additional parameters
+diskMoves :: MirrorType -- ^ The mirroring type of the instance
+ -> Bool -- ^ Whether the secondary node is a valid new node
+ -> Bool -- ^ Whether we can change the primary node
+ -> (Bool, Bool) -- ^ Whether migration is restricted and whether
+ -- the instance primary is offline
+ -> [Ndx] -- ^ Target node candidates list
+ -> [IMove] -- ^ List of valid result moves
+diskMoves MirrorNone _ _ _ _ = []
+diskMoves MirrorExternal _ _ _ _ = []
+diskMoves MirrorInternal valid_sec inst_moves restr nodes_idx =
+ concatMap (intMirrSingleDiskMove valid_sec inst_moves restr) nodes_idx
+ where
+ intMirrSingleDiskMove _ False _ tdx =
+ [ReplaceSecondary tdx]
-possibleMoves MirrorExternal _ False _ _ _ = []
+ intMirrSingleDiskMove _ _ (True, False) tdx =
+ [ReplaceSecondary tdx]
-possibleMoves MirrorExternal _ True _ _ tdx =
- [ FailoverToAny tdx ]
+ intMirrSingleDiskMove True True (False, _) tdx =
+ [ ReplaceSecondary tdx
+ , ReplaceAndFailover tdx
+ , ReplacePrimary tdx
+ , FailoverAndReplace tdx
+ ]
-possibleMoves MirrorInternal _ _ False _ _ = []
+ intMirrSingleDiskMove True True (True, True) tdx =
+ [ ReplaceSecondary tdx
+ , ReplaceAndFailover tdx
+ , FailoverAndReplace tdx
+ ]
-possibleMoves MirrorInternal _ False True _ tdx =
- [ ReplaceSecondary tdx ]
+ intMirrSingleDiskMove False True _ tdx =
+ [ ReplaceSecondary tdx
+ , ReplaceAndFailover tdx
+ ]
-possibleMoves MirrorInternal _ _ True (True, False) tdx =
- [ ReplaceSecondary tdx
- ]
-
-possibleMoves MirrorInternal True True True (False, _) tdx =
- [ ReplaceSecondary tdx
- , ReplaceAndFailover tdx
- , ReplacePrimary tdx
- , FailoverAndReplace tdx
- ]
-
-possibleMoves MirrorInternal True True True (True, True) tdx =
- [ ReplaceSecondary tdx
- , ReplaceAndFailover tdx
- , FailoverAndReplace tdx
- ]
-
-possibleMoves MirrorInternal False True True _ tdx =
- [ ReplaceSecondary tdx
- , ReplaceAndFailover tdx
- ]
-- | Compute the best move for a given instance.
checkInstanceMove :: AlgorithmOptions -- ^ Algorithmic options for balancing
-> [Ndx] -- ^ Allowed target node indices
-> Table -- ^ Original table
-> Instance.Instance -- ^ Instance to move
- -> Table -- ^ Best new table for this instance
+ -> (Table, Table) -- ^ Pair of best new tables:
+ -- migrations only and with disk moves
checkInstanceMove opts nodes_idx ini_tbl@(Table nl _ _ _) target =
let force = algIgnoreSoftErrors opts
- disk_moves = algDiskMoves opts
inst_moves = algInstanceMoves opts
rest_mig = algRestrictedMigration opts
opdx = Instance.pNode target
@@ -397,19 +412,23 @@
nodes = filter (`notElem` bad_nodes) nodes_idx
mir_type = Instance.mirrorType target
use_secondary = elem osdx nodes_idx && inst_moves
- aft_failover = if mir_type == MirrorInternal && use_secondary
- -- if drbd and allowed to failover
- then checkSingleStep force ini_tbl target ini_tbl
- Failover
- else ini_tbl
primary_drained = Node.offline
. flip Container.find nl
$ Instance.pNode target
- all_moves = concatMap (possibleMoves mir_type use_secondary inst_moves
- disk_moves (rest_mig, primary_drained)) nodes
- in
- -- iterate over the possible nodes for this instance
- foldl' (checkSingleStep force ini_tbl target) aft_failover all_moves
+
+ migrations = migrationMoves mir_type use_secondary nodes
+ disk_moves = diskMoves mir_type use_secondary inst_moves
+ (rest_mig, primary_drained) nodes
+
+ -- iterate over the possible nodes and migrations for this instance
+ best_migr_tbl =
+ if inst_moves
+ then foldl' (checkSingleStep force ini_tbl target) ini_tbl migrations
+ else ini_tbl
+ -- iterate over the possible moves for this instance
+ best_tbl =
+ foldl' (checkSingleStep force ini_tbl target) best_migr_tbl disk_moves
+ in (best_migr_tbl, best_tbl)
-- | Compute the best next move.
checkMove :: AlgorithmOptions -- ^ Algorithmic options for balancing
@@ -417,27 +436,32 @@
-> Table -- ^ The current solution
-> [Instance.Instance] -- ^ List of instances still to move
-> Table -- ^ The new solution
-checkMove opts nodes_idx ini_tbl victims =
- let Table _ _ _ ini_plc = ini_tbl
+checkMove opts nodes_idx ini_tbl@(Table _ _ ini_cv _) victims =
+ let disk_moves = algDiskMoves opts
+ disk_moves_f = algDiskMovesFactor opts
-- we're using rwhnf from the Control.Parallel.Strategies
-- package; we don't need to use rnf as that would force too
-- much evaluation in single-threaded cases, and in
-- multi-threaded case the weak head normal form is enough to
-- spark the evaluation
- tables = parMap rwhnf (checkInstanceMove opts nodes_idx ini_tbl)
- victims
+ table_pairs = parMap rwhnf (checkInstanceMove opts nodes_idx ini_tbl)
+ victims
+
-- iterate over all instances, computing the best move
- best_tbl = foldl' compareTables ini_tbl tables
- Table _ _ _ best_plc = best_tbl
- in if length best_plc == length ini_plc
- then ini_tbl -- no advancement
- else best_tbl
+ best_migr_tbl@(Table _ _ best_migr_cv _) =
+ foldl' compareTables ini_tbl $ map fst table_pairs
+ best_tbl@(Table _ _ best_cv _) =
+ foldl' compareTables ini_tbl $ map snd table_pairs
+ in if not disk_moves
+ || ini_cv - best_cv <= (ini_cv - best_migr_cv) * disk_moves_f
+ then best_migr_tbl
+ else best_tbl -- best including disk moves
-- | Check if we are allowed to go deeper in the balancing.
doNextBalance :: Table -- ^ The starting table
-> Int -- ^ Remaining length
-> Score -- ^ Score at which to stop
- -> Bool -- ^ The resulting table and commands
+ -> Bool -- ^ True if we can continue
doNextBalance ini_tbl max_rounds min_score =
let Table _ _ ini_cv ini_plc = ini_tbl
ini_plc_len = length ini_plc
@@ -463,7 +487,13 @@
reloc_inst = filter (\i -> Instance.movable i &&
Instance.autoBalance i) all_inst'
node_idx = map Node.idx online_nodes
- fin_tbl = checkMove opts node_idx ini_tbl reloc_inst
+ allowed_node = maybe (const True) (flip Set.member)
+ $ algAllowedNodes opts
+ good_nidx = filter allowed_node node_idx
+ allowed_inst = liftA2 (&&) (allowed_node . Instance.pNode)
+ (liftA2 (||) allowed_node (< 0) . Instance.sNode)
+ good_reloc_inst = filter allowed_inst reloc_inst
+ fin_tbl = checkMove opts good_nidx ini_tbl good_reloc_inst
(Table _ _ fin_cv _) = fin_tbl
in
if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
@@ -841,6 +871,39 @@
of x:_ -> Just . snd $ x
_ -> Nothing
+-- | For a failure determine the underlying resource that most likely
+-- causes this kind of failure. In particular, N+1 violations are most
+-- likely caused by lack of memory.
+underlyingCause :: FailMode -> FailMode
+underlyingCause FailN1 = FailMem
+underlyingCause x = x
+
+-- | Shrink a resource of an instance until the failure statistics for
+-- this resource changes. Note that it might no be possible to allocate
+-- an instance at this size; nevertheless there might be a need to change
+-- the resource to shrink on, e.g., if the current instance is too big on
+-- two resources.
+doShrink :: (Instance.Instance -> AllocSolution) -> Instance.Instance
+ -> FailMode -> Maybe Instance.Instance
+doShrink allocFn inst fm =
+ let physRes = underlyingCause fm
+ getCount = runListHead 0 snd . filter ((==) physRes . fst)
+ . collapseFailures . map underlyingCause . asFailures
+ initialStat = getCount $ allocFn inst
+ hasChanged = ((/=) initialStat . getCount . fst)
+ -- as the list of possible shrinks can be quite long, and, moreover,
+ -- has some cost of computing it, our heuristics is to look into it
+ -- only for a limited range; only once the list is shorter, we do
+ -- binary search.
+ lookAhead = 50
+ heuristics xs = if null (drop lookAhead xs)
+ then length xs `div` 2
+ else lookAhead
+ in fmap snd
+ . monotoneFind heuristics hasChanged
+ . map (allocFn &&& id)
+ $ iterateOk (`Instance.shrinkByType` physRes) inst
+
-- | Tiered allocation method.
--
-- This places instances on the cluster, and decreases the spec until
@@ -857,21 +920,20 @@
Nothing -> (False, Nothing)
Just n -> (n <= ixes_cnt,
Just (n - ixes_cnt))
- sortedErrs = map fst $ sortBy (comparing snd) errs
- suffShrink = sufficesShrinking
- (fromMaybe emptyAllocSolution
- . flip (tryAlloc opts nl' il') allocnodes)
- newinst
- bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
+ sortedErrs = nub . map (underlyingCause . fst)
+ $ sortBy (flip $ comparing snd) errs
+ allocFn = fromMaybe emptyAllocSolution
+ . flip (tryAlloc opts nl' il') allocnodes
+ suffShrink = sufficesShrinking allocFn newinst
+ bigSteps = filter isJust . map suffShrink $ drop 1 sortedErrs
progress (Ok (_, _, _, newil', _)) (Ok (_, _, _, newil, _)) =
length newil' > length newil
progress _ _ = False
in if stop then newsol else
- let newsol' = case Instance.shrinkByType newinst . last
- $ sortedErrs of
- Bad _ -> newsol
- Ok newinst' -> tieredAlloc opts nl' il' newlimit
- newinst' allocnodes ixes' cstats'
+ let newsol' = case map (doShrink allocFn newinst) sortedErrs of
+ Just newinst' : _ -> tieredAlloc opts nl' il' newlimit
+ newinst' allocnodes ixes' cstats'
+ _ -> newsol
in if progress newsol' newsol then newsol' else
case bigSteps of
Just newinst':_ -> tieredAlloc opts nl' il' newlimit
diff --git a/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs b/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs
index 3e90e02..f8e9aa9 100644
--- a/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs
+++ b/src/Ganeti/HTools/Cluster/AllocatePrimitives.hs
@@ -39,14 +39,14 @@
import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..))
import Ganeti.HTools.Cluster.AllocationSolution (AllocElement)
-import Ganeti.HTools.Cluster.Metrics ( compCV, compCVfromStats
+import Ganeti.HTools.Cluster.Metrics ( ClusterStatistics, compCV
+ , compCVfromStats
, updateClusterStatisticsTwice)
import Ganeti.HTools.Cluster.Moves (setInstanceLocationScore)
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
import qualified Ganeti.HTools.Node as Node
import Ganeti.HTools.Types
-import Ganeti.Utils.Statistics
-- | Tries to allocate an instance on one given node.
allocateOnSingle :: AlgorithmOptions
@@ -65,7 +65,7 @@
-- | Tries to allocate an instance on a given pair of nodes.
allocateOnPair :: AlgorithmOptions
- -> [Statistics]
+ -> ClusterStatistics
-> Node.List -> Instance.Instance -> Ndx -> Ndx
-> OpResult AllocElement
allocateOnPair opts stats nl inst new_pdx new_sdx =
diff --git a/src/Ganeti/HTools/Cluster/Metrics.hs b/src/Ganeti/HTools/Cluster/Metrics.hs
index a1681ee..ff1662a 100644
--- a/src/Ganeti/HTools/Cluster/Metrics.hs
+++ b/src/Ganeti/HTools/Cluster/Metrics.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE TemplateHaskell #-}
+
{-| Implementation of the cluster metric
-}
@@ -33,7 +35,8 @@
-}
module Ganeti.HTools.Cluster.Metrics
- ( compCV
+ ( ClusterStatistics
+ , compCV
, compCVfromStats
, compCVNodes
, compClusterStatistics
@@ -42,172 +45,24 @@
, printStats
) where
-import Control.Monad (guard)
-import Data.List (partition, transpose)
-import Data.Maybe (fromMaybe)
-import Text.Printf (printf)
-
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Node as Node
-import qualified Ganeti.HTools.PeerMap as P
-import Ganeti.HTools.Types
-import Ganeti.Utils (printTable)
-import Ganeti.Utils.Statistics
+import qualified Ganeti.HTools.Cluster.MetricsComponents as M
+import Ganeti.HTools.Cluster.MetricsTH
--- | Coefficient for the total reserved memory in the cluster metric. We
--- use a (local) constant here, as it is also used in the computation of
--- the best possible cluster score.
-reservedMemRtotalCoeff :: Double
-reservedMemRtotalCoeff = 0.25
-
--- | The names and weights of the individual elements in the CV list, together
--- with their statistical accumulation function and a bit to decide whether it
--- is a statistics for online nodes.
-detailedCVInfoExt :: [((Double, String)
- , ([AggregateComponent] -> Statistics, Bool))]
-detailedCVInfoExt = [ ((0.5, "free_mem_cv"), (getStdDevStatistics, True))
- , ((0.5, "free_disk_cv"), (getStdDevStatistics, True))
- , ((1, "n1_cnt"), (getSumStatistics, True))
- , ((1, "reserved_mem_cv"), (getStdDevStatistics, True))
- , ((4, "offline_all_cnt"), (getSumStatistics, False))
- , ((16, "offline_pri_cnt"), (getSumStatistics, False))
- , ( (0.5, "vcpu_ratio_cv")
- , (getStdDevStatistics, True))
- , ((1, "cpu_load_cv"), (getStdDevStatistics, True))
- , ((1, "mem_load_cv"), (getStdDevStatistics, True))
- , ((1, "disk_load_cv"), (getStdDevStatistics, True))
- , ((1, "net_load_cv"), (getStdDevStatistics, True))
- , ((2, "pri_tags_score"), (getSumStatistics, True))
- , ((0.5, "spindles_cv"), (getStdDevStatistics, True))
- , ((0.5, "free_mem_cv_forth"), (getStdDevStatistics, True))
- , ( (0.5, "free_disk_cv_forth")
- , (getStdDevStatistics, True))
- , ( (0.5, "vcpu_ratio_cv_forth")
- , (getStdDevStatistics, True))
- , ((0.5, "spindles_cv_forth"), (getStdDevStatistics, True))
- , ((1, "location_score"), (getSumStatistics, True))
- , ( (1, "location_exclusion_score")
- , (getMapStatistics, True))
- , ( (reservedMemRtotalCoeff, "reserved_mem_rtotal")
- , (getSumStatistics, True))
- ]
-
--- | Compute the lower bound of the cluster score, i.e., the sum of the minimal
--- values for all cluster score values that are not 0 on a perfectly balanced
--- cluster.
-optimalCVScore :: Node.List -> Double
-optimalCVScore nodelist = fromMaybe 0 $ do
- let nodes = Container.elems nodelist
- guard $ length nodes > 1
- let nodeMems = map Node.tMem nodes
- totalMem = sum nodeMems
- totalMemOneLessNode = totalMem - maximum nodeMems
- guard $ totalMemOneLessNode > 0
- let totalDrbdMem = fromIntegral . sum $ map (P.sumElems . Node.peers) nodes
- optimalUsage = totalDrbdMem / totalMem
- optimalUsageOneLessNode = totalDrbdMem / totalMemOneLessNode
- relativeReserved = optimalUsageOneLessNode - optimalUsage
- return $ reservedMemRtotalCoeff * relativeReserved
-
--- | The names and weights of the individual elements in the CV list.
-detailedCVInfo :: [(Double, String)]
-detailedCVInfo = map fst detailedCVInfoExt
-
--- | Holds the weights used by 'compCVNodes' for each metric.
-detailedCVWeights :: [Double]
-detailedCVWeights = map fst detailedCVInfo
-
--- | The aggregation functions for the weights
-detailedCVAggregation :: [([AggregateComponent] -> Statistics, Bool)]
-detailedCVAggregation = map snd detailedCVInfoExt
-
--- | The bit vector describing which parts of the statistics are
--- for online nodes.
-detailedCVOnlineStatus :: [Bool]
-detailedCVOnlineStatus = map snd detailedCVAggregation
-
--- | Compute statistical measures of a single node.
-compDetailedCVNode :: Node.Node -> [AggregateComponent]
-compDetailedCVNode node =
- let mem = Node.pMem node
- memF = Node.pMemForth node
- dsk = Node.pDsk node
- dskF = Node.pDskForth node
- n1 = fromIntegral
- $ if Node.failN1 node
- then length (Node.sList node) + length (Node.pList node)
- else 0
- res = Node.pRem node
- ipri = fromIntegral . length $ Node.pList node
- isec = fromIntegral . length $ Node.sList node
- ioff = ipri + isec
- cpu = Node.pCpuEff node
- cpuF = Node.pCpuEffForth node
- DynUtil c1 m1 d1 nn1 = Node.utilLoad node
- DynUtil c2 m2 d2 nn2 = Node.utilPool node
- (c_load, m_load, d_load, n_load) = (c1/c2, m1/m2, d1/d2, nn1/nn2)
- pri_tags = fromIntegral $ Node.conflictingPrimaries node
- spindles = Node.instSpindles node / Node.hiSpindles node
- spindlesF = Node.instSpindlesForth node / Node.hiSpindles node
- location_score = fromIntegral $ Node.locationScore node
- location_exclusion_score = Node.instanceMap node
- in [ SimpleNumber mem, SimpleNumber dsk, SimpleNumber n1, SimpleNumber res
- , SimpleNumber ioff, SimpleNumber ipri, SimpleNumber cpu
- , SimpleNumber c_load, SimpleNumber m_load, SimpleNumber d_load
- , SimpleNumber n_load
- , SimpleNumber pri_tags, SimpleNumber spindles
- , SimpleNumber memF, SimpleNumber dskF, SimpleNumber cpuF
- , SimpleNumber spindlesF
- , SimpleNumber location_score
- , SpreadValues location_exclusion_score
- , SimpleNumber res
- ]
-
--- | Compute the statistics of a cluster.
-compClusterStatistics :: [Node.Node] -> [Statistics]
-compClusterStatistics all_nodes =
- let (offline, nodes) = partition Node.offline all_nodes
- offline_values = transpose (map compDetailedCVNode offline)
- ++ repeat []
- -- transpose of an empty list is empty and not k times the empty list, as
- -- would be the transpose of a 0 x k matrix
- online_values = transpose $ map compDetailedCVNode nodes
- aggregate (f, True) (onNodes, _) = f onNodes
- aggregate (f, False) (_, offNodes) = f offNodes
- in zipWith aggregate detailedCVAggregation
- $ zip online_values offline_values
-
--- | Update a cluster statistics by replacing the contribution of one
--- node by that of another.
-updateClusterStatistics :: [Statistics]
- -> (Node.Node, Node.Node) -> [Statistics]
-updateClusterStatistics stats (old, new) =
- let update = zip (compDetailedCVNode old) (compDetailedCVNode new)
- online = not $ Node.offline old
- updateStat forOnline stat upd = if forOnline == online
- then updateStatistics stat upd
- else stat
- in zipWith3 updateStat detailedCVOnlineStatus stats update
+$(declareStatistics M.metricComponents)
-- | Update a cluster statistics twice.
-updateClusterStatisticsTwice :: [Statistics]
+updateClusterStatisticsTwice :: ClusterStatistics
-> (Node.Node, Node.Node)
-> (Node.Node, Node.Node)
- -> [Statistics]
+ -> ClusterStatistics
updateClusterStatisticsTwice s a =
updateClusterStatistics (updateClusterStatistics s a)
--- | Compute cluster statistics
-compDetailedCV :: [Node.Node] -> [Double]
-compDetailedCV = map getStatisticValue . compClusterStatistics
-
--- | Compute the cluster score from its statistics
-compCVfromStats :: [Statistics] -> Double
-compCVfromStats = sum . zipWith (*) detailedCVWeights . map getStatisticValue
-
--- | Compute the /total/ variance.
+-- | Compute the total cluster store given the nodes.
compCVNodes :: [Node.Node] -> Double
-compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
+compCVNodes = compCVfromStats . compClusterStatistics
-- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
compCV :: Node.List -> Double
@@ -215,14 +70,5 @@
-- | Shows statistics for a given node list.
printStats :: String -> Node.List -> String
-printStats lp nl =
- let dcvs = compDetailedCV $ Container.elems nl
- (weights, names) = unzip detailedCVInfo
- hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
- header = [ "Field", "Value", "Weight" ]
- formatted = map (\(w, h, val) ->
- [ h
- , printf "%.8f" val
- , printf "x%.2f" w
- ]) hd
- in printTable lp header formatted $ False:repeat True
+printStats lp =
+ showClusterStatistics lp . compClusterStatistics . Container.elems
diff --git a/src/Ganeti/HTools/Cluster/MetricsComponents.hs b/src/Ganeti/HTools/Cluster/MetricsComponents.hs
new file mode 100644
index 0000000..85f20ee
--- /dev/null
+++ b/src/Ganeti/HTools/Cluster/MetricsComponents.hs
@@ -0,0 +1,171 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Module describing cluster metrics components.
+
+ Metrics components are used for generation of functions deaing with cluster
+ statistics.
+
+-}
+
+{-
+
+Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.HTools.Cluster.MetricsComponents
+ ( metricComponents
+ ) where
+
+
+import Control.Monad (guard)
+import Data.Maybe (fromMaybe)
+import Language.Haskell.TH
+
+import Ganeti.HTools.Cluster.MetricsTH (MetricComponent(..))
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import qualified Ganeti.HTools.PeerMap as P
+import Ganeti.HTools.Types
+import Ganeti.Utils.Statistics
+
+-- | Type alias decreasing table size below
+type D = Double
+
+-- | List containing all currently enabled cluster metrics components
+metricComponents :: [MetricComponent]
+metricComponents =
+ [ stdDevComp "free_mem_cv" [| 0.5 :: D |] True [| Node.pMem |]
+ , stdDevComp "free_disk_cv" [| 0.5 :: D |] True [| Node.pDsk |]
+ , stdDevComp "vcpu_ratio_cv" [| 0.5 :: D |] True
+ [| Node.pCpuEff |]
+ , sumComp "spindles_cv" [| 0.5 :: D |] True
+ [| \n -> Node.instSpindles n / Node.hiSpindles n |]
+ , sumComp "fail_n1" [| 0.5 :: D |] True
+ [| \n -> if Node.failN1 n
+ then toDouble $ length (Node.sList n) + length (Node.pList n)
+ else 0 |]
+ , stdDevComp "reserved_mem_cv" [| 1 :: D |] True [| Node.pRem |]
+ , sumComp "offline_all_cnt" [| 4 :: D |] False
+ [| \n -> toDouble $ length (Node.pList n) + length (Node.sList n) |]
+ , sumComp "offline_pri_cnt" [| 16 :: D |] False
+ [| toDouble . length . Node.pList |]
+ , stdDevComp "cpu_load_cv" [| 1 :: D |] True
+ [| \n -> let DynUtil c1 _ _ _ = Node.utilLoad n
+ DynUtil c2 _ _ _ = Node.utilPool n
+ in c1/c2 |]
+ , stdDevComp "mem_load_cv" [| 1 :: D |] True
+ [| \n -> let DynUtil _ m1 _ _ = Node.utilLoad n
+ DynUtil _ m2 _ _ = Node.utilPool n
+ in m1/m2 |]
+ , stdDevComp "disk_load_cv" [| 1 :: D |] True
+ [| \n -> let DynUtil _ _ d1 _ = Node.utilLoad n
+ DynUtil _ _ d2 _ = Node.utilPool n
+ in d1/d2 |]
+ , stdDevComp "net_load_cv" [| 1 :: D |] True
+ [| \n -> let DynUtil _ _ _ n1 = Node.utilLoad n
+ DynUtil _ _ _ n2 = Node.utilPool n
+ in n1/n2 |]
+ , sumComp "pri_tags_score" [| 2 :: D |] True
+ [| toDouble . Node.conflictingPrimaries |]
+ , sumComp "location_score" [| 1 :: D |] True
+ [| toDouble . Node.locationScore |]
+ , mapComp "location_exclusion_score" [| 0.5 :: D |] True
+ [| MapData . Node.instanceMap |]
+ , stdDevComp "free_mem_cv_forth" [| 0.5 :: D |] True
+ [| Node.pMemForth |]
+ , stdDevComp "free_disk_cv_forth" [| 0.5 :: D |] True
+ [| Node.pDskForth |]
+ , stdDevComp "vcpu_ratio_cv_forth" [| 0.5 :: D |] True
+ [| Node.pCpuEffForth |]
+ , sumComp "spindles_cv_forth" [| 0.5 :: D |] True
+ [| \n -> Node.instSpindlesForth n / Node.hiSpindles n |]
+ , reservedMemRTotal
+ ]
+
+-- | Function to be used as a short MetricComponent constructor for SumStat.
+sumComp :: String -> ExpQ -> Bool -> ExpQ -> MetricComponent
+sumComp nm w on f = MetricComponent { name = nm
+ , weight = w
+ , fromNode = f
+ , fromNodeType = [t| Double |]
+ , statisticsType = [t| SumStat |]
+ , forOnlineNodes = on
+ , optimalValue = Nothing
+ }
+
+-- | Function to be used as a short MetricComponent constructor for StdDevStat.
+stdDevComp :: String -> ExpQ -> Bool -> ExpQ -> MetricComponent
+stdDevComp nm w on f = MetricComponent { name = nm
+ , weight = w
+ , fromNode = f
+ , fromNodeType = [t| Double |]
+ , statisticsType = [t| StdDevStat |]
+ , forOnlineNodes = on
+ , optimalValue = Nothing
+ }
+
+-- | Function to be used as a short MetricComponent constructor for MapStat.
+mapComp :: String -> ExpQ -> Bool -> ExpQ -> MetricComponent
+mapComp nm w on f = MetricComponent { name = nm
+ , weight = w
+ , fromNode = f
+ , fromNodeType = [t| MapData |]
+ , statisticsType = [t| MapStat |]
+ , forOnlineNodes = on
+ , optimalValue = Nothing
+ }
+
+-- | Weight of reservedMemRTotal component
+wReservedMemRTotal :: Double
+wReservedMemRTotal = 0.25
+
+reservedMemRTotal :: MetricComponent
+reservedMemRTotal = MetricComponent
+ { name = "reserved_mem_rtotal"
+ , weight = [| wReservedMemRTotal :: D |]
+ , fromNode = [| Node.pRem |]
+ , fromNodeType = [t| Double |]
+ , statisticsType = [t| SumStat |]
+ , forOnlineNodes = True
+ , optimalValue = Just [| reservedMemRTotalOptValue |]
+ }
+
+-- | Computes theoretical opimal value for reservedMemRTotal component
+reservedMemRTotalOptValue :: Node.List -> Double
+reservedMemRTotalOptValue nodelist = fromMaybe 0 $ do
+ let nodes = Container.elems nodelist
+ guard $ length nodes > 1
+ let nodeMems = map Node.tMem nodes
+ totalMem = sum nodeMems
+ totalMemOneLessNode = totalMem - maximum nodeMems
+ guard $ totalMemOneLessNode > 0
+ let totalDrbdMem = fromIntegral . sum $ map (P.sumElems . Node.peers) nodes
+ optimalUsage = totalDrbdMem / totalMem
+ optimalUsageOneLessNode = totalDrbdMem / totalMemOneLessNode
+ relativeReserved = optimalUsageOneLessNode - optimalUsage
+ return $ wReservedMemRTotal * relativeReserved
diff --git a/src/Ganeti/HTools/Cluster/MetricsTH.hs b/src/Ganeti/HTools/Cluster/MetricsTH.hs
new file mode 100644
index 0000000..1e2265f
--- /dev/null
+++ b/src/Ganeti/HTools/Cluster/MetricsTH.hs
@@ -0,0 +1,263 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Declaration of the datatypes and functions dealing with cluster metrics
+ generated by template haskell.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+
+module Ganeti.HTools.Cluster.MetricsTH
+ ( MetricComponent(..)
+ , declareStatistics
+ ) where
+
+import Data.List (partition)
+import Data.Maybe (mapMaybe)
+import Language.Haskell.TH
+import Text.Printf (printf)
+
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Utils (printTable)
+import Ganeti.Utils.Statistics
+
+-- | Data type describing the metric component. The information provided by
+-- this data type is used to generate statistics data types and functions
+-- dealing with them
+data MetricComponent = MetricComponent
+ { name :: String -- ^ The component name
+ , weight :: Q Exp -- ^ The component weight in the statistics sum
+ , fromNode :: Q Exp -- ^ Quasi quoted function obtaining spread value
+ -- from a node given (Node.Node -> fromNodeType)
+ , fromNodeType :: Q Type -- ^ Quasi quoted spread value type
+ , statisticsType :: Q Type -- ^ Quasi quoted statistics data type. Stat
+ -- instance for fromNodeType and statisticsType
+ -- should be defined
+ , forOnlineNodes :: Bool -- ^ Whether this component should be calculated
+ -- for online or offline nodes
+ , optimalValue :: Maybe ExpQ -- ^ Maybe quasi quoted function obtaining
+ -- optimal value of such component
+ -- (Node.List -> Double)
+ }
+
+-- | Declares all functions and data types implemented in template haskell
+declareStatistics :: [MetricComponent] -> Q [Dec]
+declareStatistics components = do
+ nodeValues <- nodeValuesDecl components
+ getNodeValues <- getNodeValuesDecl components
+ clusterStatistics <- clusterStatisticsDecl components
+ compClusterStatistics <- compClusterStatisticsDecl components
+ updateClusterStatistics <- updateClusterStatisticsDecl components
+ compCVfromStats <- compCVfromStatsDecl components
+ showClusterStatistics <- showClusterStatisticsDecl components
+ optimalCVScore <- optimalCVScoreDecl components
+ return $ nodeValues ++ getNodeValues ++ clusterStatistics ++
+ compClusterStatistics ++ updateClusterStatistics ++
+ compCVfromStats ++ showClusterStatistics ++
+ optimalCVScore
+
+-- | Helper function constructing VarStringTypeQ
+getVarStrictTypeQ :: (String, Q Type) -> VarStrictTypeQ
+getVarStrictTypeQ (n, t) = do
+ t' <- t
+ return (mkName n, NotStrict, t')
+
+-- | Function constructs NodeValues data type for metric components given.
+-- The data type is used to store all spread values of one Node.
+nodeValuesDecl :: [MetricComponent] -> Q [Dec]
+nodeValuesDecl components = do
+ let names = map (("nv_" ++ ) . name ) components
+ types = map fromNodeType components
+ strict_types <- mapM getVarStrictTypeQ $ zip names types
+ return [DataD [] (mkName "NodeValues") []
+ [RecC (mkName "NodeValues") strict_types] []]
+
+-- | Function constructs ClusterStatistics data type for metric components
+-- given. The data type is used to store all Statistics constructed from the
+-- [NodeValues].
+clusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+clusterStatisticsDecl components = do
+ let names = map (("cs_" ++ ) . name ) components
+ types = map statisticsType components
+ strict_types <- mapM getVarStrictTypeQ $ zip names types
+ return [DataD [] (mkName "ClusterStatistics") []
+ [RecC (mkName "ClusterStatistics") strict_types] []]
+
+-- | Generates (getNodeValues :: Node.Node -> NodeValues) declaration for
+-- metric components given. The function constructs NodeValues by calling
+-- fromNode function for each metrics component.
+getNodeValuesDecl :: [MetricComponent] -> Q [Dec]
+getNodeValuesDecl components = do
+ extract_functions <- mapM fromNode components
+ x <- newName "node"
+ node_t <- [t| Node.Node |]
+ let names = map (mkName . ("nv_" ++) . name) components
+ values = map (\f -> AppE f (VarE x)) extract_functions
+ body_exp = RecConE (mkName "NodeValues") $ zip names values
+ fname = mkName "getNodeValues"
+ nv_t = ConT $ mkName "NodeValues"
+ sig_d = SigD fname (ArrowT `AppT` node_t `AppT` nv_t)
+ fun_d = FunD fname [Clause [VarP x] (NormalB body_exp) []]
+ return [sig_d, fun_d]
+
+-- | Helper function passing two arguments to a function
+appTwice :: Q Exp -> Q Exp -> Q Exp -> Q Exp
+appTwice fun arg1 = appE $ appE fun arg1
+
+-- | Helper function constructing Q (Name, Exp)
+getQNameExp :: String -> Q Exp -> Q (Name, Exp)
+getQNameExp n e = do
+ e' <- e
+ return (mkName n, e')
+
+-- | Generates (compClusterStatisticsHelper :: [Node.Node] ->
+-- ClusterStatistics) declaration for metric components given. The function
+-- constructs ClusterStatistics by calling calculate function for each spread
+-- values list. Spread values lists are obtained by getNodeValues call.
+compClusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+compClusterStatisticsDecl components = do
+ nl_i <- newName "nl"
+ let splitted = appTwice [| partition |] [| Node.offline |] (varE nl_i)
+ (nl_off, nl_on) = (appE [| fst |] splitted, appE [| snd |] splitted)
+ (online, offline) = partition forOnlineNodes components
+ nv_f nm = varE . mkName $ "nv_" ++ nm
+ nvl_f = appTwice [| map |] (varE (mkName "getNodeValues"))
+ nv_field nm = appTwice [| map |] $ nv_f nm
+ cs_field nm nvl = appE [| calculate |] $ nv_field nm nvl
+ (online_names, offline_names) = (map name online, map name offline)
+ offline_f = map (\nm -> getQNameExp ("cs_" ++ nm) .
+ cs_field nm $ nvl_f nl_off) offline_names
+ online_f = map (\nm -> getQNameExp ("cs_" ++ nm) .
+ cs_field nm $ nvl_f nl_on ) online_names
+ body = recConE (mkName "ClusterStatistics") $ offline_f ++ online_f
+ cls_stat_t = conT $ mkName "ClusterStatistics"
+ fname = mkName "compClusterStatistics"
+ sig_d <- sigD fname ((arrowT `appT` [t| [Node.Node] |]) `appT` cls_stat_t)
+ fun_d <- funD fname [clause [varP nl_i] (normalB body) []]
+ return [sig_d, fun_d]
+
+-- | Generates (updateClusterStatistics :: ClusterStatistics ->
+-- (Node.Node, Node.Node) -> ClusterStatistics) declaration for metric
+-- components given. The function calls update for each ClusterStatistics
+-- field if the node is online or preserves the old ClusterStatistics
+-- otherwise. This action replaces contribution of the first node by the
+-- contribution of the second node.
+updateClusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+updateClusterStatisticsDecl components = do
+ old_s <- newName "old_s"
+ n <- newName "n"
+ n' <- newName "n'"
+ let (online, offline) = partition forOnlineNodes components
+ pattern = [varP old_s, tupP [varP n, varP n']]
+ is_node_online = appE [| not . Node.offline |] $ varE n
+ get_nv nd = appE (varE $ mkName "getNodeValues") $ varE nd
+ nv_get_field nm nd = appE (varE . mkName $ "nv_" ++ nm) $ get_nv nd
+ cs_cur_field nm = appE (varE . mkName $ "cs_" ++ nm) $ varE old_s
+ update_field nm = appTwice (appE [| update |] $ cs_cur_field nm)
+ (nv_get_field nm n) (nv_get_field nm n')
+ (online_names, offline_names) = (map name online, map name offline)
+ offline_f = map (\nm -> getQNameExp ("cs_" ++ nm) $
+ cs_cur_field nm) offline_names
+ online_f = map (\nm -> getQNameExp ("cs_" ++ nm) $
+ update_field nm) online_names
+ body = condE is_node_online
+ (recConE (mkName "ClusterStatistics") $ offline_f ++ online_f)
+ (varE old_s)
+ fname = mkName "updateClusterStatistics"
+ cs_t = conT $ mkName "ClusterStatistics"
+ sig_d <- sigD fname ((arrowT `appT` cs_t) `appT`
+ ((arrowT `appT` [t| (Node.Node, Node.Node) |]) `appT`
+ cs_t))
+ fun_d <- funD fname [clause pattern (normalB body) []]
+ return [sig_d, fun_d]
+
+-- | Generates (compCVFromStats :: ClusterStatistics -> Double) declaration
+-- for metric components given. The function computes the cluster score from
+-- the ClusterStatistics.
+compCVfromStatsDecl :: [MetricComponent] -> Q [Dec]
+compCVfromStatsDecl components = do
+ cs <- newName "cs"
+ let get_comp c = appE (varE . mkName $ "cs_" ++ name c) $ varE cs
+ get_val c = appE [| getValue |] $ get_comp c
+ term c = appTwice [| (*) :: Double -> Double -> Double |]
+ (get_val c) (weight c)
+ stat = appE [| sum :: [Double] -> Double |] . listE $ map term components
+ fname = mkName "compCVfromStats"
+ cs_t = conT $ mkName "ClusterStatistics"
+ sig_d <- sigD fname ((arrowT `appT` cs_t) `appT` [t| Double |])
+ fun_d <- funD fname [clause [varP cs] (normalB stat) []]
+ return [sig_d, fun_d]
+
+-- | Generates (showClusterStatistics :: ClusterStatistics -> String)
+-- declaration for metric components given. The function converts
+-- ClusterStatistics to a string containing a table obtained by printTable.
+showClusterStatisticsDecl :: [MetricComponent] -> Q [Dec]
+showClusterStatisticsDecl components = do
+ lp <- newName "lp"
+ cs <- newName "cs"
+ let get_comp c = appE (varE . mkName $ "cs_" ++ name c) $ varE cs
+ get_val c = appE [| getValue |] $ get_comp c
+ format w h val = listE [ h
+ , appE [| printf "%.8f" |] val
+ , appE [| printf "x%.2f"|] w
+ ]
+ print_line c = format (weight c) (litE . StringL $ name c) (get_val c)
+ header = [| [ "Field", "Value", "Weight" ] |]
+ printed = listE $ map print_line components
+ result = appTwice (appTwice [| printTable |] (varE lp) header)
+ printed [| False:repeat True |]
+ fname = mkName "showClusterStatistics"
+ cs_t = conT $ mkName "ClusterStatistics"
+ sig_d <- sigD fname ((arrowT `appT` [t| String |]) `appT`
+ ((arrowT `appT` cs_t) `appT` [t| String |]))
+ fun_d <- funD fname [clause [varP lp, varP cs] (normalB result) []]
+ return [sig_d, fun_d]
+
+
+-- | Generates (optimalCVScore :: Node.List -> Double) declaration for metric
+-- components given. The function computes the lower bound of the cluster
+-- score, i.e., the sum of the minimal values for all cluster score values that
+-- are not 0 on a perfectly balanced cluster. Components which optimal values
+-- are not 0 have Nothing as optimaLValue component
+optimalCVScoreDecl :: [MetricComponent] -> Q [Dec]
+optimalCVScoreDecl components = do
+ nl <- newName "nl"
+ let stat =
+ foldl (addVal nl) [| 0 :: Double |] $ mapMaybe optimalValue components
+ fname = mkName "optimalCVScore"
+ sig_d <- sigD fname ((arrowT `appT` [t| Node.List |]) `appT` [t| Double |])
+ fun_d <- funD fname [clause [varP nl] (normalB stat) []]
+ return [sig_d, fun_d]
+ where
+ addVal :: Name -> ExpQ -> ExpQ -> ExpQ
+ addVal nl cur f = appTwice [| (+) :: Double -> Double -> Double |]
+ cur . appE f $ varE nl
diff --git a/src/Ganeti/HTools/Dedicated.hs b/src/Ganeti/HTools/Dedicated.hs
index 206513a..00413a6 100644
--- a/src/Ganeti/HTools/Dedicated.hs
+++ b/src/Ganeti/HTools/Dedicated.hs
@@ -44,7 +44,10 @@
, runDedicatedAllocation
) where
-import Control.Applicative (liftA2, (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Arrow ((&&&))
import Control.Monad (unless, liftM, foldM, mplus)
import qualified Data.Foldable as F
diff --git a/src/Ganeti/HTools/ExtLoader.hs b/src/Ganeti/HTools/ExtLoader.hs
index 56e2e80..b322cb3 100644
--- a/src/Ganeti/HTools/ExtLoader.hs
+++ b/src/Ganeti/HTools/ExtLoader.hs
@@ -122,8 +122,9 @@
now <- getClockTime
let ignoreDynU = optIgnoreDynu opts
+ startIdle = ignoreDynU || optIdleDefault opts
eff_u = if ignoreDynU then [] else util_data
- ldresult = input_data >>= (if ignoreDynU then clearDynU else return)
+ ldresult = input_data >>= (if startIdle then clearDynU else return)
>>= mergeData eff_u exTags selInsts exInsts now
cdata <- exitIfBad "failed to load data, aborting" ldresult
(cdata', ok) <- runWriterT $ if optMonD opts
diff --git a/src/Ganeti/HTools/Instance.hs b/src/Ganeti/HTools/Instance.hs
index 63b3024..33e40be 100644
--- a/src/Ganeti/HTools/Instance.hs
+++ b/src/Ganeti/HTools/Instance.hs
@@ -338,7 +338,9 @@
-- | Checks if an instance is bigger than a given spec.
instAboveISpec :: Instance -> T.ISpec -> Bool -> T.OpResult ()
-instAboveISpec = instCompareISpec LT
+instAboveISpec inst spec exclstore =
+ genericResult (const $ Bad T.FailTooSmall) Ok
+ $ instCompareISpec LT inst spec exclstore
-- | Checks if an instance matches a min/max specs pair
instMatchesMinMaxSpecs :: Instance -> T.MinMaxISpecs -> Bool -> T.OpResult ()
diff --git a/src/Ganeti/HTools/Loader.hs b/src/Ganeti/HTools/Loader.hs
index 50ffbc1..2294468 100644
--- a/src/Ganeti/HTools/Loader.hs
+++ b/src/Ganeti/HTools/Loader.hs
@@ -53,6 +53,7 @@
, ClusterData(..)
, isAllocationRequest
, emptyCluster
+ , obtainNodeMemory
, extractDesiredLocations
, updateDesiredLocationTags
) where
@@ -76,8 +77,11 @@
import qualified Ganeti.HTools.Tags as Tags
import qualified Ganeti.HTools.Tags.Constants as TagsC
import Ganeti.HTools.Types
+import qualified Ganeti.Types as T
+import qualified Ganeti.Objects as O
import Ganeti.Utils
import Ganeti.Types (EvacMode)
+import Ganeti.JSON
-- * Types
@@ -417,3 +421,14 @@
eitherLive :: (Monad m) => Bool -> a -> m a -> m a
eitherLive True _ live_data = live_data
eitherLive False def_data _ = return def_data
+
+-- | Obtains memory used by node. It's memory_dom0 for Xen and memNode
+-- otherwise because live data collector exists only for Xen
+obtainNodeMemory :: O.FilledHvState -> Int -> Int
+obtainNodeMemory hv_state memory_dom0 =
+ let getNM ((_, hvs):_) 0 = O.hvstateMemNode hvs
+ getNM ((T.XenPvm, _):_) mem_dom0 = mem_dom0
+ getNM ((T.XenHvm, _):_) mem_dom0 = mem_dom0
+ getNM ((_, hvs):_) _ = O.hvstateMemNode hvs
+ getNM _ mem_dom0 = mem_dom0
+ in getNM (M.toList $ fromContainer hv_state) memory_dom0
diff --git a/src/Ganeti/HTools/Node.hs b/src/Ganeti/HTools/Node.hs
index 79993ad..6749568 100644
--- a/src/Ganeti/HTools/Node.hs
+++ b/src/Ganeti/HTools/Node.hs
@@ -43,6 +43,7 @@
, create
-- ** Finalization after data loading
, buildPeers
+ , computePmem
, setIdx
, setAlias
, setOffline
@@ -99,13 +100,15 @@
, haveExclStorage
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Monad (liftM, liftM2)
-import Control.Applicative ((<$>), (<*>))
import qualified Data.Foldable as Foldable
import Data.Function (on)
import qualified Data.Graph as Graph
import qualified Data.IntMap as IntMap
-import Data.List hiding (group)
+import Data.List (intercalate, foldl', delete, union, sortBy, groupBy)
import qualified Data.Map as Map
import Data.Ord (comparing)
import qualified Data.Set as Set
@@ -302,6 +305,10 @@
haveExclStorage nl =
any exclStorage $ Container.elems nl
+-- | Conversion formula from fMem, tMem and nMem to pMem.
+computePmem :: Int -> Double -> Int -> Double
+computePmem fmem tmem nmem = fromIntegral fmem / (tmem - fromIntegral nmem)
+
-- * Initialization functions
-- | Create a new node.
@@ -342,8 +349,8 @@
, peers = P.empty
, rMem = 0
, rMemForth = 0
- , pMem = fromIntegral mem_f_init / mem_t_init
- , pMemForth = fromIntegral mem_f_init / mem_t_init
+ , pMem = computePmem mem_f_init mem_t_init mem_n_init
+ , pMemForth = computePmem mem_f_init mem_t_init mem_n_init
, pDsk = if excl_stor
then computePDsk spindles_f_init $ fromIntegral spindles_t_init
else computePDsk dsk_f_init dsk_t_init
@@ -450,12 +457,22 @@
, hiCpu = mCpuTohiCpu (T.iPolicyVcpuRatio pol) (tCpu node)
, hiSpindles = computeHiSpindles (T.iPolicySpindleRatio pol)
(tSpindles node)
+ , pMem = computePmem (fMem node) (tMem node) (nMem node)
+ , pMemForth = computePmem (fMemForth node) (tMem node) (nMem node)
}
-- | Computes the maximum reserved memory for peers from a peer map.
computeMaxRes :: P.PeerMap -> P.Elem
computeMaxRes = P.maxElem
+-- | Calculates the lower acceptable amount of free memory. It's a negative
+-- value, thanks to memory over-commitment
+fMemTreshold :: Node -> Int
+fMemTreshold t =
+ fMemTresholdHelper (T.iPolicyMemoryRatio $ iPolicy t) (tMem t) (nMem t)
+ where fMemTresholdHelper ratio tmem nmem =
+ truncate $ (1 - ratio) * (tmem - fromIntegral nmem)
+
-- | Builds the peer map for a given node.
buildPeers :: Node -> Instance.List -> Node
buildPeers t il =
@@ -472,7 +489,7 @@
(sList t)
pmap = P.accumArray (+) mdata
new_rmem = computeMaxRes pmap
- new_failN1 = fMem t < new_rmem
+ new_failN1 = fMem t - new_rmem <= fMemTreshold t
new_prem = fromIntegral new_rmem / tMem t
in t { peers = pmap
, failN1 = new_failN1
@@ -595,7 +612,7 @@
(fMemForth node)
(Instance.mem inst)
- new_pMemForth = fromIntegral new_fMemForth / tMem node
+ new_pMemForth = computePmem new_fMemForth (tMem node) (nMem node)
in node
{ pTags = addTags (pTags node) (Instance.exclTags inst)
@@ -728,7 +745,7 @@
new_dsk_forth = incIf uses_disk (fDskForth n) (Instance.dsk inst)
new_free_sp_forth = calcNewFreeSpindlesForth False n inst
new_inst_sp_forth = calcSpindleUseForth False n inst
- new_mp_forth = fromIntegral new_mem_forth / tMem n
+ new_mp_forth = computePmem new_mem_forth (tMem n) (nMem n)
new_dp_forth = computeNewPDsk n new_free_sp_forth new_dsk_forth
new_ucpu_forth = decIf i_online (uCpuForth n) (Instance.vcpus inst)
new_rcpu_forth = fromIntegral new_ucpu_forth / tCpu n
@@ -759,9 +776,9 @@
new_dsk = incIf uses_disk (fDsk t) (Instance.dsk inst)
new_free_sp = calcNewFreeSpindles False t inst
new_inst_sp = calcSpindleUse False t inst
- new_mp = fromIntegral new_mem / tMem t
+ new_mp = computePmem new_mem (tMem t) (nMem t)
new_dp = computeNewPDsk t new_free_sp new_dsk
- new_failn1 = new_mem <= rMem t
+ new_failn1 = new_mem - rMem t <= fMemTreshold t
new_ucpu = decIf i_online (uCpu t) (Instance.vcpus inst)
new_rcpu = fromIntegral new_ucpu / tCpu t
new_load = utilLoad t `T.subUtil` Instance.util inst
@@ -830,7 +847,7 @@
then old_rmem
else computeMaxRes new_peers
new_prem = fromIntegral new_rmem / tMem t
- new_failn1 = fMem t <= new_rmem
+ new_failn1 = fMem t - new_rmem <= fMemTreshold t
new_dp = computeNewPDsk t new_free_sp new_dsk
old_load = utilLoad t
new_load = old_load
@@ -870,7 +887,7 @@
inst_tags = Instance.exclTags inst
new_mem_forth = fMemForth t - Instance.mem inst
- new_mp_forth = fromIntegral new_mem_forth / tMem t
+ new_mp_forth = computePmem new_mem_forth (tMem t) (nMem t)
new_dsk_forth = decIf uses_disk (fDskForth t) (Instance.dsk inst)
new_free_sp_forth = calcNewFreeSpindlesForth True t inst
new_inst_sp_forth = calcSpindleUseForth True t inst
@@ -898,7 +915,7 @@
}
checkForthcomingViolation
- | new_mem_forth <= 0 = Bad T.FailMem
+ | new_mem_forth <= fMemTreshold t = Bad T.FailMem
| uses_disk && new_dsk_forth <= 0 = Bad T.FailDisk
| uses_disk && new_dsk_forth < loDsk t = Bad T.FailDisk
| uses_disk && exclStorage t
@@ -921,19 +938,19 @@
new_dsk = decIf uses_disk (fDsk t) (Instance.dsk inst)
new_free_sp = calcNewFreeSpindles True t inst
new_inst_sp = calcSpindleUse True t inst
- new_failn1 = new_mem <= rMem t
+ new_failn1 = new_mem - rMem t <= fMemTreshold t
new_ucpu = incIf i_online (uCpu t) (Instance.vcpus inst)
new_pcpu = fromIntegral new_ucpu / tCpu t
new_dp = computeNewPDsk t new_free_sp new_dsk
new_load = utilLoad t `T.addUtil` Instance.util inst
new_plist = iname:pList t
- new_mp = fromIntegral new_mem / tMem t
+ new_mp = computePmem new_mem (tMem t) (nMem t)
new_instance_map = addTags (instanceMap t)
$ getLocationExclusionPairs t inst
in case () of
- _ | new_mem <= 0 -> Bad T.FailMem
+ _ | new_mem <= fMemTreshold t -> Bad T.FailMem
| uses_disk && new_dsk <= 0 -> Bad T.FailDisk
| strict && uses_disk && new_dsk < loDsk t -> Bad T.FailDisk
| uses_disk && exclStorage t && new_free_sp < 0 -> Bad T.FailSpindles
@@ -1019,7 +1036,7 @@
| new_dsk_forth < loDsk t = Bad T.FailDisk
| exclStorage t && new_free_sp_forth < 0 = Bad T.FailSpindles
| new_inst_sp_forth > hiSpindles t = Bad T.FailDisk
- | secondary_needed_mem >= old_mem_forth = Bad T.FailMem
+ | old_mem_forth - secondary_needed_mem <= fMemTreshold t = Bad T.FailMem
-- TODO Check failN1 including forthcoming instances
| otherwise = Ok ()
@@ -1035,7 +1052,7 @@
new_inst_sp = calcSpindleUse True t inst
new_rmem = max (rMem t) new_peem
new_prem = fromIntegral new_rmem / tMem t
- new_failn1 = old_mem <= new_rmem
+ new_failn1 = old_mem - new_rmem <= fMemTreshold t
new_dp = computeNewPDsk t new_free_sp new_dsk
old_load = utilLoad t
new_load = old_load
@@ -1049,7 +1066,8 @@
| strict && new_dsk < loDsk t -> Bad T.FailDisk
| exclStorage t && new_free_sp < 0 -> Bad T.FailSpindles
| strict && new_inst_sp > hiSpindles t -> Bad T.FailDisk
- | strict && secondary_needed_mem >= old_mem -> Bad T.FailMem
+ | strict && old_mem - secondary_needed_mem <= fMemTreshold t
+ -> Bad T.FailMem
| strict && new_failn1 && not (failN1 t) -> Bad T.FailMem
-- When strict also check forthcoming limits, but after normal checks
@@ -1277,6 +1295,8 @@
, OpCodes.opSecondaryIp = Nothing
, OpCodes.opgenericNdParams = Nothing
, OpCodes.opPowered = Nothing
+ , OpCodes.opVerbose = False
+ , OpCodes.opDebug = False
}
-- | Generate OpCode for applying a OobCommand to the given nodes
diff --git a/src/Ganeti/HTools/Program/Harep.hs b/src/Ganeti/HTools/Program/Harep.hs
index 8ad7deb..87d9b53 100644
--- a/src/Ganeti/HTools/Program/Harep.hs
+++ b/src/Ganeti/HTools/Program/Harep.hs
@@ -42,10 +42,7 @@
import Control.Exception (bracket)
import Control.Lens (over)
import Control.Monad
-import Data.Function
-import Data.List
import Data.Maybe
-import Data.Ord
import System.Time
import qualified Data.Map as Map
import qualified Text.JSON as J
@@ -58,21 +55,18 @@
import Ganeti.Jobs
import Ganeti.OpCodes
import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
-import Ganeti.OpParams
import Ganeti.Types
import Ganeti.Utils
-import qualified Ganeti.Constants as C
import qualified Ganeti.Luxi as L
import qualified Ganeti.Path as Path
import Ganeti.HTools.CLI
+import qualified Ganeti.HTools.Container as Container
import Ganeti.HTools.Loader
import Ganeti.HTools.ExtLoader
-import qualified Ganeti.HTools.Tags.Constants as Tags
+import Ganeti.HTools.Repair
import Ganeti.HTools.Types
-import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
-import qualified Ganeti.HTools.Node as Node
import Ganeti.Version (version)
@@ -101,135 +95,6 @@
. setOpComment ("automated repairs by harep " ++ version)
. wrapOpCode
-data InstanceData = InstanceData { arInstance :: Instance.Instance
- , arState :: AutoRepairStatus
- , tagsToRemove :: [String]
- }
- deriving (Eq, Show)
-
--- | Parse a tag into an 'AutoRepairData' record.
---
--- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
--- malformed.
-parseInitTag :: String -> Maybe AutoRepairData
-parseInitTag tag =
- let parsePending = do
- subtag <- chompPrefix Tags.autoRepairTagPending tag
- case sepSplit ':' subtag of
- [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
- _ -> fail ("Invalid tag: " ++ show tag)
-
- parseResult = do
- subtag <- chompPrefix Tags.autoRepairTagResult tag
- case sepSplit ':' subtag of
- [rtype, uuid, ts, result, jobs] -> do
- arData <- makeArData rtype uuid ts jobs
- result' <- autoRepairResultFromRaw result
- return arData { arResult = Just result' }
- _ -> fail ("Invalid tag: " ++ show tag)
-
- makeArData rtype uuid ts jobs = do
- rtype' <- autoRepairTypeFromRaw rtype
- ts' <- tryRead "auto-repair time" ts
- jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
- return AutoRepairData { arType = rtype'
- , arUuid = uuid
- , arTime = TOD ts' 0
- , arJobs = jobs'
- , arResult = Nothing
- , arTag = tag
- }
- in
- parsePending `mplus` parseResult
-
--- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
-getArData :: AutoRepairStatus -> Maybe AutoRepairData
-getArData status =
- case status of
- ArHealthy (Just d) -> Just d
- ArFailedRepair d -> Just d
- ArPendingRepair d -> Just d
- ArNeedsRepair d -> Just d
- _ -> Nothing
-
--- | Return a short name for each auto-repair status.
---
--- This is a more concise representation of the status, because the default
--- "Show" formatting includes all the accompanying auto-repair data.
-arStateName :: AutoRepairStatus -> String
-arStateName status =
- case status of
- ArHealthy _ -> "Healthy"
- ArFailedRepair _ -> "Failure"
- ArPendingRepair _ -> "Pending repair"
- ArNeedsRepair _ -> "Needs repair"
-
--- | Return a new list of tags to remove that includes @arTag@ if present.
-delCurTag :: InstanceData -> [String]
-delCurTag instData =
- let arData = getArData $ arState instData
- rmTags = tagsToRemove instData
- in
- case arData of
- Just d -> arTag d : rmTags
- Nothing -> rmTags
-
--- | Set the initial auto-repair state of an instance from its auto-repair tags.
---
--- The rules when there are multiple tags is:
---
--- * the earliest failure result always wins
---
--- * two or more pending repairs results in a fatal error
---
--- * a pending result from id X and a success result from id Y result in error
--- if Y is newer than X
---
--- * if there are no pending repairs, the newest success result wins,
--- otherwise the pending result is used.
-setInitialState :: Instance.Instance -> Result InstanceData
-setInitialState inst =
- let arData = mapMaybe parseInitTag $ Instance.allTags inst
- -- Group all the AutoRepairData records by id (i.e. by repair task), and
- -- present them from oldest to newest.
- arData' = sortBy (comparing arUuid) arData
- arGroups = groupBy ((==) `on` arUuid) arData'
- arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
- in
- foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
-
--- | Update the initial status of an instance with new repair task tags.
---
--- This function gets called once per repair group in an instance's tag, and it
--- determines whether to set the status of the instance according to this new
--- group, or to keep the existing state. See the documentation for
--- 'setInitialState' for the rules to be followed when determining this.
-arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
-arStatusCmp instData arData =
- let curSt = arState instData
- arData' = sortBy (comparing keyfn) arData
- keyfn d = (arResult d, arTime d)
- newData = last arData'
- newSt = case arResult newData of
- Just ArSuccess -> ArHealthy $ Just newData
- Just ArEnoperm -> ArHealthy $ Just newData
- Just ArFailure -> ArFailedRepair newData
- Nothing -> ArPendingRepair newData
- in
- case curSt of
- ArFailedRepair _ -> Ok instData -- Always keep the earliest failure.
- ArHealthy _ -> Ok instData { arState = newSt
- , tagsToRemove = delCurTag instData
- }
- ArPendingRepair d -> Bad (
- "An unfinished repair was found in instance " ++
- Instance.name (arInstance instData) ++ ": found tag " ++
- show (arTag newData) ++ ", but older pending tag " ++
- show (arTag d) ++ "exists.")
-
- ArNeedsRepair _ -> Bad
- "programming error: ArNeedsRepair found as an initial state"
-
-- | Query jobs of a pending repair, returning the new instance data.
processPending :: Options -> L.Client -> InstanceData -> IO InstanceData
processPending opts client instData =
@@ -264,20 +129,6 @@
_ -> return instData
--- | Update the tag of an 'AutoRepairData' record to match all the other fields.
-updateTag :: AutoRepairData -> AutoRepairData
-updateTag arData =
- let ini = [autoRepairTypeToRaw $ arType arData,
- arUuid arData,
- clockTimeToString $ arTime arData]
- end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
- (pfx, middle) =
- case arResult arData of
- Nothing -> (Tags.autoRepairTagPending, [])
- Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs])
- in
- arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
-
-- | Apply and remove tags from an instance as indicated by 'InstanceData'.
--
-- If the /arState/ of the /InstanceData/ record has an associated
@@ -309,100 +160,6 @@
return instData { tagsToRemove = [] }
--- | Detect brokenness with an instance and suggest repair type and jobs to run.
-detectBroken :: Node.List -> Instance.Instance
- -> Maybe (AutoRepairType, [OpCode])
-detectBroken nl inst =
- let disk = Instance.diskTemplate inst
- iname = Instance.name inst
- offPri = Node.offline $ Container.find (Instance.pNode inst) nl
- offSec = Node.offline $ Container.find (Instance.sNode inst) nl
- in
- case disk of
- DTDrbd8
- | offPri && offSec ->
- Just (
- ArReinstall,
- [ OpInstanceRecreateDisks { opInstanceName = iname
- , opInstanceUuid = Nothing
- , opRecreateDisksInfo = RecreateDisksAll
- , opNodes = []
- -- FIXME: there should be a better way to
- -- specify opcode parameters than abusing
- -- mkNonEmpty in this way (using the fact
- -- that Maybe is used both for optional
- -- fields, and to express failure).
- , opNodeUuids = Nothing
- , opIallocator = mkNonEmpty "hail"
- }
- , OpInstanceReinstall { opInstanceName = iname
- , opInstanceUuid = Nothing
- , opOsType = Nothing
- , opTempOsParams = Nothing
- , opOsparamsPrivate = Nothing
- , opOsparamsSecret = Nothing
- , opForceVariant = False
- }
- ])
- | offPri ->
- Just (
- ArFailover,
- [ OpInstanceFailover { opInstanceName = iname
- , opInstanceUuid = Nothing
- -- FIXME: ditto, see above.
- , opShutdownTimeout = fromJust $ mkNonNegative
- C.defaultShutdownTimeout
- , opIgnoreConsistency = False
- , opTargetNode = Nothing
- , opTargetNodeUuid = Nothing
- , opIgnoreIpolicy = False
- , opIallocator = Nothing
- , opMigrationCleanup = False
- }
- ])
- | offSec ->
- Just (
- ArFixStorage,
- [ OpInstanceReplaceDisks { opInstanceName = iname
- , opInstanceUuid = Nothing
- , opReplaceDisksMode = ReplaceNewSecondary
- , opReplaceDisksList = []
- , opRemoteNode = Nothing
- -- FIXME: ditto, see above.
- , opRemoteNodeUuid = Nothing
- , opIallocator = mkNonEmpty "hail"
- , opEarlyRelease = False
- , opIgnoreIpolicy = False
- }
- ])
- | otherwise -> Nothing
-
- DTPlain
- | offPri ->
- Just (
- ArReinstall,
- [ OpInstanceRecreateDisks { opInstanceName = iname
- , opInstanceUuid = Nothing
- , opRecreateDisksInfo = RecreateDisksAll
- , opNodes = []
- -- FIXME: ditto, see above.
- , opNodeUuids = Nothing
- , opIallocator = mkNonEmpty "hail"
- }
- , OpInstanceReinstall { opInstanceName = iname
- , opInstanceUuid = Nothing
- , opOsType = Nothing
- , opTempOsParams = Nothing
- , opOsparamsPrivate = Nothing
- , opOsparamsSecret = Nothing
- , opForceVariant = False
- }
- ])
- | otherwise -> Nothing
-
- _ -> Nothing -- Other cases are unimplemented for now: DTDiskless,
- -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
-
-- | Submit jobs, unless a dry-run is requested; in this case, just report
-- the job that would be submitted.
submitJobs' :: Options -> [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
diff --git a/src/Ganeti/HTools/Program/Hbal.hs b/src/Ganeti/HTools/Program/Hbal.hs
index 084433a..68572dc 100644
--- a/src/Ganeti/HTools/Program/Hbal.hs
+++ b/src/Ganeti/HTools/Program/Hbal.hs
@@ -4,7 +4,7 @@
{-
-Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
+Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -50,6 +50,7 @@
import Text.Printf (printf)
import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), fromCLIOptions)
+import Ganeti.HTools.Backend.MonD (scaleMemoryWeight)
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Cluster as Cluster
import qualified Ganeti.HTools.Cluster.Metrics as Metrics
@@ -101,15 +102,19 @@
, oMinGain
, oMinGainLim
, oDiskMoves
+ , oAvoidDiskMoves
, oSelInst
, oInstMoves
, oIgnoreSoftErrors
, oDynuFile
+ , oIdleDefault
, oIgnoreDyn
, oMonD
, oMonDDataFile
, oMonDExitMissing
, oMonDXen
+ , oMonDKvmRSS
+ , oMemWeight
, oExTags
, oExInst
, oSaveCluster
@@ -312,13 +317,14 @@
showinsts = optShowInsts opts
force = optIgnoreSoftErrors opts
- ini_cdata@(ClusterData gl fixed_nl ilf ctags ipol) <- loadExternalData opts
+ ini_cdata@(ClusterData gl fixed_nl ilf' ctags ipol) <- loadExternalData opts
when (verbose > 1) $ do
putStrLn $ "Loaded cluster tags: " ++ intercalate "," ctags
putStrLn $ "Loaded cluster ipolicy: " ++ show ipol
- nlf <- setNodeStatus opts fixed_nl
+ nlf' <- setNodeStatus opts fixed_nl
+ let (nlf, ilf) = scaleMemoryWeight (optMemWeight opts) (nlf', ilf')
checkCluster verbose nlf ilf
maybeSaveData (optSaveCluster opts) "original" "before balancing" ini_cdata
diff --git a/src/Ganeti/HTools/Program/Hcheck.hs b/src/Ganeti/HTools/Program/Hcheck.hs
index a2251ff..1250ca2 100644
--- a/src/Ganeti/HTools/Program/Hcheck.hs
+++ b/src/Ganeti/HTools/Program/Hcheck.hs
@@ -53,8 +53,8 @@
import qualified Ganeti.HTools.Group as Group
import qualified Ganeti.HTools.Node as Node
import qualified Ganeti.HTools.Instance as Instance
-
import qualified Ganeti.HTools.Program.Hbal as Hbal
+import Ganeti.HTools.RedundancyLevel (redundancy)
import Ganeti.Common
import Ganeti.HTools.CLI
@@ -70,6 +70,7 @@
return
[ oDataFile
, oDiskMoves
+ , oAvoidDiskMoves
, oDynuFile
, oIgnoreDyn
, oEvacMode
@@ -111,7 +112,7 @@
type GroupInfo = (Gdx, (Node.List, Instance.List))
-- | A type alias for group stats.
-type GroupStats = ((Group.Group, Double), [Int])
+type GroupStats = ((Group.Group, Double, Int), [Int])
-- | Prefix for machine readable names.
htcPrefix :: String
@@ -130,10 +131,12 @@
-- | Data showed per group.
groupData :: Options -> [(String, String)]
groupData opts = commonData opts ++ [("SCORE", "Group score")]
+ ++ [("REDUNDANCY", "Group redundancy level")]
-- | Data showed per cluster.
clusterData :: Options -> [(String, String)]
clusterData opts = commonData opts ++
+ [ ("REDUNDANCY", "Cluster redundancy level") ] ++
[ ("NEED_REBALANCE", "Cluster is not healthy") ]
-- | Phase-specific prefix for machine readable version.
@@ -221,9 +224,9 @@
extractGroupData False grp = Group.name grp
-- | Prepare values for group.
-prepareGroupValues :: [Int] -> Double -> [String]
-prepareGroupValues stats score =
- map show stats ++ [printf "%.8f" score]
+prepareGroupValues :: [Int] -> Double -> Int -> [String]
+prepareGroupValues stats score redundancyLevel =
+ map show stats ++ [printf "%.8f" score] ++ [show redundancyLevel]
-- | Prepare values for cluster.
prepareClusterValues :: Bool -> [Int] -> [Bool] -> [String]
@@ -232,15 +235,16 @@
-- | Print all the statistics on a group level.
printGroupStats :: Options -> Bool -> Phase -> GroupStats -> IO ()
-printGroupStats opts machineread phase ((grp, score), stats) = do
- let values = prepareGroupValues stats score
+printGroupStats opts machineread phase
+ ((grp, score, redundancyLevel), stats) = do
+ let values = prepareGroupValues stats score redundancyLevel
extradata = extractGroupData machineread grp
printStats opts machineread (GroupLvl extradata) phase values
-- | Print all the statistics on a cluster (global) level.
-printClusterStats :: Options -> Bool -> Phase -> [Int] -> Bool -> IO ()
-printClusterStats opts machineread phase stats needhbal = do
- let values = prepareClusterValues machineread stats [needhbal]
+printClusterStats :: Options -> Bool -> Phase -> [Int] -> Bool -> Int -> IO ()
+printClusterStats opts machineread phase stats needhbal gRed = do
+ let values = prepareClusterValues machineread (stats ++ [gRed]) [needhbal]
printStats opts machineread ClusterLvl phase values
-- | Check if any of cluster metrics is non-zero.
@@ -263,13 +267,14 @@
offline_pri = sum . map length $ map Node.pList offnl
offline_sec = length $ map Node.sList offnl
score = Metrics.compCV nl
+ redundancyLvl = redundancy (fromCLIOptions opts) nl il
groupstats = [ n1violated
, conflicttags
, offline_pri
, offline_sec
]
++ [ gn1fail | optCapacity opts ]
- in ((grp, score), groupstats)
+ in ((grp, score, redundancyLvl), groupstats)
-- | Use Hbal's iterateDepth to simulate group rebalance.
executeSimulation :: Options -> Cluster.Table -> Double
@@ -327,6 +332,7 @@
let groupsstats = map (perGroupChecks opts gl) splitcluster
clusterstats = map sum . transpose . map snd $ groupsstats
+ globalRedundancy = minimum $ map (\((_, _, r), _) -> r) groupsstats
needrebalance = clusterNeedsRebalance clusterstats
unless (verbose < 1 || machineread) .
@@ -339,6 +345,7 @@
mapM_ (printGroupStats opts machineread Initial) groupsstats
printClusterStats opts machineread Initial clusterstats needrebalance
+ globalRedundancy
let exitOK = nosimulation || not needrebalance
simulate = not nosimulation && needrebalance
@@ -348,12 +355,14 @@
when (simulate || machineread) $ do
let newgroupstats = map (perGroupChecks opts gl) rebalancedcluster
newclusterstats = map sum . transpose . map snd $ newgroupstats
+ newGlobalRedundancy = minimum $ map (\((_, _, r), _) -> r)
+ newgroupstats
newneedrebalance = clusterNeedsRebalance clusterstats
mapM_ (printGroupStats opts machineread Rebalanced) newgroupstats
printClusterStats opts machineread Rebalanced newclusterstats
- newneedrebalance
+ newneedrebalance newGlobalRedundancy
printFinalHTC machineread
diff --git a/src/Ganeti/HTools/RedundancyLevel.hs b/src/Ganeti/HTools/RedundancyLevel.hs
new file mode 100644
index 0000000..ca77a10
--- /dev/null
+++ b/src/Ganeti/HTools/RedundancyLevel.hs
@@ -0,0 +1,76 @@
+{-| Implementation of the computation of the cluster redundancy level
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.HTools.RedundancyLevel
+ ( redundancy
+ ) where
+
+import Control.Applicative (liftA2)
+import Control.Arrow ((&&&))
+import Data.Function (on)
+import qualified Data.IntMap as IntMap
+import Data.List (sortBy)
+
+import Ganeti.BasicTypes (runListHead)
+import Ganeti.HTools.AlgorithmParams (AlgorithmOptions)
+import Ganeti.HTools.GlobalN1 (redundant)
+import qualified Ganeti.HTools.Cluster as Cluster
+import qualified Ganeti.HTools.Cluster.Metrics as Metrics
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Utils (iterateJust)
+
+-- | Estimate the level of redundancy of node group given
+-- by its nodes and instances.
+redundancy :: AlgorithmOptions -> Node.List -> Instance.List -> Int
+redundancy _ nl _ | any (liftA2 (&&) Node.offline $ not . null . Node.pList)
+ $ IntMap.elems nl = -1
+redundancy opts nl il | not $ redundant opts nl il = 0
+redundancy opts nl il =
+ let sortedNodes =
+ sortBy (compare `on` ((Node.tMem . snd) &&& fst))
+ . filter (not . Node.offline . snd)
+ $ IntMap.toAscList nl
+ in case sortedNodes of
+ [] -> 0
+ (indexBigNode, bigNode):_ ->
+ let bigNode' = bigNode { Node.offline = True }
+ nl' = Container.add indexBigNode bigNode' nl
+ initialMetrics = Metrics.compCV nl'
+ initialTable = Cluster.Table nl' il initialMetrics []
+ Cluster.Table nl'' il' _ _ =
+ runListHead initialTable id . reverse
+ $ iterateJust (Cluster.tryBalance opts) initialTable
+ in 1 + redundancy opts nl'' il'
diff --git a/src/Ganeti/HTools/Repair.hs b/src/Ganeti/HTools/Repair.hs
new file mode 100644
index 0000000..4220635
--- /dev/null
+++ b/src/Ganeti/HTools/Repair.hs
@@ -0,0 +1,305 @@
+{-| Implementation of the auto-repair logic for Ganeti.
+
+-}
+
+{-
+
+Copyright (C) 2013, 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.HTools.Repair
+ ( InstanceData(..)
+ , parseInitTag
+ , getArData
+ , arStateName
+ , delCurTag
+ , setInitialState
+ , arStatusCmp
+ , updateTag
+ , detectBroken
+ ) where
+
+import Control.Monad (mplus, foldM)
+import Data.Function (on)
+import Data.List (sortBy, groupBy, intercalate)
+import Data.Maybe (mapMaybe, fromJust)
+import Data.Ord (comparing)
+import System.Time (ClockTime(TOD))
+
+import Ganeti.BasicTypes (GenericResult(..), Result)
+import qualified Ganeti.Constants as C
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import qualified Ganeti.HTools.Tags.Constants as Tags
+import Ganeti.HTools.Types
+import Ganeti.OpCodes (OpCode(..))
+import Ganeti.OpParams ( RecreateDisksInfo(RecreateDisksAll)
+ , ReplaceDisksMode(ReplaceNewSecondary)
+ )
+import Ganeti.Types (makeJobIdS, fromJobId, mkNonEmpty, mkNonNegative)
+import Ganeti.Utils (chompPrefix, sepSplit, tryRead, clockTimeToString)
+
+-- | Description of an instance annotated with repair-related information.
+data InstanceData = InstanceData { arInstance :: Instance.Instance
+ , arState :: AutoRepairStatus
+ , tagsToRemove :: [String]
+ }
+ deriving (Eq, Show)
+
+
+-- | Parse a tag into an 'AutoRepairData' record.
+--
+-- @Nothing@ is returned if the tag is not an auto-repair tag, or if it's
+-- malformed.
+parseInitTag :: String -> Maybe AutoRepairData
+parseInitTag tag =
+ let parsePending = do
+ subtag <- chompPrefix Tags.autoRepairTagPending tag
+ case sepSplit ':' subtag of
+ [rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
+ _ -> fail ("Invalid tag: " ++ show tag)
+
+ parseResult = do
+ subtag <- chompPrefix Tags.autoRepairTagResult tag
+ case sepSplit ':' subtag of
+ [rtype, uuid, ts, result, jobs] -> do
+ arData <- makeArData rtype uuid ts jobs
+ result' <- autoRepairResultFromRaw result
+ return arData { arResult = Just result' }
+ _ -> fail ("Invalid tag: " ++ show tag)
+
+ makeArData rtype uuid ts jobs = do
+ rtype' <- autoRepairTypeFromRaw rtype
+ ts' <- tryRead "auto-repair time" ts
+ jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
+ return AutoRepairData { arType = rtype'
+ , arUuid = uuid
+ , arTime = TOD ts' 0
+ , arJobs = jobs'
+ , arResult = Nothing
+ , arTag = tag
+ }
+ in
+ parsePending `mplus` parseResult
+
+-- | Return the 'AutoRepairData' element of an 'AutoRepairStatus' type.
+getArData :: AutoRepairStatus -> Maybe AutoRepairData
+getArData status =
+ case status of
+ ArHealthy (Just d) -> Just d
+ ArFailedRepair d -> Just d
+ ArPendingRepair d -> Just d
+ ArNeedsRepair d -> Just d
+ _ -> Nothing
+
+-- | Return a short name for each auto-repair status.
+--
+-- This is a more concise representation of the status, because the default
+-- "Show" formatting includes all the accompanying auto-repair data.
+arStateName :: AutoRepairStatus -> String
+arStateName status =
+ case status of
+ ArHealthy _ -> "Healthy"
+ ArFailedRepair _ -> "Failure"
+ ArPendingRepair _ -> "Pending repair"
+ ArNeedsRepair _ -> "Needs repair"
+
+-- | Return a new list of tags to remove that includes @arTag@ if present.
+delCurTag :: InstanceData -> [String]
+delCurTag instData =
+ let arData = getArData $ arState instData
+ rmTags = tagsToRemove instData
+ in
+ case arData of
+ Just d -> arTag d : rmTags
+ Nothing -> rmTags
+
+-- | Set the initial auto-repair state of an instance from its auto-repair tags.
+--
+-- The rules when there are multiple tags is:
+--
+-- * the earliest failure result always wins
+--
+-- * two or more pending repairs results in a fatal error
+--
+-- * a pending result from id X and a success result from id Y result in error
+-- if Y is newer than X
+--
+-- * if there are no pending repairs, the newest success result wins,
+-- otherwise the pending result is used.
+setInitialState :: Instance.Instance -> Result InstanceData
+setInitialState inst =
+ let arData = mapMaybe parseInitTag $ Instance.allTags inst
+ -- Group all the AutoRepairData records by id (i.e. by repair task), and
+ -- present them from oldest to newest.
+ arData' = sortBy (comparing arUuid) arData
+ arGroups = groupBy ((==) `on` arUuid) arData'
+ arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
+ in
+ foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
+
+-- | Update the initial status of an instance with new repair task tags.
+--
+-- This function gets called once per repair group in an instance's tag, and it
+-- determines whether to set the status of the instance according to this new
+-- group, or to keep the existing state. See the documentation for
+-- 'setInitialState' for the rules to be followed when determining this.
+arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
+arStatusCmp instData arData =
+ let curSt = arState instData
+ arData' = sortBy (comparing keyfn) arData
+ keyfn d = (arResult d, arTime d)
+ newData = last arData'
+ newSt = case arResult newData of
+ Just ArSuccess -> ArHealthy $ Just newData
+ Just ArEnoperm -> ArHealthy $ Just newData
+ Just ArFailure -> ArFailedRepair newData
+ Nothing -> ArPendingRepair newData
+ in
+ case curSt of
+ ArFailedRepair _ -> Ok instData -- Always keep the earliest failure.
+ ArHealthy _ -> Ok instData { arState = newSt
+ , tagsToRemove = delCurTag instData
+ }
+ ArPendingRepair d -> Bad (
+ "An unfinished repair was found in instance " ++
+ Instance.name (arInstance instData) ++ ": found tag " ++
+ show (arTag newData) ++ ", but older pending tag " ++
+ show (arTag d) ++ "exists.")
+
+ ArNeedsRepair _ -> Bad
+ "programming error: ArNeedsRepair found as an initial state"
+
+-- | Update the tag of an 'AutoRepairData' record to match all the other fields.
+updateTag :: AutoRepairData -> AutoRepairData
+updateTag arData =
+ let ini = [autoRepairTypeToRaw $ arType arData,
+ arUuid arData,
+ clockTimeToString $ arTime arData]
+ end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
+ (pfx, middle) =
+ case arResult arData of
+ Nothing -> (Tags.autoRepairTagPending, [])
+ Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs])
+ in
+ arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
+
+-- | Detect brokenness with an instance and suggest repair type and jobs to run.
+detectBroken :: Node.List -> Instance.Instance
+ -> Maybe (AutoRepairType, [OpCode])
+detectBroken nl inst =
+ let disk = Instance.diskTemplate inst
+ iname = Instance.name inst
+ offPri = Node.offline $ Container.find (Instance.pNode inst) nl
+ offSec = Node.offline $ Container.find (Instance.sNode inst) nl
+ in
+ case disk of
+ DTDrbd8
+ | offPri && offSec ->
+ Just (
+ ArReinstall,
+ [ OpInstanceRecreateDisks { opInstanceName = iname
+ , opInstanceUuid = Nothing
+ , opRecreateDisksInfo = RecreateDisksAll
+ , opNodes = []
+ -- FIXME: there should be a better way to
+ -- specify opcode parameters than abusing
+ -- mkNonEmpty in this way (using the fact
+ -- that Maybe is used both for optional
+ -- fields, and to express failure).
+ , opNodeUuids = Nothing
+ , opIallocator = mkNonEmpty "hail"
+ }
+ , OpInstanceReinstall { opInstanceName = iname
+ , opInstanceUuid = Nothing
+ , opOsType = Nothing
+ , opTempOsParams = Nothing
+ , opOsparamsPrivate = Nothing
+ , opOsparamsSecret = Nothing
+ , opForceVariant = False
+ }
+ ])
+ | offPri ->
+ Just (
+ ArFailover,
+ [ OpInstanceFailover { opInstanceName = iname
+ , opInstanceUuid = Nothing
+ -- FIXME: ditto, see above.
+ , opShutdownTimeout = fromJust $ mkNonNegative
+ C.defaultShutdownTimeout
+ , opIgnoreConsistency = False
+ , opTargetNode = Nothing
+ , opTargetNodeUuid = Nothing
+ , opIgnoreIpolicy = False
+ , opIallocator = Nothing
+ , opMigrationCleanup = False
+ }
+ ])
+ | offSec ->
+ Just (
+ ArFixStorage,
+ [ OpInstanceReplaceDisks { opInstanceName = iname
+ , opInstanceUuid = Nothing
+ , opReplaceDisksMode = ReplaceNewSecondary
+ , opReplaceDisksList = []
+ , opRemoteNode = Nothing
+ -- FIXME: ditto, see above.
+ , opRemoteNodeUuid = Nothing
+ , opIallocator = mkNonEmpty "hail"
+ , opEarlyRelease = False
+ , opIgnoreIpolicy = False
+ }
+ ])
+ | otherwise -> Nothing
+
+ DTPlain
+ | offPri ->
+ Just (
+ ArReinstall,
+ [ OpInstanceRecreateDisks { opInstanceName = iname
+ , opInstanceUuid = Nothing
+ , opRecreateDisksInfo = RecreateDisksAll
+ , opNodes = []
+ -- FIXME: ditto, see above.
+ , opNodeUuids = Nothing
+ , opIallocator = mkNonEmpty "hail"
+ }
+ , OpInstanceReinstall { opInstanceName = iname
+ , opInstanceUuid = Nothing
+ , opOsType = Nothing
+ , opTempOsParams = Nothing
+ , opOsparamsPrivate = Nothing
+ , opOsparamsSecret = Nothing
+ , opForceVariant = False
+ }
+ ])
+ | otherwise -> Nothing
+
+ _ -> Nothing -- Other cases are unimplemented for now: DTDiskless,
+ -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
diff --git a/src/Ganeti/HTools/Types.hs b/src/Ganeti/HTools/Types.hs
index a1fb765..d683e1b 100644
--- a/src/Ganeti/HTools/Types.hs
+++ b/src/Ganeti/HTools/Types.hs
@@ -254,6 +254,9 @@
THH.simpleField ConstantUtils.ipolicyVcpuRatio [t| Double |]
, THH.renameField "SpindleRatio" $
THH.simpleField ConstantUtils.ipolicySpindleRatio [t| Double |]
+ , THH.renameField "MemoryRatio" .
+ THH.defaultField [| ConstantUtils.ipolicyDefaultsMemoryRatio |] $
+ THH.simpleField ConstantUtils.ipolicyMemoryRatio [t| Double |]
])
-- | Converts an ISpec type to a RSpec one.
@@ -275,6 +278,7 @@
, iPolicyDiskTemplates = [minBound..maxBound]
, iPolicyVcpuRatio = ConstantUtils.ipolicyDefaultsVcpuRatio
, iPolicySpindleRatio = ConstantUtils.ipolicyDefaultsSpindleRatio
+ , iPolicyMemoryRatio = ConstantUtils.ipolicyDefaultsMemoryRatio
}
-- | The dynamic resource specs of a machine (i.e. load or load
@@ -358,6 +362,8 @@
| FailDisk -- ^ Failed due to not enough disk
| FailCPU -- ^ Failed due to not enough CPU capacity
| FailN1 -- ^ Failed due to not passing N1 checks
+ | FailTooSmall -- ^ Failed due to the instance being smaller
+ -- than allowed
| FailTags -- ^ Failed due to tag exclusion
| FailMig -- ^ Failed due to migration restrictions
| FailDiskCount -- ^ Failed due to wrong number of disks
@@ -377,10 +383,11 @@
-- will instead raise an exception.
type OpResult = GenericResult FailMode
--- | 'Error' instance for 'FailMode' designed to catch unintended
+-- | 'FromString' instance for 'FailMode' designed to catch unintended
-- use as a general monad.
-instance Error FailMode where
- strMsg v = error $ "Programming error: OpResult used as generic monad" ++ v
+instance FromString FailMode where
+ mkFromString v = error $ "Programming error: OpResult used as generic monad"
+ ++ v
-- | Conversion from 'OpResult' to 'Result'.
opToResult :: OpResult a -> Result a
diff --git a/src/Ganeti/Hs2Py/OpDoc.hs b/src/Ganeti/Hs2Py/OpDoc.hs
index aee68db..f5f832d 100644
--- a/src/Ganeti/Hs2Py/OpDoc.hs
+++ b/src/Ganeti/Hs2Py/OpDoc.hs
@@ -168,6 +168,10 @@
opRestrictedCommand =
"Runs a restricted command on node(s)."
+opRepairCommand :: String
+opRepairCommand =
+ "Runs a repair command on a given node."
+
opNodeRemove :: String
opNodeRemove =
"Remove a node.\n\
diff --git a/src/Ganeti/Hypervisor/Xen/XmParser.hs b/src/Ganeti/Hypervisor/Xen/XmParser.hs
index 00f1133..97a2edd 100644
--- a/src/Ganeti/Hypervisor/Xen/XmParser.hs
+++ b/src/Ganeti/Hypervisor/Xen/XmParser.hs
@@ -71,7 +71,7 @@
doubleP = LCDouble <$> A.rational <* A.skipSpace <* A.endOfInput
innerDoubleP = LCDouble <$> A.rational
stringP = LCString . unpack <$> A.takeWhile1 (not . (\c -> isSpace c
- || c `elem` "()"))
+ || c `elem` ("()" :: String)))
wspace = AC.many1 A.space
rparen = A.skipSpace *> A.char ')'
finalP = listConfigP <* rparen
@@ -163,5 +163,5 @@
uptimeLineParser = do
name <- A.takeTill isSpace <* A.skipSpace
idNum <- A.decimal <* A.skipSpace
- uptime <- A.takeTill (`elem` "\n\r") <* A.skipSpace
+ uptime <- A.takeTill (`elem` ("\n\r" :: String)) <* A.skipSpace
return . UptimeInfo (unpack name) idNum $ unpack uptime
diff --git a/src/Ganeti/JQScheduler.hs b/src/Ganeti/JQScheduler.hs
index df6fefc..4c594fa 100644
--- a/src/Ganeti/JQScheduler.hs
+++ b/src/Ganeti/JQScheduler.hs
@@ -48,16 +48,29 @@
, configChangeNeedsRescheduling
) where
-import Control.Applicative (liftA2, (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Arrow
import Control.Concurrent
import Control.Exception
-import Control.Monad
+import Control.Monad ( when
+ , mfilter
+ , liftM
+ , void
+ , unless
+ , forever
+ , forM_)
import Control.Monad.IO.Class
import Data.Function (on)
-import Data.Functor ((<$))
import Data.IORef (IORef, atomicModifyIORef, newIORef, readIORef)
-import Data.List
+import Data.List ( find
+ , deleteFirstsBy
+ , sortBy
+ , intercalate
+ , partition
+ , insertBy)
import Data.Maybe
import qualified Data.Map as Map
import Data.Ord (comparing)
@@ -132,10 +145,6 @@
unreadJob :: QueuedJob -> JobWithStat
unreadJob job = JobWithStat {jJob=job, jStat=nullFStat, jINotify=Nothing}
--- | Reload interval for polling the running jobs for updates in microseconds.
-watchInterval :: Int
-watchInterval = C.luxidJobqueuePollInterval * 1000000
-
-- | Read a cluster parameter from the configuration, using a default if the
-- configuration is not available.
getConfigValue :: (Cluster -> a) -> a -> JQStatus -> IO a
@@ -499,7 +508,7 @@
-- | Time-based watcher for updating the job queue.
onTimeWatcher :: JQStatus -> IO ()
onTimeWatcher qstate = forever $ do
- threadDelay watchInterval
+ threadDelaySeconds C.luxidJobqueuePollInterval
logDebug "Job queue watcher timer fired"
updateStatusAndScheduleSomeJobs qstate
logDebug "Job queue watcher cycle finished"
diff --git a/src/Ganeti/JQueue.hs b/src/Ganeti/JQueue.hs
index 5c3b8f5..736fce6 100644
--- a/src/Ganeti/JQueue.hs
+++ b/src/Ganeti/JQueue.hs
@@ -82,21 +82,30 @@
, QueuedJob(..)
) where
-import Control.Applicative (liftA2, (<|>), (<$>))
+import Prelude ()
+import Ganeti.Prelude hiding (id, log)
+
+import Control.Applicative (liftA2, (<|>))
import Control.Arrow (first, second)
import Control.Concurrent (forkIO, threadDelay)
import Control.Exception
import Control.Lens (over)
-import Control.Monad
+import Control.Monad ( filterM
+ , liftM
+ , foldM
+ , void
+ , mfilter
+ , when
+ , mzero
+ , unless
+ , msum)
import Control.Monad.IO.Class
import Control.Monad.Trans (lift)
import Control.Monad.Trans.Maybe
-import Data.Functor ((<$))
-import Data.List
+import Data.List (stripPrefix, sortBy, isPrefixOf)
import Data.Maybe
import Data.Ord (comparing)
-- workaround what seems to be a bug in ghc 7.4's TH shadowing code
-import Prelude hiding (id, log)
import System.Directory
import System.FilePath
import System.IO.Error (isDoesNotExistError)
@@ -483,7 +492,7 @@
mapM_ (replicateJob rootdir mastercandidates)
-- | Writes a job to a file and replicates it to master candidates.
-writeAndReplicateJob :: (Error e)
+writeAndReplicateJob :: (FromString e)
=> ConfigData -> FilePath -> QueuedJob
-> ResultT e IO [(Node, ERpcError ())]
writeAndReplicateJob cfg rootdir job = do
diff --git a/src/Ganeti/JSON.hs b/src/Ganeti/JSON.hs
index 770da55..86323ba 100644
--- a/src/Ganeti/JSON.hs
+++ b/src/Ganeti/JSON.hs
@@ -62,6 +62,7 @@
, lookupContainer
, alterContainerL
, readContainer
+ , getKeysFromContainer
, mkUsedKeys
, allUsedKeys
, DictObject(..)
@@ -85,7 +86,7 @@
import Control.Applicative
import Control.DeepSeq
-import Control.Monad.Error.Class
+import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.Writer
import qualified Data.ByteString as BS
import qualified Data.ByteString.UTF8 as UTF8
@@ -148,8 +149,8 @@
fromJResult _ (J.Ok x) = return x
-- | Converts a JSON Result into a MonadError value.
-fromJResultE :: (Error e, MonadError e m) => String -> J.Result a -> m a
-fromJResultE s (J.Error x) = throwError . strMsg $ s ++ ": " ++ x
+fromJResultE :: (FromString e, MonadError e m) => String -> J.Result a -> m a
+fromJResultE s (J.Error x) = throwError . mkFromString $ s ++ ": " ++ x
fromJResultE _ (J.Ok x) = return x
-- | Tries to read a string from a JSON value.
@@ -247,10 +248,10 @@
J.Ok x -> return x
-- | Small wrapper over 'readJSON' for 'MonadError'.
-fromJValE :: (Error e, MonadError e m, J.JSON a) => J.JSValue -> m a
+fromJValE :: (FromString e, MonadError e m, J.JSON a) => J.JSValue -> m a
fromJValE v =
case J.readJSON v of
- J.Error s -> throwError . strMsg $
+ J.Error s -> throwError . mkFromString $
"Cannot convert value '" ++ show (pp_value v) ++
"', error: " ++ s
J.Ok x -> return x
@@ -338,6 +339,10 @@
-- | Type alias for string keys.
type Container = GenericContainer BS.ByteString
+-- | Returns all string keys from a container.
+getKeysFromContainer :: (Container a) -> [String]
+getKeysFromContainer = map UTF8.toString . Map.keys . fromContainer
+
instance HasStringRepr BS.ByteString where
fromStringRepr = return . UTF8.fromString
toStringRepr = UTF8.toString
diff --git a/src/Ganeti/Jobs.hs b/src/Ganeti/Jobs.hs
index 01c2ac8..e31d74e 100644
--- a/src/Ganeti/Jobs.hs
+++ b/src/Ganeti/Jobs.hs
@@ -38,19 +38,24 @@
, execWithCancel
, execJobsWait
, execJobsWaitOk
+ , execJobsWaitOkJid
, waitForJobs
+ , forceFailover
) where
-import Control.Concurrent (threadDelay)
import Control.Exception (bracket)
+import Control.Monad (void, forM)
+import Data.Functor.Identity (runIdentity)
import Data.List
import Data.Tuple
import Data.IORef
import System.Exit
import System.Posix.Process
import System.Posix.Signals
+import qualified Text.JSON as J
import Ganeti.BasicTypes
+import qualified Ganeti.Constants as C
import Ganeti.Errors
import qualified Ganeti.Luxi as L
import Ganeti.OpCodes
@@ -147,26 +152,36 @@
callback jids'
waitForJobs jids' client
+-- | Wait for one job until it is finished, using the WaitForJobChange
+-- luxi command. Return the JobId and the and the final job status.
+waitForJob :: L.Client -> L.JobId -> ResultT String IO (L.JobId, JobStatus)
+waitForJob c jid = waitForJob' J.JSNull 0 where
+ waitForJob' prevJob prevLog = do
+ rval <- mkResultT' $ L.callMethod (L.WaitForJobChange jid ["status"]
+ prevJob (J.showJSON prevLog)
+ C.luxiWfjcTimeout) c
+ let parsed = J.readJSON rval
+ :: (J.Result ( [JobStatus]
+ , [ (Int, J.JSValue, J.JSValue, J.JSValue)]))
+ (status, logs) <- case parsed of
+ J.Ok ([s], ls) -> return (s, ls)
+ J.Ok (s, _) -> fail $ "Expected precisely one job status, got " ++ show s
+ J.Error x -> fail $ show x
+ let pLog = maximum $ prevLog : map (\(cnt, _, _, _) -> cnt) logs
+ if status > JOB_STATUS_RUNNING
+ then return (jid, status)
+ else waitForJob' (J.showJSON [status]) pLog
+
+
-- | Polls a set of jobs at an increasing interval until all are finished one
-- way or another.
waitForJobs :: [L.JobId] -> L.Client -> IO (Result [(L.JobId, JobStatus)])
-waitForJobs jids client = waitForJobs' 500000 15000000
- where
- waitForJobs' delay maxdelay = do
- -- TODO: this should use WaitForJobChange once it's available in Haskell
- -- land, instead of a fixed schedule of sleeping intervals.
- threadDelay delay
- sts <- L.queryJobsStatus client jids
- case sts of
- Bad e -> return . Bad $ "Checking job status: " ++ formatError e
- Ok sts' -> if any (<= JOB_STATUS_RUNNING) sts' then
- waitForJobs' (min (delay * 2) maxdelay) maxdelay
- else
- return . Ok $ zip jids sts'
+waitForJobs jids = runResultT . forM jids . waitForJob
--- | Execute jobs and return @Ok@ only if all of them succeeded.
-execJobsWaitOk :: [[MetaOpCode]] -> L.Client -> IO (Result ())
-execJobsWaitOk opcodes client = do
+-- | Execute jobs and return @Ok@ only if all of them succeeded; in
+-- this case, also return the list of Job IDs.
+execJobsWaitOkJid :: [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
+execJobsWaitOkJid opcodes client = do
let nullog = const (return () :: IO ())
failed = filter ((/=) JOB_STATUS_SUCCESS . snd)
fmtfail (i, s) = show (fromJobId i) ++ "=>" ++ jobStatusToRaw s
@@ -174,7 +189,28 @@
case sts of
Bad e -> return $ Bad e
Ok sts' -> return (if null $ failed sts' then
- Ok ()
+ Ok $ map fst sts'
else
Bad ("The following jobs failed: " ++
(intercalate ", " . map fmtfail $ failed sts')))
+
+-- | Execute jobs and return @Ok@ only if all of them succeeded.
+execJobsWaitOk :: [[MetaOpCode]] -> L.Client -> IO (Result ())
+execJobsWaitOk opcodes =
+ fmap void . execJobsWaitOkJid opcodes
+
+-- | Channge Migrations to Failovers
+forceFailover :: OpCode -> OpCode
+forceFailover op@(OpInstanceMigrate {}) =
+ let timeout = runIdentity $ mkNonNegative C.defaultShutdownTimeout
+ in OpInstanceFailover { opInstanceName = opInstanceName op
+ , opInstanceUuid = opInstanceUuid op
+ , opShutdownTimeout = timeout
+ , opIgnoreConsistency = True
+ , opTargetNode = opTargetNode op
+ , opTargetNodeUuid = opTargetNodeUuid op
+ , opIgnoreIpolicy = opIgnoreIpolicy op
+ , opMigrationCleanup = opMigrationCleanup op
+ , opIallocator = opIallocator op
+ }
+forceFailover op = op
diff --git a/src/Ganeti/Kvmd.hs b/src/Ganeti/Kvmd.hs
index 4979396..597298b 100644
--- a/src/Ganeti/Kvmd.hs
+++ b/src/Ganeti/Kvmd.hs
@@ -59,13 +59,13 @@
module Ganeti.Kvmd where
-import Prelude hiding (rem)
+import Prelude ()
+import Ganeti.Prelude hiding (rem)
-import Control.Applicative ((<$>))
import Control.Exception (try)
import Control.Concurrent
import Control.Monad (unless, when)
-import Data.List
+import Data.List (isPrefixOf, isInfixOf)
import Data.Set (Set)
import qualified Data.Set as Set (delete, empty, insert, member)
import System.Directory
diff --git a/src/Ganeti/Lens.hs b/src/Ganeti/Lens.hs
index c7951e6..ca4719d 100644
--- a/src/Ganeti/Lens.hs
+++ b/src/Ganeti/Lens.hs
@@ -44,7 +44,10 @@
, atSet
) where
-import Control.Applicative ((<$>), WrappedMonad(..))
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (WrappedMonad(..))
import Control.Lens
import Control.Monad
import Data.Functor.Compose (Compose(..))
diff --git a/src/Ganeti/Locking/Allocation.hs b/src/Ganeti/Locking/Allocation.hs
index d1caa2a..4a681b4 100644
--- a/src/Ganeti/Locking/Allocation.hs
+++ b/src/Ganeti/Locking/Allocation.hs
@@ -50,9 +50,12 @@
, freeLocks
) where
-import Control.Applicative (liftA2, (<$>), (<*>), pure)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Arrow (second, (***))
-import Control.Monad
+import Control.Monad (unless, guard, foldM, when)
import Data.Foldable (for_, find)
import Data.List (foldl')
import qualified Data.Map as M
diff --git a/src/Ganeti/Locking/Locks.hs b/src/Ganeti/Locking/Locks.hs
index e5bf524..1401b4f 100644
--- a/src/Ganeti/Locking/Locks.hs
+++ b/src/Ganeti/Locking/Locks.hs
@@ -44,7 +44,9 @@
, lockLevel
) where
-import Control.Applicative ((<$>), (<*>), pure)
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Monad ((>=>), liftM)
import Data.List (stripPrefix)
import System.Posix.Types (ProcessID)
diff --git a/src/Ganeti/Logging.hs b/src/Ganeti/Logging.hs
index cf5a3fd..a1f42d6 100644
--- a/src/Ganeti/Logging.hs
+++ b/src/Ganeti/Logging.hs
@@ -60,15 +60,16 @@
, isDebugMode
) where
-import Control.Applicative ((<$>))
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Monad
-import Control.Monad.Error (Error(..), MonadError(..), catchError)
+import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.Reader
import qualified Control.Monad.RWS.Strict as RWSS
import qualified Control.Monad.State.Strict as SS
import Control.Monad.Trans.Identity
import Control.Monad.Trans.Maybe
-import Data.Monoid
import System.Log.Logger
import System.Log.Handler.Simple
import System.Log.Handler.Syslog
@@ -76,7 +77,7 @@
import System.Log.Formatter
import System.IO
-import Ganeti.BasicTypes (ResultT(..))
+import Ganeti.BasicTypes (ResultT(..), FromString(..))
import Ganeti.THH
import qualified Ganeti.ConstantUtils as ConstantUtils
@@ -168,7 +169,7 @@
instance (MonadLog m, Monoid w) => MonadLog (RWSS.RWST r w s m) where
logAt p = lift . logAt p
-instance (MonadLog m, Error e) => MonadLog (ResultT e m) where
+instance (MonadLog m, FromString e) => MonadLog (ResultT e m) where
logAt p = lift . logAt p
-- | Log at debug level.
diff --git a/src/Ganeti/Luxi.hs b/src/Ganeti/Luxi.hs
index f763eee..831e859 100644
--- a/src/Ganeti/Luxi.hs
+++ b/src/Ganeti/Luxi.hs
@@ -60,6 +60,8 @@
, recvMsgExt
, sendMsg
, allLuxiCalls
+ , extractArray
+ , fromJValWithStatus
) where
import Control.Applicative (optional, liftA, (<|>))
@@ -71,7 +73,7 @@
import Ganeti.BasicTypes
import Ganeti.Constants
import Ganeti.Errors
-import Ganeti.JSON (fromJResult, fromJVal, Tuple5(..), MaybeForJSON(..), TimeAsDoubleJSON(..))
+import Ganeti.JSON (fromJResult, fromJVal, fromObj, Tuple5(..), MaybeForJSON(..), TimeAsDoubleJSON(..))
import Ganeti.UDSServer
import Ganeti.Objects
import Ganeti.OpParams (pTagsObject)
@@ -381,3 +383,41 @@
LuxiError "Missing job status field"
else Ok (map head vals)
J.Error x -> Bad $ LuxiError x
+
+-- * Utility functions
+
+-- | Get values behind \"data\" part of the result.
+getData :: (Monad m) => JSValue -> m JSValue
+getData (JSObject o) = fromObj (fromJSObject o) "data"
+getData x = fail $ "Invalid input, expected dict entry but got " ++ show x
+
+-- | Converts a (status, value) into m value, if possible.
+parseQueryField :: (Monad m) => JSValue -> m (JSValue, JSValue)
+parseQueryField (JSArray [status, result]) = return (status, result)
+parseQueryField o =
+ fail $ "Invalid query field, expected (status, value) but got " ++ show o
+
+-- | Parse a result row.
+parseQueryRow :: (Monad m) => JSValue -> m [(JSValue, JSValue)]
+parseQueryRow (JSArray arr) = mapM parseQueryField arr
+parseQueryRow o =
+ fail $ "Invalid query row result, expected array but got " ++ show o
+
+-- | Parse an overall query result and get the [(status, value)] list
+-- for each element queried.
+parseQueryResult :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
+parseQueryResult (JSArray arr) = mapM parseQueryRow arr
+parseQueryResult o =
+ fail $ "Invalid query result, expected array but got " ++ show o
+
+-- | Prepare resulting output as parsers expect it.
+extractArray :: (Monad m) => JSValue -> m [[(JSValue, JSValue)]]
+extractArray v =
+ getData v >>= parseQueryResult
+
+-- | Testing result status for more verbose error message.
+fromJValWithStatus :: (J.JSON a, Monad m) => (JSValue, JSValue) -> m a
+fromJValWithStatus (st, v) = do
+ st' <- fromJVal st
+ Qlang.checkRS st' v >>= fromJVal
+
diff --git a/src/Ganeti/MaintD/Autorepairs.hs b/src/Ganeti/MaintD/Autorepairs.hs
new file mode 100644
index 0000000..ce86d06
--- /dev/null
+++ b/src/Ganeti/MaintD/Autorepairs.hs
@@ -0,0 +1,236 @@
+{-| Auto-repair task of the maintenance daemon.
+
+This module implements the non-pure parts of harep-style
+repairs carried out by the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Autorepairs
+ ( harepTasks
+ ) where
+
+import Control.Arrow (second, (***))
+import Control.Monad (forM)
+import Control.Exception (bracket)
+import Data.Maybe (isJust, fromJust)
+import qualified Data.Set as Set
+import System.IO.Error (tryIOError)
+import System.Time (getClockTime)
+
+import Ganeti.BasicTypes
+import Ganeti.Errors (formatError)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.HTools.Repair
+import Ganeti.HTools.Types
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid, submitJobs)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.Utils (annotateOpCode)
+import Ganeti.OpCodes (OpCode(..))
+import qualified Ganeti.Path as Path
+import Ganeti.Types (JobId, JobStatus(..), TagKind(..), mkNonNegative)
+import Ganeti.Utils (newUUID, logAndBad)
+
+-- | Apply and remove tags form an instance indicated by `InstanceData`.
+commitChange :: L.Client
+ -> InstanceData
+ -> ResultT String IO (InstanceData, [JobId])
+commitChange client instData = do
+ now <- liftIO currentTimestamp
+ let arData = getArData $ arState instData
+ iname = Instance.name $ arInstance instData
+ rmTags = tagsToRemove instData
+ addJobs <- if isJust arData
+ then do
+ let tag = arTag $ fromJust arData
+ logDebug $ "Adding tag " ++ tag ++ " to " ++ iname
+ mkResultT $ execJobsWaitOkJid
+ [[ annotateOpCode "harep state tagging" now
+ . OpTagsSet TagKindInstance [tag]
+ $ Just iname ]]
+ client
+ else return []
+ rmJobs <- if null rmTags
+ then return []
+ else do
+ logDebug $ "Removing tags " ++ show rmTags ++ " from " ++ iname
+ mkResultT $ execJobsWaitOkJid
+ [[ annotateOpCode "harep state tag removal" now
+ . OpTagsDel TagKindInstance rmTags
+ $ Just iname ]]
+ client
+ return (instData { tagsToRemove = [] }, addJobs ++ rmJobs)
+
+-- | Query jobs of a pending repair, returning the new instance data.
+processPending :: L.Client
+ -> InstanceData
+ -> IO (Result (InstanceData, [JobId]))
+processPending client instData = runResultT $ case arState instData of
+ (ArPendingRepair arData) -> do
+ sts <- liftIO . L.queryJobsStatus client $ arJobs arData
+ time <- liftIO getClockTime
+ case sts of
+ Bad e -> mkResultT . logAndBad
+ $ "Could not check job status: " ++ formatError e
+ Ok sts' ->
+ if any (<= JOB_STATUS_RUNNING) sts' then
+ return (instData, [])
+ else do
+ let iname = Instance.name $ arInstance instData
+ srcSt = arStateName $ arState instData
+ arState' =
+ if all (== JOB_STATUS_SUCCESS) sts' then
+ ArHealthy . Just
+ . updateTag $ arData { arResult = Just ArSuccess
+ , arTime = time }
+ else
+ ArFailedRepair . updateTag
+ $ arData { arResult = Just ArFailure, arTime = time }
+ destSt = arStateName arState'
+ instData' = instData { arState = arState'
+ , tagsToRemove = delCurTag instData
+ }
+ logInfo $ "Moving " ++ iname ++ " form " ++ show srcSt ++ " to "
+ ++ show destSt
+ commitChange client instData'
+ _ -> return (instData, [])
+
+-- | Perfom the suggested repair on an instance if its policy allows it
+-- and return the list of submitted jobs.
+doRepair :: L.Client
+ -> InstanceData
+ -> (AutoRepairType, [OpCode])
+ -> IO (Result ([Idx], [JobId]))
+doRepair client instData (rtype, opcodes) = runResultT $ do
+ let inst = arInstance instData
+ ipol = Instance.arPolicy inst
+ iname = Instance.name inst
+ case ipol of
+ ArEnabled maxtype -> do
+ uuid <- liftIO newUUID
+ time <- liftIO getClockTime
+ if rtype > maxtype then do
+ let arState' = ArNeedsRepair (
+ updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
+ instData' = instData { arState = arState'
+ , tagsToRemove = delCurTag instData
+ }
+ logInfo $ "Not performing repair of type " ++ show rtype ++ " on "
+ ++ iname ++ " because only repairs up to " ++ show maxtype
+ ++ " are allowed"
+ (_, jobs) <- commitChange client instData'
+ return ([], jobs)
+ else do
+ now <- liftIO currentTimestamp
+ logInfo $ "Executing " ++ show rtype ++ " repair on " ++ iname
+ -- As in harep, we delay the actual repair, to allow the tagging
+ -- to happen first; again this is only about speeding up the harep
+ -- round, not about correctness.
+ let opcodes' = OpTestDelay { opDelayDuration = 10
+ , opDelayOnMaster = True
+ , opDelayOnNodes = []
+ , opDelayOnNodeUuids = Nothing
+ , opDelayRepeat = fromJust $ mkNonNegative 0
+ , opDelayInterruptible = False
+ , opDelayNoLocks = False
+ } : opcodes
+ jids <- liftIO $ submitJobs
+ [ map (annotateOpCode "harep-style repair" now)
+ opcodes'] client
+ case jids of
+ Bad e -> mkResultT . logAndBad $ "Failure submitting repair jobs: "
+ ++ e
+ Ok jids' -> do
+ let arState' = ArPendingRepair (
+ updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
+ instData' = instData { arState = arState'
+ , tagsToRemove = delCurTag instData
+ }
+ (_, tagjobs) <- commitChange client instData'
+ let nodes = filter (>= 0) [Instance.pNode inst, Instance.sNode inst]
+ return (nodes, jids' ++ tagjobs)
+ otherSt -> do
+ logDebug $ "Not repairing " ++ iname ++ " because it is in state "
+ ++ show otherSt
+ return ([], [])
+
+-- | Harep-like repair tasks.
+harepTasks :: (Node.List, Instance.List) -- ^ Current cluster configuration
+ -> Set.Set Int -- ^ Node indices on which actions may be taken
+ -> ResultT String IO (Set.Set Int, [JobId])
+ -- ^ untouched nodes and jobs submitted
+harepTasks (nl, il) nidxs = do
+ logDebug $ "harep tasks on nodes " ++ show (Set.toList nidxs)
+ iniData <- mkResultT . return . mapM setInitialState $ Container.elems il
+
+ -- First step: check all pending repairs, see if they are completed.
+ luxiSocket <- liftIO Path.defaultQuerySocket
+ either_iData <- liftIO . tryIOError
+ . bracket (L.getLuxiClient luxiSocket) L.closeClient
+ $ forM iniData . processPending
+ (iData', jobs) <- mkResultT $ case either_iData of
+ Left e -> logAndBad $ "Error while harep status update: "
+ ++ show e
+ Right r ->
+ if any isBad r
+ then logAndBad $ "Bad harep processing pending: "
+ ++ show (justBad r)
+ else return . Ok . second concat . unzip $ justOk r
+
+ -- Second step: detect any problems.
+ let repairs = map (detectBroken nl . arInstance) iData'
+
+ -- Third step: create repair jobs for broken instances that are in ArHealthy.
+ let repairIfHealthy c i = case arState i of
+ ArHealthy _ -> doRepair c i
+ _ -> const . return $ Ok ([], [])
+ maybeRepair c (i, r) = maybe (return $ Ok ([], []))
+ (repairIfHealthy c i) r
+ either_repairJobs <- liftIO . tryIOError
+ . bracket (L.getLuxiClient luxiSocket) L.closeClient
+ $ forM (zip iData' repairs) . maybeRepair
+
+ (ntouched, jobs') <- mkResultT $ case either_repairJobs of
+ Left e -> logAndBad $ "Error while attempting repair: "
+ ++ show e
+ Right r ->
+ if any isBad r
+ then logAndBad $ "Error submitting repair jobs: "
+ ++ show (justBad r)
+ else return . Ok . (concat *** concat) . unzip
+ $ justOk r
+
+ return (nidxs Set.\\ Set.fromList ntouched, jobs ++ jobs' )
diff --git a/src/Ganeti/MaintD/Balance.hs b/src/Ganeti/MaintD/Balance.hs
new file mode 100644
index 0000000..d48fb5d
--- /dev/null
+++ b/src/Ganeti/MaintD/Balance.hs
@@ -0,0 +1,347 @@
+{-| Balancing task of the maintenance daemon.
+
+This module carries out the automated balancing done by the
+maintenance daemon. The actual balancing algorithm is imported
+from htools.
+
+-}
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Balance
+ ( balanceTask
+ ) where
+
+import Control.Arrow ((***), (&&&))
+import Control.Exception.Lifted (bracket)
+import Control.Monad (liftM, unless, when)
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef)
+import qualified Data.IntMap as IntMap
+import qualified Data.Set as Set
+import qualified Data.Map as Map
+import Data.Maybe (mapMaybe, isJust)
+import qualified Data.Traversable as Traversable
+import System.IO.Error (tryIOError)
+import Text.Printf (printf)
+
+import Ganeti.BasicTypes ( ResultT, mkResultT, mkResultT'
+ , GenericResult(..), Result)
+import Ganeti.Cpu.Types (emptyCPUavgload, CPUavgload(..))
+import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), defaultOptions)
+import qualified Ganeti.HTools.Backend.MonD as MonD
+import qualified Ganeti.HTools.Cluster as Cluster
+import qualified Ganeti.HTools.Cluster.Metrics as Metrics
+import qualified Ganeti.HTools.Cluster.Utils as ClusterUtils
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.JQueue.Objects (Timestamp)
+import Ganeti.Jobs (submitJobs)
+import Ganeti.HTools.Types ( zeroUtil, DynUtil(cpuWeight), addUtil, subUtil
+ , MoveJob, iPolicyMemoryRatio)
+import Ganeti.Logging.Lifted (logDebug)
+import Ganeti.MaintD.MemoryState ( MemoryState, getEvacuated
+ , addEvacuated, rmEvacuated)
+import Ganeti.MaintD.Utils (annotateOpCode)
+import qualified Ganeti.Luxi as L
+import Ganeti.OpCodes (MetaOpCode)
+import qualified Ganeti.Path as Path
+import qualified Ganeti.Query.Language as Qlang
+import Ganeti.Types (JobId)
+import Ganeti.Utils (logAndBad)
+
+-- * Collection of dynamic load data
+
+data AllReports = AllReports { rTotal :: MonD.Report
+ , rIndividual :: MonD.Report
+ , rMem :: MonD.Report
+ }
+
+-- | Empty report. It describes an idle node and can be used as
+-- default value for nodes marked as offline.
+emptyReports :: AllReports
+emptyReports = AllReports (MonD.CPUavgloadReport emptyCPUavgload)
+ (MonD.InstanceCpuReport Map.empty)
+ (MonD.InstanceRSSReport Map.empty)
+
+-- | Query a node unless it is offline and return all
+-- CPU reports. For offline nodes return the empty report.
+queryNode :: Node.Node -> ResultT String IO AllReports
+queryNode node = do
+ let getReport dc = mkResultT
+ . liftM (maybe (Bad $ "Failed collecting "
+ ++ MonD.dName dc
+ ++ " from " ++ Node.name node) Ok
+ . MonD.mkReport dc)
+ $ MonD.fromCurl dc node
+ if Node.offline node
+ then return emptyReports
+ else do
+ total <- getReport MonD.totalCPUCollector
+ xeninstances <- getReport MonD.xenCPUCollector
+ rssinstances <- getReport MonD.kvmRSSCollector
+ return $ AllReports total xeninstances rssinstances
+
+-- | Get a map with the CPU live data for all nodes; for offline nodes
+-- the empty report is guessed.
+queryLoad :: Node.List -> ResultT String IO (Container.Container AllReports)
+queryLoad = Traversable.mapM queryNode
+
+-- | Ask luxid about the hypervisors used. As, at the moment, we only
+-- have specialised CPU collectors for xen, we're only interested which
+-- instances run under the Xen hypervisor.
+getXenInstances :: ResultT String IO (Set.Set String)
+getXenInstances = do
+ let query = L.Query (Qlang.ItemTypeOpCode Qlang.QRInstance)
+ ["name", "hypervisor"] Qlang.EmptyFilter
+ luxiSocket <- liftIO Path.defaultQuerySocket
+ raw <- bracket (mkResultT . liftM (either (Bad . show) Ok)
+ . tryIOError $ L.getLuxiClient luxiSocket)
+ (liftIO . L.closeClient)
+ $ mkResultT' . L.callMethod query
+ answer <- L.extractArray raw >>= mapM (mapM L.fromJValWithStatus)
+ let getXen [name, hv] | hv `elem` ["xen-pvm", "xen-hvm"] = [name]
+ getXen _ = []
+ return $ Set.fromList (answer >>= getXen)
+
+-- | Look for an instance in a given report.
+findInstanceLoad :: String -> AllReports -> Maybe Double
+findInstanceLoad name r | MonD.InstanceCpuReport m <- rIndividual r =
+ Map.lookup name m
+findInstanceLoad _ _ = Nothing
+
+-- | Update the CPU load of one instance based on the reports.
+-- Fail if instance CPU load is not (yet) available. However, do
+-- accpet missing load data for instances on offline nodes, as well
+-- as old load data for recently migrated instances.
+updateCPUInstance :: Node.List
+ -> Container.Container AllReports
+ -> Set.Set String
+ -> [String]
+ -> Instance.Instance
+ -> Result Instance.Instance
+updateCPUInstance nl reports xeninsts evacuated inst =
+ let name = Instance.name inst
+ nidx = Instance.pNode inst
+ in if name `Set.member` xeninsts
+ then let onNodeLoad = findInstanceLoad name (Container.find nidx reports)
+ allLoads = mapMaybe (findInstanceLoad name)
+ $ Container.elems reports
+ in case () of
+ _ | Just load <- onNodeLoad ->
+ return $ inst { Instance.util = zeroUtil { cpuWeight = load } }
+ _ | (load:_) <- allLoads ->
+ return $ inst { Instance.util = zeroUtil { cpuWeight = load } }
+ _ | Node.offline $ Container.find nidx nl ->
+ return $ inst { Instance.util = zeroUtil }
+ _ | Instance.name inst `elem` evacuated ->
+ return $ inst { Instance.util = zeroUtil }
+ _ -> fail $ "Xen CPU data unavailable for " ++ name
+ else let rep = rTotal $ Container.find nidx reports
+ in case rep of MonD.CPUavgloadReport (CPUavgload _ _ ndload) ->
+ let w = ndload * fromIntegral (Instance.vcpus inst)
+ / (fromIntegral . Node.uCpu
+ $ Container.find nidx nl)
+ in return $ inst { Instance.util =
+ zeroUtil { cpuWeight = w }}
+ _ -> fail $ "CPU data unavailable for node of " ++ name
+
+-- | Update CPU usage data based on the collected reports. That is, get the
+-- CPU usage of all instances from the reports and also update the nodes
+-- accordingly.
+updateCPULoad :: (Node.List, Instance.List)
+ -> Container.Container AllReports
+ -> Set.Set String
+ -> [ String ]
+ -> Result (Node.List, Instance.List)
+updateCPULoad (nl, il) reports xeninsts evacuated = do
+ il' <- Traversable.mapM (updateCPUInstance nl reports xeninsts evacuated) il
+ let addNodeUtil n delta = n { Node.utilLoad = addUtil (Node.utilLoad n) delta
+ , Node.utilLoadForth =
+ addUtil (Node.utilLoadForth n) delta
+ }
+ let updateNodeUtil nnl inst_old inst_new =
+ let delta = subUtil (Instance.util inst_new) $ Instance.util inst_old
+ nidx = Instance.pNode inst_old
+ n = Container.find nidx nnl
+ n' = addNodeUtil n delta
+ in Container.add nidx n' nnl
+ let nl' = foldl (\nnl i -> updateNodeUtil nnl (Container.find i il)
+ $ Container.find i il') nl $ Container.keys il
+ return (nl', il')
+
+-- | For an instance, given by name, verify if an individual load report is
+-- available again.
+cleanUpEvacuation :: IORef MemoryState
+ -> Instance.List
+ -> Container.Container AllReports
+ -> String
+ -> IO ()
+cleanUpEvacuation memstate il reports name = do
+ let insts = filter ((==) name . Instance.name) $ Container.elems il
+ case insts of
+ [] -> do
+ logDebug $ "Instnace " ++ name ++ "no longer on the cluster"
+ rmEvacuated memstate name
+ inst:_ -> do
+ let nidx = Instance.pNode inst
+ when (isJust . findInstanceLoad name
+ $ Container.find nidx reports) $ do
+ logDebug $ "Load data for " ++ name ++ " available again"
+ rmEvacuated memstate name
+
+-- * Balancing
+
+-- | Transform an instance move into a submittable job.
+moveToJob :: Timestamp -> (Node.List, Instance.List) -> MoveJob -> [MetaOpCode]
+moveToJob now (nl, il) (_, idx, move, _) =
+ let opCodes = Cluster.iMoveToJob nl il idx move
+ in map (annotateOpCode "auto-balancing the cluster" now) opCodes
+
+-- | Iteratively improve a cluster by iterating over tryBalance.
+iterateBalance :: AlgorithmOptions
+ -> Cluster.Table -- ^ the starting table
+ -> [MoveJob] -- ^ current command list
+ -> [MoveJob] -- ^ resulting commands
+iterateBalance opts ini_tbl cmds =
+ let Cluster.Table ini_nl ini_il _ _ = ini_tbl
+ m_next_tbl = Cluster.tryBalance opts ini_tbl
+ in case m_next_tbl of
+ Just next_tbl@(Cluster.Table _ _ _ plc@(curplc:_)) ->
+ let (idx, _, _, move, _) = curplc
+ plc_len = length plc
+ (_, cs) = Cluster.printSolutionLine ini_nl ini_il 1 1 curplc plc_len
+ afn = Cluster.involvedNodes ini_il curplc
+ cmds' = (afn, idx, move, cs):cmds
+ in iterateBalance opts next_tbl cmds'
+ _ -> cmds
+
+-- | List instances evacuated in a move job, if any.
+evacuatedInsts :: (Node.List, Instance.List)
+ -> MoveJob
+ -> [String]
+evacuatedInsts (nl, il) (_, idx, _, _) =
+ let inst = Container.find idx il
+ node = Container.find (Instance.pNode inst) nl
+ in [Instance.name inst | Node.offline node]
+
+-- | Balance a single group, restricted to the allowed nodes and
+-- minimal gain.
+balanceGroup :: IORef MemoryState
+ -> Set.Set String
+ -> L.Client
+ -> Set.Set Int
+ -> Double
+ -> (Int, (Node.List, Instance.List))
+ -> ResultT String IO [JobId]
+balanceGroup memstate xens client allowedNodes threshold (gidx, (nl, il)) = do
+ logDebug $ printf "Balancing group %d, %d nodes, %d instances." gidx
+ (Container.size nl) (Container.size il)
+ let ini_cv = Metrics.compCV nl
+ ini_tbl = Cluster.Table nl il ini_cv []
+ opts = defaultOptions { algAllowedNodes = Just allowedNodes
+ , algMinGain = threshold
+ , algMinGainLimit = 10 * threshold
+ }
+ cmds = iterateBalance opts ini_tbl []
+ tasks = take 1 $ Cluster.splitJobs cmds
+ logDebug $ "First task group: " ++ show tasks
+ now <- liftIO currentTimestamp
+ let jobs = tasks >>= map (moveToJob now (nl, il))
+ evacs = filter (`Set.member` xens)
+ (concat tasks >>= evacuatedInsts (nl, il))
+ if null jobs
+ then return []
+ else do
+ unless (null evacs) $ do
+ logDebug $ "Evacuation of instances " ++ show evacs
+ liftIO $ addEvacuated memstate evacs
+ jids <- liftIO $ submitJobs jobs client
+ case jids of
+ Bad e -> mkResultT . logAndBad
+ $ "Failure submitting balancing jobs: " ++ e
+ Ok jids' -> return jids'
+
+-- * Memory balancing
+
+-- | Decide the weight that dynamic memory utilization should have
+-- based on the memory-over-commitment ratio. This function is likely
+-- to change once more experience with memory over-commited clusters
+-- is gained.
+weightFromMemRatio :: Double -> Double
+weightFromMemRatio f = 0.0 `max` (f - 1) * 5.0
+
+-- | Apply the memory data to the cluster data.
+useMemData :: Double
+ -> Container.Container AllReports
+ -> (Node.List, Instance.List)
+ -> ResultT String IO (Node.List, Instance.List)
+useMemData ratio allreports (nl, il) = do
+ logDebug "Taking dynamic memory data into account"
+ let memoryReports =
+ map (flip Container.find nl *** rMem) $ IntMap.toList allreports
+ mkResultT . return . liftM (MonD.scaleMemoryWeight (weightFromMemRatio ratio))
+ $ MonD.useInstanceRSSData memoryReports (nl, il)
+
+-- * Interface function
+
+-- | Carry out all the needed balancing, based on live CPU data, only touching
+-- the available nodes. Only carry out balancing steps where the gain is above
+-- the threshold.
+balanceTask :: IORef MemoryState
+ -> (Node.List, Instance.List) -- ^ current cluster configuration
+ -> Set.Set Int -- ^ node indices on which actions may be taken
+ -> Double -- ^ threshold for improvement
+ -> ResultT String IO [JobId] -- ^ jobs submitted
+balanceTask memstate (nl, il) okNodes threshold = do
+ logDebug "Collecting dynamic load values"
+ evacuated <- getEvacuated memstate
+ logDebug $ "Not expecting load data from: " ++ show evacuated
+ reports <- queryLoad nl
+ xenInstances <- getXenInstances
+ (nl', il') <- mkResultT . return
+ $ updateCPULoad (nl, il) reports xenInstances evacuated
+ liftIO $ mapM_ (cleanUpEvacuation memstate il reports) evacuated
+ let memoryOvercommitment =
+ maximum . (0.0:) . map (iPolicyMemoryRatio .Node.iPolicy)
+ $ IntMap.elems nl
+ logDebug $ "Memory over-commitment ratio is " ++ show memoryOvercommitment
+ (nl'', il'') <- if memoryOvercommitment > 1.0
+ then useMemData memoryOvercommitment reports (nl', il')
+ else return (nl', il')
+ logDebug . (++) "Dynamic node load: " . show
+ . map (Node.name &&& Node.utilLoad) $ Container.elems nl''
+ let ngroups = ClusterUtils.splitCluster nl'' il''
+ luxiSocket <- liftIO Path.defaultQuerySocket
+ bracket (liftIO $ L.getLuxiClient luxiSocket) (liftIO . L.closeClient) $ \c ->
+ liftM concat $ mapM (balanceGroup memstate xenInstances c okNodes threshold)
+ ngroups
diff --git a/src/Ganeti/MaintD/CleanupIncidents.hs b/src/Ganeti/MaintD/CleanupIncidents.hs
new file mode 100644
index 0000000..f8aaf92
--- /dev/null
+++ b/src/Ganeti/MaintD/CleanupIncidents.hs
@@ -0,0 +1,87 @@
+{-| Incident clean up in the maintenance daemon.
+
+This module implements the clean up of events that are finished,
+and acknowledged as such by the user.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.CleanupIncidents
+ ( cleanupIncidents
+ ) where
+
+import Control.Arrow ((&&&))
+import Control.Monad (unless)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.IORef (IORef)
+
+import Ganeti.BasicTypes (ResultT, mkResultT)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Logging.Lifted
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, rmIncident)
+import Ganeti.Objects.Maintenance (Incident(..), RepairStatus(..))
+import Ganeti.Utils (logAndBad)
+
+-- | Remove a single incident, provided the corresponding tag
+-- is no longer present.
+cleanupIncident :: IORef MemoryState
+ -> Node.List
+ -> Incident
+ -> ResultT String IO ()
+cleanupIncident memstate nl incident = do
+ let location = incidentNode incident
+ uuid = incidentUuid incident
+ tag = incidentTag incident
+ nodes = filter ((==) location . Node.name) $ Container.elems nl
+ case nodes of
+ [] -> do
+ logInfo $ "No node any more with name " ++ location
+ ++ "; will forget event " ++ UTF8.toString uuid
+ liftIO . rmIncident memstate $ UTF8.toString uuid
+ [nd] -> unless (tag `elem` Node.nTags nd) $ do
+ logInfo $ "Tag " ++ tag ++ " removed on " ++ location
+ ++ "; will forget event " ++ UTF8.toString uuid
+ liftIO . rmIncident memstate $ UTF8.toString uuid
+ _ -> mkResultT . logAndBad
+ $ "Found More than one node with name " ++ location
+
+-- | Remove all incidents from the record that are in a final state
+-- and additionally the node tag for that incident has been removed.
+cleanupIncidents :: IORef MemoryState -> Node.List -> ResultT String IO ()
+cleanupIncidents memstate nl = do
+ incidents <- getIncidents memstate
+ let finalized = filter ((> RSPending) . incidentRepairStatus) incidents
+ logDebug . (++) "Finalized incidents " . show
+ $ map (incidentNode &&& incidentUuid) finalized
+ mapM_ (cleanupIncident memstate nl) finalized
diff --git a/src/Ganeti/MaintD/CollectIncidents.hs b/src/Ganeti/MaintD/CollectIncidents.hs
new file mode 100644
index 0000000..ba31569
--- /dev/null
+++ b/src/Ganeti/MaintD/CollectIncidents.hs
@@ -0,0 +1,130 @@
+{-| Discovery of incidents by the maintenance daemon.
+
+This module implements the querying of all monitoring
+daemons for the value of the node-status data collector.
+Any new incident gets registered.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.CollectIncidents
+ ( collectIncidents
+ ) where
+
+import Control.Applicative (liftA2)
+import Control.Monad (unless)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.IORef (IORef)
+import Network.Curl
+import System.Time (getClockTime)
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes (ResultT)
+import qualified Ganeti.Constants as C
+import qualified Ganeti.DataCollectors.Diagnose as D
+import Ganeti.DataCollectors.Types (getCategoryName)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Logging.Lifted
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, updateIncident)
+import Ganeti.Objects.Maintenance
+import Ganeti.Utils (newUUID)
+
+-- | Query a node, unless it is offline, and return
+-- the paylod of the report, if available. For offline
+-- nodes return nothing.
+queryStatus :: Node.Node -> IO (Maybe J.JSValue)
+queryStatus node = do
+ let name = Node.name node
+ let url = name ++ ":" ++ show C.defaultMondPort
+ ++ "/1/report/" ++ maybe "default" getCategoryName D.dcCategory
+ ++ "/" ++ D.dcName
+ if Node.offline node
+ then do
+ logDebug $ "Not asking " ++ name ++ "; it is offline"
+ return Nothing
+ else do
+ (code, body) <- liftIO $ curlGetString url []
+ case code of
+ CurlOK ->
+ case J.decode body of
+ J.Ok r -> return $ Just r
+ _ -> return Nothing
+ _ -> do
+ logWarning $ "Failed to contact " ++ name
+ return Nothing
+
+-- | Update the status of one node.
+updateNode :: IORef MemoryState -> Node.Node -> ResultT String IO ()
+updateNode memstate node = do
+ let name = Node.name node
+ logDebug $ "Inspecting " ++ name
+ report <- liftIO $ queryStatus node
+ case report of
+ Just (J.JSObject obj)
+ | Just orig@(J.JSObject origobj) <- lookup "data" $ J.fromJSObject obj,
+ Just s <- lookup "status" $ J.fromJSObject origobj,
+ J.Ok state <- J.readJSON s,
+ state /= RANoop -> do
+ let origs = J.encode orig
+ logDebug $ "Relevant event on " ++ name ++ ": " ++ origs
+ incidents <- getIncidents memstate
+ unless (any (liftA2 (&&)
+ ((==) name . incidentNode)
+ ((==) orig . incidentOriginal)) incidents) $ do
+ logInfo $ "Registering new incident on " ++ name ++ ": " ++ origs
+ uuid <- liftIO newUUID
+ now <- liftIO getClockTime
+ let tag = C.maintdSuccessTagPrefix ++ uuid
+ incident = Incident { incidentOriginal = orig
+ , incidentAction = state
+ , incidentRepairStatus = RSNoted
+ , incidentJobs = []
+ , incidentNode = name
+ , incidentTag = tag
+ , incidentUuid = UTF8.fromString uuid
+ , incidentCtime = now
+ , incidentMtime = now
+ , incidentSerial = 1
+ }
+ liftIO $ updateIncident memstate incident
+ _ -> return ()
+
+
+-- | Query all MonDs for updates on the node-status.
+collectIncidents :: IORef MemoryState -> Node.List -> ResultT String IO ()
+collectIncidents memstate nl = do
+ _ <- getIncidents memstate -- always update the memory state,
+ -- even if we do not observe anything
+ logDebug "Querying all nodes for incidents"
+ mapM_ (updateNode memstate) $ Container.elems nl
diff --git a/src/Ganeti/MaintD/FailIncident.hs b/src/Ganeti/MaintD/FailIncident.hs
new file mode 100644
index 0000000..917cb78
--- /dev/null
+++ b/src/Ganeti/MaintD/FailIncident.hs
@@ -0,0 +1,93 @@
+{-| Incident failing in the maintenace daemon
+
+This module implements the treatment of an incident, once
+a job failed.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.FailIncident
+ ( failIncident
+ ) where
+
+import Control.Exception.Lifted (bracket)
+import Control.Lens.Setter (over)
+import Control.Monad (liftM, when)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.IORef (IORef)
+import System.IO.Error (tryIOError)
+
+import Ganeti.BasicTypes (ResultT, mkResultT, GenericResult(..))
+import qualified Ganeti.Constants as C
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, updateIncident)
+import Ganeti.MaintD.Utils (annotateOpCode)
+import Ganeti.Objects.Lens (incidentJobsL)
+import Ganeti.Objects.Maintenance (Incident(..), RepairStatus(..))
+import Ganeti.OpCodes (OpCode(..))
+import qualified Ganeti.Path as Path
+import Ganeti.Types (JobId, fromJobId, TagKind(..))
+
+-- | Mark an incident as failed.
+markAsFailed :: IORef MemoryState -> Incident -> ResultT String IO ()
+markAsFailed memstate incident = do
+ let uuid = incidentUuid incident
+ newtag = C.maintdFailureTagPrefix ++ UTF8.toString uuid
+ logInfo $ "Marking incident " ++ UTF8.toString uuid ++ " as failed"
+ now <- liftIO currentTimestamp
+ luxiSocket <- liftIO Path.defaultQuerySocket
+ jids <- bracket (mkResultT . liftM (either (Bad . show) Ok)
+ . tryIOError $ L.getLuxiClient luxiSocket)
+ (liftIO . L.closeClient)
+ (mkResultT . execJobsWaitOkJid
+ [[ annotateOpCode "marking incident handling as failed" now
+ . OpTagsSet TagKindNode [ newtag ]
+ . Just $ incidentNode incident ]])
+ let incident' = over incidentJobsL (++ jids)
+ $ incident { incidentRepairStatus = RSFailed
+ , incidentTag = newtag
+ }
+ liftIO $ updateIncident memstate incident'
+
+-- | Mark the incident, if any, belonging to the given job as
+-- failed after having tagged it appropriately.
+failIncident :: IORef MemoryState -> JobId -> ResultT String IO ()
+failIncident memstate jid = do
+ incidents <- getIncidents memstate
+ let affected = filter (elem jid . incidentJobs) incidents
+ when (null affected) . logInfo
+ $ "Job " ++ show (fromJobId jid) ++ " does not belong to an incident"
+ mapM_ (markAsFailed memstate) affected
diff --git a/src/Ganeti/MaintD/HandleIncidents.hs b/src/Ganeti/MaintD/HandleIncidents.hs
new file mode 100644
index 0000000..90831d0
--- /dev/null
+++ b/src/Ganeti/MaintD/HandleIncidents.hs
@@ -0,0 +1,300 @@
+{-| Incident handling in the maintenance daemon.
+
+This module implements the submission of actions for ongoing
+repair events reported by the node-status data collector.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.HandleIncidents
+ ( handleIncidents
+ ) where
+
+import Control.Arrow ((&&&))
+import Control.Exception.Lifted (bracket)
+import Control.Lens.Setter (over)
+import Control.Monad (foldM)
+import Control.Monad.IO.Class (liftIO)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.Function (on)
+import Data.IORef (IORef)
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes ( GenericResult(..), ResultT, mkResultT, Down(..))
+import qualified Ganeti.Constants as C
+import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), defaultOptions)
+import Ganeti.HTools.Cluster.Evacuate (tryNodeEvac, EvacSolution(..))
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Group as Group
+import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.HTools.Types (Idx)
+import Ganeti.JQueue (currentTimestamp)
+import Ganeti.Jobs (execJobsWaitOkJid, submitJobs, forceFailover)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.MemoryState ( MemoryState, getIncidents, rmIncident
+ , updateIncident, appendJobs)
+import Ganeti.MaintD.Utils (annotateOpCode, getRepairCommand)
+import Ganeti.Objects.Lens (incidentJobsL)
+import Ganeti.Objects.Maintenance ( RepairStatus(..), RepairAction(..)
+ , Incident(..))
+import Ganeti.OpCodes (OpCode(..), MetaOpCode)
+import qualified Ganeti.Path as Path
+import Ganeti.Types ( cTimeOf, uuidOf, mkNonEmpty, fromJobId
+ , EvacMode(..), TagKind(..))
+import Ganeti.Utils (maxBy, logAndBad)
+
+-- | Given two incidents, choose the more severe one; for equally severe
+-- ones the older (by creation timestamp).
+moreSevereIncident :: Incident -> Incident -> Incident
+moreSevereIncident = maxBy (compare `on` incidentAction &&& (Down . cTimeOf))
+
+-- | From a given list of incidents, return, for each node,
+-- the one with the most severe action.
+rankIncidents :: [Incident] -> Map.Map String Incident
+rankIncidents = foldl (\m i -> Map.insertWith moreSevereIncident
+ (incidentNode i) i m) Map.empty
+
+-- | Generate a job to drain a given node.
+drainJob :: String -> ResultT String IO [ MetaOpCode ]
+drainJob name = do
+ name' <- mkNonEmpty name
+ now <- liftIO currentTimestamp
+ return $ map (annotateOpCode ("Draining " ++ name) now)
+ [ OpNodeSetParams { opNodeName = name'
+ , opNodeUuid = Nothing
+ , opForce = True
+ , opHvState = Nothing
+ , opDiskState = Nothing
+ , opMasterCandidate = Nothing
+ , opOffline = Nothing
+ , opDrained = Just True
+ , opAutoPromote = False
+ , opMasterCapable = Nothing
+ , opVmCapable = Nothing
+ , opSecondaryIp = Nothing
+ , opgenericNdParams = Nothing
+ , opPowered = Nothing
+ , opVerbose = False
+ , opDebug = False
+ } ]
+
+-- | Submit and register the next job for a node evacuation.
+handleEvacuation :: L.Client -- ^ Luxi client to use
+ -> IORef MemoryState -- ^ memory state of the daemon
+ -> (Group.List, Node.List, Instance.List) -- ^ cluster
+ -> Idx -- ^ index of the node to evacuate
+ -> Bool -- ^ whether to try migrations
+ -> Set.Set Int -- ^ allowed nodes for evacuation
+ -> Incident -- ^ the incident
+ -> ResultT String IO (Set.Set Int) -- ^ nodes still available
+handleEvacuation client memst (gl, nl, il) ndx migrate freenodes incident = do
+ let node = Container.find ndx nl
+ name = Node.name node
+ fNdNames = map (Node.name . flip Container.find nl) $ Set.elems freenodes
+ evacOpts = defaultOptions { algEvacMode = True
+ , algIgnoreSoftErrors = True
+ , algRestrictToNodes = Just fNdNames
+ }
+ evacFun = tryNodeEvac evacOpts gl nl il
+ migrateFun = if migrate then id else forceFailover
+ annotateFun = annotateOpCode $ "Evacuating " ++ name
+ pendingIncident = incident { incidentRepairStatus = RSPending }
+ updateJobs jids_r = case jids_r of
+ Ok jids -> do
+ let incident' = over incidentJobsL (++ jids) pendingIncident
+ liftIO $ updateIncident memst incident'
+ liftIO $ appendJobs memst jids
+ logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
+ Bad e -> mkResultT . logAndBad
+ $ "Failure evacuating " ++ name ++ ": " ++ e
+ logInstName i = logInfo $ "Evacuating instance "
+ ++ Instance.name (Container.find i il)
+ ++ " from " ++ name
+ execSol sol = do
+ now <- liftIO currentTimestamp
+ let jobs = map (map (annotateFun now . migrateFun)) $ esOpCodes sol
+ jids <- liftIO $ submitJobs jobs client
+ updateJobs jids
+ let touched = esMoved sol >>= \(_, _, nidxs) -> nidxs
+ return $ freenodes Set.\\ Set.fromList touched
+ logDebug $ "Handling evacuation of " ++ name
+ case () of _ | not $ Node.offline node -> do
+ logDebug $ "Draining node " ++ name
+ job <- drainJob name
+ jids <- liftIO $ submitJobs [job] client
+ updateJobs jids
+ return freenodes
+ | i:_ <- Node.pList node -> do
+ logInstName i
+ (_, _, sol) <- mkResultT . return $ evacFun ChangePrimary [i]
+ execSol sol
+ | i:_ <- Node.sList node -> do
+ logInstName i
+ (_, _, sol) <- mkResultT . return
+ $ evacFun ChangeSecondary [i]
+ execSol sol
+ | otherwise -> do
+ logDebug $ "Finished evacuation of " ++ name
+ now <- liftIO currentTimestamp
+ jids <- mkResultT $ execJobsWaitOkJid
+ [[ annotateFun now
+ . OpTagsSet TagKindNode [ incidentTag incident ]
+ $ Just name]] client
+ let incident' = over incidentJobsL (++ jids)
+ $ incident { incidentRepairStatus =
+ RSCompleted }
+ liftIO $ updateIncident memst incident'
+ liftIO $ appendJobs memst jids
+ return freenodes
+
+-- | Submit the next action for a live-repair incident.
+handleLiveRepairs :: L.Client -- ^ Luxi client to use
+ -> IORef MemoryState -- ^ memory state of the daemon
+ -> Idx -- ^ the node to handle the event on
+ -> Set.Set Int -- ^ unaffected nodes
+ -> Incident -- ^ the incident
+ -> ResultT String IO (Set.Set Int) -- ^ nodes still available
+handleLiveRepairs client memst ndx freenodes incident = do
+ let maybeCmd = getRepairCommand incident
+ uuid = incidentUuid incident
+ name = incidentNode incident
+ now <- liftIO currentTimestamp
+ logDebug $ "Handling requested command " ++ show maybeCmd ++ " on " ++ name
+ case () of
+ _ | null $ incidentJobs incident,
+ Just cmd <- maybeCmd,
+ cmd /= "" -> do
+ logDebug "Submitting repair command job"
+ name' <- mkNonEmpty name
+ cmd' <- mkNonEmpty cmd
+ orig' <- mkNonEmpty . J.encode $ incidentOriginal incident
+ jids_r <- liftIO $ submitJobs
+ [[ annotateOpCode "repair command requested by node" now
+ OpRepairCommand { opNodeName = name'
+ , opRepairCommand = cmd'
+ , opInput = Just orig'
+ } ]] client
+ case jids_r of
+ Ok jids -> do
+ let incident' = over incidentJobsL (++ jids) incident
+ liftIO $ updateIncident memst incident'
+ liftIO $ appendJobs memst jids
+ logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
+ Bad e -> mkResultT . logAndBad
+ $ "Failure requesting command " ++ cmd ++ " on " ++ name
+ ++ ": " ++ e
+ | null $ incidentJobs incident -> do
+ logInfo $ "Marking incident " ++ UTF8.toString uuid ++ " as failed;"
+ ++ " command for live repair not specified"
+ let newtag = C.maintdFailureTagPrefix ++ UTF8.toString uuid
+ jids <- mkResultT $ execJobsWaitOkJid
+ [[ annotateOpCode "marking incident as ill specified" now
+ . OpTagsSet TagKindNode [ newtag ]
+ $ Just name ]] client
+ let incident' = over incidentJobsL (++ jids)
+ $ incident { incidentRepairStatus = RSFailed
+ , incidentTag = newtag
+ }
+ liftIO $ updateIncident memst incident'
+ liftIO $ appendJobs memst jids
+ | otherwise -> do
+ logDebug "Command execution has succeeded"
+ jids <- mkResultT $ execJobsWaitOkJid
+ [[ annotateOpCode "repair command requested by node" now
+ . OpTagsSet TagKindNode [ incidentTag incident ]
+ $ Just name ]] client
+ let incident' = over incidentJobsL (++ jids)
+ $ incident { incidentRepairStatus = RSCompleted }
+ liftIO $ updateIncident memst incident'
+ liftIO $ appendJobs memst jids
+ return $ Set.delete ndx freenodes
+
+
+-- | Submit the next actions for a single incident, given the unaffected nodes;
+-- register all submitted jobs and return the new set of unaffected nodes.
+handleIncident :: L.Client
+ -> IORef MemoryState
+ -> (Group.List, Node.List, Instance.List)
+ -> Set.Set Int
+ -> (String, Incident)
+ -> ResultT String IO (Set.Set Int)
+handleIncident client memstate (gl, nl, il) freeNodes (name, incident) = do
+ ndx <- case Container.keys $ Container.filter ((==) name . Node.name) nl of
+ [ndx] -> return ndx
+ [] -> do
+ logWarning $ "Node " ++ name ++ " no longer in the cluster;"
+ ++ " clearing incident " ++ show incident
+ liftIO . rmIncident memstate $ uuidOf incident
+ fail $ "node " ++ name ++ " left the cluster"
+ ndxs -> do
+ logWarning $ "Abmigious node name " ++ name
+ ++ "; could refer to indices " ++ show ndxs
+ fail $ "ambigious name " ++ name
+ case incidentAction incident of
+ RANoop -> do
+ logDebug $ "Nothing to do for " ++ show incident
+ liftIO . rmIncident memstate $ uuidOf incident
+ return freeNodes
+ RALiveRepair ->
+ handleLiveRepairs client memstate ndx freeNodes incident
+ RAEvacuate ->
+ handleEvacuation client memstate (gl, nl, il) ndx True freeNodes incident
+ RAEvacuateFailover ->
+ handleEvacuation client memstate (gl, nl, il) ndx False freeNodes incident
+
+-- | Submit the jobs necessary for the next maintenance step
+-- for each pending maintenance, i.e., the most radical maintenance
+-- for each node. Return the set of node indices unaffected by these
+-- operations. Also, for each job submitted, register it directly.
+handleIncidents :: IORef MemoryState
+ -> (Group.List, Node.List, Instance.List)
+ -> ResultT String IO (Set.Set Int)
+handleIncidents memstate (gl, nl, il) = do
+ incidents <- getIncidents memstate
+ let activeIncidents = filter ((<= RSPending) . incidentRepairStatus) incidents
+ incidentsToHandle = rankIncidents activeIncidents
+ incidentNodes = Set.fromList . Container.keys
+ $ Container.filter ((`Map.member` incidentsToHandle) . Node.name) nl
+ freeNodes = Set.fromList (Container.keys nl) Set.\\ incidentNodes
+ if null activeIncidents
+ then return freeNodes
+ else do
+ luxiSocket <- liftIO Path.defaultQuerySocket
+ bracket (liftIO $ L.getLuxiClient luxiSocket)
+ (liftIO . L.closeClient)
+ $ \ client ->
+ foldM (handleIncident client memstate (gl, nl, il)) freeNodes
+ $ Map.assocs incidentsToHandle
diff --git a/src/Ganeti/MaintD/MemoryState.hs b/src/Ganeti/MaintD/MemoryState.hs
new file mode 100644
index 0000000..ce0c94a
--- /dev/null
+++ b/src/Ganeti/MaintD/MemoryState.hs
@@ -0,0 +1,153 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Memory copy of the state of the maintenance daemon.
+
+While the autoritative state of the maintenance daemon is
+stored in the configuration, the daemon keeps a copy of some
+values at run time, so that they can easily be exposed over
+HTTP.
+
+This module also provides functions for the mirrored information
+to update both, the authoritative state and the in-memory copy.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.MemoryState
+ ( MemoryState(..)
+ , emptyMemoryState
+ , getJobs
+ , clearJobs
+ , appendJobs
+ , getEvacuated
+ , addEvacuated
+ , rmEvacuated
+ , getIncidents
+ , updateIncident
+ , rmIncident
+ ) where
+
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef)
+
+import Ganeti.BasicTypes (ResultT, withErrorT)
+import Ganeti.Lens (makeCustomLenses)
+import Ganeti.Objects.Maintenance (Incident)
+import Ganeti.Types (JobId, uuidOf)
+import Ganeti.Utils (ordNub)
+import Ganeti.Utils.IORef (atomicModifyWithLens_)
+import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceJobs, runModifyRpc
+ , clearMaintdJobs, appendMaintdJobs
+ , maintenanceEvacuated, addMaintdEvacuated
+ , rmMaintdEvacuated
+ , maintenanceIncidents, updateMaintdIncident
+ , rmMaintdIncident )
+
+-- | In-memory copy of parts of the state of the maintenance
+-- daemon.
+data MemoryState = MemoryState
+ { msJobs :: [ JobId ]
+ , msEvacuated :: [ String ]
+ , msIncidents :: [ Incident ]
+ }
+
+$(makeCustomLenses ''MemoryState)
+
+-- | Inital state of the in-memory copy. All parts will be updated
+-- before use, after one round at the latest this copy is up to date.
+emptyMemoryState :: MemoryState
+emptyMemoryState = MemoryState { msJobs = []
+ , msEvacuated = []
+ , msIncidents = []
+ }
+
+-- | Get the list of jobs from the authoritative copy, and update the
+-- in-memory copy as well.
+getJobs :: IORef MemoryState -> ResultT String IO [JobId]
+getJobs memstate = do
+ jobs <- withErrorT show $ runNewWConfdClient maintenanceJobs
+ liftIO . atomicModifyWithLens_ memstate msJobsL $ const jobs
+ return jobs
+
+-- | Reset the list of active jobs.
+clearJobs :: IORef MemoryState -> IO ()
+clearJobs memstate = do
+ runModifyRpc clearMaintdJobs
+ atomicModifyWithLens_ memstate msJobsL $ const []
+
+-- | Append jobs to the list of active jobs, if not present already
+appendJobs :: IORef MemoryState -> [JobId] -> IO ()
+appendJobs memstate jobs = do
+ runModifyRpc $ appendMaintdJobs jobs
+ atomicModifyWithLens_ memstate msJobsL $ ordNub . (++ jobs)
+
+-- | Get the list of recently evacuated instances from the authoritative
+-- copy and update the in-memory state.
+getEvacuated :: IORef MemoryState -> ResultT String IO [String]
+getEvacuated memstate = do
+ evac <- withErrorT show $ runNewWConfdClient maintenanceEvacuated
+ liftIO . atomicModifyWithLens_ memstate msEvacuatedL $ const evac
+ return evac
+
+-- | Add names to the list of recently evacuated instances.
+addEvacuated :: IORef MemoryState -> [String] -> IO ()
+addEvacuated memstate names = do
+ runModifyRpc $ addMaintdEvacuated names
+ atomicModifyWithLens_ memstate msEvacuatedL $ ordNub . (++ names)
+
+-- | Remove a name from the list of recently evacuated instances.
+rmEvacuated :: IORef MemoryState -> String -> IO ()
+rmEvacuated memstate name = do
+ runModifyRpc $ rmMaintdEvacuated name
+ atomicModifyWithLens_ memstate msEvacuatedL $ filter (/= name)
+
+-- | Get the list of incidents fo the authoritative copy and update the
+-- in-memory state.
+getIncidents :: IORef MemoryState -> ResultT String IO [Incident]
+getIncidents memstate = do
+ incidents <- withErrorT show $ runNewWConfdClient maintenanceIncidents
+ liftIO . atomicModifyWithLens_ memstate msIncidentsL $ const incidents
+ return incidents
+
+-- | Update an incident.
+updateIncident :: IORef MemoryState -> Incident -> IO ()
+updateIncident memstate incident = do
+ runModifyRpc $ updateMaintdIncident incident
+ atomicModifyWithLens_ memstate msIncidentsL
+ $ (incident :) . filter ((/= uuidOf incident) . uuidOf)
+
+-- | Remove an incident.
+rmIncident :: IORef MemoryState -> String -> IO ()
+rmIncident memstate uuid = do
+ runModifyRpc $ rmMaintdIncident uuid
+ atomicModifyWithLens_ memstate msIncidentsL
+ $ filter ((/= uuid) . uuidOf)
diff --git a/src/Ganeti/MaintD/Server.hs b/src/Ganeti/MaintD/Server.hs
new file mode 100644
index 0000000..b88b23e
--- /dev/null
+++ b/src/Ganeti/MaintD/Server.hs
@@ -0,0 +1,215 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+{-| Implementation of the Ganeti maintenenace server.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Server
+ ( options
+ , main
+ , checkMain
+ , prepMain
+ ) where
+
+import Control.Applicative ((<|>))
+import Control.Concurrent (forkIO)
+import Control.Exception.Lifted (bracket)
+import Control.Monad (forever, void, unless, when, liftM)
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef, newIORef, readIORef)
+import qualified Data.Set as Set
+import Snap.Core (Snap, method, Method(GET), ifTop, dir, route)
+import Snap.Http.Server (httpServe)
+import Snap.Http.Server.Config (Config)
+import System.IO.Error (tryIOError)
+import System.Time (getClockTime)
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes ( GenericResult(..), ResultT, runResultT, mkResultT
+ , mkResultTEither, withErrorT, isBad, isOk)
+import qualified Ganeti.Constants as C
+import Ganeti.Daemon ( OptType, CheckFn, PrepFn, MainFn, oDebug
+ , oNoVoting, oYesDoIt, oPort, oBindAddress, oNoDaemonize)
+import Ganeti.Daemon.Utils (handleMasterVerificationOptions)
+import qualified Ganeti.HTools.Backend.Luxi as Luxi
+import Ganeti.HTools.Loader (ClusterData(..), mergeData, checkData)
+import Ganeti.Jobs (waitForJobs)
+import Ganeti.Logging.Lifted
+import qualified Ganeti.Luxi as L
+import Ganeti.MaintD.Autorepairs (harepTasks)
+import Ganeti.MaintD.Balance (balanceTask)
+import Ganeti.MaintD.CleanupIncidents (cleanupIncidents)
+import Ganeti.MaintD.CollectIncidents (collectIncidents)
+import Ganeti.MaintD.FailIncident (failIncident)
+import Ganeti.MaintD.HandleIncidents (handleIncidents)
+import Ganeti.MaintD.MemoryState
+import qualified Ganeti.Path as Path
+import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
+import Ganeti.Types (JobId(..), JobStatus(..))
+import Ganeti.Utils (threadDelaySeconds, partitionM)
+import Ganeti.Utils.Http (httpConfFromOpts, plainJSON, error404)
+import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceRoundDelay
+ , maintenanceBalancing)
+
+-- | Options list and functions.
+options :: [OptType]
+options =
+ [ oNoDaemonize
+ , oDebug
+ , oPort C.defaultMaintdPort
+ , oBindAddress
+ , oNoVoting
+ , oYesDoIt
+ ]
+
+-- | Type alias for checkMain results.
+type CheckResult = ()
+
+-- | Type alias for prepMain results
+type PrepResult = Config Snap ()
+
+-- | Load cluster data
+--
+-- At the moment, only the static data is fetched via luxi;
+-- once we support load-based balancing in maintd as well,
+-- we also need to query the MonDs for the load data.
+loadClusterData :: ResultT String IO ClusterData
+loadClusterData = do
+ now <- liftIO getClockTime
+ socket <- liftIO Path.defaultQuerySocket
+ either_inp <- liftIO . tryIOError $ Luxi.loadData socket
+ input_data <- mkResultT $ case either_inp of
+ Left e -> do
+ let msg = show e
+ logNotice $ "Couldn't read data from luxid: " ++ msg
+ return $ Bad msg
+ Right r -> return r
+ cdata <- mkResultT . return $ mergeData [] [] [] [] now input_data
+ let (msgs, nl) = checkData (cdNodes cdata) (cdInstances cdata)
+ unless (null msgs) . logDebug $ "Cluster data inconsistencies: " ++ show msgs
+ return $ cdata { cdNodes = nl }
+
+-- | Perform one round of maintenance
+maintenance :: IORef MemoryState -> ResultT String IO ()
+maintenance memstate = do
+ delay <- withErrorT show $ runNewWConfdClient maintenanceRoundDelay
+ liftIO $ threadDelaySeconds delay
+ oldjobs <- getJobs memstate
+ logDebug $ "Jobs submitted in the last round: "
+ ++ show (map fromJobId oldjobs)
+ luxiSocket <- liftIO Path.defaultQuerySocket
+
+ -- Filter out any jobs in the maintenance list which can't be parsed by luxi
+ -- anymore. This can happen if the job file is corrupted, missing or archived.
+ -- We have to query one job at a time, as luxi returns a single error if any
+ -- job in the query list can't be read/parsed.
+ (okjobs, badjobs) <- bracket
+ (mkResultTEither . tryIOError $ L.getLuxiClient luxiSocket)
+ (liftIO . L.closeClient)
+ $ mkResultT . liftM Ok
+ . (\c -> partitionM (\j -> liftM isOk $ L.queryJobsStatus c [j]) oldjobs)
+
+ unless (null badjobs) $ do
+ logInfo . (++) "Unparsable jobs (marking as failed): "
+ . show $ map fromJobId badjobs
+ mapM_ (failIncident memstate) badjobs
+
+ jobresults <- bracket
+ (mkResultTEither . tryIOError $ L.getLuxiClient luxiSocket)
+ (liftIO . L.closeClient)
+ $ mkResultT . (\c -> waitForJobs okjobs c)
+
+ let failedjobs = map fst $ filter ((/=) JOB_STATUS_SUCCESS . snd) jobresults
+ unless (null failedjobs) $ do
+ logInfo . (++) "Failed jobs: " . show $ map fromJobId failedjobs
+ mapM_ (failIncident memstate) failedjobs
+ unless (null oldjobs)
+ . liftIO $ clearJobs memstate
+ logDebug "New round of maintenance started"
+ cData <- loadClusterData
+ let il = cdInstances cData
+ nl = cdNodes cData
+ gl = cdGroups cData
+ cleanupIncidents memstate nl
+ collectIncidents memstate nl
+ nidxs <- handleIncidents memstate (gl, nl, il)
+ (nidxs', jobs) <- harepTasks (nl, il) nidxs
+ unless (null jobs)
+ . liftIO $ appendJobs memstate jobs
+ logDebug $ "Nodes unaffected by harep " ++ show (Set.toList nidxs')
+ ++ ", jobs submitted " ++ show (map fromJobId jobs)
+ (bal, thresh) <- withErrorT show $ runNewWConfdClient maintenanceBalancing
+ when (bal && not (Set.null nidxs')) $ do
+ logDebug $ "Will balance unaffected nodes, threshold " ++ show thresh
+ jobs' <- balanceTask memstate (nl, il) nidxs thresh
+ logDebug $ "Balancing jobs submitted: " ++ show (map fromJobId jobs')
+ unless (null jobs')
+ . liftIO $ appendJobs memstate jobs'
+
+-- | Expose a part of the memory state
+exposeState :: J.JSON a => (MemoryState -> a) -> IORef MemoryState -> Snap ()
+exposeState selector ref = do
+ state <- liftIO $ readIORef ref
+ plainJSON $ selector state
+
+-- | The information to serve via HTTP
+httpInterface :: IORef MemoryState -> Snap ()
+httpInterface memstate =
+ ifTop (method GET $ plainJSON [1 :: Int])
+ <|> dir "1" (ifTop (plainJSON J.JSNull)
+ <|> route [ ("jobs", exposeState msJobs memstate)
+ , ("evacuated", exposeState msEvacuated memstate)
+ , ("status", exposeState msIncidents memstate)
+ ])
+ <|> error404
+
+-- | Check function for luxid.
+checkMain :: CheckFn CheckResult
+checkMain = handleMasterVerificationOptions
+
+-- | Prepare function for luxid.
+prepMain :: PrepFn CheckResult PrepResult
+prepMain opts _ = httpConfFromOpts GanetiMaintd opts
+
+-- | Main function.
+main :: MainFn CheckResult PrepResult
+main _ _ httpConf = do
+ memstate <- newIORef emptyMemoryState
+ void . forkIO . forever $ do
+ res <- runResultT $ maintenance memstate
+ (if isBad res then logInfo else logDebug)
+ $ "Maintenance round result is " ++ show res
+ when (isBad res) $ do
+ logDebug "Backing off after a round with internal errors"
+ threadDelaySeconds C.maintdDefaultRoundDelay
+ httpServe httpConf $ httpInterface memstate
diff --git a/src/Ganeti/MaintD/Utils.hs b/src/Ganeti/MaintD/Utils.hs
new file mode 100644
index 0000000..b74d2de
--- /dev/null
+++ b/src/Ganeti/MaintD/Utils.hs
@@ -0,0 +1,64 @@
+{-| Utility functions for the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.Utils
+ ( annotateOpCode
+ , getRepairCommand
+ ) where
+
+import Control.Lens.Setter (over)
+import qualified Text.JSON as J
+
+import qualified Ganeti.Constants as C
+import Ganeti.JQueue (reasonTrailTimestamp)
+import Ganeti.JQueue.Objects (Timestamp)
+import Ganeti.Objects.Maintenance (Incident(..))
+import Ganeti.OpCodes (OpCode, MetaOpCode, wrapOpCode)
+import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
+
+-- | Wrap an `OpCode` into a `MetaOpCode` and adding an indication
+-- that the `OpCode` was submitted by the maintenance daemon.
+annotateOpCode :: String -> Timestamp -> OpCode -> MetaOpCode
+annotateOpCode reason ts =
+ over (metaParamsL . opReasonL)
+ (++ [(C.opcodeReasonSrcMaintd, reason, reasonTrailTimestamp ts)])
+ . wrapOpCode
+
+-- | Get the name of the repair command from a live-repair incident.
+getRepairCommand :: Incident -> Maybe String
+getRepairCommand incident
+ | J.JSObject obj <- incidentOriginal incident,
+ Just (J.JSString cmd) <- lookup "command" $ J.fromJSObject obj
+ = return $ J.fromJSString cmd
+getRepairCommand _ = Nothing
diff --git a/src/Ganeti/Metad/ConfigCore.hs b/src/Ganeti/Metad/ConfigCore.hs
index 7211c7e..5821baa 100644
--- a/src/Ganeti/Metad/ConfigCore.hs
+++ b/src/Ganeti/Metad/ConfigCore.hs
@@ -35,7 +35,9 @@
-}
module Ganeti.Metad.ConfigCore where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent.MVar.Lifted
import Control.Monad.Base
import Control.Monad.IO.Class
diff --git a/src/Ganeti/Monitoring/Server.hs b/src/Ganeti/Monitoring/Server.hs
index da78b00..668779b 100644
--- a/src/Ganeti/Monitoring/Server.hs
+++ b/src/Ganeti/Monitoring/Server.hs
@@ -41,19 +41,20 @@
, DataCollector(..)
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Applicative
import Control.DeepSeq (force)
import Control.Exception.Base (evaluate)
-import Control.Monad
+import Control.Monad (void, forever, liftM, foldM, foldM_, mzero)
import Control.Monad.IO.Class
-import Data.ByteString.Char8 (pack, unpack)
+import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.UTF8 as UTF8
import Data.Maybe (fromMaybe)
import Data.List (find)
-import Data.Monoid (mempty)
import qualified Data.Map as Map
import qualified Data.PSQueue as Queue
-import Network.BSD (getServicePortNumber)
import Snap.Core
import Snap.Http.Server
import qualified Text.JSON as J
@@ -71,7 +72,8 @@
import qualified Ganeti.Constants as C
import qualified Ganeti.ConstantUtils as CU
import Ganeti.Runtime
-import Ganeti.Utils (getCurrentTimeUSec, withDefaultOnIOError)
+import Ganeti.Utils (getCurrentTimeUSec)
+import Ganeti.Utils.Http (httpConfFromOpts, error404, plainJSON)
-- * Types and constants definitions
@@ -87,17 +89,6 @@
latestAPIVersion :: Int
latestAPIVersion = C.mondLatestApiVersion
--- * Configuration handling
-
--- | The default configuration for the HTTP server.
-defaultHttpConf :: FilePath -> FilePath -> Config Snap ()
-defaultHttpConf accessLog errorLog =
- setAccessLog (ConfigFileLog accessLog) .
- setCompression False .
- setErrorLog (ConfigFileLog errorLog) $
- setVerbose False
- emptyConfig
-
-- * Helper functions
-- | Check function for the monitoring agent.
@@ -106,28 +97,18 @@
-- | Prepare function for monitoring agent.
prepMain :: PrepFn CheckResult PrepResult
-prepMain opts _ = do
- accessLog <- daemonsExtraLogFile GanetiMond AccessLog
- errorLog <- daemonsExtraLogFile GanetiMond ErrorLog
- defaultPort <- withDefaultOnIOError C.defaultMondPort
- . liftM fromIntegral
- $ getServicePortNumber C.mond
- return .
- setPort
- (maybe defaultPort fromIntegral (optPort opts)) .
- maybe id (setBind . pack) (optBindAddress opts)
- $ defaultHttpConf accessLog errorLog
+prepMain opts _ = httpConfFromOpts GanetiMond opts
-- * Query answers
-- | Reply to the supported API version numbers query.
versionQ :: Snap ()
-versionQ = writeBS . pack $ J.encode [latestAPIVersion]
+versionQ = plainJSON [latestAPIVersion]
-- | Version 1 of the monitoring HTTP API.
version1Api :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
version1Api mvar mvarConfig =
- let returnNull = writeBS . pack $ J.encode J.JSNull :: Snap ()
+ let returnNull = plainJSON J.JSNull
in ifTop returnNull <|>
route
[ ("list", listHandler mvarConfig)
@@ -171,7 +152,7 @@
listHandler :: MVar ConfigAccess -> Snap ()
listHandler mvarConfig = dir "collectors" $ do
collectors' <- liftIO $ activeCollectors mvarConfig
- writeBS . pack . J.encode $ map dcListItem collectors'
+ plainJSON $ map dcListItem collectors'
-- | Handler for returning data collector reports.
reportHandler :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
@@ -187,7 +168,7 @@
allReports mvar mvarConfig = do
collectors' <- liftIO $ activeCollectors mvarConfig
reports <- mapM (liftIO . getReport mvar) collectors'
- writeBS . pack . J.encode $ reports
+ plainJSON reports
-- | Takes the CollectorMap and a DataCollector and returns the report for this
-- collector.
@@ -213,6 +194,7 @@
catFromName "storage" = BT.Ok $ Just DCStorage
catFromName "daemon" = BT.Ok $ Just DCDaemon
catFromName "hypervisor" = BT.Ok $ Just DCHypervisor
+catFromName "node" = BT.Ok $ Just DCNode
catFromName "default" = BT.Ok Nothing
catFromName _ = BT.Bad "No such category"
@@ -221,11 +203,6 @@
modifyResponse $ setResponseStatus 404 "Not found"
writeBS "Unable to produce a report for the requested resource"
-error404 :: Snap ()
-error404 = do
- modifyResponse $ setResponseStatus 404 "Not found"
- writeBS "Resource not found"
-
-- | Return the report of one collector.
oneReport :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
oneReport mvar mvarConfig = do
@@ -243,7 +220,7 @@
Just col -> return col
Nothing -> fail "Unable to find the requested collector"
dcr <- liftIO $ getReport mvar collector
- writeBS . pack . J.encode $ dcr
+ plainJSON dcr
-- | The function implementing the HTTP API of the monitoring agent.
monitoringApi :: MVar CollectorMap -> MVar ConfigAccess -> Snap ()
diff --git a/src/Ganeti/Network.hs b/src/Ganeti/Network.hs
index 1cb6aa1..b557369 100644
--- a/src/Ganeti/Network.hs
+++ b/src/Ganeti/Network.hs
@@ -55,7 +55,7 @@
) where
import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
import Control.Monad.State
import Data.Bits ((.&.))
import Data.Function (on)
@@ -98,7 +98,7 @@
netIpv4NumHosts = ipv4NumHosts . ip4netMask . networkNetwork
-- | Creates a new bit array pool of the appropriate size
-newPoolArray :: (MonadError e m, Error e) => Network -> m BA.BitArray
+newPoolArray :: (MonadError e m, FromString e) => Network -> m BA.BitArray
newPoolArray net = do
let numhosts = netIpv4NumHosts net
when (numhosts > ipv4NetworkMaxNumHosts) . failError $
@@ -112,15 +112,15 @@
return $ BA.zeroes (fromInteger numhosts)
-- | Creates a new bit array pool of the appropriate size
-newPool :: (MonadError e m, Error e) => Network -> m AddressPool
+newPool :: (MonadError e m, FromString e) => Network -> m AddressPool
newPool = liftM AddressPool . newPoolArray
-- | A helper function that creates a bit array pool, of it's missing.
-orNewPool :: (MonadError e m, Error e)
+orNewPool :: (MonadError e m, FromString e)
=> Network -> Maybe AddressPool -> m AddressPool
orNewPool net = maybe (newPool net) return
-withPool :: (MonadError e m, Error e)
+withPool :: (MonadError e m, FromString e)
=> PoolPart -> (Network -> BA.BitArray -> m (a, BA.BitArray))
-> StateT Network m a
withPool part f = StateT $ \n -> mapMOf2 (poolLens part) (f' n) n
@@ -129,7 +129,7 @@
. mapMOf2 addressPoolIso (f net)
<=< orNewPool net
-withPool_ :: (MonadError e m, Error e)
+withPool_ :: (MonadError e m, FromString e)
=> PoolPart -> (Network -> BA.BitArray -> m BA.BitArray)
-> Network -> m Network
withPool_ part f = execStateT $ withPool part ((liftM ((,) ()) .) . f)
@@ -137,12 +137,12 @@
readPool :: PoolPart -> Network -> Maybe BA.BitArray
readPool = view . poolArrayLens
-readPoolE :: (MonadError e m, Error e)
+readPoolE :: (MonadError e m, FromString e)
=> PoolPart -> Network -> m BA.BitArray
readPoolE part net =
liftM apReservations $ orNewPool net ((view . poolLens) part net)
-readAllE :: (MonadError e m, Error e)
+readAllE :: (MonadError e m, FromString e)
=> Network -> m BA.BitArray
readAllE net = do
let toRes = liftM apReservations . orNewPool net
@@ -180,7 +180,7 @@
-- | Returns an address index wrt a network.
-- Fails if the address isn't in the network range.
-addrIndex :: (MonadError e m, Error e) => Ip4Address -> Network -> m Int
+addrIndex :: (MonadError e m, FromString e) => Ip4Address -> Network -> m Int
addrIndex addr net = do
let n = networkNetwork net
i = on (-) ip4AddressToNumber addr (ip4BaseAddr n)
@@ -190,7 +190,7 @@
-- | Returns an address of a given index wrt a network.
-- Fails if the index isn't in the network range.
-addrAt :: (MonadError e m, Error e) => Int -> Network -> m Ip4Address
+addrAt :: (MonadError e m, FromString e) => Int -> Network -> m Ip4Address
addrAt i net | (i' < 0) || (i' >= ipv4NumHosts (ip4netMask n)) =
failError $ "Requested index " ++ show i
++ " outside the range of network '" ++ show net ++ "'"
@@ -202,13 +202,13 @@
-- | Checks if a given address is reserved.
-- Fails if the address isn't in the network range.
-isReserved :: (MonadError e m, Error e) =>
+isReserved :: (MonadError e m, FromString e) =>
PoolPart -> Ip4Address -> Network -> m Bool
isReserved part addr net =
(BA.!) `liftM` readPoolE part net `ap` addrIndex addr net
-- | Marks an address as used.
-reserve :: (MonadError e m, Error e) =>
+reserve :: (MonadError e m, FromString e) =>
PoolPart -> Ip4Address -> Network -> m Network
reserve part addr =
withPool_ part $ \net ba -> do
@@ -220,7 +220,7 @@
BA.setAt idx True ba
-- | Marks an address as unused.
-release :: (MonadError e m, Error e) =>
+release :: (MonadError e m, FromString e) =>
PoolPart -> Ip4Address -> Network -> m Network
release part addr =
withPool_ part $ \net ba -> do
@@ -233,7 +233,7 @@
-- | Get the first free address in the network
-- that satisfies a given predicate.
-findFree :: (MonadError e m, Error e)
+findFree :: (MonadError e m, FromString e)
=> (Ip4Address -> Bool) -> Network -> m (Maybe Ip4Address)
findFree p net = readAllE net >>= BA.foldr f (return Nothing)
where
diff --git a/src/Ganeti/Objects.hs b/src/Ganeti/Objects.hs
index 59abc5c..065aaa8 100644
--- a/src/Ganeti/Objects.hs
+++ b/src/Ganeti/Objects.hs
@@ -103,16 +103,22 @@
, module Ganeti.PartialParams
, module Ganeti.Objects.Disk
, module Ganeti.Objects.Instance
- ) where
+ , module Ganeti.Objects.Maintenance
+ , FilledHvStateParams(..)
+ , PartialHvStateParams(..)
+ , allHvStateParamFields
+ , FilledHvState
+ , PartialHvState ) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow (first)
import Control.Monad.State
import qualified Data.ByteString.UTF8 as UTF8
import Data.List (foldl', intercalate)
import Data.Maybe
import qualified Data.Map as Map
-import Data.Monoid
import Data.Ord (comparing)
import Data.Ratio (numerator, denominator)
import Data.Tuple (swap)
@@ -127,8 +133,10 @@
import Ganeti.JSON (DictObject(..), Container, emptyContainer, GenericContainer)
import Ganeti.Objects.BitArray (BitArray)
import Ganeti.Objects.Disk
+import Ganeti.Objects.Maintenance
import Ganeti.Objects.Nic
import Ganeti.Objects.Instance
+import Ganeti.Objects.HvState
import Ganeti.Query.Language
import Ganeti.PartialParams
import Ganeti.Types
@@ -318,6 +326,8 @@
simpleField "std" [t| PartialISpecParams |]
, optionalField . renameField "SpindleRatioP" $
simpleField "spindle-ratio" [t| Double |]
+ , optionalField . renameField "MemoryRatioP" $
+ simpleField "memory-ratio" [t| Double |]
, optionalField . renameField "VcpuRatioP" $
simpleField "vcpu-ratio" [t| Double |]
, optionalField . renameField "DiskTemplatesP" $
@@ -331,6 +341,8 @@
simpleField ConstantUtils.ispecsMinmax [t| [MinMaxISpecs] |]
, renameField "StdSpec" $ simpleField "std" [t| FilledISpecParams |]
, simpleField "spindle-ratio" [t| Double |]
+ , defaultField [| ConstantUtils.ipolicyDefaultsMemoryRatio |] $
+ simpleField "memory-ratio" [t| Double |]
, simpleField "vcpu-ratio" [t| Double |]
, simpleField "disk-templates" [t| [DiskTemplate] |]
])
@@ -341,17 +353,20 @@
(FilledIPolicy { ipolicyMinMaxISpecs = fminmax
, ipolicyStdSpec = fstd
, ipolicySpindleRatio = fspindleRatio
+ , ipolicyMemoryRatio = fmemoryRatio
, ipolicyVcpuRatio = fvcpuRatio
, ipolicyDiskTemplates = fdiskTemplates})
(PartialIPolicy { ipolicyMinMaxISpecsP = pminmax
, ipolicyStdSpecP = pstd
, ipolicySpindleRatioP = pspindleRatio
+ , ipolicyMemoryRatioP = pmemoryRatio
, ipolicyVcpuRatioP = pvcpuRatio
, ipolicyDiskTemplatesP = pdiskTemplates}) =
FilledIPolicy
{ ipolicyMinMaxISpecs = fromMaybe fminmax pminmax
, ipolicyStdSpec = maybe fstd (fillParams fstd) pstd
, ipolicySpindleRatio = fromMaybe fspindleRatio pspindleRatio
+ , ipolicyMemoryRatio = fromMaybe fmemoryRatio pmemoryRatio
, ipolicyVcpuRatio = fromMaybe fvcpuRatio pvcpuRatio
, ipolicyDiskTemplates = fromMaybe fdiskTemplates
pdiskTemplates
@@ -359,22 +374,31 @@
toPartial (FilledIPolicy { ipolicyMinMaxISpecs = fminmax
, ipolicyStdSpec = fstd
, ipolicySpindleRatio = fspindleRatio
+ , ipolicyMemoryRatio = fmemoryRatio
, ipolicyVcpuRatio = fvcpuRatio
, ipolicyDiskTemplates = fdiskTemplates}) =
PartialIPolicy
{ ipolicyMinMaxISpecsP = Just fminmax
, ipolicyStdSpecP = Just $ toPartial fstd
, ipolicySpindleRatioP = Just fspindleRatio
+ , ipolicyMemoryRatioP = Just fmemoryRatio
, ipolicyVcpuRatioP = Just fvcpuRatio
, ipolicyDiskTemplatesP = Just fdiskTemplates
}
toFilled (PartialIPolicy { ipolicyMinMaxISpecsP = pminmax
, ipolicyStdSpecP = pstd
, ipolicySpindleRatioP = pspindleRatio
+ , ipolicyMemoryRatioP = pmemoryRatio
, ipolicyVcpuRatioP = pvcpuRatio
, ipolicyDiskTemplatesP = pdiskTemplates}) =
- FilledIPolicy <$> pminmax <*> (toFilled =<< pstd) <*> pspindleRatio
- <*> pvcpuRatio <*> pdiskTemplates
+ FilledIPolicy <$> pminmax <*> (toFilled =<< pstd) <*> pspindleRatio
+ <*> pmemoryRatio <*> pvcpuRatio <*> pdiskTemplates
+
+-- | Disk state parameters.
+--
+-- As according to the documentation this option is unused by Ganeti,
+-- the content is just a 'JSValue'.
+type DiskState = Container JSValue
-- * Node definitions
@@ -389,32 +413,20 @@
, simpleField "cpu_speed" [t| Double |]
])
--- | Disk state parameters.
---
--- As according to the documentation this option is unused by Ganeti,
--- the content is just a 'JSValue'.
-type DiskState = Container JSValue
-
--- | Hypervisor state parameters.
---
--- As according to the documentation this option is unused by Ganeti,
--- the content is just a 'JSValue'.
-type HypervisorState = Container JSValue
-
$(buildObject "Node" "node" $
- [ simpleField "name" [t| String |]
- , simpleField "primary_ip" [t| String |]
- , simpleField "secondary_ip" [t| String |]
- , simpleField "master_candidate" [t| Bool |]
- , simpleField "offline" [t| Bool |]
- , simpleField "drained" [t| Bool |]
- , simpleField "group" [t| String |]
- , simpleField "master_capable" [t| Bool |]
- , simpleField "vm_capable" [t| Bool |]
- , simpleField "ndparams" [t| PartialNDParams |]
- , simpleField "powered" [t| Bool |]
+ [ simpleField "name" [t| String |]
+ , simpleField "primary_ip" [t| String |]
+ , simpleField "secondary_ip" [t| String |]
+ , simpleField "master_candidate" [t| Bool |]
+ , simpleField "offline" [t| Bool |]
+ , simpleField "drained" [t| Bool |]
+ , simpleField "group" [t| String |]
+ , simpleField "master_capable" [t| Bool |]
+ , simpleField "vm_capable" [t| Bool |]
+ , simpleField "ndparams" [t| PartialNDParams |]
+ , simpleField "powered" [t| Bool |]
, notSerializeDefaultField [| emptyContainer |] $
- simpleField "hv_state_static" [t| HypervisorState |]
+ simpleField "hv_state_static" [t| PartialHvState |]
, notSerializeDefaultField [| emptyContainer |] $
simpleField "disk_state_static" [t| DiskState |]
]
@@ -445,15 +457,15 @@
type Networks = Container PartialNicParams
$(buildObject "NodeGroup" "group" $
- [ simpleField "name" [t| String |]
+ [ simpleField "name" [t| String |]
, defaultField [| [] |] $ simpleField "members" [t| [String] |]
- , simpleField "ndparams" [t| PartialNDParams |]
- , simpleField "alloc_policy" [t| AllocPolicy |]
- , simpleField "ipolicy" [t| PartialIPolicy |]
- , simpleField "diskparams" [t| GroupDiskParams |]
- , simpleField "networks" [t| Networks |]
+ , simpleField "ndparams" [t| PartialNDParams |]
+ , simpleField "alloc_policy" [t| AllocPolicy |]
+ , simpleField "ipolicy" [t| PartialIPolicy |]
+ , simpleField "diskparams" [t| GroupDiskParams |]
+ , simpleField "networks" [t| Networks |]
, notSerializeDefaultField [| emptyContainer |] $
- simpleField "hv_state_static" [t| HypervisorState |]
+ simpleField "hv_state_static" [t| PartialHvState |]
, notSerializeDefaultField [| emptyContainer |] $
simpleField "disk_state_static" [t| DiskState |]
]
@@ -664,10 +676,10 @@
, simpleField "primary_ip_family" [t| IpFamily |]
, simpleField "prealloc_wipe_disks" [t| Bool |]
, simpleField "ipolicy" [t| FilledIPolicy |]
- , defaultField [| emptyContainer |] $
- simpleField "hv_state_static" [t| HypervisorState |]
- , defaultField [| emptyContainer |] $
- simpleField "disk_state_static" [t| DiskState |]
+ , notSerializeDefaultField [| emptyContainer |] $
+ simpleField "hv_state_static" [t| FilledHvState |]
+ , notSerializeDefaultField [| emptyContainer |] $
+ simpleField "disk_state_static" [t| DiskState |]
, simpleField "enabled_disk_templates" [t| [DiskTemplate] |]
, simpleField "candidate_certs" [t| CandidateCertificates |]
, simpleField "max_running_jobs" [t| Int |]
@@ -678,6 +690,8 @@
, simpleField "compression_tools" [t| [String] |]
, simpleField "enabled_user_shutdown" [t| Bool |]
, simpleField "data_collectors" [t| Container DataCollectorConfig |]
+ , defaultField [| [] |] $ simpleField
+ "diagnose_data_collector_filename" [t| String |]
, simpleField "ssh_key_type" [t| SshKeyType |]
, simpleField "ssh_key_bits" [t| Int |]
]
@@ -711,6 +725,7 @@
, simpleField "networks" [t| Container Network |]
, simpleField "disks" [t| Container Disk |]
, simpleField "filters" [t| Container FilterRule |]
+ , simpleField "maintenance" [t| MaintenanceData |]
]
++ timeStampFields
++ serialFields)
diff --git a/src/Ganeti/Objects/BitArray.hs b/src/Ganeti/Objects/BitArray.hs
index 7932fb2..62b45c4 100644
--- a/src/Ganeti/Objects/BitArray.hs
+++ b/src/Ganeti/Objects/BitArray.hs
@@ -58,7 +58,7 @@
import Prelude hiding (foldr)
import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
import qualified Data.IntSet as IS
import qualified Text.JSON as J
@@ -116,7 +116,7 @@
-- | Sets or removes an element from a bit array.
-- | Sets a given bit in an array. Fails if the index is out of bounds.
-setAt :: (MonadError e m, Error e) => Int -> Bool -> BitArray -> m BitArray
+setAt :: (MonadError e m, FromString e) => Int -> Bool -> BitArray -> m BitArray
setAt i False (BitArray s bits) =
return $ BitArray s (IS.delete i bits)
setAt i True (BitArray s bits) | (i >= 0) && (i < s) =
diff --git a/src/Ganeti/Objects/Disk.hs b/src/Ganeti/Objects/Disk.hs
index a03ba23..f6b3cbb 100644
--- a/src/Ganeti/Objects/Disk.hs
+++ b/src/Ganeti/Objects/Disk.hs
@@ -36,7 +36,9 @@
module Ganeti.Objects.Disk where
-import Control.Applicative ((<*>), (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.ByteString.UTF8 as UTF8
import Data.Char (isAsciiLower, isAsciiUpper, isDigit)
import Data.List (isPrefixOf, isInfixOf)
diff --git a/src/Ganeti/Objects/HvState.hs b/src/Ganeti/Objects/HvState.hs
new file mode 100644
index 0000000..de2599f
--- /dev/null
+++ b/src/Ganeti/Objects/HvState.hs
@@ -0,0 +1,60 @@
+{-# LANGUAGE TemplateHaskell, FunctionalDependencies #-}
+
+{-| Implementation of the Ganeti HvState config object.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Objects.HvState
+ ( FilledHvStateParams(..)
+ , PartialHvStateParams(..)
+ , allHvStateParamFields
+ , FilledHvState
+ , PartialHvState ) where
+
+import Ganeti.THH
+import Ganeti.JSON
+import Ganeti.Types
+
+$(buildParam "HvState" "hvstate"
+ [ simpleField "cpu_node" [t| Int |]
+ , simpleField "cpu_total" [t| Int |]
+ , simpleField "mem_hv" [t| Int |]
+ , simpleField "mem_node" [t| Int |]
+ , simpleField "mem_total" [t| Int |]
+ ])
+
+-- | Static filled hypervisor state (hvtype to hvstate mapping)
+type FilledHvState = GenericContainer Hypervisor FilledHvStateParams
+
+-- | Static partial hypervisor state (hvtype to hvstate mapping)
+type PartialHvState = GenericContainer Hypervisor PartialHvStateParams
diff --git a/src/Ganeti/Objects/Instance.hs b/src/Ganeti/Objects/Instance.hs
index fb35f65..a946b4e 100644
--- a/src/Ganeti/Objects/Instance.hs
+++ b/src/Ganeti/Objects/Instance.hs
@@ -40,7 +40,9 @@
module Ganeti.Objects.Instance where
import qualified Data.ByteString.UTF8 as UTF8
-import Data.Monoid
+
+import Prelude ()
+import Ganeti.Prelude
import Ganeti.JSON (emptyContainer)
import Ganeti.Objects.Nic
diff --git a/src/Ganeti/Objects/Lens.hs b/src/Ganeti/Objects/Lens.hs
index e838bfd..3f27981 100644
--- a/src/Ganeti/Objects/Lens.hs
+++ b/src/Ganeti/Objects/Lens.hs
@@ -157,6 +157,14 @@
instance TagsObjectL Cluster where
tagsL = clusterTagsL
+$(makeCustomLenses ''MaintenanceData)
+
+instance TimeStampObjectL MaintenanceData where
+ mTimeL = maintMtimeL
+
+instance SerialNoObjectL MaintenanceData where
+ serialL = maintSerialL
+
$(makeCustomLenses ''ConfigData)
instance SerialNoObjectL ConfigData where
@@ -164,3 +172,5 @@
instance TimeStampObjectL ConfigData where
mTimeL = configMtimeL
+
+$(makeCustomLenses ''Incident)
diff --git a/src/Ganeti/Objects/Maintenance.hs b/src/Ganeti/Objects/Maintenance.hs
new file mode 100644
index 0000000..ea6e709
--- /dev/null
+++ b/src/Ganeti/Objects/Maintenance.hs
@@ -0,0 +1,115 @@
+{-# LANGUAGE TemplateHaskell #-}
+
+{-| Implementation of the Ganeti configuration for the maintenance daemon.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Objects.Maintenance
+ ( MaintenanceData(..)
+ , RepairAction(..)
+ , RepairStatus(..)
+ , Incident(..)
+ ) where
+
+import qualified Data.ByteString.UTF8 as UTF8
+import qualified Text.JSON as J
+
+import qualified Ganeti.Constants as C
+import Ganeti.THH
+import Ganeti.THH.Field
+import Ganeti.Types
+
+-- | Action to be taken for a certain repair event. Note
+-- that the order is important, as we rely on values higher
+-- in the derived order to be more intrusive actions.
+$(declareLADT ''String "RepairAction"
+ [ ("RANoop", "Ok")
+ , ("RALiveRepair", "live-repair")
+ , ("RAEvacuate", "evacuate")
+ , ("RAEvacuateFailover", "evacuate-failover")
+ ])
+$(makeJSONInstance ''RepairAction)
+
+-- | Progress made on the particular repair event. Again we rely
+-- on the order in that everything larger than `RSPending` is finalized
+-- in the sense that no further jobs will be submitted.
+$(declareLADT ''String "RepairStatus"
+ [ ("RSNoted", "noted")
+ , ("RSPending", "pending")
+ , ("RSCanceled", "canceled")
+ , ("RSFailed", "failed")
+ , ("RSCompleted", "completed")
+ ])
+$(makeJSONInstance ''RepairStatus)
+
+$(buildObject "Incident" "incident" $
+ [ simpleField "original" [t| J.JSValue |]
+ , simpleField "action" [t| RepairAction |]
+ , defaultField [| [] |] $ simpleField "jobs" [t| [ JobId ] |]
+ , simpleField "node" [t| String |]
+ , simpleField "repair-status" [t| RepairStatus |]
+ , simpleField "tag" [t| String |]
+ ]
+ ++ uuidFields
+ ++ timeStampFields
+ ++ serialFields)
+
+instance SerialNoObject Incident where
+ serialOf = incidentSerial
+
+instance TimeStampObject Incident where
+ cTimeOf = incidentCtime
+ mTimeOf = incidentMtime
+
+instance UuidObject Incident where
+ uuidOf = UTF8.toString . incidentUuid
+
+$(buildObject "MaintenanceData" "maint" $
+ [ defaultField [| C.maintdDefaultRoundDelay |]
+ $ simpleField "roundDelay" [t| Int |]
+ , defaultField [| [] |] $ simpleField "jobs" [t| [ JobId ] |]
+ , defaultField [| False |] $ simpleField "balance" [t| Bool |]
+ , defaultField [| 0.1 :: Double |]
+ $ simpleField "balanceThreshold" [t| Double |]
+ , defaultField [| [] |] $ simpleField "evacuated" [t| [ String ] |]
+ , defaultField [| [] |] $ simpleField "incidents" [t| [ Incident ] |]
+ ]
+ ++ timeStampFields
+ ++ serialFields)
+
+instance SerialNoObject MaintenanceData where
+ serialOf = maintSerial
+
+instance TimeStampObject MaintenanceData where
+ cTimeOf = maintCtime
+ mTimeOf = maintMtime
diff --git a/src/Ganeti/OpCodes.hs b/src/Ganeti/OpCodes.hs
index 3ff87b1..4a3660e 100644
--- a/src/Ganeti/OpCodes.hs
+++ b/src/Ganeti/OpCodes.hs
@@ -58,7 +58,9 @@
, setOpPriority
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Data.List (intercalate)
import Data.Map (Map)
import qualified Text.JSON
@@ -189,12 +191,14 @@
[t| JobIdListOnly |],
OpDoc.opClusterVerifyDisks,
[ pOptGroupName
+ , pIsStrict
],
[])
, ("OpGroupVerifyDisks",
[t| (Map String String, [String], Map String [[String]]) |],
OpDoc.opGroupVerifyDisks,
[ pGroupName
+ , pIsStrict
],
"group_name")
, ("OpClusterRepairDiskSizes",
@@ -262,6 +266,10 @@
, pEnabledUserShutdown
, pEnabledDataCollectors
, pDataCollectorInterval
+ , pDiagnoseDataCollectorFilename
+ , pMaintdRoundDelay
+ , pMaintdEnableBalancing
+ , pMaintdBalancingThreshold
],
[])
, ("OpClusterRedistConf",
@@ -330,11 +338,21 @@
, pRestrictedCommand
],
[])
+ , ("OpRepairCommand",
+ [t| String |],
+ OpDoc.opRepairCommand,
+ [ pNodeName
+ , pRepairCommand
+ , pInput
+ ],
+ [])
, ("OpNodeRemove",
[t| () |],
OpDoc.opNodeRemove,
[ pNodeName
, pNodeUuid
+ , pVerbose
+ , pDebug
],
"node_name")
, ("OpNodeAdd",
@@ -351,6 +369,8 @@
, pVmCapable
, pNdParams
, pNodeSetup
+ , pVerbose
+ , pDebug
],
"node_name")
, ("OpNodeQueryvols",
@@ -408,6 +428,8 @@
, pSecondaryIp
, pNdParams
, pPowered
+ , pVerbose
+ , pDebug
],
"node_name")
, ("OpNodePowercycle",
diff --git a/src/Ganeti/OpParams.hs b/src/Ganeti/OpParams.hs
index 3c8be20..e507a4a 100644
--- a/src/Ganeti/OpParams.hs
+++ b/src/Ganeti/OpParams.hs
@@ -244,6 +244,8 @@
, pZeroingTimeoutPerMiB
, pTagSearchPattern
, pRestrictedCommand
+ , pRepairCommand
+ , pInput
, pReplaceDisksMode
, pReplaceDisksList
, pAllowFailover
@@ -297,7 +299,11 @@
, pEnabledUserShutdown
, pAdminStateSource
, pEnabledDataCollectors
+ , pMaintdRoundDelay
+ , pMaintdEnableBalancing
+ , pMaintdBalancingThreshold
, pDataCollectorInterval
+ , pDiagnoseDataCollectorFilename
, pNodeSslCerts
, pSshKeyBits
, pSshKeyType
@@ -305,6 +311,7 @@
, pNodeSetup
, pVerifyClutter
, pLongSleep
+ , pIsStrict
) where
import Control.Monad (liftM, mplus)
@@ -905,12 +912,12 @@
pRequiredNodes :: Field
pRequiredNodes =
withDoc "Required list of node names" .
- renameField "ReqNodes " $ simpleField "nodes" [t| [NonEmptyString] |]
+ renameField "ReqNodes" $ simpleField "nodes" [t| [NonEmptyString] |]
pRequiredNodeUuids :: Field
pRequiredNodeUuids =
withDoc "Required list of node UUIDs" .
- renameField "ReqNodeUuids " . optionalField $
+ renameField "ReqNodeUuids" . optionalField $
simpleField "node_uuids" [t| [NonEmptyString] |]
pRestrictedCommand :: Field
@@ -919,6 +926,17 @@
renameField "RestrictedCommand" $
simpleField "command" [t| NonEmptyString |]
+pRepairCommand :: Field
+pRepairCommand =
+ withDoc "Repair command name" .
+ renameField "RepairCommand" $
+ simpleField "command" [t| NonEmptyString |]
+
+pInput :: Field
+pInput =
+ withDoc "Input to be redirected to stdin of repair script" .
+ optionalField $ simpleField "input" [t| NonEmptyString |]
+
pNodeName :: Field
pNodeName =
withDoc "A required node name (for single-node LUs)" $
@@ -1521,7 +1539,7 @@
pDiskIndex :: Field
pDiskIndex =
withDoc "Disk index for e.g. grow disk" .
- renameField "DiskIndex " $ simpleField "disk" [t| DiskIndex |]
+ renameField "DiskIndex" $ simpleField "disk" [t| DiskIndex |]
pDiskChgAmount :: Field
pDiskChgAmount =
@@ -1742,7 +1760,7 @@
pIAllocatorInstances :: Field
pIAllocatorInstances =
withDoc "IAllocator instances field" .
- renameField "IAllocatorInstances " .
+ renameField "IAllocatorInstances" .
optionalField $
simpleField "instances" [t| [NonEmptyString] |]
@@ -1891,6 +1909,29 @@
optionalField $
simpleField C.dataCollectorsIntervalName [t| GenericContainer String Int |]
+pDiagnoseDataCollectorFilename :: Field
+pDiagnoseDataCollectorFilename =
+ withDoc "Sets the filename of the script diagnose data collector should run" $
+ optionalStringField "diagnose_data_collector_filename"
+
+pMaintdRoundDelay :: Field
+pMaintdRoundDelay =
+ withDoc "Minimal delay between rounds of the maintenance daemon"
+ . optionalField
+ $ simpleField "maint_round_delay" [t| Int |]
+
+pMaintdEnableBalancing :: Field
+pMaintdEnableBalancing =
+ withDoc "Whether the maintenance daemon should also keep the cluster balanced"
+ . optionalField
+ $ simpleField "maint_balance" [t| Bool |]
+
+pMaintdBalancingThreshold :: Field
+pMaintdBalancingThreshold =
+ withDoc "Minimal gain per balancing step by the maintenance daemon"
+ . optionalField
+ $ simpleField "maint_balance_threshold" [t| Double |]
+
pNodeSslCerts :: Field
pNodeSslCerts =
withDoc "Whether to renew node SSL certificates" .
@@ -1930,3 +1971,9 @@
withDoc "Whether to allow long instance shutdowns during exports" .
defaultField [| False |] $
simpleField "long_sleep" [t| Bool |]
+
+pIsStrict :: Field
+pIsStrict =
+ withDoc "Whether the operation is in strict mode or not." .
+ defaultField [| True |] $
+ simpleField "is_strict" [t| Bool |]
diff --git a/src/Ganeti/Parsers.hs b/src/Ganeti/Parsers.hs
index 10b0e41..7cb037e 100644
--- a/src/Ganeti/Parsers.hs
+++ b/src/Ganeti/Parsers.hs
@@ -37,7 +37,9 @@
-}
module Ganeti.Parsers where
-import Control.Applicative ((*>))
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.Attoparsec.Text as A
import Data.Attoparsec.Text (Parser)
import Data.Text (unpack)
@@ -54,6 +56,10 @@
numberP :: Parser Int
numberP = skipSpaces *> A.decimal
+-- | A parser recognizing a number preceeded by spaces.
+integerP :: Parser Integer
+integerP = skipSpaces *> A.decimal
+
-- | A parser recognizing a word preceded by spaces, and closed by a space.
stringP :: Parser String
stringP = skipSpaces *> fmap unpack (A.takeWhile $ not . A.isHorizontalSpace)
diff --git a/src/Ganeti/Path.hs b/src/Ganeti/Path.hs
index 2b52d85..8c02dea 100644
--- a/src/Ganeti/Path.hs
+++ b/src/Ganeti/Path.hs
@@ -58,6 +58,7 @@
, instanceReasonDir
, getInstReasonFilename
, jqueueExecutorPy
+ , kvmPidDir
) where
import System.FilePath
@@ -190,3 +191,7 @@
jqueueExecutorPy :: IO FilePath
jqueueExecutorPy = return $ versionedsharedir
</> "ganeti" </> "jqueue" </> "exec.py"
+
+-- | The path to the directory where kvm stores the pid files.
+kvmPidDir :: IO FilePath
+kvmPidDir = runDir `pjoin` "kvm-hypervisor" `pjoin` "pid"
diff --git a/src/Ganeti/Prelude.hs b/src/Ganeti/Prelude.hs
new file mode 100644
index 0000000..8114b9f
--- /dev/null
+++ b/src/Ganeti/Prelude.hs
@@ -0,0 +1,194 @@
+{-# LANGUAGE NoImplicitPrelude, CPP #-}
+
+{-| Export Prelude as in base 4.8.0
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Prelude (
+
+ -- * Standard types, classes and related functions
+
+ -- ** Basic data types
+ Bool(False, True),
+ (&&), (||), not, otherwise,
+
+ Maybe(Nothing, Just),
+ maybe,
+
+ Either(Left, Right),
+ either,
+
+ Ordering(LT, EQ, GT),
+ Char, String,
+
+ -- *** Tuples
+ fst, snd, curry, uncurry,
+
+ -- ** Basic type classes
+ Eq((==), (/=)),
+ Ord(compare, (<), (<=), (>=), (>), max, min),
+ Enum(succ, pred, toEnum, fromEnum, enumFrom, enumFromThen,
+ enumFromTo, enumFromThenTo),
+ Bounded(minBound, maxBound),
+
+ -- ** Numbers
+
+ -- *** Numeric types
+ Int, Integer, Float, Double,
+ Rational, Word,
+
+ -- *** Numeric type classes
+ Num((+), (-), (*), negate, abs, signum, fromInteger),
+ Real(toRational),
+ Integral(quot, rem, div, mod, quotRem, divMod, toInteger),
+ Fractional((/), recip, fromRational),
+ Floating(pi, exp, log, sqrt, (**), logBase, sin, cos, tan,
+ asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh),
+ RealFrac(properFraction, truncate, round, ceiling, floor),
+ RealFloat(floatRadix, floatDigits, floatRange, decodeFloat,
+ encodeFloat, exponent, significand, scaleFloat, isNaN,
+ isInfinite, isDenormalized, isIEEE, isNegativeZero, atan2),
+
+ -- *** Numeric functions
+ subtract, even, odd, gcd, lcm, (^), (^^),
+ fromIntegral, realToFrac,
+
+ -- ** Monoids
+ Monoid(mempty, mappend, mconcat),
+
+ -- ** Monads and functors
+ Functor(fmap, (<$)), (<$>),
+ Applicative(pure, (<*>), (*>), (<*)),
+ Monad((>>=), (>>), return, fail),
+ mapM_, sequence_, (=<<),
+
+#if MIN_VERSION_base(4,8,0)
+ -- ** Folds and traversals
+ Foldable(elem, -- :: (Foldable t, Eq a) => a -> t a -> Bool
+ -- fold, -- :: Monoid m => t m -> m
+ foldMap, -- :: Monoid m => (a -> m) -> t a -> m
+ foldr, -- :: (a -> b -> b) -> b -> t a -> b
+ -- foldr', -- :: (a -> b -> b) -> b -> t a -> b
+ foldl, -- :: (b -> a -> b) -> b -> t a -> b
+ -- foldl', -- :: (b -> a -> b) -> b -> t a -> b
+ foldr1, -- :: (a -> a -> a) -> t a -> a
+ foldl1, -- :: (a -> a -> a) -> t a -> a
+ maximum, -- :: (Foldable t, Ord a) => t a -> a
+ minimum, -- :: (Foldable t, Ord a) => t a -> a
+ product, -- :: (Foldable t, Num a) => t a -> a
+ sum), -- :: Num a => t a -> a
+ -- toList) -- :: Foldable t => t a -> [a]
+#else
+ Foldable(foldMap,
+ foldr,
+ foldl,
+ foldr1,
+ foldl1),
+ elem,
+ maximum,
+ minimum,
+ product,
+ sum,
+#endif
+
+ Traversable(traverse, sequenceA, mapM, sequence),
+
+ -- ** Miscellaneous functions
+ id, const, (.), flip, ($), until,
+ asTypeOf, error, undefined,
+ seq, ($!),
+
+ -- * List operations
+ map, (++), filter,
+ head, last, tail, init, null, length, (!!),
+ reverse,
+ -- *** Special folds
+ and, or, any, all,
+ concat, concatMap,
+ -- ** Building lists
+ -- *** Scans
+ scanl, scanl1, scanr, scanr1,
+ -- *** Infinite lists
+ iterate, repeat, replicate, cycle,
+ -- ** Sublists
+ take, drop, splitAt, takeWhile, dropWhile, span, break,
+ -- ** Searching lists
+ notElem, lookup,
+ -- ** Zipping and unzipping lists
+ zip, zip3, zipWith, zipWith3, unzip, unzip3,
+ -- ** Functions on strings
+ lines, words, unlines, unwords,
+
+ -- * Converting to and from @String@
+ -- ** Converting to @String@
+ ShowS,
+ Show(showsPrec, showList, show),
+ shows,
+ showChar, showString, showParen,
+ -- ** Converting from @String@
+ ReadS,
+ Read(readsPrec, readList),
+ reads, readParen, read, lex,
+
+ -- * Basic Input and output
+ IO,
+ -- ** Simple I\/O operations
+ -- All I/O functions defined here are character oriented. The
+ -- treatment of the newline character will vary on different systems.
+ -- For example, two characters of input, return and linefeed, may
+ -- read as a single newline character. These functions cannot be
+ -- used portably for binary I/O.
+ -- *** Output functions
+ putChar,
+ putStr, putStrLn, print,
+ -- *** Input functions
+ getChar,
+ getLine, getContents, interact,
+ -- *** Files
+ FilePath,
+ readFile, writeFile, appendFile, readIO, readLn,
+ -- ** Exception handling in the I\/O monad
+ IOError, ioError, userError,
+
+ ) where
+
+#if MIN_VERSION_base(4,8,0)
+import Prelude
+#else
+import Prelude hiding ( elem, maximum, minimum, product, sum )
+import Data.Foldable ( Foldable(..), elem, maximum, minimum, product, sum )
+import Data.Traversable ( Traversable(..) )
+import Control.Applicative
+import Data.Monoid
+import Data.Word
+#endif
diff --git a/src/Ganeti/Query/Exec.hs b/src/Ganeti/Query/Exec.hs
index 124f7f3..79889ff 100644
--- a/src/Ganeti/Query/Exec.hs
+++ b/src/Ganeti/Query/Exec.hs
@@ -60,12 +60,14 @@
, forkJobProcess
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent (rtsSupportsBoundThreads)
import Control.Concurrent.Lifted (threadDelay)
import Control.Exception (finally)
import Control.Monad
-import Control.Monad.Error
-import Data.Functor
+import Control.Monad.Error.Class (MonadError(..))
import qualified Data.Map as M
import Data.Maybe (listToMaybe, mapMaybe)
import System.Directory (getDirectoryContents)
@@ -103,7 +105,7 @@
}
-- Returns the list of all open file descriptors of the current process.
-listOpenFds :: (Error e) => ResultT e IO [Fd]
+listOpenFds :: (FromString e) => ResultT e IO [Fd]
listOpenFds = liftM filterReadable
$ liftIO (getDirectoryContents "/proc/self/fd") `orElse`
liftIO (getDirectoryContents "/dev/fd") `orElse`
@@ -224,7 +226,7 @@
-- | Forks the job process and starts processing of the given job.
-- Returns the livelock of the job and its process ID.
-forkJobProcess :: (Error e, Show e)
+forkJobProcess :: (FromString e, Show e)
=> QueuedJob -- ^ a job to process
-> FilePath -- ^ the daemons own livelock file
-> (FilePath -> ResultT e IO ())
diff --git a/src/Ganeti/Query/Filter.hs b/src/Ganeti/Query/Filter.hs
index 64eab37..0d36ff2 100644
--- a/src/Ganeti/Query/Filter.hs
+++ b/src/Ganeti/Query/Filter.hs
@@ -66,13 +66,14 @@
, FilterOp(..)
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Monad (liftM, mzero)
import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
import Control.Monad.Trans.Class (lift)
import qualified Data.Map as Map
import Data.Maybe
-import Data.Traversable (traverse)
import Text.JSON (JSValue(..), fromJSString)
import Text.JSON.Pretty (pp_value)
import qualified Text.Regex.PCRE as PCRE
@@ -136,7 +137,7 @@
-- | A type synonim for a rank-2 comparator function. This is used so
-- that we can pass the usual '<=', '>', '==' functions to 'binOpFilter'
-- and for them to be used in multiple contexts.
-type Comparator = (Eq a, Ord a) => a -> a -> Bool
+type Comparator = forall a . (Eq a, Ord a) => a -> a -> Bool
-- | Equality checker.
--
@@ -183,10 +184,10 @@
-- note: the next two implementations are the same, but we have to
-- repeat them due to the encapsulation done by FilterValue
containsFilter (QuotedString val) lst = do
- lst' <- fromJVal lst
+ lst' <- fromJVal lst :: ErrorResult [String]
return $! val `elem` lst'
containsFilter (NumericValue val) lst = do
- lst' <- fromJVal lst
+ lst' <- fromJVal lst :: ErrorResult [Integer]
return $! val `elem` lst'
diff --git a/src/Ganeti/Query/Group.hs b/src/Ganeti/Query/Group.hs
index 45bd81a..26fc881 100644
--- a/src/Ganeti/Query/Group.hs
+++ b/src/Ganeti/Query/Group.hs
@@ -55,6 +55,8 @@
, (FieldDefinition "custom_ipolicy" "CustomInstancePolicy" QFTOther
"Custom instance policy limitations",
FieldSimple (rsNormal . groupIpolicy), QffNormal)
+ , (FieldDefinition "networks" "Networks" QFTOther "Node group networks",
+ FieldSimple (rsNormal . groupNetworks), QffNormal)
, (FieldDefinition "custom_ndparams" "CustomNDParams" QFTOther
"Custom node parameters",
FieldSimple (rsNormal . groupNdparams), QffNormal)
@@ -83,6 +85,11 @@
"List of primary instances",
FieldConfig (\cfg -> rsNormal . niceSort . mapMaybe instName . fst .
getGroupInstances cfg . uuidOf), QffNormal)
+ , (FieldDefinition "hv_state" "HypervisorState" QFTOther
+ "Custom static hypervisor state",
+ FieldSimple (rsNormal . groupHvStateStatic), QffNormal)
+ , (FieldDefinition "disk_state" "DiskState" QFTOther "Disk state",
+ FieldSimple (rsNormal . groupDiskStateStatic), QffNormal)
] ++
map buildNdParamField allNDParamFields ++
timeStampFields ++
diff --git a/src/Ganeti/Query/Language.hs b/src/Ganeti/Query/Language.hs
index 882a9da..3c6919f 100644
--- a/src/Ganeti/Query/Language.hs
+++ b/src/Ganeti/Query/Language.hs
@@ -65,10 +65,11 @@
, checkRS
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.DeepSeq
import Data.Foldable
-import Data.Traversable (Traversable)
import Data.Ratio (numerator, denominator)
import Text.JSON.Pretty (pp_value)
import Text.JSON.Types
@@ -94,7 +95,8 @@
-- | No-op 'NFData' instance for 'ResultStatus', since it's a single
-- constructor data-type.
-instance NFData ResultStatus
+instance NFData ResultStatus where
+ rnf x = seq x ()
-- | Check that ResultStatus is success or fail with descriptive
-- message.
diff --git a/src/Ganeti/Query/Node.hs b/src/Ganeti/Query/Node.hs
index 9d36c74..f431ade 100644
--- a/src/Ganeti/Query/Node.hs
+++ b/src/Ganeti/Query/Node.hs
@@ -38,8 +38,10 @@
, collectLiveData
) where
-import Control.Applicative
-import Data.List
+import Prelude ()
+import Ganeti.Prelude
+
+import Data.List (intercalate)
import Data.Maybe
import qualified Text.JSON as J
@@ -243,13 +245,14 @@
, (FieldDefinition "powered" "Powered" QFTBool
"Whether node is thought to be powered on",
FieldConfig getNodePower, QffNormal)
- -- FIXME: the two fields below are incomplete in Python, part of the
- -- non-implemented node resource model; they are declared just for
- -- parity, but are not functional
- , (FieldDefinition "hv_state" "HypervisorState" QFTOther "Hypervisor state",
- FieldSimple (const rsUnavail), QffNormal)
+ , (FieldDefinition "hv_state" "HypervisorState" QFTOther
+ "Static hypervisor state for default hypervisor only",
+ FieldConfig $ (rsNormal .) . getFilledHvStateParams, QffNormal)
+ , (FieldDefinition "custom_hv_state" "CustomHypervisorState" QFTOther
+ "Custom static hypervisor state",
+ FieldSimple $ rsNormal . nodeHvStateStatic, QffNormal)
, (FieldDefinition "disk_state" "DiskState" QFTOther "Disk state",
- FieldSimple (const rsUnavail), QffNormal)
+ FieldSimple $ rsNormal . nodeDiskStateStatic, QffNormal)
] ++
map nodeLiveFieldBuilder nodeLiveFieldsDefs ++
map buildNdParamField allNDParamFields ++
diff --git a/src/Ganeti/Query/Server.hs b/src/Ganeti/Query/Server.hs
index c942803..aefe129 100644
--- a/src/Ganeti/Query/Server.hs
+++ b/src/Ganeti/Query/Server.hs
@@ -40,13 +40,15 @@
, prepMain
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent
import Control.Exception
import Control.Lens ((.~))
import Control.Monad (forever, when, mzero, guard, zipWithM, liftM, void)
import Control.Monad.Base (MonadBase, liftBase)
-import Control.Monad.Error (MonadError)
+import Control.Monad.Error.Class (MonadError)
import Control.Monad.IO.Class
import Control.Monad.Trans (lift)
import Control.Monad.Trans.Maybe
@@ -271,6 +273,18 @@
, ("data_collector_interval",
showJSON . fmap dataCollectorInterval
$ clusterDataCollectors cluster)
+ , ("diagnose_data_collector_filename",
+ showJSON $ clusterDiagnoseDataCollectorFilename cluster)
+ , ("maint_round_delay",
+ showJSON . maintRoundDelay $ configMaintenance cdata)
+ , ("maint_balance",
+ showJSON . maintBalance $ configMaintenance cdata)
+ , ("maint_balance_threshold",
+ showJSON . maintBalanceThreshold $ configMaintenance cdata)
+ , ("hv_state",
+ showJSON $ clusterHvStateStatic cluster)
+ , ("disk_state",
+ showJSON $ clusterDiskStateStatic cluster)
, ("modify_ssh_setup",
showJSON $ clusterModifySshSetup cluster)
, ("ssh_key_type", showJSON $ clusterSshKeyType cluster)
diff --git a/src/Ganeti/Runtime.hs b/src/Ganeti/Runtime.hs
index 01f3885..8cf497f 100644
--- a/src/Ganeti/Runtime.hs
+++ b/src/Ganeti/Runtime.hs
@@ -52,7 +52,6 @@
) where
import Control.Monad
-import Control.Monad.Error
import qualified Data.Map as M
import System.Exit
import System.FilePath
@@ -75,6 +74,7 @@
| GanetiWConfd
| GanetiKvmd
| GanetiLuxid
+ | GanetiMaintd
| GanetiMond
deriving (Show, Enum, Bounded, Eq, Ord)
@@ -103,6 +103,7 @@
daemonName GanetiWConfd = "ganeti-wconfd"
daemonName GanetiKvmd = "ganeti-kvmd"
daemonName GanetiLuxid = "ganeti-luxid"
+daemonName GanetiMaintd = "ganeti-maintd"
daemonName GanetiMond = "ganeti-mond"
-- | Returns whether the daemon only runs on the master node.
@@ -115,6 +116,7 @@
daemonOnlyOnMaster GanetiWConfd = True
daemonOnlyOnMaster GanetiKvmd = False
daemonOnlyOnMaster GanetiLuxid = True
+daemonOnlyOnMaster GanetiMaintd = True
daemonOnlyOnMaster GanetiMond = False
-- | Returns the log file base for a daemon.
@@ -127,6 +129,7 @@
daemonLogBase GanetiWConfd = "wconf-daemon"
daemonLogBase GanetiKvmd = "kvm-daemon"
daemonLogBase GanetiLuxid = "luxi-daemon"
+daemonLogBase GanetiMaintd = "maintenance-daemon"
daemonLogBase GanetiMond = "monitoring-daemon"
-- | Returns the configured user name for a daemon.
@@ -139,6 +142,7 @@
daemonUser GanetiWConfd = AutoConf.wconfdUser
daemonUser GanetiKvmd = AutoConf.kvmdUser
daemonUser GanetiLuxid = AutoConf.luxidUser
+daemonUser GanetiMaintd = AutoConf.mondUser
daemonUser GanetiMond = AutoConf.mondUser
-- | Returns the configured group for a daemon.
@@ -151,6 +155,7 @@
daemonGroup (DaemonGroup GanetiWConfd) = AutoConf.wconfdGroup
daemonGroup (DaemonGroup GanetiLuxid) = AutoConf.luxidGroup
daemonGroup (DaemonGroup GanetiKvmd) = AutoConf.kvmdGroup
+daemonGroup (DaemonGroup GanetiMaintd) = AutoConf.mondGroup
daemonGroup (DaemonGroup GanetiMond) = AutoConf.mondGroup
daemonGroup (ExtraGroup DaemonsGroup) = AutoConf.daemonsGroup
daemonGroup (ExtraGroup AdminGroup) = AutoConf.adminGroup
@@ -189,7 +194,7 @@
map ExtraGroup [minBound..maxBound]
-- | Computes the group/user maps.
-getEnts :: (Error e) => ResultT e IO RuntimeEnts
+getEnts :: (FromString e) => ResultT e IO RuntimeEnts
getEnts = do
let userOf = liftM userID . liftIO . getUserEntryForName . daemonUser
let groupOf = liftM groupID . liftIO . getGroupEntryForName . daemonGroup
diff --git a/src/Ganeti/Ssconf.hs b/src/Ganeti/Ssconf.hs
index 99ad3e5..e3fc864 100644
--- a/src/Ganeti/Ssconf.hs
+++ b/src/Ganeti/Ssconf.hs
@@ -54,8 +54,10 @@
, emptySSConf
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow ((&&&))
-import Control.Applicative ((<$>))
import Control.Exception
import Control.Monad (forM, liftM)
import qualified Data.Map as M
diff --git a/src/Ganeti/Storage/Diskstats/Parser.hs b/src/Ganeti/Storage/Diskstats/Parser.hs
index 64d3885..6f64b04 100644
--- a/src/Ganeti/Storage/Diskstats/Parser.hs
+++ b/src/Ganeti/Storage/Diskstats/Parser.hs
@@ -36,7 +36,9 @@
-}
module Ganeti.Storage.Diskstats.Parser (diskstatsParser) where
-import Control.Applicative ((<*>), (<*), (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.Attoparsec.Text as A
import qualified Data.Attoparsec.Combinator as AC
import Data.Attoparsec.Text (Parser)
diff --git a/src/Ganeti/Storage/Drbd/Parser.hs b/src/Ganeti/Storage/Drbd/Parser.hs
index c9c8dce..8dee72c 100644
--- a/src/Ganeti/Storage/Drbd/Parser.hs
+++ b/src/Ganeti/Storage/Drbd/Parser.hs
@@ -36,7 +36,10 @@
-}
module Ganeti.Storage.Drbd.Parser (drbdStatusParser, commaIntParser) where
-import Control.Applicative ((<*>), (*>), (<*), (<$>), (<|>), pure)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative ((<|>))
import qualified Data.Attoparsec.Text as A
import qualified Data.Attoparsec.Combinator as AC
import Data.Attoparsec.Text (Parser)
diff --git a/src/Ganeti/Storage/Lvm/LVParser.hs b/src/Ganeti/Storage/Lvm/LVParser.hs
index 470c41a..cf31431 100644
--- a/src/Ganeti/Storage/Lvm/LVParser.hs
+++ b/src/Ganeti/Storage/Lvm/LVParser.hs
@@ -37,7 +37,9 @@
-}
module Ganeti.Storage.Lvm.LVParser (lvParser, lvCommand, lvParams) where
-import Control.Applicative ((<*>), (*>), (<*), (<$>))
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.Attoparsec.Text as A
import qualified Data.Attoparsec.Combinator as AC
import Data.Attoparsec.Text (Parser)
diff --git a/src/Ganeti/THH.hs b/src/Ganeti/THH.hs
index 7ae4c9f..4bc7e88 100644
--- a/src/Ganeti/THH.hs
+++ b/src/Ganeti/THH.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE ParallelListComp, TemplateHaskell #-}
+{-# LANGUAGE ParallelListComp, TemplateHaskell, RankNTypes #-}
{-| TemplateHaskell helper for Ganeti Haskell code.
@@ -77,11 +77,14 @@
, ssconfConstructorName
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow ((&&&), second)
import Control.Applicative
import Control.Lens.Type (Lens, Lens')
import Control.Lens (lens, set, element)
-import Control.Monad
+import Control.Monad (liftM, replicateM, when, unless)
import Control.Monad.Base () -- Needed to prevent spurious GHC linking errors.
import Control.Monad.Writer (tell)
import qualified Control.Monad.Trans as MT
@@ -90,10 +93,9 @@
-- See issue #683 and https://ghc.haskell.org/trac/ghc/ticket/4899
import Data.Char
import Data.Function (on)
-import Data.List
+import Data.List (intercalate, groupBy, stripPrefix, sort, nub)
import Data.Maybe
import qualified Data.Map as M
-import Data.Monoid
import qualified Data.Set as S
import qualified Data.Text as T
import Language.Haskell.TH
@@ -486,7 +488,7 @@
genFromRaw :: Name -> Name -> Name -> [(String, Either String Name)] -> Q [Dec]
genFromRaw traw fname tname constructors = do
-- signature of form (Monad m) => String -> m $name
- sigt <- [t| (Monad m) => $(conT traw) -> m $(conT tname) |]
+ sigt <- [t| forall m. (Monad m) => $(conT traw) -> m $(conT tname) |]
-- clauses for a guarded pattern
let varp = mkName "s"
varpe = varE varp
@@ -1201,8 +1203,13 @@
-> Q [Dec]
genDictObject save_fn load_fn sname fields = do
let name = mkName sname
+ -- newName fails in ghc 7.10 when used on keywords
+ newName' "data" = newName "data_ghcBug10599"
+ newName' "instance" = newName "instance_ghcBug10599"
+ newName' "type" = newName "type_ghcBug10599"
+ newName' s = newName s
-- toDict
- fnames <- mapM (newName . fieldVariable) fields
+ fnames <- mapM (newName' . fieldVariable) fields
let pat = conP name (map varP fnames)
tdexp = [| concat $(listE $ zipWith save_fn fnames fields) |]
tdclause <- clause [pat] (normalB tdexp) []
diff --git a/src/Ganeti/THH/HsRPC.hs b/src/Ganeti/THH/HsRPC.hs
index 7822912..8bcdb4d 100644
--- a/src/Ganeti/THH/HsRPC.hs
+++ b/src/Ganeti/THH/HsRPC.hs
@@ -43,11 +43,13 @@
, mkRpcCalls
) where
-import Control.Applicative
-import Control.Monad
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad (liftM)
import Control.Monad.Base
-import Control.Monad.Error
-import Control.Monad.Reader
+import Control.Monad.Error.Class (MonadError)
+import Control.Monad.Reader (ReaderT, runReaderT, ask)
import Control.Monad.Trans.Control
import Language.Haskell.TH
import qualified Text.JSON as J
diff --git a/src/Ganeti/THH/PyRPC.hs b/src/Ganeti/THH/PyRPC.hs
index eee1554..81e9223 100644
--- a/src/Ganeti/THH/PyRPC.hs
+++ b/src/Ganeti/THH/PyRPC.hs
@@ -40,9 +40,11 @@
, genPyUDSRpcStubStr
) where
-import Control.Monad
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad (liftM, zipWithM)
import Data.Char (toLower, toUpper)
-import Data.Functor
import Data.Maybe (fromMaybe)
import Language.Haskell.TH
import Language.Haskell.TH.Syntax (liftString)
diff --git a/src/Ganeti/THH/PyType.hs b/src/Ganeti/THH/PyType.hs
index 5a3941c..efcbc32 100644
--- a/src/Ganeti/THH/PyType.hs
+++ b/src/Ganeti/THH/PyType.hs
@@ -39,8 +39,10 @@
, pyOptionalType
) where
-import Control.Applicative
-import Control.Monad
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad (ap, liftM)
import Data.List (intercalate)
import Language.Haskell.TH
import Language.Haskell.TH.Syntax (Lift(..))
diff --git a/src/Ganeti/THH/RPC.hs b/src/Ganeti/THH/RPC.hs
index fa4b84c..25388df 100644
--- a/src/Ganeti/THH/RPC.hs
+++ b/src/Ganeti/THH/RPC.hs
@@ -42,10 +42,12 @@
, mkRpcM
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow ((&&&))
import Control.Monad
-import Control.Monad.Error.Class
+import Control.Monad.Error.Class (MonadError(..))
import Data.Map (Map)
import qualified Data.Map as Map
import Language.Haskell.TH
@@ -78,12 +80,12 @@
, US.hExec = liftToHandler . exec
}
where
- orError :: (MonadError e m, Error e) => Maybe a -> e -> m a
+ orError :: (MonadError e m, FromString e) => Maybe a -> e -> m a
orError m e = maybe (throwError e) return m
exec (Request m as) = do
(RpcFn f) <- orError (Map.lookup m fs)
- (strMsg $ "No such method: " ++ m)
+ (mkFromString $ "No such method: " ++ m)
i <- fromJResultE "RPC input" . J.readJSON $ as
o <- f i -- lift $ f i
return $ J.showJSON o
diff --git a/src/Ganeti/Types.hs b/src/Ganeti/Types.hs
index 318127e..8da06d4 100644
--- a/src/Ganeti/Types.hs
+++ b/src/Ganeti/Types.hs
@@ -190,7 +190,9 @@
, TagsObject(..)
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Monad (liftM)
import qualified Text.JSON as JSON
import Text.JSON (JSON, readJSON, showJSON)
diff --git a/src/Ganeti/UDSServer.hs b/src/Ganeti/UDSServer.hs
index c259475..7008d08 100644
--- a/src/Ganeti/UDSServer.hs
+++ b/src/Ganeti/UDSServer.hs
@@ -70,7 +70,9 @@
, listener
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent.Lifted (fork, yield)
import Control.Monad.Base
import Control.Monad.Trans.Control
@@ -79,7 +81,7 @@
import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as UTF8
import Data.IORef
-import Data.List
+import Data.List (isInfixOf)
import Data.Word (Word8)
import qualified Network.Socket as S
import System.Directory (removeFile)
diff --git a/src/Ganeti/Utils.hs b/src/Ganeti/Utils.hs
index 47f65c1..0c2a0ac 100644
--- a/src/Ganeti/Utils.hs
+++ b/src/Ganeti/Utils.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
+{-# LANGUAGE FlexibleContexts, ScopedTypeVariables, CPP #-}
{-| Utility functions. -}
@@ -58,6 +58,7 @@
, exitWhen
, exitUnless
, logWarningIfBad
+ , logAndBad
, rStripSpace
, newUUID
, isUUID
@@ -96,21 +97,39 @@
, ensurePermissions
, ordNub
, isSubsequenceOf
+ , maxBy
+ , threadDelaySeconds
+ , monotoneFind
+ , iterateJust
+ , partitionM
, frequency
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent
import Control.Exception (try, bracket)
import Control.Monad
-import Control.Monad.Error
import qualified Data.Attoparsec.ByteString as A
import qualified Data.ByteString.UTF8 as UTF8
import Data.Char (toUpper, isAlphaNum, isDigit, isSpace)
import qualified Data.Either as E
import Data.Function (on)
import Data.IORef
-import Data.List
+#if MIN_VERSION_base(4,8,0)
+import Data.List hiding (isSubsequenceOf)
+#else
+import Data.List ( intercalate
+ , find
+ , foldl'
+ , group
+ , transpose
+ , sort
+ , sortBy
+ , isPrefixOf
+ , maximumBy)
+#endif
import qualified Data.Map as M
import Data.Maybe (fromMaybe)
import qualified Data.Set as S
@@ -190,6 +209,10 @@
then '\'':v ++ "'"
else v
+-- | Delay a thread for several seconds.
+threadDelaySeconds :: Int -> IO ()
+threadDelaySeconds = threadDelay . (*) 1000000
+
-- * Mathematical functions
-- Simple and slow statistical functions, please replace with better
@@ -351,6 +374,12 @@
return defVal
logWarningIfBad _ _ (Ok v) = return v
+-- | Log a message and return a Bad result.
+logAndBad :: String -> IO (Result a)
+logAndBad msg = do
+ logNotice msg
+ return $ Bad msg
+
-- | Try an IO interaction, log errors and unfold as a 'Result'.
tryAndLogIOError :: IO a -> String -> (a -> Result b) -> IO (Result b)
tryAndLogIOError io msg okfn =
@@ -820,6 +849,45 @@
isSubsequenceOf a@(x:a') (y:b) | x == y = isSubsequenceOf a' b
| otherwise = isSubsequenceOf a b
+-- | Compute the maximum of two elements by a given order.
+-- As opposed to using `maximumBy`, is function is guaranteed
+-- to be total, as the signature enforces a non-empty list of
+-- arguments.
+maxBy :: (a -> a -> Ordering) -> a -> a -> a
+maxBy ord a b = maximumBy ord [a, b]
+
+-- | Given a predicate that is monotone on a list, find the
+-- first list entry where it holds, if any. Use the monotonicity
+-- property to evaluate the property at as few places as possible,
+-- guided by the heuristics provided.
+monotoneFind :: ([a] -> Int) -> (a -> Bool) -> [a] -> Maybe a
+monotoneFind heuristics p xs =
+ let count = heuristics xs
+ in case () of
+ _ | x:xs' <- drop count xs
+ -> if p x
+ then (`mplus` Just x) . monotoneFind heuristics p
+ $ take count xs
+ else monotoneFind heuristics p xs'
+ _ | x:xs' <- xs
+ -> if p x
+ then Just x
+ else monotoneFind heuristics p xs'
+ _ -> Nothing
+
+-- | Iterate a function as long as it returns Just values, collecting
+-- all the Justs that where obtained.
+iterateJust :: (a -> Maybe a) -> a -> [a]
+iterateJust f a = a : maybe [] (iterateJust f) (f a)
+
+-- | A version of partition with a monadic predicate
+-- Implementation taken from David Fox's Extras package.
+partitionM :: (Monad m) => (a -> m Bool) -> [a] -> m ([a], [a])
+partitionM p xs = foldM f ([], []) xs
+ where f (a, b) x = do
+ pv <- p x
+ return $ if pv then (x : a, b) else (a, x : b)
+
-- | Returns a list of tuples of elements and the number of times they occur
-- in a list
frequency :: Ord t => [t] -> [(Int, t)]
diff --git a/src/Ganeti/Utils/Atomic.hs b/src/Ganeti/Utils/Atomic.hs
index 7f4d2df..ae7bf81 100644
--- a/src/Ganeti/Utils/Atomic.hs
+++ b/src/Ganeti/Utils/Atomic.hs
@@ -43,7 +43,7 @@
import qualified Control.Exception.Lifted as L
import Control.Monad
import Control.Monad.Base (MonadBase(..))
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
import Control.Monad.Trans.Control
import System.FilePath.Posix (takeDirectory, takeBaseName)
import System.IO
@@ -91,12 +91,12 @@
-- | Opens a file in a R/W mode, locks it (blocking if needed) and runs
-- a given action while the file is locked. Releases the lock and
-- closes the file afterwards.
-withLockedFile :: (MonadError e m, Error e, MonadBaseControl IO m)
+withLockedFile :: (MonadError e m, FromString e, MonadBaseControl IO m)
=> FilePath -> (Fd -> m a) -> m a
withLockedFile path =
L.bracket (openAndLock path) (liftBase . closeFd)
where
- openAndLock :: (MonadError e m, Error e, MonadBaseControl IO m)
+ openAndLock :: (MonadError e m, FromString e, MonadBaseControl IO m)
=> FilePath -> m Fd
openAndLock p = liftBase $ do
fd <- openFd p ReadWrite Nothing defaultFileFlags
diff --git a/src/Ganeti/Utils/Http.hs b/src/Ganeti/Utils/Http.hs
new file mode 100644
index 0000000..901d401
--- /dev/null
+++ b/src/Ganeti/Utils/Http.hs
@@ -0,0 +1,102 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+{-| Utils for HTTP servers
+
+-}
+
+{-
+
+Copyright (C) 2013 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.Utils.Http
+ ( httpConfFromOpts
+ , error404
+ , plainJSON
+ ) where
+
+import Control.Monad (liftM)
+import Data.ByteString.Char8 (pack)
+import Data.Map ((!))
+import Data.Maybe (fromMaybe)
+import Network.BSD (getServicePortNumber)
+import qualified Network.Socket as Socket
+import Snap.Core (Snap, writeBS, modifyResponse, setResponseStatus)
+import Snap.Http.Server.Config ( Config, ConfigLog(ConfigFileLog), emptyConfig
+ , setAccessLog, setErrorLog, setCompression
+ , setVerbose, setPort, setBind )
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes (GenericResult(..))
+import qualified Ganeti.Constants as C
+import Ganeti.Daemon (DaemonOptions(..))
+import Ganeti.Runtime ( GanetiDaemon, daemonName
+ , daemonsExtraLogFile, ExtraLogReason(..))
+import qualified Ganeti.Ssconf as Ssconf
+import Ganeti.Utils (withDefaultOnIOError)
+
+-- * Configuration handling
+
+-- | The default configuration for the HTTP server.
+defaultHttpConf :: FilePath -> FilePath -> Config Snap ()
+defaultHttpConf accessLog errorLog =
+ setAccessLog (ConfigFileLog accessLog) .
+ setCompression False .
+ setErrorLog (ConfigFileLog errorLog) $
+ setVerbose False
+ emptyConfig
+
+-- | Get the HTTP Configuration from the daemon options.
+httpConfFromOpts :: GanetiDaemon -> DaemonOptions -> IO (Config Snap ())
+httpConfFromOpts daemon opts = do
+ accessLog <- daemonsExtraLogFile daemon AccessLog
+ errorLog <- daemonsExtraLogFile daemon ErrorLog
+ let name = daemonName daemon
+ standardPort = snd $ C.daemonsPorts ! name
+ defaultPort <- withDefaultOnIOError standardPort
+ . liftM fromIntegral
+ $ getServicePortNumber name
+ defaultFamily <- Ssconf.getPrimaryIPFamily Nothing
+ let defaultBind = if defaultFamily == Ok Socket.AF_INET6 then "::" else "*"
+ return .
+ setPort (maybe defaultPort fromIntegral (optPort opts)) .
+ setBind (pack . fromMaybe defaultBind $ optBindAddress opts)
+ $ defaultHttpConf accessLog errorLog
+
+
+-- * Standard answers
+
+-- | Resource not found error
+error404 :: Snap ()
+error404 = do
+ modifyResponse $ setResponseStatus 404 "Not found"
+ writeBS "Resource not found"
+
+-- | Return the JSON encoding of an object
+plainJSON :: J.JSON a => a -> Snap ()
+plainJSON = writeBS . pack . J.encode
diff --git a/src/Ganeti/Utils/IORef.hs b/src/Ganeti/Utils/IORef.hs
index 488d2e8..a220e3e 100644
--- a/src/Ganeti/Utils/IORef.hs
+++ b/src/Ganeti/Utils/IORef.hs
@@ -34,6 +34,7 @@
module Ganeti.Utils.IORef
( atomicModifyWithLens
+ , atomicModifyWithLens_
, atomicModifyIORefErr
, atomicModifyIORefErrLog
) where
@@ -53,6 +54,11 @@
=> IORef a -> Lens a a b c -> (b -> (r, c)) -> m r
atomicModifyWithLens ref l f = atomicModifyIORef ref (swap . traverseOf l f)
+-- | Atomically modify an 'IORef', not reading any value.
+atomicModifyWithLens_ :: (MonadBase IO m)
+ => IORef a -> Lens a a b c -> (b -> c) -> m ()
+atomicModifyWithLens_ ref l f = atomicModifyWithLens ref l $ (,) () . f
+
-- | Atomically modifies an 'IORef' using a function that can possibly fail.
-- If it fails, the value of the 'IORef' is preserved.
atomicModifyIORefErr :: (MonadBase IO m)
diff --git a/src/Ganeti/Utils/Livelock.hs b/src/Ganeti/Utils/Livelock.hs
index 8bbb37f..905cd88 100644
--- a/src/Ganeti/Utils/Livelock.hs
+++ b/src/Ganeti/Utils/Livelock.hs
@@ -41,7 +41,7 @@
import qualified Control.Exception as E
import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
import System.Directory (doesFileExist, getDirectoryContents)
import System.FilePath.Posix ((</>))
import System.IO
@@ -59,7 +59,7 @@
-- | Appends the current time to the given prefix, creates
-- the lockfile in the appropriate directory, and locks it.
-- Returns its full path and the file's file descriptor.
-mkLivelockFile :: (Error e, MonadError e m, MonadIO m)
+mkLivelockFile :: (FromString e, MonadError e m, MonadIO m)
=> FilePath -> m (Fd, Livelock)
mkLivelockFile prefix = do
(TOD secs _) <- liftIO getClockTime
diff --git a/src/Ganeti/Utils/Monad.hs b/src/Ganeti/Utils/Monad.hs
index cd09a0d..cecaaf4 100644
--- a/src/Ganeti/Utils/Monad.hs
+++ b/src/Ganeti/Utils/Monad.hs
@@ -44,7 +44,7 @@
) where
import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.Trans.Maybe
-- | Retries the given action up to @n@ times.
diff --git a/src/Ganeti/Utils/MultiMap.hs b/src/Ganeti/Utils/MultiMap.hs
index 0f97e26..6f46e1d 100644
--- a/src/Ganeti/Utils/MultiMap.hs
+++ b/src/Ganeti/Utils/MultiMap.hs
@@ -54,13 +54,13 @@
, values
) where
-import Prelude hiding (lookup, null, elem)
+import Prelude ()
+import Ganeti.Prelude hiding (lookup, null, elem)
import Control.Monad
import qualified Data.Foldable as F
import qualified Data.Map as M
import Data.Maybe (fromMaybe, isJust)
-import Data.Monoid
import qualified Data.Set as S
import qualified Text.JSON as J
diff --git a/src/Ganeti/Utils/Random.hs b/src/Ganeti/Utils/Random.hs
index 500e00d..bdccd4e 100644
--- a/src/Ganeti/Utils/Random.hs
+++ b/src/Ganeti/Utils/Random.hs
@@ -38,7 +38,9 @@
, delayRandom
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Concurrent (threadDelay)
import Control.Monad
import Control.Monad.State
diff --git a/src/Ganeti/Utils/Statistics.hs b/src/Ganeti/Utils/Statistics.hs
index 7057973..ff91d93 100644
--- a/src/Ganeti/Utils/Statistics.hs
+++ b/src/Ganeti/Utils/Statistics.hs
@@ -1,5 +1,4 @@
-{-# LANGUAGE BangPatterns #-}
-
+{-# LANGUAGE BangPatterns, MultiParamTypeClasses, FunctionalDependencies#-}
{-| Utility functions for statistical accumulation. -}
@@ -34,100 +33,105 @@
-}
module Ganeti.Utils.Statistics
- ( Statistics
+ ( Stat
+ , SumStat(..)
+ , StdDevStat(..)
, TagTagMap
- , AggregateComponent(..)
- , getSumStatistics
- , getStdDevStatistics
- , getMapStatistics
- , getStatisticValue
- , updateStatistics
+ , MapData(..)
+ , MapStat(..)
+ , update
+ , calculate
+ , getValue
+ , toDouble
) where
import qualified Data.Foldable as Foldable
import Data.List (foldl')
import qualified Data.Map as Map
+-- | Typeclass describing necessary statistical accumulations functions. Types
+-- defining an instance of Stat behave as if the given statistics were computed
+-- on the list of values, but they allow a potentially more efficient update of
+-- a given value. c is the statistical accumulation data type itself while s is
+-- a type of spread values used to calculate a statistics. s defined as a
+-- type dependent from c in order to pretend ambiguity.
+class (Show c) => Stat s c | c -> s where
+ -- | Calculate a statistics from the spread values list.
+ calculate :: [s] -> c
+ -- | In a given statistics replace on value by another. This will only give
+ -- meaningful results, if the original value was actually part of
+ -- the statistics.
+ update :: c -> s -> s -> c
+ -- | Obtain the value of a statistics.
+ getValue :: c -> Double
+
+-- | Type of statistical accumulations representing simple sum of values
+data SumStat = SumStat Double deriving Show
+-- | Type of statistical accumulations representing values standard deviation
+data StdDevStat = StdDevStat Double Double Double deriving Show
+ -- count, sum, and not the sum of squares---instead the
+ -- computed variance for better precission.
+-- | Type of statistical accumulations representing the amount of instances per
+-- each tags pair. See Also TagTagMap documentation.
+data MapStat = MapStat TagTagMap deriving Show
+
+instance Stat Double SumStat where
+ calculate xs =
+ let addComponent s x =
+ let !s' = s + x
+ in s'
+ st = foldl' addComponent 0 xs
+ in SumStat st
+ update (SumStat s) x x' =
+ SumStat $ s + x' - x
+ getValue (SumStat s) = s
+
+instance Stat Double StdDevStat where
+ calculate xs =
+ let addComponent (n, s) x =
+ let !n' = n + 1
+ !s' = s + x
+ in (n', s')
+ (nt, st) = foldl' addComponent (0, 0) xs
+ mean = st / nt
+ center x = x - mean
+ nvar = foldl' (\v x -> let d = center x in v + d * d) 0 xs
+ in StdDevStat nt st (nvar / nt)
+ update (StdDevStat n s var) x x' =
+ let !ds = x' - x
+ !dss = x' * x' - x * x
+ !dnnvar = (n * dss - 2 * s * ds) - ds * ds
+ !s' = s + ds
+ !var' = max 0 $ var + dnnvar / (n * n)
+ in StdDevStat n s' var'
+ getValue (StdDevStat _ _ var) = sqrt var
+
-- | Type to store the number of instances for each exclusion and location
-- pair. This is necessary to calculate second component of location score.
type TagTagMap = Map.Map (String, String) Int
--- | Abstract type of statistical accumulations. They behave as if the given
--- statistics were computed on the list of values, but they allow a potentially
--- more efficient update of a given value.
-data Statistics = SumStatistics Double
- | StdDevStatistics Double Double Double
- -- count, sum, and not the sum of squares---instead the
- -- computed variance for better precission.
- | MapStatistics TagTagMap deriving Show
+-- | Data type used to store spread values of type TagTagMap. This data type
+-- is introduced only to defin an instance of Stat for TagTagMap.
+data MapData = MapData TagTagMap
--- | Abstract type of per-node statistics measures. The SimpleNumber is used
--- to construct SumStatistics and StdDevStatistics while SpreadValues is used
--- to construct MapStatistics.
-data AggregateComponent = SimpleNumber Double
- | SpreadValues TagTagMap
--- Each function below depends on the contents of AggregateComponent but it's
--- necessary to define each function as a function processing both
--- SimpleNumber and SpreadValues instances (see Metrics.hs). That's why
--- pattern matches for invalid type defined as functions which change nothing.
+-- | Helper function unpacking [MapData] spread values list.
+mapTmpToMap :: [MapData] -> [TagTagMap]
+mapTmpToMap (MapData m : xs) = m : mapTmpToMap xs
+mapTmpToMap _ = []
--- | Get a statistics that sums up the values.
-getSumStatistics :: [AggregateComponent] -> Statistics
-getSumStatistics xs =
- let addComponent s (SimpleNumber x) =
- let !s' = s + x
- in s'
- addComponent s _ = s
- st = foldl' addComponent 0 xs
- in SumStatistics st
+instance Stat MapData MapStat where
+ calculate xs =
+ let addComponent m x =
+ let !m' = Map.unionWith (+) m x
+ in m'
+ mt = foldl' addComponent Map.empty (mapTmpToMap xs)
+ in MapStat mt
+ update (MapStat m) (MapData x) (MapData x') =
+ let nm = Map.unionWith (+) (Map.unionWith (-) m x) x'
+ in MapStat nm
+ getValue (MapStat m) = fromIntegral $ Foldable.sum m - Map.size m
--- | Get a statistics for the standard deviation.
-getStdDevStatistics :: [AggregateComponent] -> Statistics
-getStdDevStatistics xs =
- let addComponent (n, s) (SimpleNumber x) =
- let !n' = n + 1
- !s' = s + x
- in (n', s')
- addComponent (n, s) _ = (n, s)
- (nt, st) = foldl' addComponent (0, 0) xs
- mean = st / nt
- center (SimpleNumber x) = x - mean
- center _ = 0
- nvar = foldl' (\v x -> let d = center x in v + d * d) 0 xs
- in StdDevStatistics nt st (nvar / nt)
-
--- | Get a statistics for the standard deviation.
-getMapStatistics :: [AggregateComponent] -> Statistics
-getMapStatistics xs =
- let addComponent m (SpreadValues x) =
- let !m' = Map.unionWith (+) m x
- in m'
- addComponent m _ = m
- mt = foldl' addComponent Map.empty xs
- in MapStatistics mt
-
--- | Obtain the value of a statistics.
-getStatisticValue :: Statistics -> Double
-getStatisticValue (SumStatistics s) = s
-getStatisticValue (StdDevStatistics _ _ var) = sqrt var
-getStatisticValue (MapStatistics m) = fromIntegral $ Foldable.sum m - Map.size m
--- Function above calculates sum (N_i - 1) over each map entry.
-
--- | In a given statistics replace on value by another. This
--- will only give meaningful results, if the original value
--- was actually part of the statistics.
-updateStatistics :: Statistics -> (AggregateComponent, AggregateComponent) ->
- Statistics
-updateStatistics (SumStatistics s) (SimpleNumber x, SimpleNumber y) =
- SumStatistics $ s + (y - x)
-updateStatistics (StdDevStatistics n s var) (SimpleNumber x, SimpleNumber y) =
- let !ds = y - x
- !dss = y * y - x * x
- !dnnvar = (n * dss - 2 * s * ds) - ds * ds
- !s' = s + ds
- !var' = max 0 $ var + dnnvar / (n * n)
- in StdDevStatistics n s' var'
-updateStatistics (MapStatistics m) (SpreadValues x, SpreadValues y) =
- let nm = Map.unionWith (+) (Map.unionWith (-) m x) y
- in MapStatistics nm
-updateStatistics s _ = s
+-- | Converts Integral types to Double. It's usefull than it's not enough type
+-- information in the expression to call fromIntegral directly.
+toDouble :: (Integral a) => a -> Double
+toDouble = fromIntegral
diff --git a/src/Ganeti/Utils/UniStd.hs b/src/Ganeti/Utils/UniStd.hs
index c3453d9..6f301f2 100644
--- a/src/Ganeti/Utils/UniStd.hs
+++ b/src/Ganeti/Utils/UniStd.hs
@@ -54,7 +54,7 @@
-- Because of a bug in GHC 7.6.3 (at least), calling 'hIsClosed' on a handle
-- to get the file descriptor leaks memory. Therefore we open a given file
-- just to sync it and close it again.
-fsyncFile :: (Error e) => FilePath -> ResultT e IO ()
+fsyncFile :: (FromString e) => FilePath -> ResultT e IO ()
fsyncFile path = liftIO
$ bracket (openFd path ReadOnly Nothing defaultFileFlags) closeFd callfsync
where
diff --git a/src/Ganeti/Utils/Validate.hs b/src/Ganeti/Utils/Validate.hs
index 421f0c1..cab6b90 100644
--- a/src/Ganeti/Utils/Validate.hs
+++ b/src/Ganeti/Utils/Validate.hs
@@ -51,16 +51,20 @@
, validate'
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow
import Control.Monad
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.Writer
import qualified Data.Foldable as F
import Data.Functor.Identity
import Data.List (intercalate)
import Data.Sequence
+import Ganeti.BasicTypes (FromString(..))
+
-- | Monad for running validation checks.
newtype ValidationMonadT m a =
ValidationMonad { runValidationMonad :: WriterT (Seq String) m a }
@@ -100,19 +104,19 @@
-- | A helper function for throwing an exception if a list of errors
-- is non-empty.
-throwIfErrors :: (MonadError e m, Error e) => (a, [String]) -> m a
+throwIfErrors :: (MonadError e m, FromString e) => (a, [String]) -> m a
throwIfErrors (x, []) = return x
-throwIfErrors (_, es) = throwError (strMsg $ "Validation errors: "
- ++ intercalate "; " es)
+throwIfErrors (_, es) = throwError (mkFromString $ "Validation errors: "
+ ++ intercalate "; " es)
-- | Runs a validation action and if there are errors, combine them
-- into an exception.
-evalValidate :: (MonadError e m, Error e) => ValidationMonad a -> m a
+evalValidate :: (MonadError e m, FromString e) => ValidationMonad a -> m a
evalValidate = throwIfErrors . runValidate
-- | Runs a validation action and if there are errors, combine them
-- into an exception.
-evalValidateT :: (MonadError e m, Error e) => ValidationMonadT m a -> m a
+evalValidateT :: (MonadError e m, FromString e) => ValidationMonadT m a -> m a
evalValidateT k = runValidateT k >>= throwIfErrors
-- | A typeclass for objects that can be validated.
diff --git a/src/Ganeti/WConfd/Client.hs b/src/Ganeti/WConfd/Client.hs
index a477907..12bd69b 100644
--- a/src/Ganeti/WConfd/Client.hs
+++ b/src/Ganeti/WConfd/Client.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE TemplateHaskell, FlexibleContexts #-}
{-| The Ganeti WConfd client functions.
@@ -38,14 +38,22 @@
module Ganeti.WConfd.Client where
+import Control.Concurrent (threadDelay)
import Control.Exception.Lifted (bracket)
+import Control.Monad (unless)
+import Control.Monad.Base
+import Control.Monad.Error.Class (MonadError)
+import Control.Monad.Trans.Control (MonadBaseControl)
-import Ganeti.THH.HsRPC
+import Ganeti.BasicTypes (runResultT, GenericResult(..))
import Ganeti.Constants
+import Ganeti.Errors (GanetiException)
import Ganeti.JSON (unMaybeForJSON)
import Ganeti.Locking.Locks (ClientId)
import Ganeti.Objects (ConfigData)
-import Ganeti.UDSServer (ConnectConfig(..), Client, connectClient)
+import qualified Ganeti.Path as Path
+import Ganeti.THH.HsRPC
+import Ganeti.UDSServer (ConnectConfig(..), Client, connectClient, closeClient)
import Ganeti.WConfd.Core (exportedFunctions)
-- * Generated client functions
@@ -65,6 +73,15 @@
getWConfdClient :: FilePath -> IO Client
getWConfdClient = connectClient wconfdConnectConfig wconfdDefCtmo
+-- | Run an Rpc with a fresh client.
+runNewWConfdClient :: ( MonadBase IO m, MonadBaseControl IO m
+ , MonadError GanetiException m )
+ => RpcClientMonad a -> m a
+runNewWConfdClient request =
+ bracket (liftBase (Path.defaultWConfdSocket >>= getWConfdClient))
+ (liftBase . closeClient)
+ $ runRpcClient request
+
-- * Helper functions for getting a remote lock
-- | Calls the `lockConfig` RPC until the lock is obtained.
@@ -86,3 +103,14 @@
withLockedConfig c shared =
-- Unlock config even if something throws.
bracket (waitLockConfig c shared) (const $ unlockConfig c)
+
+
+-- * Other functions
+
+-- | Try an RPC until no errors occur and the result is true.
+runModifyRpc :: RpcClientMonad Bool -> IO ()
+runModifyRpc action = do
+ res <- runResultT $ runNewWConfdClient action
+ unless (res == Ok True) $ do
+ threadDelay 100000 -- sleep 0.1 seconds
+ runModifyRpc action
diff --git a/src/Ganeti/WConfd/ConfigModifications.hs b/src/Ganeti/WConfd/ConfigModifications.hs
index 46686d4..2fdea73 100644
--- a/src/Ganeti/WConfd/ConfigModifications.hs
+++ b/src/Ganeti/WConfd/ConfigModifications.hs
@@ -1,4 +1,5 @@
-{-# LANGUAGE TemplateHaskell, NoMonomorphismRestriction, FlexibleContexts #-}
+{-# LANGUAGE TemplateHaskell, NoMonomorphismRestriction, FlexibleContexts,
+ RankNTypes #-}
{-| The WConfd functions for direct configuration manipulation
@@ -39,18 +40,21 @@
module Ganeti.WConfd.ConfigModifications where
-import Control.Applicative ((<$>))
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Lens (_2)
import Control.Lens.Getter ((^.))
-import Control.Lens.Setter ((.~), (%~))
-import qualified Data.ByteString.UTF8 as UTF8
+import Control.Lens.Setter (Setter, (.~), (%~), (+~), over)
import Control.Lens.Traversal (mapMOf)
-import Control.Monad (unless, when, forM_, foldM, liftM2)
-import Control.Monad.Error (throwError, MonadError)
+import Control.Lens.Type (Simple)
+import Control.Monad (unless, when, forM_, foldM, liftM, liftM2)
+import Control.Monad.Error.Class (throwError, MonadError)
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Trans.State (StateT, get, put, modify,
runStateT, execStateT)
-import Data.Foldable (fold, foldMap)
+import qualified Data.ByteString.UTF8 as UTF8
+import Data.Foldable (fold)
import Data.List (elemIndex)
import Data.Maybe (isJust, maybeToList, fromMaybe, fromJust)
import Language.Haskell.TH (Name)
@@ -68,7 +72,8 @@
import Ganeti.Logging.Lifted (logDebug, logInfo)
import Ganeti.Objects
import Ganeti.Objects.Lens
-import Ganeti.Types (AdminState, AdminStateSource)
+import Ganeti.Types (AdminState, AdminStateSource, JobId)
+import Ganeti.Utils (ordNub)
import Ganeti.WConfd.ConfigState (ConfigState, csConfigData, csConfigDataL)
import Ganeti.WConfd.Monad (WConfdMonad, modifyConfigWithLock
, modifyConfigAndReturnWithLock)
@@ -117,7 +122,7 @@
instKeys = keysFromC . configInstances . csConfigData $ cs
nodeKeys = keysFromC . configNodes . csConfigData $ cs
-
+
instValues = map uuidOf . valuesFromC
. configInstances . csConfigData $ cs
nodeValues = map uuidOf . valuesFromC . configNodes . csConfigData $ cs
@@ -672,6 +677,74 @@
. T.releaseDRBDMinors . UTF8.fromString $ uuidOf disk
return . MaybeForJSON $ fmap (_2 %~ TimeAsDoubleJSON) r
+-- | Set a particular value and bump serial in the hosting
+-- structure. Arguments are a setter to focus on the part
+-- of the configuration that gets serial-bumped, and a modification
+-- of that part. The function will do the change and bump the serial
+-- in the WConfdMonad temporarily acquiring the configuration lock.
+-- Return True if that succeeded and False if the configuration lock
+-- was not available; no change is done in the latter case.
+changeAndBump :: (SerialNoObjectL a, TimeStampObjectL a)
+ => Simple Setter ConfigState a
+ -> (a -> a)
+ -> WConfdMonad Bool
+changeAndBump focus change = do
+ now <- liftIO getClockTime
+ let operation = over focus $ (serialL +~ 1) . (mTimeL .~ now) . change
+ liftM isJust $ modifyConfigWithLock
+ (\_ cs -> return . operation $ cs)
+ (return ())
+
+-- | Change and bump part of the maintenance part of the configuration.
+changeAndBumpMaint :: (MaintenanceData -> MaintenanceData) -> WConfdMonad Bool
+changeAndBumpMaint = changeAndBump $ csConfigDataL . configMaintenanceL
+
+-- | Set the maintenance intervall.
+setMaintdRoundDelay :: Int -> WConfdMonad Bool
+setMaintdRoundDelay delay = changeAndBumpMaint $ maintRoundDelayL .~ delay
+
+-- | Clear the list of current maintenance jobs.
+clearMaintdJobs :: WConfdMonad Bool
+clearMaintdJobs = changeAndBumpMaint $ maintJobsL .~ []
+
+-- | Append new jobs to the list of current maintenace jobs, if
+-- not alread present.
+appendMaintdJobs :: [JobId] -> WConfdMonad Bool
+appendMaintdJobs jobs = changeAndBumpMaint . over maintJobsL
+ $ ordNub . (++ jobs)
+
+-- | Set the autobalance flag.
+setMaintdBalance :: Bool -> WConfdMonad Bool
+setMaintdBalance value = changeAndBumpMaint $ maintBalanceL .~ value
+
+-- | Set the auto-balance threshold.
+setMaintdBalanceThreshold :: Double -> WConfdMonad Bool
+setMaintdBalanceThreshold value = changeAndBumpMaint
+ $ maintBalanceThresholdL .~ value
+
+-- | Add a name to the list of recently evacuated instances.
+addMaintdEvacuated :: [String] -> WConfdMonad Bool
+addMaintdEvacuated names = changeAndBumpMaint . over maintEvacuatedL
+ $ ordNub . (++ names)
+
+-- | Remove a name from the list of recently evacuated instances.
+rmMaintdEvacuated :: String -> WConfdMonad Bool
+rmMaintdEvacuated name = changeAndBumpMaint . over maintEvacuatedL
+ $ filter (/= name)
+
+-- | Update an incident to the list of known incidents; if the incident,
+-- as identified by the UUID, is not present, it is added.
+updateMaintdIncident :: Incident -> WConfdMonad Bool
+updateMaintdIncident incident =
+ changeAndBumpMaint . over maintIncidentsL
+ $ (incident :) . filter ((/= uuidOf incident) . uuidOf)
+
+-- | Remove an incident from the list of known incidents.
+rmMaintdIncident :: String -> WConfdMonad Bool
+rmMaintdIncident uuid =
+ changeAndBumpMaint . over maintIncidentsL
+ $ filter ((/= uuid) . uuidOf)
+
-- * The list of functions exported to RPC.
exportedFunctions :: [Name]
@@ -691,4 +764,13 @@
, 'updateNetwork
, 'updateNode
, 'updateNodeGroup
+ , 'setMaintdRoundDelay
+ , 'clearMaintdJobs
+ , 'appendMaintdJobs
+ , 'setMaintdBalance
+ , 'setMaintdBalanceThreshold
+ , 'addMaintdEvacuated
+ , 'rmMaintdEvacuated
+ , 'updateMaintdIncident
+ , 'rmMaintdIncident
]
diff --git a/src/Ganeti/WConfd/ConfigState.hs b/src/Ganeti/WConfd/ConfigState.hs
index fa6e754..b41fda1 100644
--- a/src/Ganeti/WConfd/ConfigState.hs
+++ b/src/Ganeti/WConfd/ConfigState.hs
@@ -43,7 +43,9 @@
, needsFullDist
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Data.Function (on)
import System.Time (ClockTime(..))
diff --git a/src/Ganeti/WConfd/ConfigVerify.hs b/src/Ganeti/WConfd/ConfigVerify.hs
index 246b627..118d775 100644
--- a/src/Ganeti/WConfd/ConfigVerify.hs
+++ b/src/Ganeti/WConfd/ConfigVerify.hs
@@ -39,7 +39,8 @@
, verifyConfigErr
) where
-import Control.Monad.Error
+import Control.Monad (forM_)
+import Control.Monad.Error.Class (MonadError(..))
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Foldable as F
import qualified Data.Map as M
diff --git a/src/Ganeti/WConfd/ConfigWriter.hs b/src/Ganeti/WConfd/ConfigWriter.hs
index 8ffbc13..ccd562b 100644
--- a/src/Ganeti/WConfd/ConfigWriter.hs
+++ b/src/Ganeti/WConfd/ConfigWriter.hs
@@ -43,10 +43,14 @@
, distSSConfAsyncTask
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Monad ((>=>), liftM, unless)
import Control.Monad.Base
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError)
import qualified Control.Monad.State.Strict as S
+import Control.Monad.Trans.Class (lift)
import Control.Monad.Trans.Control
import Data.Monoid
import qualified Data.Set as Set
diff --git a/src/Ganeti/WConfd/Core.hs b/src/Ganeti/WConfd/Core.hs
index 73dba45..88ecafa 100644
--- a/src/Ganeti/WConfd/Core.hs
+++ b/src/Ganeti/WConfd/Core.hs
@@ -61,8 +61,13 @@
, lockLevel, LockLevel
, ClientType(ClientOther), ClientId(..) )
import qualified Ganeti.Locking.Waiting as LW
-import Ganeti.Objects (ConfigData, DRBDSecret, LogicalVolume, Ip4Address)
+import Ganeti.Objects ( ConfigData, DRBDSecret, LogicalVolume, Ip4Address
+ , configMaintenance, maintRoundDelay, maintJobs
+ , maintBalance, maintBalanceThreshold, maintEvacuated
+ , Incident, maintIncidents
+ )
import Ganeti.Objects.Lens (configClusterL, clusterMasterNodeL)
+import Ganeti.Types (JobId)
import Ganeti.WConfd.ConfigState (csConfigDataL)
import qualified Ganeti.WConfd.ConfigVerify as V
import Ganeti.WConfd.DeathDetection (cleanupLocks)
@@ -165,6 +170,30 @@
flushConfigGroup :: String -> WConfdMonad ()
flushConfigGroup = forceConfigStateDistribution . ToGroups . S.singleton
+-- *** Access to individual parts of the configuration
+
+-- | Get the configurable value of the maintenance interval
+maintenanceRoundDelay :: WConfdMonad Int
+maintenanceRoundDelay = liftM ( maintRoundDelay . configMaintenance )
+ CW.readConfig
+
+-- | Get the list of jobs in the state of the maintenance daemon.
+maintenanceJobs :: WConfdMonad [JobId]
+maintenanceJobs = liftM ( maintJobs . configMaintenance ) CW.readConfig
+
+-- | Get the information related to balancing for the maintenance daemon.
+maintenanceBalancing :: WConfdMonad (Bool, Double)
+maintenanceBalancing = liftM ((maintBalance &&& maintBalanceThreshold)
+ . configMaintenance) CW.readConfig
+
+-- | Get the list of recently evacuated instances.
+maintenanceEvacuated :: WConfdMonad [String]
+maintenanceEvacuated = liftM (maintEvacuated . configMaintenance) CW.readConfig
+
+-- | Get the list of current incidents.
+maintenanceIncidents :: WConfdMonad [Incident]
+maintenanceIncidents = liftM (maintIncidents . configMaintenance) CW.readConfig
+
-- ** Temporary reservations related functions
dropAllReservations :: ClientId -> WConfdMonad ()
@@ -396,6 +425,11 @@
, 'writeConfigAndUnlock
, 'flushConfig
, 'flushConfigGroup
+ , 'maintenanceRoundDelay
+ , 'maintenanceJobs
+ , 'maintenanceBalancing
+ , 'maintenanceEvacuated
+ , 'maintenanceIncidents
-- temporary reservations (common)
, 'dropAllReservations
-- DRBD
diff --git a/src/Ganeti/WConfd/Monad.hs b/src/Ganeti/WConfd/Monad.hs
index fe78e31..b37ab9e 100644
--- a/src/Ganeti/WConfd/Monad.hs
+++ b/src/Ganeti/WConfd/Monad.hs
@@ -69,19 +69,20 @@
, DistributionTarget(..)
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow ((&&&), second)
import Control.Concurrent (forkIO, myThreadId)
import Control.Exception.Lifted (bracket)
import Control.Monad
import Control.Monad.Base
-import Control.Monad.Error
import Control.Monad.Reader
import Control.Monad.State
import Control.Monad.Trans.Control
import Data.Functor.Identity
import Data.IORef.Lifted
-import Data.Monoid (Any(..), Monoid(..))
+import Data.Monoid (Any(..))
import qualified Data.Set as S
import Data.Tuple (swap)
import System.Posix.Process (getProcessID)
diff --git a/src/Ganeti/WConfd/Persistent.hs b/src/Ganeti/WConfd/Persistent.hs
index 48b8330..dc0bc63 100644
--- a/src/Ganeti/WConfd/Persistent.hs
+++ b/src/Ganeti/WConfd/Persistent.hs
@@ -46,7 +46,7 @@
, persistentTempRes
) where
-import Control.Monad.Error
+import Control.Monad.Error.Class (catchError)
import System.Directory (doesFileExist)
import qualified Text.JSON as J
diff --git a/src/Ganeti/WConfd/Server.hs b/src/Ganeti/WConfd/Server.hs
index b226d09..1c2ef83 100644
--- a/src/Ganeti/WConfd/Server.hs
+++ b/src/Ganeti/WConfd/Server.hs
@@ -43,7 +43,6 @@
import Control.Concurrent (forkIO)
import Control.Exception
import Control.Monad
-import Control.Monad.Error
import Ganeti.BasicTypes
import qualified Ganeti.Constants as C
@@ -88,8 +87,8 @@
conf_file <- Path.clusterConfFile
dh <- toErrorBase
- . withErrorT (strMsg . ("Initialization of the daemon failed" ++)
- . formatError) $ do
+ . withErrorT (mkFromString . ("Initialization of the daemon failed" ++)
+ . formatError) $ do
ents <- getEnts
(cdata, cstat) <- loadConfigFromFile conf_file
verifyConfigErr cdata
diff --git a/src/Ganeti/WConfd/TempRes.hs b/src/Ganeti/WConfd/TempRes.hs
index 565fae2..9c0220d 100644
--- a/src/Ganeti/WConfd/TempRes.hs
+++ b/src/Ganeti/WConfd/TempRes.hs
@@ -73,9 +73,11 @@
, reserved
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Lens.At
-import Control.Monad.Error
+import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.State
import Control.Monad.Trans.Maybe
import qualified Data.ByteString as BS
diff --git a/src/ganeti-maintd.hs b/src/ganeti-maintd.hs
new file mode 100644
index 0000000..caa76fc
--- /dev/null
+++ b/src/ganeti-maintd.hs
@@ -0,0 +1,47 @@
+{-| Ganeti maintenance agent daemon
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Main (main) where
+
+import Ganeti.Daemon
+import Ganeti.Runtime
+import qualified Ganeti.MaintD.Server as S
+
+-- | Main function.
+main :: IO ()
+main =
+ genericMain GanetiMaintd S.options
+ S.checkMain
+ S.prepMain
+ S.main
diff --git a/test/data/cluster_config_2.16.json b/test/data/cluster_config_2.16.json
new file mode 100644
index 0000000..76e9b4f
--- /dev/null
+++ b/test/data/cluster_config_2.16.json
@@ -0,0 +1,658 @@
+{
+ "cluster": {
+ "beparams": {
+ "default": {
+ "always_failover": false,
+ "auto_balance": true,
+ "maxmem": 128,
+ "minmem": 128,
+ "spindle_use": 1,
+ "vcpus": 1
+ }
+ },
+ "blacklisted_os": [],
+ "candidate_certs": {},
+ "candidate_pool_size": 10,
+ "cluster_name": "cluster.name.example.com",
+ "compression_tools": [
+ "gzip",
+ "gzip-fast",
+ "gzip-slow"
+ ],
+ "ctime": 1343869045.6048839,
+ "data_collectors": {
+ "cpu-avg-load": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "diskstats": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "drbd": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "inst-status-xen": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "lv": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "xen-cpu-avg-load": {
+ "active": true,
+ "interval": 5000000.0
+ }
+ },
+ "default_iallocator": "hail",
+ "default_iallocator_params": {},
+ "disk_state_static": {},
+ "diskparams": {
+ "blockdev": {},
+ "diskless": {},
+ "drbd": {
+ "c-delay-target": 1,
+ "c-fill-target": 200,
+ "c-max-rate": 2048,
+ "c-min-rate": 1024,
+ "c-plan-ahead": 1,
+ "data-stripes": 2,
+ "disk-barriers": "bf",
+ "disk-custom": "",
+ "dynamic-resync": false,
+ "meta-barriers": true,
+ "meta-stripes": 2,
+ "metavg": "xenvg",
+ "net-custom": "",
+ "protocol": "C",
+ "resync-rate": 1024
+ },
+ "ext": {
+ "access": "kernelspace"
+ },
+ "file": {},
+ "gluster": {
+ "access": "kernelspace",
+ "host": "127.0.0.1",
+ "port": 24007,
+ "volume": "gv0"
+ },
+ "plain": {
+ "stripes": 2
+ },
+ "rbd": {
+ "access": "kernelspace",
+ "pool": "rbd"
+ },
+ "sharedfile": {}
+ },
+ "drbd_usermode_helper": "/bin/true",
+ "enabled_disk_templates": [
+ "drbd",
+ "plain",
+ "file",
+ "sharedfile"
+ ],
+ "enabled_hypervisors": [
+ "xen-pvm"
+ ],
+ "enabled_user_shutdown": false,
+ "file_storage_dir": "",
+ "gluster_storage_dir": "",
+ "hidden_os": [],
+ "highest_used_port": 32105,
+ "hv_state_static": {
+ "xen-pvm": {
+ "cpu_node": 1,
+ "cpu_total": 1,
+ "mem_hv": 0,
+ "mem_node": 0,
+ "mem_total": 0
+ }
+ },
+ "hvparams": {
+ "chroot": {
+ "init_script": "/ganeti-chroot"
+ },
+ "fake": {
+ "migration_mode": "live"
+ },
+ "kvm": {
+ "acpi": true,
+ "boot_order": "disk",
+ "cdrom2_image_path": "",
+ "cdrom_disk_type": "",
+ "cdrom_image_path": "",
+ "cpu_cores": 0,
+ "cpu_mask": "all",
+ "cpu_sockets": 0,
+ "cpu_threads": 0,
+ "cpu_type": "",
+ "disk_aio": "threads",
+ "disk_cache": "default",
+ "disk_type": "paravirtual",
+ "floppy_image_path": "",
+ "initrd_path": "",
+ "kernel_args": "ro",
+ "kernel_path": "/boot/vmlinuz-kvmU",
+ "keymap": "",
+ "kvm_extra": "",
+ "kvm_flag": "",
+ "kvm_path": "/usr/bin/kvm",
+ "machine_version": "",
+ "mem_path": "",
+ "migration_bandwidth": 4,
+ "migration_caps": "",
+ "migration_downtime": 30,
+ "migration_mode": "live",
+ "migration_port": 4041,
+ "nic_type": "paravirtual",
+ "reboot_behavior": "reboot",
+ "root_path": "/dev/vda1",
+ "security_domain": "",
+ "security_model": "none",
+ "serial_console": true,
+ "serial_speed": 38400,
+ "soundhw": "",
+ "spice_bind": "",
+ "spice_image_compression": "",
+ "spice_ip_version": 0,
+ "spice_jpeg_wan_compression": "",
+ "spice_password_file": "",
+ "spice_playback_compression": true,
+ "spice_streaming_video": "",
+ "spice_tls_ciphers": "HIGH:-DES:-3DES:-EXPORT:-ADH",
+ "spice_use_tls": false,
+ "spice_use_vdagent": true,
+ "spice_zlib_glz_wan_compression": "",
+ "usb_devices": "",
+ "usb_mouse": "",
+ "use_chroot": false,
+ "use_localtime": false,
+ "user_shutdown": false,
+ "vga": "",
+ "vhost_net": false,
+ "virtio_net_queues": 1,
+ "vnc_bind_address": "",
+ "vnc_password_file": "",
+ "vnc_tls": false,
+ "vnc_x509_path": "",
+ "vnc_x509_verify": false,
+ "vnet_hdr": true
+ },
+ "lxc": {
+ "cpu_mask": "",
+ "devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+ "drop_capabilities": "mac_override,sys_boot,sys_module,sys_time,sys_admin",
+ "extra_cgroups": "",
+ "extra_config": "",
+ "lxc_cgroup_use": "",
+ "lxc_devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+ "lxc_drop_capabilities": "mac_override,sys_boot,sys_module,sys_time",
+ "lxc_extra_config": "",
+ "lxc_startup_wait": 30,
+ "lxc_tty": 6,
+ "num_ttys": 6,
+ "startup_timeout": 30
+ },
+ "xen-hvm": {
+ "acpi": true,
+ "blockdev_prefix": "hd",
+ "boot_order": "cd",
+ "cdrom_image_path": "",
+ "cpu_cap": 0,
+ "cpu_mask": "all",
+ "cpu_weight": 256,
+ "cpuid": "",
+ "device_model": "/usr/lib/xen/bin/qemu-dm",
+ "disk_type": "paravirtual",
+ "kernel_path": "/usr/lib/xen/boot/hvmloader",
+ "migration_mode": "non-live",
+ "migration_port": 8082,
+ "nic_type": "rtl8139",
+ "pae": true,
+ "pci_pass": "",
+ "reboot_behavior": "reboot",
+ "soundhw": "",
+ "use_localtime": false,
+ "vif_script": "",
+ "vif_type": "ioemu",
+ "viridian": false,
+ "vnc_bind_address": "0.0.0.0",
+ "vnc_password_file": "/your/vnc-cluster-password",
+ "xen_cmd": "xm"
+ },
+ "xen-pvm": {
+ "blockdev_prefix": "sd",
+ "bootloader_args": "",
+ "bootloader_path": "",
+ "cpu_cap": 0,
+ "cpu_mask": "all",
+ "cpu_weight": 256,
+ "cpuid": "",
+ "initrd_path": "",
+ "kernel_args": "ro",
+ "kernel_path": "/boot/vmlinuz-xenU",
+ "migration_mode": "live",
+ "migration_port": 8082,
+ "reboot_behavior": "reboot",
+ "root_path": "/dev/xvda1",
+ "soundhw": "",
+ "use_bootloader": false,
+ "vif_script": "",
+ "xen_cmd": "xm"
+ }
+ },
+ "install_image": "",
+ "instance_communication_network": "",
+ "ipolicy": {
+ "disk-templates": [
+ "drbd",
+ "plain",
+ "sharedfile",
+ "file"
+ ],
+ "minmax": [
+ {
+ "max": {
+ "cpu-count": 8,
+ "disk-count": 16,
+ "disk-size": 1048576,
+ "memory-size": 32768,
+ "nic-count": 8,
+ "spindle-use": 12
+ },
+ "min": {
+ "cpu-count": 1,
+ "disk-count": 1,
+ "disk-size": 1024,
+ "memory-size": 128,
+ "nic-count": 1,
+ "spindle-use": 1
+ }
+ }
+ ],
+ "spindle-ratio": 32.0,
+ "std": {
+ "cpu-count": 1,
+ "disk-count": 1,
+ "disk-size": 1024,
+ "memory-size": 128,
+ "nic-count": 1,
+ "spindle-use": 1
+ },
+ "vcpu-ratio": 1.0
+ },
+ "mac_prefix": "aa:bb:cc",
+ "maintain_node_health": false,
+ "master_ip": "192.0.2.87",
+ "master_netdev": "eth0",
+ "master_netmask": 32,
+ "master_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+ "max_running_jobs": 20,
+ "max_tracked_jobs": 25,
+ "modify_etc_hosts": true,
+ "modify_ssh_setup": true,
+ "mtime": 1361964122.7947099,
+ "ndparams": {
+ "cpu_speed": 1.0,
+ "exclusive_storage": false,
+ "oob_program": "",
+ "ovs": false,
+ "ovs_link": "",
+ "ovs_name": "switch1",
+ "spindle_count": 1,
+ "ssh_port": 22
+ },
+ "nicparams": {
+ "default": {
+ "link": "br974",
+ "mode": "bridged",
+ "vlan": ""
+ }
+ },
+ "os_hvp": {
+ "TEMP-Ganeti-QA-OS": {
+ "xen-hvm": {
+ "acpi": false,
+ "pae": true
+ },
+ "xen-pvm": {
+ "root_path": "/dev/sda5"
+ }
+ }
+ },
+ "osparams": {},
+ "osparams_private_cluster": {},
+ "prealloc_wipe_disks": false,
+ "primary_ip_family": 2,
+ "reserved_lvs": [],
+ "rsahostkeypub": "YOURKEY",
+ "serial_no": 3189,
+ "shared_file_storage_dir": "/srv/ganeti/shared-file-storage",
+ "ssh_key_bits": 1024,
+ "ssh_key_type": "dsa",
+ "tags": [
+ "mytag"
+ ],
+ "tcpudp_port_pool": [
+ 32104,
+ 32105,
+ 32101,
+ 32102,
+ 32103
+ ],
+ "uid_pool": [],
+ "use_external_mip_script": false,
+ "uuid": "dddf8c12-f2d8-4718-a35b-7804daf12a3f",
+ "volume_group_name": "xenvg",
+ "zeroing_image": ""
+ },
+ "ctime": 1343869045.6055231,
+ "disks": {
+ "150bd154-8e23-44d1-b762-5065ae5a507b": {
+ "ctime": 1354038435.343601,
+ "dev_type": "plain",
+ "iv_name": "disk/0",
+ "logical_id": [
+ "xenvg",
+ "b27a576a-13f7-4f07-885c-63fcad4fdfcc.disk0"
+ ],
+ "mode": "rw",
+ "mtime": 1354038435.343601,
+ "nodes": [
+ "2ae3d962-2dad-44f2-bdb1-85f77107f907"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 1280,
+ "uuid": "150bd154-8e23-44d1-b762-5065ae5a507b"
+ },
+ "77ced3a5-6756-49ae-8d1f-274e27664c05": {
+ "children": [
+ {
+ "ctime": 1421677173.7280669,
+ "dev_type": "plain",
+ "logical_id": [
+ "xenvg",
+ "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_data"
+ ],
+ "mtime": 1421677173.7280591,
+ "nodes": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 1024
+ },
+ {
+ "ctime": 1421677173.728096,
+ "dev_type": "plain",
+ "logical_id": [
+ "xenvg",
+ "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_meta"
+ ],
+ "mtime": 1421677173.7280879,
+ "nodes": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 128
+ }
+ ],
+ "ctime": 1363620258.6089759,
+ "dev_type": "drbd",
+ "iv_name": "disk/0",
+ "logical_id": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a",
+ 32100,
+ 0,
+ 0,
+ "d3c3fd475fcbaf5fd177fb245ac43b71247ada38"
+ ],
+ "mode": "rw",
+ "mtime": 1363620258.6089759,
+ "nodes": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 1024,
+ "uuid": "77ced3a5-6756-49ae-8d1f-274e27664c05"
+ },
+ "79acf611-be58-4334-9fe4-4f2b73ae8abb": {
+ "ctime": 1355186880.4511809,
+ "dev_type": "plain",
+ "iv_name": "disk/0",
+ "logical_id": [
+ "xenvg",
+ "3e559cd7-1024-4294-a923-a9fd13182b2f.disk0"
+ ],
+ "mode": "rw",
+ "mtime": 1355186880.4511809,
+ "nodes": [
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 102400,
+ "uuid": "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+ }
+ },
+ "filters": {},
+ "instances": {
+ "4e091bdc-e205-4ed7-8a47-0c9130a6619f": {
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "beparams": {},
+ "ctime": 1354038435.343601,
+ "disks": [
+ "150bd154-8e23-44d1-b762-5065ae5a507b"
+ ],
+ "disks_active": true,
+ "hvparams": {},
+ "hypervisor": "xen-pvm",
+ "mtime": 1354224585.700732,
+ "name": "instance3.example.com",
+ "nics": [
+ {
+ "mac": "aa:bb:cc:5e:5c:75",
+ "nicparams": {},
+ "uuid": "1ab090c1-e017-406c-afb4-fc285cb43e31"
+ }
+ ],
+ "os": "debian-image",
+ "osparams": {},
+ "osparams_private": {},
+ "primary_node": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+ "serial_no": 4,
+ "tags": [],
+ "uuid": "4e091bdc-e205-4ed7-8a47-0c9130a6619f"
+ },
+ "6c078d22-3eb6-4780-857d-81772e09eef1": {
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "beparams": {},
+ "ctime": 1363620258.6089759,
+ "disks": [
+ "77ced3a5-6756-49ae-8d1f-274e27664c05"
+ ],
+ "disks_active": true,
+ "hvparams": {},
+ "hypervisor": "xen-pvm",
+ "mtime": 1363620320.8749011,
+ "name": "instance1.example.com",
+ "nics": [
+ {
+ "mac": "aa:bb:cc:b2:6e:0b",
+ "nicparams": {},
+ "uuid": "2c953d72-fac4-4aa9-a225-4131bb271791"
+ }
+ ],
+ "os": "busybox",
+ "osparams": {},
+ "osparams_private": {},
+ "primary_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+ "serial_no": 2,
+ "uuid": "6c078d22-3eb6-4780-857d-81772e09eef1"
+ },
+ "8fde9f6d-e1f1-4850-9e9c-154966f622f5": {
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "beparams": {},
+ "ctime": 1355186880.4511809,
+ "disks": [
+ "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+ ],
+ "disks_active": true,
+ "hvparams": {},
+ "hypervisor": "xen-pvm",
+ "mtime": 1355186898.307642,
+ "name": "instance2.example.com",
+ "nics": [
+ {
+ "mac": "aa:bb:cc:56:83:fb",
+ "nicparams": {},
+ "uuid": "1cf95562-e676-4fd0-8214-e8b84a2f7bd1"
+ }
+ ],
+ "os": "debian-image",
+ "osparams": {},
+ "osparams_private": {},
+ "primary_node": "41f9c238-173c-4120-9e41-04ad379b647a",
+ "serial_no": 2,
+ "tags": [],
+ "uuid": "8fde9f6d-e1f1-4850-9e9c-154966f622f5"
+ }
+ },
+ "mtime": 1421677173.729104,
+ "networks": {
+ "99f0128a-1c84-44da-90b9-9581ea00c075": {
+ "ext_reservations": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+ "name": "a network",
+ "network": "203.0.113.0/24",
+ "reservations": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+ "serial_no": 1,
+ "uuid": "99f0128a-1c84-44da-90b9-9581ea00c075"
+ }
+ },
+ "nodegroups": {
+ "5244a46d-7506-4e14-922d-02b58153dde1": {
+ "alloc_policy": "preferred",
+ "diskparams": {},
+ "ipolicy": {},
+ "mtime": 1361963775.5750091,
+ "name": "default",
+ "ndparams": {},
+ "networks": {},
+ "serial_no": 125,
+ "tags": [],
+ "uuid": "5244a46d-7506-4e14-922d-02b58153dde1"
+ },
+ "6c0a8916-b719-45ad-95dd-82192b1e473f": {
+ "alloc_policy": "preferred",
+ "diskparams": {},
+ "ipolicy": {
+ "disk-templates": [
+ "plain"
+ ],
+ "minmax": [
+ {
+ "max": {
+ "cpu-count": 8,
+ "disk-count": 16,
+ "disk-size": 1048576,
+ "memory-size": 32768,
+ "nic-count": 18,
+ "spindle-use": 14
+ },
+ "min": {
+ "cpu-count": 2,
+ "disk-count": 2,
+ "disk-size": 1024,
+ "memory-size": 128,
+ "nic-count": 1,
+ "spindle-use": 1
+ }
+ }
+ ],
+ "spindle-ratio": 5.2000000000000002,
+ "vcpu-ratio": 3.1400000000000001
+ },
+ "mtime": 1361963775.5750091,
+ "name": "another",
+ "ndparams": {
+ "exclusive_storage": true
+ },
+ "networks": {},
+ "serial_no": 125,
+ "tags": [],
+ "uuid": "6c0a8916-b719-45ad-95dd-82192b1e473f"
+ }
+ },
+ "nodes": {
+ "2ae3d962-2dad-44f2-bdb1-85f77107f907": {
+ "ctime": 1343869045.6048839,
+ "drained": false,
+ "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+ "master_candidate": true,
+ "master_capable": true,
+ "mtime": 1358348755.779906,
+ "name": "node2.example.com",
+ "ndparams": {},
+ "offline": false,
+ "powered": true,
+ "primary_ip": "192.0.2.83",
+ "secondary_ip": "198.51.100.83",
+ "serial_no": 6,
+ "tags": [],
+ "uuid": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+ "vm_capable": true
+ },
+ "41f9c238-173c-4120-9e41-04ad379b647a": {
+ "ctime": 1343869205.9348071,
+ "drained": false,
+ "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+ "master_candidate": true,
+ "master_capable": true,
+ "mtime": 1353019704.8853681,
+ "name": "node3.example.com",
+ "ndparams": {},
+ "offline": false,
+ "powered": true,
+ "primary_ip": "192.0.2.84",
+ "secondary_ip": "198.51.100.84",
+ "serial_no": 2,
+ "tags": [],
+ "uuid": "41f9c238-173c-4120-9e41-04ad379b647a",
+ "vm_capable": true
+ },
+ "9a12d554-75c0-4cb1-8064-103365145db0": {
+ "ctime": 1349722460.022264,
+ "drained": false,
+ "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+ "master_candidate": true,
+ "master_capable": true,
+ "mtime": 1359986533.3533289,
+ "name": "node1.example.com",
+ "ndparams": {},
+ "offline": false,
+ "powered": true,
+ "primary_ip": "192.0.2.82",
+ "secondary_ip": "198.51.100.82",
+ "serial_no": 197,
+ "tags": [],
+ "uuid": "9a12d554-75c0-4cb1-8064-103365145db0",
+ "vm_capable": true
+ }
+ },
+ "serial_no": 7627,
+ "version": 2160000
+}
diff --git a/test/data/cluster_config_2.17.json b/test/data/cluster_config_2.17.json
new file mode 100644
index 0000000..65204b9
--- /dev/null
+++ b/test/data/cluster_config_2.17.json
@@ -0,0 +1,669 @@
+{
+ "cluster": {
+ "beparams": {
+ "default": {
+ "always_failover": false,
+ "auto_balance": true,
+ "maxmem": 128,
+ "minmem": 128,
+ "spindle_use": 1,
+ "vcpus": 1
+ }
+ },
+ "blacklisted_os": [],
+ "candidate_certs": {},
+ "candidate_pool_size": 10,
+ "cluster_name": "cluster.name.example.com",
+ "compression_tools": [
+ "gzip",
+ "gzip-fast",
+ "gzip-slow"
+ ],
+ "ctime": 1343869045.6048839,
+ "data_collectors": {
+ "cpu-avg-load": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "diagnose": {
+ "active": true,
+ "intervall": 5000000.0
+ },
+ "diskstats": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "drbd": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "inst-status-xen": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "kvm-inst-rss": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "lv": {
+ "active": true,
+ "interval": 5000000.0
+ },
+ "xen-cpu-avg-load": {
+ "active": true,
+ "interval": 5000000.0
+ }
+ },
+ "default_iallocator": "hail",
+ "default_iallocator_params": {},
+ "diagnose_data_collector_filename": "",
+ "disk_state_static": {},
+ "diskparams": {
+ "blockdev": {},
+ "diskless": {},
+ "drbd": {
+ "c-delay-target": 1,
+ "c-fill-target": 200,
+ "c-max-rate": 2048,
+ "c-min-rate": 1024,
+ "c-plan-ahead": 1,
+ "data-stripes": 2,
+ "disk-barriers": "bf",
+ "disk-custom": "",
+ "dynamic-resync": false,
+ "meta-barriers": true,
+ "meta-stripes": 2,
+ "metavg": "xenvg",
+ "net-custom": "",
+ "protocol": "C",
+ "resync-rate": 1024
+ },
+ "ext": {
+ "access": "kernelspace"
+ },
+ "file": {},
+ "gluster": {
+ "access": "kernelspace",
+ "host": "127.0.0.1",
+ "port": 24007,
+ "volume": "gv0"
+ },
+ "plain": {
+ "stripes": 2
+ },
+ "rbd": {
+ "access": "kernelspace",
+ "pool": "rbd"
+ },
+ "sharedfile": {}
+ },
+ "drbd_usermode_helper": "/bin/true",
+ "enabled_disk_templates": [
+ "drbd",
+ "plain",
+ "file",
+ "sharedfile"
+ ],
+ "enabled_hypervisors": [
+ "xen-pvm"
+ ],
+ "enabled_user_shutdown": false,
+ "file_storage_dir": "",
+ "gluster_storage_dir": "",
+ "hidden_os": [],
+ "highest_used_port": 32105,
+ "hv_state_static": {
+ "xen-pvm": {
+ "cpu_node": 1,
+ "cpu_total": 1,
+ "mem_hv": 0,
+ "mem_node": 0,
+ "mem_total": 0
+ }
+ },
+ "hvparams": {
+ "chroot": {
+ "init_script": "/ganeti-chroot"
+ },
+ "fake": {
+ "migration_mode": "live"
+ },
+ "kvm": {
+ "acpi": true,
+ "boot_order": "disk",
+ "cdrom2_image_path": "",
+ "cdrom_disk_type": "",
+ "cdrom_image_path": "",
+ "cpu_cores": 0,
+ "cpu_mask": "all",
+ "cpu_sockets": 0,
+ "cpu_threads": 0,
+ "cpu_type": "",
+ "disk_aio": "threads",
+ "disk_cache": "default",
+ "disk_type": "paravirtual",
+ "floppy_image_path": "",
+ "initrd_path": "",
+ "kernel_args": "ro",
+ "kernel_path": "/boot/vmlinuz-kvmU",
+ "keymap": "",
+ "kvm_extra": "",
+ "kvm_flag": "",
+ "kvm_path": "/usr/bin/kvm",
+ "machine_version": "",
+ "mem_path": "",
+ "migration_bandwidth": 4,
+ "migration_caps": "",
+ "migration_downtime": 30,
+ "migration_mode": "live",
+ "migration_port": 4041,
+ "nic_type": "paravirtual",
+ "reboot_behavior": "reboot",
+ "root_path": "/dev/vda1",
+ "security_domain": "",
+ "security_model": "none",
+ "serial_console": true,
+ "serial_speed": 38400,
+ "soundhw": "",
+ "spice_bind": "",
+ "spice_image_compression": "",
+ "spice_ip_version": 0,
+ "spice_jpeg_wan_compression": "",
+ "spice_password_file": "",
+ "spice_playback_compression": true,
+ "spice_streaming_video": "",
+ "spice_tls_ciphers": "HIGH:-DES:-3DES:-EXPORT:-ADH",
+ "spice_use_tls": false,
+ "spice_use_vdagent": true,
+ "spice_zlib_glz_wan_compression": "",
+ "usb_devices": "",
+ "usb_mouse": "",
+ "use_chroot": false,
+ "use_localtime": false,
+ "user_shutdown": false,
+ "vga": "",
+ "vhost_net": false,
+ "virtio_net_queues": 1,
+ "vnc_bind_address": "",
+ "vnc_password_file": "",
+ "vnc_tls": false,
+ "vnc_x509_path": "",
+ "vnc_x509_verify": false,
+ "vnet_hdr": true
+ },
+ "lxc": {
+ "cpu_mask": "",
+ "devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+ "drop_capabilities": "mac_override,sys_boot,sys_module,sys_time,sys_admin",
+ "extra_cgroups": "",
+ "extra_config": "",
+ "lxc_cgroup_use": "",
+ "lxc_devices": "c 1:3 rw,c 1:5 rw,c 1:7 rw,c 1:8 rw,c 1:9 rw,c 1:10 rw,c 5:0 rw,c 5:1 rw,c 5:2 rw,c 136:* rw",
+ "lxc_drop_capabilities": "mac_override,sys_boot,sys_module,sys_time",
+ "lxc_extra_config": "",
+ "lxc_startup_wait": 30,
+ "lxc_tty": 6,
+ "num_ttys": 6,
+ "startup_timeout": 30
+ },
+ "xen-hvm": {
+ "acpi": true,
+ "blockdev_prefix": "hd",
+ "boot_order": "cd",
+ "cdrom_image_path": "",
+ "cpu_cap": 0,
+ "cpu_mask": "all",
+ "cpu_weight": 256,
+ "cpuid": "",
+ "device_model": "/usr/lib/xen/bin/qemu-dm",
+ "disk_type": "paravirtual",
+ "kernel_path": "/usr/lib/xen/boot/hvmloader",
+ "migration_mode": "non-live",
+ "migration_port": 8082,
+ "nic_type": "rtl8139",
+ "pae": true,
+ "pci_pass": "",
+ "reboot_behavior": "reboot",
+ "soundhw": "",
+ "use_localtime": false,
+ "vif_script": "",
+ "vif_type": "ioemu",
+ "viridian": false,
+ "vnc_bind_address": "0.0.0.0",
+ "vnc_password_file": "/your/vnc-cluster-password",
+ "xen_cmd": "xm"
+ },
+ "xen-pvm": {
+ "blockdev_prefix": "sd",
+ "bootloader_args": "",
+ "bootloader_path": "",
+ "cpu_cap": 0,
+ "cpu_mask": "all",
+ "cpu_weight": 256,
+ "cpuid": "",
+ "initrd_path": "",
+ "kernel_args": "ro",
+ "kernel_path": "/boot/vmlinuz-xenU",
+ "migration_mode": "live",
+ "migration_port": 8082,
+ "reboot_behavior": "reboot",
+ "root_path": "/dev/xvda1",
+ "soundhw": "",
+ "use_bootloader": false,
+ "vif_script": "",
+ "xen_cmd": "xm"
+ }
+ },
+ "install_image": "",
+ "instance_communication_network": "",
+ "ipolicy": {
+ "disk-templates": [
+ "drbd",
+ "plain",
+ "sharedfile",
+ "file"
+ ],
+ "minmax": [
+ {
+ "max": {
+ "cpu-count": 8,
+ "disk-count": 16,
+ "disk-size": 1048576,
+ "memory-size": 32768,
+ "nic-count": 8,
+ "spindle-use": 12
+ },
+ "min": {
+ "cpu-count": 1,
+ "disk-count": 1,
+ "disk-size": 1024,
+ "memory-size": 128,
+ "nic-count": 1,
+ "spindle-use": 1
+ }
+ }
+ ],
+ "spindle-ratio": 32.0,
+ "std": {
+ "cpu-count": 1,
+ "disk-count": 1,
+ "disk-size": 1024,
+ "memory-size": 128,
+ "nic-count": 1,
+ "spindle-use": 1
+ },
+ "vcpu-ratio": 1.0,
+ "memory-ratio": 1.7
+ },
+ "mac_prefix": "aa:bb:cc",
+ "maintain_node_health": false,
+ "master_ip": "192.0.2.87",
+ "master_netdev": "eth0",
+ "master_netmask": 32,
+ "master_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+ "max_running_jobs": 20,
+ "max_tracked_jobs": 25,
+ "modify_etc_hosts": true,
+ "modify_ssh_setup": true,
+ "mtime": 1361964122.7947099,
+ "ndparams": {
+ "cpu_speed": 1.0,
+ "exclusive_storage": false,
+ "oob_program": "",
+ "ovs": false,
+ "ovs_link": "",
+ "ovs_name": "switch1",
+ "spindle_count": 1,
+ "ssh_port": 22
+ },
+ "nicparams": {
+ "default": {
+ "link": "br974",
+ "mode": "bridged",
+ "vlan": ""
+ }
+ },
+ "os_hvp": {
+ "TEMP-Ganeti-QA-OS": {
+ "xen-hvm": {
+ "acpi": false,
+ "pae": true
+ },
+ "xen-pvm": {
+ "root_path": "/dev/sda5"
+ }
+ }
+ },
+ "osparams": {},
+ "osparams_private_cluster": {},
+ "prealloc_wipe_disks": false,
+ "primary_ip_family": 2,
+ "reserved_lvs": [],
+ "rsahostkeypub": "YOURKEY",
+ "serial_no": 3189,
+ "shared_file_storage_dir": "/srv/ganeti/shared-file-storage",
+ "ssh_key_bits": 1024,
+ "ssh_key_type": "dsa",
+ "tags": [
+ "mytag"
+ ],
+ "tcpudp_port_pool": [
+ 32104,
+ 32105,
+ 32101,
+ 32102,
+ 32103
+ ],
+ "uid_pool": [],
+ "use_external_mip_script": false,
+ "uuid": "dddf8c12-f2d8-4718-a35b-7804daf12a3f",
+ "volume_group_name": "xenvg",
+ "zeroing_image": ""
+ },
+ "ctime": 1343869045.6055231,
+ "disks": {
+ "150bd154-8e23-44d1-b762-5065ae5a507b": {
+ "ctime": 1354038435.343601,
+ "dev_type": "plain",
+ "iv_name": "disk/0",
+ "logical_id": [
+ "xenvg",
+ "b27a576a-13f7-4f07-885c-63fcad4fdfcc.disk0"
+ ],
+ "mode": "rw",
+ "mtime": 1354038435.343601,
+ "nodes": [
+ "2ae3d962-2dad-44f2-bdb1-85f77107f907"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 1280,
+ "uuid": "150bd154-8e23-44d1-b762-5065ae5a507b"
+ },
+ "77ced3a5-6756-49ae-8d1f-274e27664c05": {
+ "children": [
+ {
+ "ctime": 1421677173.7280669,
+ "dev_type": "plain",
+ "logical_id": [
+ "xenvg",
+ "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_data"
+ ],
+ "mtime": 1421677173.7280591,
+ "nodes": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 1024
+ },
+ {
+ "ctime": 1421677173.728096,
+ "dev_type": "plain",
+ "logical_id": [
+ "xenvg",
+ "5c390722-6a7a-4bb4-9cef-98d896a8e6b1.disk0_meta"
+ ],
+ "mtime": 1421677173.7280879,
+ "nodes": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 128
+ }
+ ],
+ "ctime": 1363620258.6089759,
+ "dev_type": "drbd",
+ "iv_name": "disk/0",
+ "logical_id": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a",
+ 32100,
+ 0,
+ 0,
+ "d3c3fd475fcbaf5fd177fb245ac43b71247ada38"
+ ],
+ "mode": "rw",
+ "mtime": 1363620258.6089759,
+ "nodes": [
+ "9a12d554-75c0-4cb1-8064-103365145db0",
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 1024,
+ "uuid": "77ced3a5-6756-49ae-8d1f-274e27664c05"
+ },
+ "79acf611-be58-4334-9fe4-4f2b73ae8abb": {
+ "ctime": 1355186880.4511809,
+ "dev_type": "plain",
+ "iv_name": "disk/0",
+ "logical_id": [
+ "xenvg",
+ "3e559cd7-1024-4294-a923-a9fd13182b2f.disk0"
+ ],
+ "mode": "rw",
+ "mtime": 1355186880.4511809,
+ "nodes": [
+ "41f9c238-173c-4120-9e41-04ad379b647a"
+ ],
+ "params": {},
+ "serial_no": 1,
+ "size": 102400,
+ "uuid": "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+ }
+ },
+ "filters": {},
+ "instances": {
+ "4e091bdc-e205-4ed7-8a47-0c9130a6619f": {
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "beparams": {},
+ "ctime": 1354038435.343601,
+ "disks": [
+ "150bd154-8e23-44d1-b762-5065ae5a507b"
+ ],
+ "disks_active": true,
+ "hvparams": {},
+ "hypervisor": "xen-pvm",
+ "mtime": 1354224585.700732,
+ "name": "instance3.example.com",
+ "nics": [
+ {
+ "mac": "aa:bb:cc:5e:5c:75",
+ "nicparams": {},
+ "uuid": "1ab090c1-e017-406c-afb4-fc285cb43e31"
+ }
+ ],
+ "os": "debian-image",
+ "osparams": {},
+ "osparams_private": {},
+ "primary_node": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+ "serial_no": 4,
+ "tags": [],
+ "uuid": "4e091bdc-e205-4ed7-8a47-0c9130a6619f"
+ },
+ "6c078d22-3eb6-4780-857d-81772e09eef1": {
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "beparams": {},
+ "ctime": 1363620258.6089759,
+ "disks": [
+ "77ced3a5-6756-49ae-8d1f-274e27664c05"
+ ],
+ "disks_active": true,
+ "hvparams": {},
+ "hypervisor": "xen-pvm",
+ "mtime": 1363620320.8749011,
+ "name": "instance1.example.com",
+ "nics": [
+ {
+ "mac": "aa:bb:cc:b2:6e:0b",
+ "nicparams": {},
+ "uuid": "2c953d72-fac4-4aa9-a225-4131bb271791"
+ }
+ ],
+ "os": "busybox",
+ "osparams": {},
+ "osparams_private": {},
+ "primary_node": "9a12d554-75c0-4cb1-8064-103365145db0",
+ "serial_no": 2,
+ "uuid": "6c078d22-3eb6-4780-857d-81772e09eef1"
+ },
+ "8fde9f6d-e1f1-4850-9e9c-154966f622f5": {
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "beparams": {},
+ "ctime": 1355186880.4511809,
+ "disks": [
+ "79acf611-be58-4334-9fe4-4f2b73ae8abb"
+ ],
+ "disks_active": true,
+ "hvparams": {},
+ "hypervisor": "xen-pvm",
+ "mtime": 1355186898.307642,
+ "name": "instance2.example.com",
+ "nics": [
+ {
+ "mac": "aa:bb:cc:56:83:fb",
+ "nicparams": {},
+ "uuid": "1cf95562-e676-4fd0-8214-e8b84a2f7bd1"
+ }
+ ],
+ "os": "debian-image",
+ "osparams": {},
+ "osparams_private": {},
+ "primary_node": "41f9c238-173c-4120-9e41-04ad379b647a",
+ "serial_no": 2,
+ "tags": [],
+ "uuid": "8fde9f6d-e1f1-4850-9e9c-154966f622f5"
+ }
+ },
+ "maintenance": {},
+ "mtime": 1421677173.729104,
+ "networks": {
+ "99f0128a-1c84-44da-90b9-9581ea00c075": {
+ "ext_reservations": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+ "name": "a network",
+ "network": "203.0.113.0/24",
+ "reservations": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+ "serial_no": 1,
+ "uuid": "99f0128a-1c84-44da-90b9-9581ea00c075"
+ }
+ },
+ "nodegroups": {
+ "5244a46d-7506-4e14-922d-02b58153dde1": {
+ "alloc_policy": "preferred",
+ "diskparams": {},
+ "ipolicy": {},
+ "mtime": 1361963775.5750091,
+ "name": "default",
+ "ndparams": {},
+ "networks": {},
+ "serial_no": 125,
+ "tags": [],
+ "uuid": "5244a46d-7506-4e14-922d-02b58153dde1"
+ },
+ "6c0a8916-b719-45ad-95dd-82192b1e473f": {
+ "alloc_policy": "preferred",
+ "diskparams": {},
+ "ipolicy": {
+ "disk-templates": [
+ "plain"
+ ],
+ "minmax": [
+ {
+ "max": {
+ "cpu-count": 8,
+ "disk-count": 16,
+ "disk-size": 1048576,
+ "memory-size": 32768,
+ "nic-count": 18,
+ "spindle-use": 14
+ },
+ "min": {
+ "cpu-count": 2,
+ "disk-count": 2,
+ "disk-size": 1024,
+ "memory-size": 128,
+ "nic-count": 1,
+ "spindle-use": 1
+ }
+ }
+ ],
+ "spindle-ratio": 5.2000000000000002,
+ "vcpu-ratio": 3.1400000000000001
+ },
+ "mtime": 1361963775.5750091,
+ "name": "another",
+ "ndparams": {
+ "exclusive_storage": true
+ },
+ "networks": {},
+ "serial_no": 125,
+ "tags": [],
+ "uuid": "6c0a8916-b719-45ad-95dd-82192b1e473f"
+ }
+ },
+ "nodes": {
+ "2ae3d962-2dad-44f2-bdb1-85f77107f907": {
+ "ctime": 1343869045.6048839,
+ "drained": false,
+ "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+ "master_candidate": true,
+ "master_capable": true,
+ "mtime": 1358348755.779906,
+ "name": "node2.example.com",
+ "ndparams": {},
+ "offline": false,
+ "powered": true,
+ "primary_ip": "192.0.2.83",
+ "secondary_ip": "198.51.100.83",
+ "serial_no": 6,
+ "tags": [],
+ "uuid": "2ae3d962-2dad-44f2-bdb1-85f77107f907",
+ "vm_capable": true
+ },
+ "41f9c238-173c-4120-9e41-04ad379b647a": {
+ "ctime": 1343869205.9348071,
+ "drained": false,
+ "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+ "master_candidate": true,
+ "master_capable": true,
+ "mtime": 1353019704.8853681,
+ "name": "node3.example.com",
+ "ndparams": {},
+ "offline": false,
+ "powered": true,
+ "primary_ip": "192.0.2.84",
+ "secondary_ip": "198.51.100.84",
+ "serial_no": 2,
+ "tags": [],
+ "uuid": "41f9c238-173c-4120-9e41-04ad379b647a",
+ "vm_capable": true
+ },
+ "9a12d554-75c0-4cb1-8064-103365145db0": {
+ "ctime": 1349722460.022264,
+ "drained": false,
+ "group": "5244a46d-7506-4e14-922d-02b58153dde1",
+ "master_candidate": true,
+ "master_capable": true,
+ "mtime": 1359986533.3533289,
+ "name": "node1.example.com",
+ "ndparams": {},
+ "offline": false,
+ "powered": true,
+ "primary_ip": "192.0.2.82",
+ "secondary_ip": "198.51.100.82",
+ "serial_no": 197,
+ "tags": [],
+ "uuid": "9a12d554-75c0-4cb1-8064-103365145db0",
+ "vm_capable": true
+ }
+ },
+ "serial_no": 7627,
+ "version": 2170000
+}
diff --git a/test/data/htools/dyn1.json b/test/data/htools/dyn1.json
new file mode 100644
index 0000000..ab0e89d
--- /dev/null
+++ b/test/data/htools/dyn1.json
@@ -0,0 +1,38 @@
+[
+ {
+ "node": "node-01-000",
+ "reports": [
+ {
+ "category": null,
+ "data": {
+ "inst-00": 0.1,
+ "inst-01": 0.1,
+ "inst-02": 0.1,
+ "inst-03": 0.1
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "xen-cpu-avg-load",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ }
+ ]
+ },
+ {
+ "node": "node-01-001",
+ "reports": [
+ {
+ "category": null,
+ "data": {
+ "inst-10": 2.0,
+ "inst-11": 2.0
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "xen-cpu-avg-load",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ }
+ ]
+ }
+]
diff --git a/test/data/htools/dyn2.json b/test/data/htools/dyn2.json
new file mode 100644
index 0000000..3a8e2b4
--- /dev/null
+++ b/test/data/htools/dyn2.json
@@ -0,0 +1,64 @@
+[
+ {
+ "node": "node-01-000",
+ "reports": [
+ {
+ "category": null,
+ "data": {
+ "cpu_number": 32,
+ "cpu_total": 0.1,
+ "cpus": []
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "cpu-avg-load",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ },
+ {
+ "category": null,
+ "data": {
+ "inst-00": 256,
+ "inst-01": 256,
+ "inst-02": 256,
+ "inst-03": 256
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "kvm-inst-rss",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ }
+ ]
+ },
+ {
+ "node": "node-01-001",
+ "reports": [
+ {
+ "category": null,
+ "data": {
+ "cpu_number": 32,
+ "cpu_total": 0.3,
+ "cpus": []
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "cpu-avg-load",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ },
+ {
+ "category": null,
+ "data": {
+ "inst-10": 65536,
+ "inst-11": 65536
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "kvm-inst-rss",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ }
+ ]
+ }
+]
diff --git a/test/data/htools/dyn3.json b/test/data/htools/dyn3.json
new file mode 100644
index 0000000..e40c276
--- /dev/null
+++ b/test/data/htools/dyn3.json
@@ -0,0 +1,64 @@
+[
+ {
+ "node": "node-01-000",
+ "reports": [
+ {
+ "category": null,
+ "data": {
+ "cpu_number": 32,
+ "cpu_total": 2.0,
+ "cpus": []
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "cpu-avg-load",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ },
+ {
+ "category": null,
+ "data": {
+ "inst-00": 256,
+ "inst-01": 256,
+ "inst-02": 256,
+ "inst-03": 256
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "kvm-inst-rss",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ }
+ ]
+ },
+ {
+ "node": "node-01-001",
+ "reports": [
+ {
+ "category": null,
+ "data": {
+ "cpu_number": 32,
+ "cpu_total": 0.1,
+ "cpus": []
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "cpu-avg-load",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ },
+ {
+ "category": null,
+ "data": {
+ "inst-10": 65536,
+ "inst-11": 65536
+ },
+ "format_version": 1,
+ "kind": 0,
+ "name": "kvm-inst-rss",
+ "timestamp": 1444910125282702000,
+ "version": "B"
+ }
+ ]
+ }
+]
diff --git a/test/data/htools/hail-alloc-memory-over-commitment.json b/test/data/htools/hail-alloc-memory-over-commitment.json
new file mode 100644
index 0000000..58c3b5d
--- /dev/null
+++ b/test/data/htools/hail-alloc-memory-over-commitment.json
@@ -0,0 +1,204 @@
+{
+ "cluster_tags": [
+ "htools:desiredlocation:power",
+ "htools:nlocation:power"
+ ],
+ "nodegroups": {
+ "uuid-group-1": {
+ "ipolicy": {
+ "std": {
+ "nic-count": 1,
+ "disk-size": 1024,
+ "disk-count": 1,
+ "memory-size": 128,
+ "cpu-count": 1,
+ "spindle-use": 1
+ },
+ "minmax": [
+ {
+ "min": {
+ "nic-count": 1,
+ "disk-size": 128,
+ "disk-count": 1,
+ "memory-size": 128,
+ "cpu-count": 1,
+ "spindle-use": 1
+ },
+ "max": {
+ "nic-count": 8,
+ "disk-size": 1048576,
+ "disk-count": 16,
+ "memory-size": 32768,
+ "cpu-count": 8,
+ "spindle-use": 8
+ }
+ }
+ ],
+ "vcpu-ratio": 4.0,
+ "disk-templates": [
+ "sharedfile",
+ "diskless",
+ "plain",
+ "blockdev",
+ "drbd",
+ "file",
+ "rbd"
+ ],
+ "spindle-ratio": 32.0,
+ "memory-ratio": 2
+ },
+ "networks": [],
+ "alloc_policy": "preferred",
+ "tags": [],
+ "name": "default"
+ }
+ },
+ "cluster_name": "cluster",
+ "instances": {
+ "instance1": {
+ "disks": [
+ {
+ "spindles": 1,
+ "mode": "rw",
+ "size": 51200
+ }
+ ],
+ "disk_space_total": 51200,
+ "hypervisor": "xen-pvm",
+ "tags": [
+ "test:test"
+ ],
+ "nics": [
+ {
+ "ip": null,
+ "mac": "aa:00:00:10:d2:01",
+ "link": "xen-br0",
+ "mode": "bridged",
+ "bridge": "xen-br0"
+ }
+ ],
+ "vcpus": 1,
+ "spindle_use": 1,
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "disk_template": "drbd",
+ "memory": 1024,
+ "nodes": [
+ "node1",
+ "node2"
+ ],
+ "os": "instance-debootstrap"
+ },
+ "instance2": {
+ "disks": [
+ {
+ "spindles": 1,
+ "mode": "rw",
+ "size": 51200
+ }
+ ],
+ "disk_space_total": 51200,
+ "hypervisor": "xen-pvm",
+ "tags": [
+ "test:test"
+ ],
+ "nics": [
+ {
+ "ip": null,
+ "mac": "aa:00:00:10:d2:01",
+ "link": "xen-br0",
+ "mode": "bridged",
+ "bridge": "xen-br0"
+ }
+ ],
+ "vcpus": 1,
+ "spindle_use": 1,
+ "admin_state": "up",
+ "admin_state_source": "admin",
+ "disk_template": "drbd",
+ "memory": 1024,
+ "nodes": [
+ "node2",
+ "node1"
+ ],
+ "os": "instance-debootstrap"
+ }
+ },
+ "nodes": {
+ "node1": {
+ "total_disk": 307200,
+ "total_cpus": 4,
+ "group": "uuid-group-1",
+ "i_pri_up_memory": 0,
+ "tags": [
+ "power:a"
+ ],
+ "master_candidate": true,
+ "free_memory": 256,
+ "ndparams": {
+ "spindle_count": 1,
+ "oob_program": null,
+ "exclusive_storage": false
+ },
+ "reserved_cpus": 1,
+ "master_capable": true,
+ "free_disk": 307200,
+ "drained": false,
+ "total_memory": 1280,
+ "i_pri_memory": 0,
+ "reserved_memory": 0,
+ "free_spindles": 12,
+ "total_spindles": 12,
+ "vm_capable": true,
+ "offline": false
+ },
+ "node2": {
+ "total_disk": 307200,
+ "total_cpus": 4,
+ "group": "uuid-group-1",
+ "i_pri_up_memory": 0,
+ "tags": [
+ "power:b"
+ ],
+ "master_candidate": true,
+ "free_memory": 256,
+ "ndparams": {
+ "spindle_count": 1,
+ "oob_program": null,
+ "exclusive_storage": false
+ },
+ "reserved_cpus": 1,
+ "master_capable": true,
+ "free_disk": 307200,
+ "drained": false,
+ "total_memory": 1280,
+ "i_pri_memory": 0,
+ "reserved_memory": 0,
+ "free_spindles": 12,
+ "total_spindles": 12,
+ "vm_capable": true,
+ "offline": false
+ }
+ },
+ "request": {
+ "disk_space_total": 0,
+ "disk_template": "drbd",
+ "disks": [
+ {
+ "size": 1024
+ }
+ ],
+ "hypervisor": "xen-pvm",
+ "memory": 256,
+ "name": "instance-new",
+ "nics": [],
+ "os": "instance-debootstrap",
+ "required_nodes": 2,
+ "spindle_use": 1,
+ "tags": [
+ "power:a"
+ ],
+ "type": "allocate",
+ "vcpus": 1
+ }
+}
diff --git a/test/data/htools/hbal-avoid-disk-moves.data b/test/data/htools/hbal-avoid-disk-moves.data
new file mode 100644
index 0000000..41dac29
--- /dev/null
+++ b/test/data/htools/hbal-avoid-disk-moves.data
@@ -0,0 +1,12 @@
+group-01|fake-uuid-01|preferred||
+
+node-01|16384|0|14336|409600|306600|16|N|fake-uuid-01|1|power:a
+node-02|16384|0|16384|409600|357800|16|N|fake-uuid-01|1|power:b
+node-03|16384|0|16384|409600|357800|16|N|fake-uuid-01|1|power:a
+node-04|16384|0|16384|409600|409600|16|N|fake-uuid-01|1|power:b
+
+inst1|1024|51200|1|running|Y|node-01|node-02|drbd|power:a|1
+inst2|1024|51200|1|running|Y|node-01|node-03|drbd|power:a|1
+
+htools:nlocation:power
+htools:desiredlocation:power
diff --git a/test/data/htools/hbal-dyn2.data b/test/data/htools/hbal-dyn2.data
new file mode 100644
index 0000000..619a5d9
--- /dev/null
+++ b/test/data/htools/hbal-dyn2.data
@@ -0,0 +1,15 @@
+group-01|fake-uuid-01|preferred||
+
+node-01-000|552|0|40|3100|3100|32|M|fake-uuid-01|1
+node-01-001|552|0|40|3100|3100|32|N|fake-uuid-01|1
+
+inst-00|128|0|1|running|Y|node-01-000||ext||1
+inst-01|128|0|1|running|Y|node-01-000||ext||1
+inst-02|128|0|1|running|Y|node-01-000||ext||1
+inst-03|128|0|1|running|Y|node-01-000||ext||1
+inst-10|256|0|2|running|Y|node-01-001||ext||1
+inst-11|256|0|2|running|Y|node-01-001||ext||1
+
+
+|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|6.0
+group-01|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|6.0
diff --git a/test/data/htools/hbal-memory-over-commitment-2.data b/test/data/htools/hbal-memory-over-commitment-2.data
new file mode 100644
index 0000000..d9cd6ea
--- /dev/null
+++ b/test/data/htools/hbal-memory-over-commitment-2.data
@@ -0,0 +1,13 @@
+group-01|fake-uuid-01|preferred||
+
+node-01|1024|0|0|409600|256000|16|N|fake-uuid-01|1|power:a
+node-02|1280|0|128|409600|256000|16|N|fake-uuid-01|1|power:b
+
+inst1|1024|51200|1|running|Y|node-01|node-02|drbd|power:a|1
+inst2|1024|51200|1|running|Y|node-02|node-01|drbd|power:b|1
+inst3|128|51200|1|running|Y|node-02|node-01|drbd|power:a|1
+
+htools:desiredlocation:power
+htools:nlocation:power
+
+group-01|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|2.0
diff --git a/test/data/htools/hbal-memory-over-commitment.data b/test/data/htools/hbal-memory-over-commitment.data
new file mode 100644
index 0000000..c307d8f
--- /dev/null
+++ b/test/data/htools/hbal-memory-over-commitment.data
@@ -0,0 +1,13 @@
+group-01|fake-uuid-01|preferred||
+
+node-01|1024|0|0|409600|256000|16|N|fake-uuid-01|1|power:a
+node-02|1280|0|128|409600|256000|16|N|fake-uuid-01|1|power:b
+
+inst1|1024|51200|1|running|Y|node-01|node-02|drbd|power:a|1
+inst2|1024|51200|1|running|Y|node-02|node-01|drbd|power:b|1
+inst3|128|51200|1|running|Y|node-02|node-01|drbd|power:a|1
+
+htools:desiredlocation:power
+htools:nlocation:power
+
+group-01|128,1,1024,1,1,1|128,1,1024,1,1,1;32768,8,1048576,16,8,12|diskless,file,sharedfile,plain,blockdev,drbd,rbd,ext|4.0|32.0|1.0
diff --git a/test/hs/Test/Ganeti/BasicTypes.hs b/test/hs/Test/Ganeti/BasicTypes.hs
index f29d16f..e9ed399 100644
--- a/test/hs/Test/Ganeti/BasicTypes.hs
+++ b/test/hs/Test/Ganeti/BasicTypes.hs
@@ -37,10 +37,12 @@
module Test.Ganeti.BasicTypes (testBasicTypes) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck hiding (Result)
import Test.QuickCheck.Function
-import Control.Applicative
import Control.Monad
import Test.Ganeti.TestHelper
diff --git a/test/hs/Test/Ganeti/Confd/Types.hs b/test/hs/Test/Ganeti/Confd/Types.hs
index 3bc7167..6e7cb29 100644
--- a/test/hs/Test/Ganeti/Confd/Types.hs
+++ b/test/hs/Test/Ganeti/Confd/Types.hs
@@ -42,7 +42,9 @@
, ConfdReqQ(..)
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck
import Test.HUnit
import qualified Text.JSON as J
diff --git a/test/hs/Test/Ganeti/HTools/Instance.hs b/test/hs/Test/Ganeti/HTools/Instance.hs
index dcd4b79..84a5f5c 100644
--- a/test/hs/Test/Ganeti/HTools/Instance.hs
+++ b/test/hs/Test/Ganeti/HTools/Instance.hs
@@ -44,8 +44,10 @@
, Instance.Instance(..)
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Arrow ((&&&))
-import Control.Applicative ((<$>))
import Control.Monad (liftM)
import Test.QuickCheck hiding (Result)
diff --git a/test/hs/Test/Ganeti/HTools/Node.hs b/test/hs/Test/Ganeti/HTools/Node.hs
index e7f46e2..24d942d 100644
--- a/test/hs/Test/Ganeti/HTools/Node.hs
+++ b/test/hs/Test/Ganeti/HTools/Node.hs
@@ -128,8 +128,11 @@
let node' = node { Node.offline = False
, Node.fMem = fmem
, Node.fMemForth = fmem
- , Node.pMem = fromIntegral fmem / Node.tMem node
- , Node.pMemForth = fromIntegral fmem / Node.tMem node
+ , Node.pMem = Node.computePmem fmem (Node.tMem node)
+ (Node.nMem node)
+ , Node.pMemForth = Node.computePmem fmem
+ (Node.tMem node)
+ (Node.nMem node)
, Node.rMem = 0
, Node.rMemForth = 0
, Node.pRem = 0
diff --git a/test/hs/Test/Ganeti/HTools/Types.hs b/test/hs/Test/Ganeti/HTools/Types.hs
index 7708b0a..4136308 100644
--- a/test/hs/Test/Ganeti/HTools/Types.hs
+++ b/test/hs/Test/Ganeti/HTools/Types.hs
@@ -45,10 +45,12 @@
, nullIPolicy
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck hiding (Result)
import Test.HUnit
-import Control.Applicative
import Control.Monad (replicateM)
import Test.Ganeti.TestHelper
@@ -146,11 +148,13 @@
dts <- genUniquesList num_tmpl arbitrary
vcpu_ratio <- choose (1.0, maxVcpuRatio)
spindle_ratio <- choose (1.0, maxSpindleRatio)
+ memory_ratio <- choose (1.0, maxMemoryRatio)
return Types.IPolicy { Types.iPolicyMinMaxISpecs = iminmax
, Types.iPolicyStdSpec = istd
, Types.iPolicyDiskTemplates = dts
, Types.iPolicyVcpuRatio = vcpu_ratio
, Types.iPolicySpindleRatio = spindle_ratio
+ , Types.iPolicyMemoryRatio = memory_ratio
}
-- * Test cases
diff --git a/test/hs/Test/Ganeti/JQScheduler.hs b/test/hs/Test/Ganeti/JQScheduler.hs
index 77eb2ac..04a6287 100644
--- a/test/hs/Test/Ganeti/JQScheduler.hs
+++ b/test/hs/Test/Ganeti/JQScheduler.hs
@@ -37,7 +37,9 @@
module Test.Ganeti.JQScheduler (testJQScheduler) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Lens ((&), (.~), _2)
import qualified Data.ByteString.UTF8 as UTF8
import Data.List (inits)
@@ -45,7 +47,6 @@
import qualified Data.Map as Map
import Data.Set (Set, difference)
import qualified Data.Set as Set
-import Data.Traversable (traverse)
import Text.JSON (JSValue(..))
import Test.HUnit
import Test.QuickCheck
diff --git a/test/hs/Test/Ganeti/JQueue/Objects.hs b/test/hs/Test/Ganeti/JQueue/Objects.hs
index 13e0f0f..6d56a5d 100644
--- a/test/hs/Test/Ganeti/JQueue/Objects.hs
+++ b/test/hs/Test/Ganeti/JQueue/Objects.hs
@@ -39,7 +39,9 @@
, genJobId
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck as QuickCheck
import Text.JSON
diff --git a/test/hs/Test/Ganeti/Locking/Allocation.hs b/test/hs/Test/Ganeti/Locking/Allocation.hs
index a4ce21b..498d149 100644
--- a/test/hs/Test/Ganeti/Locking/Allocation.hs
+++ b/test/hs/Test/Ganeti/Locking/Allocation.hs
@@ -42,7 +42,9 @@
, requestSucceeded
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.Foldable as F
import qualified Data.Map as M
import Data.Maybe (fromMaybe)
diff --git a/test/hs/Test/Ganeti/Locking/Locks.hs b/test/hs/Test/Ganeti/Locking/Locks.hs
index 732779f..1c992ff 100644
--- a/test/hs/Test/Ganeti/Locking/Locks.hs
+++ b/test/hs/Test/Ganeti/Locking/Locks.hs
@@ -37,7 +37,10 @@
module Test.Ganeti.Locking.Locks (testLocking_Locks) where
-import Control.Applicative ((<$>), (<*>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Monad (liftM)
import System.Posix.Types (CPid)
diff --git a/test/hs/Test/Ganeti/Locking/Waiting.hs b/test/hs/Test/Ganeti/Locking/Waiting.hs
index ee1a6b0..1b06225 100644
--- a/test/hs/Test/Ganeti/Locking/Waiting.hs
+++ b/test/hs/Test/Ganeti/Locking/Waiting.hs
@@ -37,7 +37,10 @@
module Test.Ganeti.Locking.Waiting (testLocking_Waiting) where
-import Control.Applicative ((<$>), (<*>), liftA2)
+import Prelude ()
+import Ganeti.Prelude
+
+import Control.Applicative (liftA2)
import Control.Monad (liftM)
import qualified Data.Map as M
import qualified Data.Set as S
diff --git a/test/hs/Test/Ganeti/Luxi.hs b/test/hs/Test/Ganeti/Luxi.hs
index c269b8c..47736f4 100644
--- a/test/hs/Test/Ganeti/Luxi.hs
+++ b/test/hs/Test/Ganeti/Luxi.hs
@@ -37,12 +37,14 @@
module Test.Ganeti.Luxi (testLuxi) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.HUnit
import Test.QuickCheck
import Test.QuickCheck.Monadic (monadicIO, run, stop)
import Data.List
-import Control.Applicative
import Control.Concurrent (forkIO)
import Control.Exception (bracket)
import qualified Text.JSON as J
diff --git a/test/hs/Test/Ganeti/Objects.hs b/test/hs/Test/Ganeti/Objects.hs
index 76543da..ea17bc0 100644
--- a/test/hs/Test/Ganeti/Objects.hs
+++ b/test/hs/Test/Ganeti/Objects.hs
@@ -49,11 +49,13 @@
, genBitStringMaxLen
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck
import qualified Test.HUnit as HUnit
-import Control.Applicative
-import Control.Monad
+import Control.Monad (liftM, when)
import qualified Data.ByteString as BS
import qualified Data.ByteString.UTF8 as UTF8
import Data.Char
@@ -91,6 +93,29 @@
return GenericContainer {
fromContainer = Map.fromList $ zip names configs }
+-- FYI: Currently only memory node value is used
+instance Arbitrary PartialHvStateParams where
+ arbitrary = PartialHvStateParams <$> pure Nothing <*> pure Nothing
+ <*> pure Nothing <*> genMaybe (fromPositive <$> arbitrary)
+ <*> pure Nothing
+
+instance Arbitrary PartialHvState where
+ arbitrary = do
+ hv_params <- arbitrary
+ return GenericContainer {
+ fromContainer = Map.fromList [ hv_params ] }
+
+-- FYI: Currently only memory node value is used
+instance Arbitrary FilledHvStateParams where
+ arbitrary = FilledHvStateParams <$> pure 0 <*> pure 0 <*> pure 0
+ <*> (fromPositive <$> arbitrary) <*> pure 0
+
+instance Arbitrary FilledHvState where
+ arbitrary = do
+ hv_params <- arbitrary
+ return GenericContainer {
+ fromContainer = Map.fromList [ hv_params ] }
+
instance Arbitrary BS.ByteString where
arbitrary = fmap UTF8.fromString arbitrary
@@ -389,6 +414,35 @@
, pure ECDSA
]
+instance Arbitrary RepairStatus where
+ arbitrary = elements [ RSNoted, RSPending, RSCanceled, RSFailed, RSCompleted ]
+
+instance Arbitrary RepairAction where
+ arbitrary = elements [ RANoop, RALiveRepair, RAEvacuate, RAEvacuateFailover ]
+
+instance Arbitrary Incident where
+ arbitrary = Incident <$> pure (J.JSObject $ J.toJSObject [])
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+
+instance Arbitrary MaintenanceData where
+ arbitrary = MaintenanceData <$> (fromPositive <$> arbitrary)
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+ <*> arbitrary
+
-- | Generates a network instance with minimum netmasks of /24. Generating
-- bigger networks slows down the tests, because long bit strings are generated
-- for the reservations.
@@ -445,6 +499,7 @@
networks = GenericContainer Map.empty
disks = GenericContainer Map.empty
filters = GenericContainer Map.empty
+ maintenance <- arbitrary
let contgroups = GenericContainer $ Map.singleton (UTF8.fromString guuid) grp
serial <- arbitrary
-- timestamp fields
@@ -452,7 +507,7 @@
mtime <- arbitrary
cluster <- resize 8 arbitrary
let c = ConfigData version cluster contnodes contgroups continsts networks
- disks filters ctime mtime serial
+ disks filters ctime maintenance mtime serial
return c
-- | FIXME: make an even simpler base version of creating a cluster.
diff --git a/test/hs/Test/Ganeti/OpCodes.hs b/test/hs/Test/Ganeti/OpCodes.hs
index 959d803..bc16c9e 100644
--- a/test/hs/Test/Ganeti/OpCodes.hs
+++ b/test/hs/Test/Ganeti/OpCodes.hs
@@ -40,11 +40,13 @@
, OpCodes.OpCode(..)
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.HUnit as HUnit
import Test.QuickCheck as QuickCheck
-import Control.Applicative
-import Control.Monad
+import Control.Monad (when)
import Data.Char
import Data.List
import qualified Data.Map as Map
@@ -189,9 +191,9 @@
arbitrary <*> genListSet Nothing <*> genListSet Nothing <*>
arbitrary <*> arbitrary
"OP_CLUSTER_VERIFY_DISKS" ->
- OpCodes.OpClusterVerifyDisks <$> genMaybe genNameNE
+ OpCodes.OpClusterVerifyDisks <$> genMaybe genNameNE <*> arbitrary
"OP_GROUP_VERIFY_DISKS" ->
- OpCodes.OpGroupVerifyDisks <$> genNameNE
+ OpCodes.OpGroupVerifyDisks <$> genNameNE <*> arbitrary
"OP_CLUSTER_REPAIR_DISK_SIZES" ->
OpCodes.OpClusterRepairDiskSizes <$> genNodeNamesNE
"OP_CLUSTER_CONFIG_QUERY" ->
@@ -256,6 +258,10 @@
<*> arbitrary -- enabled_user_shutdown
<*> genMaybe arbitraryDataCollector -- enabled_data_collectors
<*> arbitraryDataCollectorInterval -- data_collector_interval
+ <*> genMaybe genName -- diagnose_data_collector_filename
+ <*> genMaybe (fromPositive <$> arbitrary) -- maintd round interval
+ <*> genMaybe arbitrary -- enable maintd balancing
+ <*> genMaybe arbitrary -- maintd balancing threshold
"OP_CLUSTER_REDIST_CONF" -> pure OpCodes.OpClusterRedistConf
"OP_CLUSTER_ACTIVATE_MASTER_IP" ->
pure OpCodes.OpClusterActivateMasterIp
@@ -271,12 +277,13 @@
arbitrary <*> arbitrary <*> arbitrary <*>
(arbitrary `suchThat` (>0))
"OP_NODE_REMOVE" ->
- OpCodes.OpNodeRemove <$> genNodeNameNE <*> return Nothing
+ OpCodes.OpNodeRemove <$> genNodeNameNE <*> return Nothing <*>
+ arbitrary <*> arbitrary
"OP_NODE_ADD" ->
OpCodes.OpNodeAdd <$> genNodeNameNE <*> emptyMUD <*> emptyMUD <*>
genMaybe genNameNE <*> genMaybe genNameNE <*> arbitrary <*>
genMaybe genNameNE <*> arbitrary <*> arbitrary <*> emptyMUD <*>
- arbitrary
+ arbitrary <*> arbitrary <*> arbitrary
"OP_NODE_QUERYVOLS" ->
OpCodes.OpNodeQueryvols <$> genNamesNE <*> genNodeNamesNE
"OP_NODE_QUERY_STORAGE" ->
@@ -292,7 +299,8 @@
OpCodes.OpNodeSetParams <$> genNodeNameNE <*> return Nothing <*>
arbitrary <*> emptyMUD <*> emptyMUD <*> arbitrary <*> arbitrary <*>
arbitrary <*> arbitrary <*> arbitrary <*> arbitrary <*>
- genMaybe genNameNE <*> emptyMUD <*> arbitrary
+ genMaybe genNameNE <*> emptyMUD <*> arbitrary <*> arbitrary <*>
+ arbitrary
"OP_NODE_POWERCYCLE" ->
OpCodes.OpNodePowercycle <$> genNodeNameNE <*> return Nothing <*>
arbitrary
@@ -517,6 +525,9 @@
"OP_RESTRICTED_COMMAND" ->
OpCodes.OpRestrictedCommand <$> arbitrary <*> genNodeNamesNE <*>
return Nothing <*> genNameNE
+ "OP_REPAIR_COMMAND" ->
+ OpCodes.OpRepairCommand <$> genNodeNameNE <*> genNameNE <*>
+ genMaybe genPrintableAsciiStringNE
_ -> fail $ "Undefined arbitrary for opcode " ++ op_id
instance Arbitrary OpCodes.CommonOpParams where
diff --git a/test/hs/Test/Ganeti/Query/Language.hs b/test/hs/Test/Ganeti/Query/Language.hs
index 9556bc3..677990a 100644
--- a/test/hs/Test/Ganeti/Query/Language.hs
+++ b/test/hs/Test/Ganeti/Query/Language.hs
@@ -41,10 +41,12 @@
, genJSValue
) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.HUnit (Assertion, assertEqual)
import Test.QuickCheck
-import Control.Applicative
import Control.Arrow (second)
import Text.JSON
diff --git a/test/hs/Test/Ganeti/Rpc.hs b/test/hs/Test/Ganeti/Rpc.hs
index 8205cc1..bdb83ac 100644
--- a/test/hs/Test/Ganeti/Rpc.hs
+++ b/test/hs/Test/Ganeti/Rpc.hs
@@ -37,10 +37,12 @@
module Test.Ganeti.Rpc (testRpc) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck
import Test.QuickCheck.Monadic (monadicIO, run, stop)
-import Control.Applicative
import qualified Data.Map as Map
import Test.Ganeti.TestHelper
diff --git a/test/hs/Test/Ganeti/Runtime.hs b/test/hs/Test/Ganeti/Runtime.hs
index ee48e0e..3e49dd6 100644
--- a/test/hs/Test/Ganeti/Runtime.hs
+++ b/test/hs/Test/Ganeti/Runtime.hs
@@ -97,6 +97,7 @@
\ constants.KVMD_USER,\n\
\ constants.LUXID_USER,\n\
\ constants.MOND_USER,\n\
+ \ constants.MOND_USER,\n\
\ ]\n\
\groups = [constants.MASTERD_GROUP,\n\
\ constants.METAD_GROUP,\n\
@@ -107,6 +108,7 @@
\ constants.KVMD_GROUP,\n\
\ constants.LUXID_GROUP,\n\
\ constants.MOND_GROUP,\n\
+ \ constants.MOND_GROUP,\n\
\ constants.DAEMONS_GROUP,\n\
\ constants.ADMIN_GROUP,\n\
\ ]\n\
diff --git a/test/hs/Test/Ganeti/SlotMap.hs b/test/hs/Test/Ganeti/SlotMap.hs
index 295240d..7897c72 100644
--- a/test/hs/Test/Ganeti/SlotMap.hs
+++ b/test/hs/Test/Ganeti/SlotMap.hs
@@ -42,16 +42,15 @@
, overfullKeys
) where
-import Prelude hiding (all)
+import Prelude ()
+import Ganeti.Prelude hiding (all)
-import Control.Applicative
import Control.Monad
import Data.Foldable (all)
import qualified Data.Map as Map
import Data.Map (Map, member, keys, keysSet)
import Data.Set (Set, size, union)
import qualified Data.Set as Set
-import Data.Traversable (traverse)
import Test.HUnit
import Test.QuickCheck
diff --git a/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs b/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs
index 8193ae9..4a63b02 100644
--- a/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs
+++ b/test/hs/Test/Ganeti/Storage/Diskstats/Parser.hs
@@ -35,13 +35,15 @@
module Test.Ganeti.Storage.Diskstats.Parser (testBlock_Diskstats_Parser) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck as QuickCheck hiding (Result)
import Test.HUnit
import Test.Ganeti.TestHelper
import Test.Ganeti.TestCommon
-import Control.Applicative ((<*>), (<$>))
import qualified Data.Attoparsec.Text as A
import Data.Text (pack)
import Text.Printf
diff --git a/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs b/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs
index 9a00799..bb1ec64 100644
--- a/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs
+++ b/test/hs/Test/Ganeti/Storage/Lvm/LVParser.hs
@@ -35,13 +35,15 @@
module Test.Ganeti.Storage.Lvm.LVParser (testStorage_Lvm_LVParser) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck as QuickCheck hiding (Result)
import Test.HUnit
import Test.Ganeti.TestHelper
import Test.Ganeti.TestCommon
-import Control.Applicative ((<$>), (<*>))
import Data.List (intercalate)
import Ganeti.Storage.Lvm.LVParser
diff --git a/test/hs/Test/Ganeti/TestCommon.hs b/test/hs/Test/Ganeti/TestCommon.hs
index bcd8421..43595df 100644
--- a/test/hs/Test/Ganeti/TestCommon.hs
+++ b/test/hs/Test/Ganeti/TestCommon.hs
@@ -41,6 +41,7 @@
, maxCpu
, maxSpindles
, maxVcpuRatio
+ , maxMemoryRatio
, maxSpindleRatio
, maxNodes
, maxOpCodes
@@ -92,9 +93,11 @@
, counterexample
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Control.Exception (catchJust)
-import Control.Monad
+import Control.Monad (guard, liftM, foldM)
import Data.Attoparsec.Text (Parser, parseOnly)
import Data.List
import qualified Data.Map as M
@@ -154,6 +157,10 @@
maxSpindleRatio :: Double
maxSpindleRatio = 1024.0
+-- | Max memory ratio (random value).
+maxMemoryRatio :: Double
+maxMemoryRatio = 1024.0
+
-- | Max nodes, used just to limit arbitrary instances for smaller
-- opcode definitions (e.g. list of nodes in OpTestDelay).
maxNodes :: Int
diff --git a/test/hs/Test/Ganeti/TestHTools.hs b/test/hs/Test/Ganeti/TestHTools.hs
index e2ec6a5..92fef8d 100644
--- a/test/hs/Test/Ganeti/TestHTools.hs
+++ b/test/hs/Test/Ganeti/TestHTools.hs
@@ -94,6 +94,9 @@
, Types.iPolicyVcpuRatio = maxVcpuRatio -- somewhat random value, high
-- enough to not impact us
, Types.iPolicySpindleRatio = maxSpindleRatio
+ , Types.iPolicyMemoryRatio = 1 -- because there are several test which
+ -- become senseless in case of memory
+ -- over-commitment
}
-- | Default group definition.
diff --git a/test/hs/Test/Ganeti/TestHelper.hs b/test/hs/Test/Ganeti/TestHelper.hs
index 399ad58..01be610 100644
--- a/test/hs/Test/Ganeti/TestHelper.hs
+++ b/test/hs/Test/Ganeti/TestHelper.hs
@@ -39,7 +39,9 @@
, genArbitrary
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import Data.List (stripPrefix, isPrefixOf)
import Data.Maybe (fromMaybe)
import Test.Framework
diff --git a/test/hs/Test/Ganeti/Types.hs b/test/hs/Test/Ganeti/Types.hs
index 12f957a..5ce6dae 100644
--- a/test/hs/Test/Ganeti/Types.hs
+++ b/test/hs/Test/Ganeti/Types.hs
@@ -47,7 +47,9 @@
, genReasonTrail
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import System.Time (ClockTime(..))
import Test.QuickCheck as QuickCheck hiding (Result)
diff --git a/test/hs/Test/Ganeti/Utils.hs b/test/hs/Test/Ganeti/Utils.hs
index af1c5b6..c65db11 100644
--- a/test/hs/Test/Ganeti/Utils.hs
+++ b/test/hs/Test/Ganeti/Utils.hs
@@ -37,13 +37,19 @@
module Test.Ganeti.Utils (testUtils) where
+import Prelude ()
+import Ganeti.Prelude
+
import Test.QuickCheck hiding (Result)
import Test.HUnit
-import Control.Applicative ((<$>), (<*>))
import Data.Char (isSpace)
import qualified Data.Either as Either
+#if MIN_VERSION_base(4,8,0)
+import Data.List hiding (isSubsequenceOf)
+#else
import Data.List
+#endif
import Data.Maybe (listToMaybe)
import qualified Data.Set as S
import System.Time
diff --git a/test/hs/Test/Ganeti/Utils/MultiMap.hs b/test/hs/Test/Ganeti/Utils/MultiMap.hs
index 3656841..02dfc46 100644
--- a/test/hs/Test/Ganeti/Utils/MultiMap.hs
+++ b/test/hs/Test/Ganeti/Utils/MultiMap.hs
@@ -39,7 +39,9 @@
( testUtils_MultiMap
) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
+
import qualified Data.Set as S
import qualified Data.Map as M
diff --git a/test/hs/Test/Ganeti/Utils/Statistics.hs b/test/hs/Test/Ganeti/Utils/Statistics.hs
index f39546b..573769c 100644
--- a/test/hs/Test/Ganeti/Utils/Statistics.hs
+++ b/test/hs/Test/Ganeti/Utils/Statistics.hs
@@ -55,9 +55,7 @@
let original = xs ++ [a] ++ ys
modified = xs ++ [b] ++ ys
with_update =
- getStatisticValue
- $ updateStatistics (getStdDevStatistics $ map SimpleNumber original)
- (SimpleNumber a, SimpleNumber b)
+ getValue $ update (calculate original :: StdDevStat) a b
direct = stdDev modified
in counterexample ("Value computed by update " ++ show with_update
++ " differs too much from correct value " ++ show direct)
diff --git a/test/hs/Test/Ganeti/WConfd/TempRes.hs b/test/hs/Test/Ganeti/WConfd/TempRes.hs
index 768804c..8b8745b 100644
--- a/test/hs/Test/Ganeti/WConfd/TempRes.hs
+++ b/test/hs/Test/Ganeti/WConfd/TempRes.hs
@@ -37,7 +37,8 @@
module Test.Ganeti.WConfd.TempRes (testWConfd_TempRes) where
-import Control.Applicative
+import Prelude ()
+import Ganeti.Prelude
import Test.QuickCheck
diff --git a/test/hs/htest.hs b/test/hs/htest.hs
index 86d193e..ca83366 100644
--- a/test/hs/htest.hs
+++ b/test/hs/htest.hs
@@ -34,7 +34,9 @@
module Main(main) where
-import Data.Monoid (mappend)
+import Prelude ()
+import Ganeti.Prelude
+
import Test.Framework
import System.Environment (getArgs)
import System.Log.Logger
diff --git a/test/hs/shelltests/htools-balancing.test b/test/hs/shelltests/htools-balancing.test
index 383cb8e..95f82c2 100644
--- a/test/hs/shelltests/htools-balancing.test
+++ b/test/hs/shelltests/htools-balancing.test
@@ -133,11 +133,60 @@
>>>/Solution length=1/
>>>=0
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data -C
+>>>/gnt-instance migrate -f -n node-01-001 inst-0./
+>>>=0
+
# ...but the --ignore-dynu option should be honored
./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --ignore-dynu
>>>/Cluster is already well balanced/
>>>=0
+# Assuming idle default also gives 0 utilisation
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --idle-default
+>>>/Cluster is already well balanced/
+>>>=0
+
+# Heavy CPU load can even push instances on the more
+# crowded node
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --idle-default --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json -C
+>>>/gnt-instance migrate -f -n node-01-000 inst-1./
+>>>=0
+
+# ...but with default assumption about disk/net/mem fully used the move is in
+# the other direction.
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json
+>>>/Solution length=1/
+>>>=0
+
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json -C
+>>>/gnt-instance migrate -f -n node-01-001 inst-0./
+>>>=0
+
+# Still, --ignore-dynu overrides everything
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn.data --idle-default --mond --mond-xen --mond-data=$TESTDATA_DIR/dyn1.json --ignore-dynu
+>>>/Cluster is already well balanced/
+>>>=0
+
+# On an overcommitted cluster with small amount of memory, taking memory
+# usage into account can make a difference.
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn2.json
+>>>/Solution length=0/
+>>>=0
+
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn2.json --mond-kvm-rss -C
+>>>/gnt-instance migrate -f -n node-01-000 inst-1./
+>>>=0
+
+# Depending on weight, instances can move in either direction
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn3.json --mond-kvm-rss -C --mem-weight=0.5
+>>>/ 1\. inst-0./
+>>>=0
+
+./test/hs/hbal -t$TESTDATA_DIR/hbal-dyn2.data --idle-default --mond --mond-data=$TESTDATA_DIR/dyn3.json --mond-kvm-rss -C --mem-weight=3.0
+>>>/ 1\. inst-1./
+>>>=0
+
# Test CPU speed is taken into account
./test/hs/hbal -t$TESTDATA_DIR/hbal-cpu-speed.data --ignore-dynu
>>>/inst[12] node-slow:node-fast => node-fast:node-slow/
diff --git a/test/hs/shelltests/htools-hail.test b/test/hs/shelltests/htools-hail.test
index 4725f84..d40b67f 100644
--- a/test/hs/shelltests/htools-hail.test
+++ b/test/hs/shelltests/htools-hail.test
@@ -44,7 +44,7 @@
>>>= 0
./test/hs/hail $TESTDATA_DIR/hail-alloc-invalid-twodisks.json
->>> /"success":false,.*FailDisk: 1/
+>>> /"success":false,.*FailTooSmall: 1/
>>>= 0
# check that hail honors network requirements
@@ -84,12 +84,12 @@
>>>= 0
./test/hs/hail $T/hail-alloc-invalid-twodisks.json.excl-stor
->>> /"success":false,.*FailDisk: 1"/
+>>> /"success":false,.*FailTooSmall: 1"/
>>>= 0
# Same tests with exclusive storage enabled, but no spindles info in instances
./test/hs/hail $T/hail-alloc-drbd.json.fail-excl-stor
->>> /"success":false,.*FailSpindles: 12"/
+>>> /"success":false,.*FailTooSmall: 12"/
>>>= 0
./test/hs/hail $T/hail-reloc-drbd.json.fail-excl-stor
@@ -101,11 +101,11 @@
>>>= 0
./test/hs/hail $T/hail-change-group.json.fail-excl-stor
->>> /"success":true,"info":"Request successful: 1 instances failed to move and 0 were moved successfully",.*FailSpindles: 2"/
+>>> /"success":true,"info":"Request successful: 1 instances failed to move and 0 were moved successfully",.*FailTooSmall: 2"/
>>>= 0
./test/hs/hail $T/hail-alloc-twodisks.json.fail-excl-stor
->>> /"success":false,.*FailSpindles: 1"/
+>>> /"success":false,.*FailTooSmall: 1"/
>>>= 0
# check that hail correctly parses admin state
@@ -165,7 +165,7 @@
>>>= 0
./test/hs/hail $T/hail-alloc-spindles.json.excl-stor
->>> /"success":true,"info":"Request successful: Selected group: group1,.*FailSpindles: 2",.*"result":\["node1"\]/
+>>> /"success":true,"info":"Request successful: Selected group: group1,.*FailTooSmall: 2",.*"result":\["node1"\]/
>>>= 0
# Check that --ignore-soft-errors works and ignores tag errors
@@ -230,6 +230,11 @@
>>> /successes 2, failures 0.*"result":"node-2-2"/
>>>= 0
+# Memory over-commitment test
+./test/hs/hail $TESTDATA_DIR/hail-alloc-memory-over-commitment.json
+>>> /"success":true.*/
+>>>= 0
+
# Check that hail account location tags
./test/hs/hail $TESTDATA_DIR/hail-alloc-nlocation.json
>>> /"success":true,.*,"result":\["node3","node2"\]/
diff --git a/test/hs/shelltests/htools-hbal.test b/test/hs/shelltests/htools-hbal.test
index b7b29d8..f7ce274 100644
--- a/test/hs/shelltests/htools-hbal.test
+++ b/test/hs/shelltests/htools-hbal.test
@@ -92,3 +92,19 @@
node-02 0
node-03 1/
>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-avoid-disk-moves.data --avoid-disk-moves=1.2
+>>>/Solution length=1/
+>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-avoid-disk-moves.data --avoid-disk-moves=5
+>>>/Solution length=2/
+>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-memory-over-commitment.data
+>>>/No solution found/
+>>>= 0
+
+./test/hs/hbal -t $TESTDATA_DIR/hbal-memory-over-commitment-2.data
+>>>/Solution length=1/
+>>>= 0
diff --git a/test/hs/shelltests/htools-hcheck.test b/test/hs/shelltests/htools-hcheck.test
index d5bd0dd..854e045 100644
--- a/test/hs/shelltests/htools-hcheck.test
+++ b/test/hs/shelltests/htools-hcheck.test
@@ -22,3 +22,6 @@
>>>/Cluster is not healthy: False/
>>>= 0
+./test/hs/hcheck -t $TESTDATA_DIR/hsqueeze-underutilized.data --machine-readable
+>>>/HCHECK_INIT_GROUP_0_REDUNDANCY=4/
+>>>= 0
diff --git a/test/hs/shelltests/htools-hspace.test b/test/hs/shelltests/htools-hspace.test
index 80ad64f..0dba25a 100644
--- a/test/hs/shelltests/htools-hspace.test
+++ b/test/hs/shelltests/htools-hspace.test
@@ -26,7 +26,7 @@
>>>=0
# Mixed cluster, half with exclusive storage
-./test/hs/hspace --machine-readable -t $TESTDATA_DIR/hspace-tiered-mixed.data --no-capacity-checks > $T/capacity && sh -c ". $T/capacity && test \"\${HTS_TSPEC}\" = '131072,1048576,4,12=2 131072,1048576,4,10=2 129984,1048320,4,10=2' && test \"\${HTS_ALLOC_INSTANCES}\" = 6 && test \"\${HTS_TRL_SPN_FREE}\" = 0 && test \"\${HTS_FIN_SPN_FREE}\" = 18"
+./test/hs/hspace --machine-readable -t $TESTDATA_DIR/hspace-tiered-mixed.data --no-capacity-checks > $T/capacity && sh -c ". $T/capacity && echo \"\${HTS_TSPEC}\" | grep -q '131072,1048576,4,12=2 .*129984,1048320,4,10=2' && test \"\${HTS_ALLOC_INSTANCES}\" = 6 && test \"\${HTS_TRL_SPN_FREE}\" = 0 && test \"\${HTS_FIN_SPN_FREE}\" = 18"
>>>=0
# Verify that instance policy for disks is adhered to
diff --git a/test/py/cfgupgrade_unittest.py b/test/py/cfgupgrade_unittest.py
index a6dec64..132575a 100755
--- a/test/py/cfgupgrade_unittest.py
+++ b/test/py/cfgupgrade_unittest.py
@@ -56,7 +56,7 @@
"version": constants.CONFIG_VERSION,
"cluster": {
"master_node": "node1-uuid",
- "ipolicy": None,
+ "ipolicy": {},
"default_iallocator_params": {},
"diskparams": {},
"ndparams": {},
@@ -67,13 +67,16 @@
"compression_tools": constants.IEC_DEFAULT_TOOLS,
"enabled_user_shutdown": False,
"data_collectors": {
+ "diagnose": { "active": True, "interval": 5000000 },
"diskstats": { "active": True, "interval": 5000000 },
"drbd": { "active": True, "interval": 5000000 },
+ "kvm-inst-rss": { "active": True, "interval": 5000000 },
"lv": { "active": True, "interval": 5000000 },
"inst-status-xen": { "active": True, "interval": 5000000 },
"cpu-avg-load": { "active": True, "interval": 5000000 },
"xen-cpu-avg-load": { "active": True, "interval": 5000000 },
},
+ "diagnose_data_collector_filename": "",
"ssh_key_type": "dsa",
"ssh_key_bits": 1024,
},
@@ -81,6 +84,7 @@
"disks": {},
"networks": {},
"filters": {},
+ "maintenance": {},
"nodegroups": {},
"nodes": {
"node1-uuid": {
@@ -435,6 +439,19 @@
def testUpgradeFullConfigFrom_2_15(self):
self._TestUpgradeFromFile("cluster_config_2.15.json", False)
+ def testUpgradeFullConfigFrom_2_16(self):
+ self._TestUpgradeFromFile("cluster_config_2.16.json", False)
+
+ def testUpgradeFullConfigFrom_2_17(self):
+ self._TestUpgradeFromFile("cluster_config_2.17.json", False)
+
+ def test_2_17_to_2_16_downgrade(self):
+ self._TestUpgradeFromFile("cluster_config_2.17.json", False)
+ _RunUpgrade(self.tmpdir, False, True, downgrade=True)
+ oldconf = self._LoadConfig()
+ newconf = self._LoadTestDataConfig("cluster_config_2.16.json")
+ self.assertEqual(oldconf, newconf)
+
def testUpgradeCurrent(self):
self._TestSimpleUpgrade(constants.CONFIG_VERSION, False)
@@ -452,7 +469,7 @@
def testDowngradeFullConfig(self):
"""Test for upgrade + downgrade combination."""
# This test can work only with the previous version of a configuration!
- oldconfname = "cluster_config_2.15.json"
+ oldconfname = "cluster_config_2.16.json"
self._TestUpgradeFromFile(oldconfname, False)
_RunUpgrade(self.tmpdir, False, True, downgrade=True)
oldconf = self._LoadTestDataConfig(oldconfname)
diff --git a/test/py/cmdlib/cluster_unittest.py b/test/py/cmdlib/cluster_unittest.py
index 22701d9..1bdac3f 100644
--- a/test/py/cmdlib/cluster_unittest.py
+++ b/test/py/cmdlib/cluster_unittest.py
@@ -2159,7 +2159,9 @@
def setUp(self):
super(TestLUClusterVerifyGroupUpdateNodeInfo, self).setUp()
self.nimg = verify.LUClusterVerifyGroup.NodeImage(uuid=self.master_uuid)
- self.valid_hvresult = {constants.NV_HVINFO: {"memory_free": 1024}}
+ self.valid_hvresult = {constants.NV_HVINFO: {"memory_free": 1024,
+ "memory_total": 4096,
+ "memory_dom0": 3072}}
@withLockedLU
def testInvalidHvNodeResult(self, lu):
@@ -2171,7 +2173,9 @@
@withLockedLU
def testInvalidMemoryFreeHvNodeResult(self, lu):
lu._UpdateNodeInfo(self.master,
- {constants.NV_HVINFO: {"memory_free": "abc"}},
+ {constants.NV_HVINFO: {"memory_free": 'abc',
+ "memory_total": 1024,
+ "memory_dom0": 2048}},
self.nimg, None)
self.mcpu.assertLogContainsRegex(
"node returned invalid nodeinfo, check hypervisor")
diff --git a/test/py/daemon-util_unittest.bash b/test/py/daemon-util_unittest.bash
index 1437713..84fd6f3 100755
--- a/test/py/daemon-util_unittest.bash
+++ b/test/py/daemon-util_unittest.bash
@@ -45,8 +45,8 @@
STOPDAEMONS_LIST="kvmd luxid rapi wconfd confd noded"
if grep -q '^ENABLE_MOND = True' lib/_constants.py; then
- DAEMONS_LIST="$DAEMONS_LIST mond"
- STOPDAEMONS_LIST="mond $STOPDAEMONS_LIST"
+ DAEMONS_LIST="$DAEMONS_LIST mond maintd"
+ STOPDAEMONS_LIST="maintd mond $STOPDAEMONS_LIST"
fi
STOPDAEMONS_LIST="metad $STOPDAEMONS_LIST"
diff --git a/test/py/docs_unittest.py b/test/py/docs_unittest.py
index cf59866..aa8a971 100755
--- a/test/py/docs_unittest.py
+++ b/test/py/docs_unittest.py
@@ -76,6 +76,7 @@
# Very sensitive in nature
opcodes.OpRestrictedCommand,
+ opcodes.OpRepairCommand,
opcodes.OpClusterRenewCrypto,
# Helper opcodes (e.g. submitted by LUs)
diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ganeti.backend_unittest.py
index 897fcba..e737dad 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -32,6 +32,7 @@
import collections
import copy
+import time
import mock
import os
import shutil
@@ -424,7 +425,7 @@
return "Executing command '%s' failed" % cmd
-class TestRunRestrictedCmd(unittest.TestCase):
+class TestRunConstrainedCmd(unittest.TestCase):
def setUp(self):
self.tmpdir = tempfile.mkdtemp()
@@ -436,10 +437,10 @@
sleep_fn = testutils.CallCounter(_SleepForRestrictedCmd)
self.assertFalse(os.path.exists(lockfile))
self.assertRaises(backend.RPCFail,
- backend.RunRestrictedCmd, "test",
+ backend.RunConstrainedCmd, "test",
_lock_timeout=NotImplemented,
- _lock_file=lockfile,
- _path=NotImplemented,
+ lock_file=lockfile,
+ path=NotImplemented,
_sleep_fn=sleep_fn,
_prepare_fn=NotImplemented,
_runcmd_fn=NotImplemented,
@@ -452,14 +453,14 @@
result = False
try:
- backend.RunRestrictedCmd("test22717",
- _lock_timeout=0.1,
- _lock_file=lockfile,
- _path=NotImplemented,
- _sleep_fn=sleep_fn,
- _prepare_fn=NotImplemented,
- _runcmd_fn=NotImplemented,
- _enabled=True)
+ backend.RunConstrainedCmd("test22717",
+ _lock_timeout=0.1,
+ lock_file=lockfile,
+ path=NotImplemented,
+ _sleep_fn=sleep_fn,
+ _prepare_fn=NotImplemented,
+ _runcmd_fn=NotImplemented,
+ _enabled=True)
except backend.RPCFail, err:
assert str(err) == _GenericRestrictedCmdError("test22717"), \
"Did not fail with generic error message"
@@ -491,11 +492,11 @@
prepare_fn = testutils.CallCounter(self._PrepareRaisingException)
try:
- backend.RunRestrictedCmd("test23122",
- _lock_timeout=1.0, _lock_file=lockfile,
- _path=NotImplemented, _runcmd_fn=NotImplemented,
- _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
- _enabled=True)
+ backend.RunConstrainedCmd("test23122",
+ _lock_timeout=1.0, lock_file=lockfile,
+ path=NotImplemented, _runcmd_fn=NotImplemented,
+ _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
+ _enabled=True)
except backend.RPCFail, err:
self.assertEqual(str(err), _GenericRestrictedCmdError("test23122"))
else:
@@ -516,11 +517,11 @@
prepare_fn = testutils.CallCounter(self._PrepareFails)
try:
- backend.RunRestrictedCmd("test29327",
- _lock_timeout=1.0, _lock_file=lockfile,
- _path=NotImplemented, _runcmd_fn=NotImplemented,
- _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
- _enabled=True)
+ backend.RunConstrainedCmd("test29327",
+ _lock_timeout=1.0, lock_file=lockfile,
+ path=NotImplemented, _runcmd_fn=NotImplemented,
+ _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
+ _enabled=True)
except backend.RPCFail, err:
self.assertEqual(str(err), _GenericRestrictedCmdError("test29327"))
else:
@@ -533,11 +534,11 @@
def _SuccessfulPrepare(path, cmd):
return (True, utils.PathJoin(path, cmd))
- def testRunCmdFails(self):
+ def testRunConstrainedCmdFails(self):
lockfile = utils.PathJoin(self.tmpdir, "lock")
def fn(args, env=NotImplemented, reset_env=NotImplemented,
- postfork_fn=NotImplemented):
+ postfork_fn=NotImplemented, input_fd=NotImplemented):
self.assertEqual(args, [utils.PathJoin(self.tmpdir, "test3079")])
self.assertEqual(env, {})
self.assertTrue(reset_env)
@@ -567,11 +568,11 @@
runcmd_fn = testutils.CallCounter(fn)
try:
- backend.RunRestrictedCmd("test3079",
- _lock_timeout=1.0, _lock_file=lockfile,
- _path=self.tmpdir, _runcmd_fn=runcmd_fn,
- _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
- _enabled=True)
+ backend.RunConstrainedCmd("test3079",
+ _lock_timeout=1.0, lock_file=lockfile,
+ path=self.tmpdir, _runcmd_fn=runcmd_fn,
+ _sleep_fn=sleep_fn, _prepare_fn=prepare_fn,
+ _enabled=True)
except backend.RPCFail, err:
self.assertTrue(str(err).startswith("Restricted command 'test3079'"
" failed:"))
@@ -584,11 +585,11 @@
self.assertEqual(prepare_fn.Count(), 1)
self.assertEqual(runcmd_fn.Count(), 1)
- def testRunCmdSucceeds(self):
+ def testRunConstrainedCmdSucceeds(self):
lockfile = utils.PathJoin(self.tmpdir, "lock")
def fn(args, env=NotImplemented, reset_env=NotImplemented,
- postfork_fn=NotImplemented):
+ postfork_fn=NotImplemented, input_fd=NotImplemented):
self.assertEqual(args, [utils.PathJoin(self.tmpdir, "test5667")])
self.assertEqual(env, {})
self.assertTrue(reset_env)
@@ -605,12 +606,12 @@
prepare_fn = testutils.CallCounter(self._SuccessfulPrepare)
runcmd_fn = testutils.CallCounter(fn)
- result = backend.RunRestrictedCmd("test5667",
- _lock_timeout=1.0, _lock_file=lockfile,
- _path=self.tmpdir, _runcmd_fn=runcmd_fn,
- _sleep_fn=sleep_fn,
- _prepare_fn=prepare_fn,
- _enabled=True)
+ result = backend.RunConstrainedCmd("test5667",
+ _lock_timeout=1.0, lock_file=lockfile,
+ path=self.tmpdir, _runcmd_fn=runcmd_fn,
+ _sleep_fn=sleep_fn,
+ _prepare_fn=prepare_fn,
+ _enabled=True)
self.assertEqual(result, "stdout14463")
self.assertEqual(sleep_fn.Count(), 0)
@@ -619,14 +620,14 @@
def testCommandsDisabled(self):
try:
- backend.RunRestrictedCmd("test",
- _lock_timeout=NotImplemented,
- _lock_file=NotImplemented,
- _path=NotImplemented,
- _sleep_fn=NotImplemented,
- _prepare_fn=NotImplemented,
- _runcmd_fn=NotImplemented,
- _enabled=False)
+ backend.RunConstrainedCmd("test",
+ _lock_timeout=NotImplemented,
+ lock_file=NotImplemented,
+ path=NotImplemented,
+ _sleep_fn=NotImplemented,
+ _prepare_fn=NotImplemented,
+ _runcmd_fn=NotImplemented,
+ _enabled=False)
except backend.RPCFail, err:
self.assertEqual(str(err),
"Restricted commands disabled at configure time")
@@ -1033,6 +1034,11 @@
self._ssh_replace_name_by_uuid_mock.side_effect = \
self._ssh_file_manager.ReplaceNameByUuid
+ self._time_sleep_patcher = testutils \
+ .patch_object(time, "sleep")
+ self._time_sleep_mock = \
+ self._time_sleep_patcher.start()
+
self.noded_cert_file = testutils.TestDataFilename("cert1.pem")
self._SetupTestData()
@@ -1045,6 +1051,7 @@
self._ssh_remove_public_key_patcher.stop()
self._ssh_query_pub_key_file_patcher.stop()
self._ssh_replace_name_by_uuid_patcher.stop()
+ self._time_sleep_patcher.stop()
self._TearDownTestData()
def _SetupTestData(self, number_of_nodes=15, number_of_pot_mcs=5,
@@ -1102,10 +1109,9 @@
key_file=self._pub_key_file)
backend._GenerateNodeSshKey(
- test_node_uuid, test_node_name,
+ test_node_name,
self._ssh_file_manager.GetSshPortMap(self._SSH_PORT),
"rsa", 2048,
- pub_key_file=self._pub_key_file,
ssconf_store=self._ssconf_mock,
noded_cert_file=self.noded_cert_file,
run_cmd_fn=self._run_cmd_mock)
@@ -1959,6 +1965,127 @@
self.assertTrue([error_msg for (node, error_msg) in error_msgs
if node == node_name])
+ def _MockReadRemoteSshPubKey(self, pub_key_file, node, cluster_name, port,
+ ask_key, strict_host_check):
+ return self._ssh_file_manager.GetKeyOfNode(self._master_node)
+
+
+ def _MockReadLocalSshPubKeys(self, key_types, suffix=""):
+ return [self._ssh_file_manager.GetKeyOfNode(self._master_node)]
+
+ def _setUpRenewCrypto(self):
+ """Preparations only needed for the renew-crypto unittests."""
+ self.tmpdir = tempfile.mkdtemp()
+ self._dsa_keyfile = os.path.join(self.tmpdir, "id_dsa.pub")
+ self._rsa_keyfile = os.path.join(self.tmpdir, "id_rsa.pub")
+
+ self._ssh_get_all_user_files_patcher = testutils \
+ .patch_object(ssh, "GetAllUserFiles")
+ self._ssh_get_all_user_files_mock = \
+ self._ssh_get_all_user_files_patcher.start()
+ self._ssh_get_all_user_files_mock.return_value = (None,
+ {constants.SSHK_DSA: (None, self._dsa_keyfile),
+ constants.SSHK_RSA: (None, self._rsa_keyfile)})
+
+ self._ssh_read_remote_ssh_pub_key_patcher = testutils \
+ .patch_object(ssh, "ReadRemoteSshPubKey")
+ self._ssh_read_remote_ssh_pub_key_mock = \
+ self._ssh_read_remote_ssh_pub_key_patcher.start()
+ self._ssh_read_remote_ssh_pub_key_mock.side_effect = \
+ self._MockReadRemoteSshPubKey
+
+ self._ssh_read_local_ssh_pub_keys_patcher = testutils \
+ .patch_object(ssh, "ReadLocalSshPubKeys")
+ self._ssh_read_local_ssh_pub_keys_mock = \
+ self._ssh_read_local_ssh_pub_keys_patcher.start()
+ self._ssh_read_local_ssh_pub_keys_mock.side_effect = \
+ self._MockReadLocalSshPubKeys
+
+ self._ssh_replace_ssh_keys_patcher = testutils \
+ .patch_object(ssh, "ReplaceSshKeys")
+ self._ssh_replace_ssh_keys_mock = \
+ self._ssh_replace_ssh_keys_patcher.start()
+
+ def _tearDownRenewCrypto(self):
+ self._ssh_get_all_user_files_patcher.stop()
+ self._ssh_read_remote_ssh_pub_key_patcher.stop()
+ self._ssh_read_local_ssh_pub_keys_patcher.stop()
+ self._ssh_replace_ssh_keys_patcher.stop()
+
+ def testRenewCrypto(self):
+ self._setUpRenewCrypto()
+
+ node_uuids = self._ssh_file_manager.GetAllNodeUuids()
+ node_names = self._ssh_file_manager.GetAllNodeNames()
+
+ old_ssh_file_manager = copy.deepcopy(self._ssh_file_manager)
+
+ backend.RenewSshKeys(node_uuids, node_names,
+ self._master_candidate_uuids,
+ self._potential_master_candidates,
+ constants.SSHK_DSA, constants.SSHK_DSA,
+ constants.SSH_DEFAULT_KEY_BITS,
+ ganeti_pub_keys_file=self._pub_key_file,
+ ssconf_store=self._ssconf_mock,
+ noded_cert_file=self.noded_cert_file,
+ run_cmd_fn=self._run_cmd_mock)
+
+ self._tearDownRenewCrypto()
+
+ self.assertEqual(set(old_ssh_file_manager.GetAllNodeNames()),
+ set(self._ssh_file_manager.GetAllNodeNames()))
+
+ for node_name in self._ssh_file_manager.GetAllNodeNames():
+ self.assertNotEqual(self._ssh_file_manager.GetKeyOfNode(node_name),
+ old_ssh_file_manager.GetKeyOfNode(node_name))
+
+
+class TestRemoveSshKeyFromPublicKeyFile(testutils.GanetiTestCase):
+
+ def setUp(self):
+ testutils.GanetiTestCase.setUp(self)
+ self._ssconf_mock = mock.Mock()
+ self._ssconf_mock.GetNodeList = mock.Mock()
+ self._tmpdir = tempfile.mkdtemp()
+ self._pub_keys_file = os.path.join(self._tmpdir, "pub_keys_file")
+
+ def testValidRemoval(self):
+ key = "myKey"
+ name = "myName"
+ ssh.AddPublicKey(name, key, key_file=self._pub_keys_file)
+ self._ssconf_mock.GetNodeList.return_value = \
+ ["myOtherNode1", "myOtherNode2"]
+
+ backend.RemoveSshKeyFromPublicKeyFile(
+ name, pub_key_file=self._pub_keys_file,
+ ssconf_store=self._ssconf_mock)
+
+ result = ssh.QueryPubKeyFile([name], key_file=self._pub_keys_file)
+ self.assertEqual({}, result)
+
+ def testStillClusterNode(self):
+ """Tests the safety check to only remove keys of obsolete nodes."""
+ key = "myKey"
+ name = "myName"
+ ssh.AddPublicKey(name, key, key_file=self._pub_keys_file)
+ self._ssconf_mock.GetNodeList.return_value = ["myName", "myOtherNode"]
+
+ self.assertRaises(
+ errors.SshUpdateError,
+ backend.RemoveSshKeyFromPublicKeyFile,
+ name, pub_key_file=self._pub_keys_file,
+ ssconf_store=self._ssconf_mock)
+
+ def testNoKey(self):
+ name = "myName"
+ # 'clear' file to make sure it exists.
+ ssh.ClearPubKeyFile(key_file=self._pub_keys_file)
+ self._ssconf_mock.GetNodeList.return_value = ["myOtherNode"]
+
+ backend.RemoveSshKeyFromPublicKeyFile(
+ name, pub_key_file=self._pub_keys_file,
+ ssconf_store=self._ssconf_mock)
+
class TestVerifySshSetup(testutils.GanetiTestCase):
diff --git a/test/py/ganeti.client.gnt_cluster_unittest.py b/test/py/ganeti.client.gnt_cluster_unittest.py
index 38bda23..2c827a7 100755
--- a/test/py/ganeti.client.gnt_cluster_unittest.py
+++ b/test/py/ganeti.client.gnt_cluster_unittest.py
@@ -409,7 +409,7 @@
self._setUpFakeKeys()
self._ssh_read_remote_ssh_pub_keys_patcher = testutils \
- .patch_object(ssh, "ReadRemoteSshPubKeys")
+ .patch_object(ssh, "ReadRemoteSshPubKey")
self._ssh_read_remote_ssh_pub_keys_mock = \
self._ssh_read_remote_ssh_pub_keys_patcher.start()
self._ssh_read_remote_ssh_pub_keys_mock.return_value = self._SOME_KEY_DICT
diff --git a/test/py/ganeti.masterd.iallocator_unittest.py b/test/py/ganeti.masterd.iallocator_unittest.py
index d92e572..5fb18a8 100755
--- a/test/py/ganeti.masterd.iallocator_unittest.py
+++ b/test/py/ganeti.masterd.iallocator_unittest.py
@@ -101,6 +101,8 @@
class _FakeConfigWithNdParams:
def GetNdParams(self, _):
return None
+ def GetFilledHvStateParams(self, _):
+ return None
class TestComputeBasicNodeData(unittest.TestCase):
@@ -112,6 +114,7 @@
self.assertEqual(self.fn({}), {})
def testSimple(self):
+ self.maxDiff = None
node1 = objects.Node(name="node1",
primary_ip="192.0.2.1",
secondary_ip="192.0.2.2",
@@ -151,6 +154,7 @@
"master_capable": True,
"vm_capable": False,
"ndparams": None,
+ "hv_state": None,
},
"node2": {
"tags": [],
@@ -163,6 +167,7 @@
"master_capable": False,
"vm_capable": True,
"ndparams": None,
+ "hv_state": None,
},
})
diff --git a/test/py/ganeti.ssh_unittest.py b/test/py/ganeti.ssh_unittest.py
index 265adec..661245b 100755
--- a/test/py/ganeti.ssh_unittest.py
+++ b/test/py/ganeti.ssh_unittest.py
@@ -488,36 +488,203 @@
self.assertTrue(os.path.exists(self.priv_filename + suffix + ".pub"))
-class TestDetermineKeyBits():
+class TestDetermineKeyBits(testutils.GanetiTestCase):
def testCompleteness(self):
- self.assertEquals(constants.SSHK_ALL, ssh.SSH_KEY_VALID_BITS.keys())
+ self.assertEquals(constants.SSHK_ALL,
+ frozenset(ssh.SSH_KEY_VALID_BITS.keys()))
def testAdoptDefault(self):
- self.assertEquals(2048, DetermineKeyBits("rsa", None, None, None))
- self.assertEquals(1024, DetermineKeyBits("dsa", None, None, None))
+ self.assertEquals(2048, ssh.DetermineKeyBits("rsa", None, None, None))
+ self.assertEquals(1024, ssh.DetermineKeyBits("dsa", None, None, None))
def testAdoptOldKeySize(self):
- self.assertEquals(4098, DetermineKeyBits("rsa", None, "rsa", 4098))
- self.assertEquals(2048, DetermineKeyBits("rsa", None, "dsa", 1024))
+ self.assertEquals(4098, ssh.DetermineKeyBits("rsa", None, "rsa", 4098))
+ self.assertEquals(2048, ssh.DetermineKeyBits("rsa", None, "dsa", 1024))
def testDsaSpecificValues(self):
- self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "dsa", 2048,
+ self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "dsa", 2048,
None, None)
- self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "dsa", 512,
+ self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "dsa", 512,
None, None)
- self.assertEquals(1024, DetermineKeyBits("dsa", None, None, None))
+ self.assertEquals(1024, ssh.DetermineKeyBits("dsa", None, None, None))
def testEcdsaSpecificValues(self):
- self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "ecdsa", 2048,
+ self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "ecdsa", 2048,
None, None)
for b in [256, 384, 521]:
- self.assertEquals(b, DetermineKeyBits("ecdsa", b, None, None))
+ self.assertEquals(b, ssh.DetermineKeyBits("ecdsa", b, None, None))
def testRsaSpecificValues(self):
- self.assertRaises(errors.OpPrereqError, DetermineKeyBits, "dsa", 766,
+ self.assertRaises(errors.OpPrereqError, ssh.DetermineKeyBits, "dsa", 766,
None, None)
for b in [768, 769, 2048, 2049, 4096]:
- self.assertEquals(b, DetermineKeyBits("rsa", b, None, None))
+ self.assertEquals(b, ssh.DetermineKeyBits("rsa", b, None, None))
+
+
+class TestManageLocalSshPubKeys(testutils.GanetiTestCase):
+ """Test class for several methods handling local SSH keys.
+
+ Methods covered are:
+ - GetSshKeyFilenames
+ - GetSshPubKeyFilename
+ - ReplaceSshKeys
+ - ReadLocalSshPubKeys
+
+ These methods are covered in one test, because the preparations for
+ their tests is identical and thus can be reused.
+
+ """
+ VISIBILITY_PRIVATE = "private"
+ VISIBILITY_PUBLIC = "public"
+ VISIBILITIES = frozenset([VISIBILITY_PRIVATE, VISIBILITY_PUBLIC])
+
+ def _GenerateKey(self, key_id, visibility):
+ assert visibility in self.VISIBILITIES
+ return "I am the %s %s SSH key." % (visibility, key_id)
+
+ def _GetKeyPath(self, key_file_basename):
+ return os.path.join(self.tmpdir, key_file_basename)
+
+ def _SetUpKeys(self):
+ """Creates a fake SSH key for each type and with/without suffix."""
+ self._key_file_dict = {}
+ for key_type in constants.SSHK_ALL:
+ for suffix in ["", self._suffix]:
+ pub_key_filename = "id_%s%s.pub" % (key_type, suffix)
+ priv_key_filename = "id_%s%s" % (key_type, suffix)
+
+ pub_key_path = self._GetKeyPath(pub_key_filename)
+ priv_key_path = self._GetKeyPath(priv_key_filename)
+
+ utils.WriteFile(
+ priv_key_path,
+ data=self._GenerateKey(key_type + suffix, self.VISIBILITY_PRIVATE))
+
+ utils.WriteFile(
+ pub_key_path,
+ data=self._GenerateKey(key_type + suffix, self.VISIBILITY_PUBLIC))
+
+ # Fill key dict only for non-suffix keys
+ # (as this is how it will be in the code)
+ if not suffix:
+ self._key_file_dict[key_type] = \
+ (priv_key_path, pub_key_path)
+
+ def setUp(self):
+ testutils.GanetiTestCase.setUp(self)
+ self.tmpdir = tempfile.mkdtemp()
+ self._suffix = "_suffix"
+ self._SetUpKeys()
+
+ def tearDown(self):
+ shutil.rmtree(self.tmpdir)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testReadAllPublicKeyFiles(self, mock_getalluserfiles):
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ keys = ssh.ReadLocalSshPubKeys([], suffix="")
+
+ self.assertEqual(len(constants.SSHK_ALL), len(keys))
+ for key_type in constants.SSHK_ALL:
+ self.assertTrue(
+ self._GenerateKey(key_type, self.VISIBILITY_PUBLIC) in keys)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testReadOnePublicKeyFile(self, mock_getalluserfiles):
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ keys = ssh.ReadLocalSshPubKeys([constants.SSHK_DSA], suffix="")
+
+ self.assertEqual(1, len(keys))
+ self.assertEqual(
+ self._GenerateKey(constants.SSHK_DSA, self.VISIBILITY_PUBLIC),
+ keys[0])
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testReadPublicKeyFilesWithSuffix(self, mock_getalluserfiles):
+ key_types = [constants.SSHK_DSA, constants.SSHK_ECDSA]
+
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ keys = ssh.ReadLocalSshPubKeys(key_types, suffix=self._suffix)
+
+ self.assertEqual(2, len(keys))
+ for key_id in [key_type + self._suffix for key_type in key_types]:
+ self.assertTrue(
+ self._GenerateKey(key_id, self.VISIBILITY_PUBLIC) in keys)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testGetSshKeyFilenames(self, mock_getalluserfiles):
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ priv, pub = ssh.GetSshKeyFilenames(constants.SSHK_DSA)
+
+ self.assertEqual("id_dsa", os.path.basename(priv))
+ self.assertNotEqual("id_dsa", priv)
+ self.assertEqual("id_dsa.pub", os.path.basename(pub))
+ self.assertNotEqual("id_dsa.pub", pub)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testGetSshKeyFilenamesWithSuffix(self, mock_getalluserfiles):
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ priv, pub = ssh.GetSshKeyFilenames(constants.SSHK_RSA, suffix=self._suffix)
+
+ self.assertEqual("id_rsa_suffix", os.path.basename(priv))
+ self.assertNotEqual("id_rsa_suffix", priv)
+ self.assertEqual("id_rsa_suffix.pub", os.path.basename(pub))
+ self.assertNotEqual("id_rsa_suffix.pub", pub)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testGetPubSshKeyFilename(self, mock_getalluserfiles):
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ pub = ssh.GetSshPubKeyFilename(constants.SSHK_DSA)
+ pub_suffix = ssh.GetSshPubKeyFilename(
+ constants.SSHK_DSA, suffix=self._suffix)
+
+ self.assertEqual("id_dsa.pub", os.path.basename(pub))
+ self.assertNotEqual("id_dsa.pub", pub)
+ self.assertEqual("id_dsa_suffix.pub", os.path.basename(pub_suffix))
+ self.assertNotEqual("id_dsa_suffix.pub", pub_suffix)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testReplaceSshKeys(self, mock_getalluserfiles):
+ """Replace SSH keys without suffixes.
+
+ Note: usually it does not really make sense to replace the DSA key
+ by the RSA key. This is just to test the function without suffixes.
+
+ """
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ ssh.ReplaceSshKeys(constants.SSHK_RSA, constants.SSHK_DSA)
+
+ priv_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][0])
+ pub_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][1])
+
+ self.assertEqual("I am the private rsa SSH key.", priv_key)
+ self.assertEqual("I am the public rsa SSH key.", pub_key)
+
+ @testutils.patch_object(ssh, "GetAllUserFiles")
+ def testReplaceSshKeysBySuffixedKeys(self, mock_getalluserfiles):
+ """Replace SSH keys with keys from suffixed files.
+
+ Note: usually it does not really make sense to replace the DSA key
+ by the RSA key. This is just to test the function without suffixes.
+
+ """
+ mock_getalluserfiles.return_value = (None, self._key_file_dict)
+
+ ssh.ReplaceSshKeys(constants.SSHK_DSA, constants.SSHK_DSA,
+ src_key_suffix=self._suffix)
+
+ priv_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][0])
+ pub_key = utils.ReadFile(self._key_file_dict[constants.SSHK_DSA][1])
+
+ self.assertEqual("I am the private dsa_suffix SSH key.", priv_key)
+ self.assertEqual("I am the public dsa_suffix SSH key.", pub_key)
if __name__ == "__main__":
diff --git a/test/py/ganeti.utils.log_unittest.py b/test/py/ganeti.utils.log_unittest.py
index a5d98e9..c568b96 100755
--- a/test/py/ganeti.utils.log_unittest.py
+++ b/test/py/ganeti.utils.log_unittest.py
@@ -204,70 +204,5 @@
self.assertTrue(utils.ReadFile(logfile2).endswith("This is a test\n"))
-class TestSetupToolLogging(unittest.TestCase):
- def test(self):
- error_name = logging.getLevelName(logging.ERROR)
- warn_name = logging.getLevelName(logging.WARNING)
- info_name = logging.getLevelName(logging.INFO)
- debug_name = logging.getLevelName(logging.DEBUG)
-
- for debug in [False, True]:
- for verbose in [False, True]:
- logger = logging.Logger("TestLogger")
- buf = StringIO()
-
- utils.SetupToolLogging(debug, verbose, _root_logger=logger, _stream=buf)
-
- logger.error("level=error")
- logger.warning("level=warning")
- logger.info("level=info")
- logger.debug("level=debug")
-
- lines = buf.getvalue().splitlines()
-
- self.assertTrue(compat.all(line.count(":") == 3 for line in lines))
-
- messages = [line.split(":", 3)[-1].strip() for line in lines]
-
- if debug:
- self.assertEqual(messages, [
- "%s level=error" % error_name,
- "%s level=warning" % warn_name,
- "%s level=info" % info_name,
- "%s level=debug" % debug_name,
- ])
- elif verbose:
- self.assertEqual(messages, [
- "%s level=error" % error_name,
- "%s level=warning" % warn_name,
- "%s level=info" % info_name,
- ])
- else:
- self.assertEqual(messages, [
- "level=error",
- "level=warning",
- ])
-
- def testThreadName(self):
- thread_name = threading.currentThread().getName()
-
- for enable_threadname in [False, True]:
- logger = logging.Logger("TestLogger")
- buf = StringIO()
-
- utils.SetupToolLogging(True, True, threadname=enable_threadname,
- _root_logger=logger, _stream=buf)
-
- logger.debug("test134042376")
-
- lines = buf.getvalue().splitlines()
- self.assertEqual(len(lines), 1)
-
- if enable_threadname:
- self.assertTrue((" %s " % thread_name) in lines[0])
- else:
- self.assertTrue(thread_name not in lines[0])
-
-
if __name__ == "__main__":
testutils.GanetiTestProgram()
diff --git a/test/py/ganeti.utils.retry_unittest.py b/test/py/ganeti.utils.retry_unittest.py
index f8c5daa..93638cd 100755
--- a/test/py/ganeti.utils.retry_unittest.py
+++ b/test/py/ganeti.utils.retry_unittest.py
@@ -30,6 +30,8 @@
"""Script for testing ganeti.utils.retry"""
+import mock
+import time
import unittest
from ganeti import constants
@@ -205,5 +207,74 @@
self.assertEqual(self.called, 3)
+class TestRetryByNumberOfTimes(testutils.GanetiTestCase):
+
+ def setUp(self):
+ testutils.GanetiTestCase.setUp(self)
+
+ def testSuccessOnFirst(self):
+ test_fn = mock.Mock()
+ utils.RetryByNumberOfTimes(5, 0, Exception, test_fn)
+ test_fn.assert_called_once()
+
+ def testSuccessOnFirstWithArgs(self):
+ test_fn = mock.Mock()
+ utils.RetryByNumberOfTimes(5, 0, Exception, test_fn,
+ "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value")
+ test_fn.assert_called_with(
+ "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value")
+
+ def testSuccessAtSomePoint(self):
+ self.succeed_after_try = 2
+ self.num_try = 0
+ self.max_tries = 5
+
+ def test_fn():
+ self.num_try +=1
+ if self.num_try <= self.succeed_after_try:
+ raise errors.OpExecError("I fail!")
+ else:
+ return "I succeed."
+
+ utils.RetryByNumberOfTimes(self.max_tries, 0, Exception, test_fn)
+
+ def testFailAllTries(self):
+ self.max_tries = 5
+
+ def test_fn():
+ raise errors.OpExecError("I fail!")
+
+ self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+ 0, Exception, test_fn)
+
+ @testutils.patch_object(time, "sleep")
+ def testBackoffZero(self, mock_sleep):
+ self.max_tries = 5
+
+ def test_fn():
+ raise errors.OpExecError("I fail!")
+
+ self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+ backoff=0, exception_class=Exception, fn=test_fn)
+ for call in mock_sleep.mock_calls:
+ self.assertEqual(mock.call(0), call)
+
+ @testutils.patch_object(time, "sleep")
+ def testBackoffPositive(self, mock_sleep):
+ self.max_tries = 5
+
+ def test_fn():
+ raise errors.OpExecError("I fail!")
+
+ backoff = 3
+ self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries,
+ backoff=backoff, exception_class=Exception, fn=test_fn)
+
+ expected_calls = [3, 6, 12, 24, 48]
+ for call_idx in range(len(mock_sleep.mock_calls)):
+ self.assertEqual(mock.call(expected_calls[call_idx]),
+ mock_sleep.mock_calls[call_idx])
+
+
if __name__ == "__main__":
testutils.GanetiTestProgram()
diff --git a/test/py/testutils/config_mock.py b/test/py/testutils/config_mock.py
index 6dbdbda..473bede 100644
--- a/test/py/testutils/config_mock.py
+++ b/test/py/testutils/config_mock.py
@@ -64,6 +64,7 @@
# pylint: disable=R0904
+# pylint: disable=W0102
class ConfigMock(config.ConfigWriter):
"""A mocked cluster configuration with added methods for easy customization.
@@ -109,7 +110,7 @@
ndparams=None,
diskparams=None,
ipolicy=None,
- hv_state_static=None,
+ hv_state_static={},
disk_state_static=None,
alloc_policy=None,
networks=None):
@@ -160,7 +161,7 @@
ndparams=None,
powered=True,
hv_state=None,
- hv_state_static=None,
+ hv_state_static={},
disk_state=None,
disk_state_static=None):
"""Add a new L{objects.Node} to the cluster configuration
diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
index a38304d..e700e47 100644
--- a/test/py/testutils_ssh.py
+++ b/test/py/testutils_ssh.py
@@ -183,6 +183,14 @@
"""
return self._all_node_data.keys()
+ def GetAllNodeUuids(self):
+ """Returns all node UUIDs of the cluster.
+
+ @rtype: list of str
+ @returns: list of all node UUIDs
+ """
+ return [node.uuid for node in self._all_node_data.values()]
+
def GetAllPotentialMasterCandidateNodeNames(self):
return [name for name, node_info
in self._all_node_data.items()
@@ -281,12 +289,25 @@
def GetAuthorizedKeysOfNode(self, node):
"""Returns the authorized keys of the given node.
+ @type node: string
+ @param node: name of the node
@rtype: list of str
@returns: a list of authorized keys that are stored on that node
"""
return self._authorized_keys[node]
+ def GetKeyOfNode(self, node):
+ """Returns the SSH key of the given node.
+
+ @type node: string
+ @param node: name of the node
+ @rtype: string
+ @returns: the SSH key of the node
+
+ """
+ return self._all_node_data[node].key
+
def SetOrAddNode(self, name, uuid, key, pot_mc, mc, master):
"""Adds a new node to the state of the file manager.
@@ -508,8 +529,47 @@
if constants.SSHS_SSH_PUBLIC_KEYS in data:
instructions_pub = data[constants.SSHS_SSH_PUBLIC_KEYS]
self._HandlePublicKeys(instructions_pub, node)
+ if constants.SSHS_GENERATE in data:
+ instructions_generate = data[constants.SSHS_GENERATE]
+ self._GenerateNewKey(instructions_generate, node)
# pylint: enable=W0613
+ def _GenerateNewKey(self, instructions_generate, node):
+ """Generates a new key for the given node.
+
+ Note that this is a very rudimentary generation of a new key. The key is
+ always generated with the same pattern, starting with 'new_key'. That
+ means if you run it twice, it will actually produce the same key. However,
+ for what we want to test, this is sufficient.
+ The 'suffix' instruction is also ignored and the key is directly overriden.
+ This works so far, but simplifies the tests a bit. It might be extended
+ in case it becomes necessary.
+
+ @type instructions_generate: tuple of (string, integer, string)
+ @param instructions_generate: an instructions tuple for generating a new
+ SSH key. This has to comply to the C{_DATA_CHECK} description in
+ C{ssh_update.py}.
+ @type node: string
+ @param node: name of node
+ """
+ (key_type, key_bits, suffix) = instructions_generate
+ assert key_type in constants.SSHK_ALL
+ assert key_bits > 0
+ assert isinstance(suffix, str)
+
+ new_key = "new_key_%s" % node
+ old_node_data = self._all_node_data[node]
+
+ new_node_data = self._NodeInfo(
+ uuid=old_node_data.uuid,
+ key=new_key,
+ is_potential_master_candidate=old_node_data
+ .is_potential_master_candidate,
+ is_master_candidate=old_node_data.is_master_candidate,
+ is_master=old_node_data.is_master)
+
+ self._all_node_data[node] = new_node_data
+
def _EnsureAuthKeyFile(self, file_node_name):
if file_node_name not in self._authorized_keys:
self._authorized_keys[file_node_name] = set()
diff --git a/tools/cluster-merge b/tools/cluster-merge
index 926b705..8af20df 100755
--- a/tools/cluster-merge
+++ b/tools/cluster-merge
@@ -807,7 +807,9 @@
(options, args) = parser.parse_args()
- utils.SetupToolLogging(options.debug, options.verbose)
+ utils.SetupToolLogging(
+ options.debug, options.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
if not args:
parser.error("No clusters specified")
diff --git a/tools/move-instance b/tools/move-instance
index 8913f62..32474d5 100755
--- a/tools/move-instance
+++ b/tools/move-instance
@@ -1033,7 +1033,10 @@
"""
(parser, options, args) = ParseOptions()
- utils.SetupToolLogging(options.debug, options.verbose, threadname=True)
+ utils.SetupToolLogging(
+ options.debug, options.verbose, threadname=True,
+ toolname=os.path.splitext(os.path.basename(__file__))[0],
+ logfile=None)
(src_cluster_name, dest_cluster_name, instance_names) = \
CheckOptions(parser, options, args)
diff --git a/tools/ovfconverter b/tools/ovfconverter
index ba437c7..f13a3a9 100755
--- a/tools/ovfconverter
+++ b/tools/ovfconverter
@@ -177,7 +177,9 @@
"""
(mode, input_path, options) = ParseOptions()
- utils.SetupToolLogging(options.debug, options.verbose)
+ utils.SetupToolLogging(
+ options.debug, options.verbose,
+ toolname=os.path.splitext(os.path.basename(__file__))[0])
logging.info("Chosen %s mode, reading the %s file", mode, input_path)
assert mode in (IMPORT_MODE, EXPORT_MODE)
diff --git a/tools/post-upgrade b/tools/post-upgrade
index 4d673e0..41ca528 100644
--- a/tools/post-upgrade
+++ b/tools/post-upgrade
@@ -63,7 +63,8 @@
if utils.version.IsBefore(version, 2, 13, 0):
result = utils.RunCmd(["gnt-cluster", "renew-crypto",
- "--new-ssh-keys", "--no-ssh-key-check", "-f", "-d"])
+ "--new-ssh-keys", "--no-ssh-key-check",
+ "--verbose", "-f", "-d"])
if result.failed:
cli.ToStderr("Failed to create SSH keys: %s; Output %s" %