blob: b8ff4eff7b05628477d8d17e7dcb428908cd8580 [file] [log] [blame]
#
#
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Module keeping state for Ganeti watcher.
"""
import os
import time
import logging
from ganeti import utils
from ganeti import serializer
from ganeti import errors
# Delete any record that is older than 8 hours; this value is based on
# the fact that the current retry counter is 5, and watcher runs every
# 5 minutes, so it takes around half an hour to exceed the retry
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
RETRY_EXPIRATION = 8 * 3600
KEY_CLEANUP_COUNT = "cleanup_count"
KEY_CLEANUP_WHEN = "cleanup_when"
KEY_RESTART_COUNT = "restart_count"
KEY_RESTART_WHEN = "restart_when"
KEY_BOOT_ID = "bootid"
def OpenStateFile(path):
"""Opens the state file and acquires a lock on it.
@type path: string
@param path: Path to state file
"""
# The two-step dance below is necessary to allow both opening existing
# file read/write and creating if not existing. Vanilla open will truncate
# an existing file -or- allow creating if not existing.
statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
# Try to acquire lock on state file. If this fails, another watcher instance
# might already be running or another program is temporarily blocking the
# watcher from running.
try:
utils.LockFile(statefile_fd)
except errors.LockError, err:
logging.error("Can't acquire lock on state file %s: %s", path, err)
return None
return os.fdopen(statefile_fd, "w+")
class WatcherState(object):
"""Interface to a state file recording restart attempts.
"""
def __init__(self, statefile):
"""Open, lock, read and parse the file.
@type statefile: file
@param statefile: State file object
"""
self.statefile = statefile
try:
state_data = self.statefile.read()
if not state_data:
self._data = {}
else:
self._data = serializer.Load(state_data)
except Exception, msg: # pylint: disable=W0703
# Ignore errors while loading the file and treat it as empty
self._data = {}
logging.warning(("Invalid state file. Using defaults."
" Error message: %s"), msg)
if "instance" not in self._data:
self._data["instance"] = {}
if "node" not in self._data:
self._data["node"] = {}
self._orig_data = serializer.Dump(self._data)
def Save(self, filename):
"""Save state to file.
"""
assert self.statefile
serialized_form = serializer.Dump(self._data)
if self._orig_data == serialized_form:
logging.debug("Data didn't change, just touching status file")
os.utime(filename, None)
return
# We need to make sure the file is locked before renaming it, otherwise
# starting ganeti-watcher again at the same time will create a conflict.
fd = utils.WriteFile(filename,
data=serialized_form,
prewrite=utils.LockFile, close=False)
self.statefile = os.fdopen(fd, "w+")
def Close(self):
"""Unlock configuration file and close it.
"""
assert self.statefile
# Files are automatically unlocked when closing them
self.statefile.close()
self.statefile = None
def GetNodeBootID(self, name):
"""Returns the last boot ID of a node or None.
"""
ndata = self._data["node"]
if name in ndata and KEY_BOOT_ID in ndata[name]:
return ndata[name][KEY_BOOT_ID]
return None
def SetNodeBootID(self, name, bootid):
"""Sets the boot ID of a node.
"""
assert bootid
ndata = self._data["node"]
ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
def NumberOfRestartAttempts(self, instance_name):
"""Returns number of previous restart attempts.
@type instance_name: string
@param instance_name: the name of the instance to look up
"""
idata = self._data["instance"]
return idata.get(instance_name, {}).get(KEY_RESTART_COUNT, 0)
def NumberOfCleanupAttempts(self, instance_name):
"""Returns number of previous cleanup attempts.
@type instance_name: string
@param instance_name: the name of the instance to look up
"""
idata = self._data["instance"]
return idata.get(instance_name, {}).get(KEY_CLEANUP_COUNT, 0)
def MaintainInstanceList(self, instances):
"""Perform maintenance on the recorded instances.
@type instances: list of string
@param instances: the list of currently existing instances
"""
idict = self._data["instance"]
# First, delete obsolete instances
obsolete_instances = set(idict).difference(instances)
for inst in obsolete_instances:
logging.debug("Forgetting obsolete instance %s", inst)
idict.pop(inst, None)
# Second, delete expired records
earliest = time.time() - RETRY_EXPIRATION
expired_instances = [i for i in idict
if idict[i].get(KEY_RESTART_WHEN, 0) < earliest]
for inst in expired_instances:
logging.debug("Expiring record for instance %s", inst)
idict.pop(inst, None)
@staticmethod
def _RecordAttempt(instances, instance_name, key_when, key_count):
"""Record an event.
@type instances: dict
@param instances: contains instance data indexed by instance_name
@type instance_name: string
@param instance_name: name of the instance involved in the event
@type key_when:
@param key_when: dict key for the information for when the event occurred
@type key_count: int
@param key_count: dict key for the information for how many times
the event occurred
"""
instance = instances.setdefault(instance_name, {})
instance[key_when] = time.time()
instance[key_count] = instance.get(key_count, 0) + 1
def RecordRestartAttempt(self, instance_name):
"""Record a restart attempt.
@type instance_name: string
@param instance_name: the name of the instance being restarted
"""
self._RecordAttempt(self._data["instance"], instance_name,
KEY_RESTART_WHEN, KEY_RESTART_COUNT)
def RecordCleanupAttempt(self, instance_name):
"""Record a cleanup attempt.
@type instance_name: string
@param instance_name: the name of the instance being cleaned up
"""
self._RecordAttempt(self._data["instance"], instance_name,
KEY_CLEANUP_WHEN, KEY_CLEANUP_COUNT)
def RemoveInstance(self, instance_name):
"""Update state to reflect that a machine is running.
This method removes the record for a named instance (as we only
track down instances).
@type instance_name: string
@param instance_name: the name of the instance to remove from books
"""
idata = self._data["instance"]
idata.pop(instance_name, None)