blob: 3e8c4635c661001249e239ecd9717bb0e7d4e1f1 [file] [log] [blame]
#
#
# Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011 Google Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
"""Module keeping state for Ganeti watcher.
"""
import os
import time
import logging
from ganeti import utils
from ganeti import serializer
from ganeti import errors
# Delete any record that is older than 8 hours; this value is based on
# the fact that the current retry counter is 5, and watcher runs every
# 5 minutes, so it takes around half an hour to exceed the retry
# counter, so 8 hours (16*1/2h) seems like a reasonable reset time
RETRY_EXPIRATION = 8 * 3600
KEY_RESTART_COUNT = "restart_count"
KEY_RESTART_WHEN = "restart_when"
KEY_BOOT_ID = "bootid"
def OpenStateFile(path):
"""Opens the state file and acquires a lock on it.
@type path: string
@param path: Path to state file
"""
# The two-step dance below is necessary to allow both opening existing
# file read/write and creating if not existing. Vanilla open will truncate
# an existing file -or- allow creating if not existing.
statefile_fd = os.open(path, os.O_RDWR | os.O_CREAT)
# Try to acquire lock on state file. If this fails, another watcher instance
# might already be running or another program is temporarily blocking the
# watcher from running.
try:
utils.LockFile(statefile_fd)
except errors.LockError, err:
logging.error("Can't acquire lock on state file %s: %s", path, err)
return None
return os.fdopen(statefile_fd, "w+")
class WatcherState(object):
"""Interface to a state file recording restart attempts.
"""
def __init__(self, statefile):
"""Open, lock, read and parse the file.
@type statefile: file
@param statefile: State file object
"""
self.statefile = statefile
try:
state_data = self.statefile.read()
if not state_data:
self._data = {}
else:
self._data = serializer.Load(state_data)
except Exception, msg: # pylint: disable=W0703
# Ignore errors while loading the file and treat it as empty
self._data = {}
logging.warning(("Invalid state file. Using defaults."
" Error message: %s"), msg)
if "instance" not in self._data:
self._data["instance"] = {}
if "node" not in self._data:
self._data["node"] = {}
self._orig_data = serializer.Dump(self._data)
def Save(self, filename):
"""Save state to file, then unlock and close it.
"""
assert self.statefile
serialized_form = serializer.Dump(self._data)
if self._orig_data == serialized_form:
logging.debug("Data didn't change, just touching status file")
os.utime(filename, None)
return
# We need to make sure the file is locked before renaming it, otherwise
# starting ganeti-watcher again at the same time will create a conflict.
fd = utils.WriteFile(filename,
data=serialized_form,
prewrite=utils.LockFile, close=False)
self.statefile = os.fdopen(fd, "w+")
def Close(self):
"""Unlock configuration file and close it.
"""
assert self.statefile
# Files are automatically unlocked when closing them
self.statefile.close()
self.statefile = None
def GetNodeBootID(self, name):
"""Returns the last boot ID of a node or None.
"""
ndata = self._data["node"]
if name in ndata and KEY_BOOT_ID in ndata[name]:
return ndata[name][KEY_BOOT_ID]
return None
def SetNodeBootID(self, name, bootid):
"""Sets the boot ID of a node.
"""
assert bootid
ndata = self._data["node"]
ndata.setdefault(name, {})[KEY_BOOT_ID] = bootid
def NumberOfRestartAttempts(self, instance_name):
"""Returns number of previous restart attempts.
@type instance_name: string
@param instance_name: the name of the instance to look up
"""
idata = self._data["instance"]
if instance_name in idata:
return idata[instance_name][KEY_RESTART_COUNT]
return 0
def MaintainInstanceList(self, instances):
"""Perform maintenance on the recorded instances.
@type instances: list of string
@param instances: the list of currently existing instances
"""
idict = self._data["instance"]
# First, delete obsolete instances
obsolete_instances = set(idict).difference(instances)
for inst in obsolete_instances:
logging.debug("Forgetting obsolete instance %s", inst)
idict.pop(inst, None)
# Second, delete expired records
earliest = time.time() - RETRY_EXPIRATION
expired_instances = [i for i in idict
if idict[i][KEY_RESTART_WHEN] < earliest]
for inst in expired_instances:
logging.debug("Expiring record for instance %s", inst)
idict.pop(inst, None)
def RecordRestartAttempt(self, instance_name):
"""Record a restart attempt.
@type instance_name: string
@param instance_name: the name of the instance being restarted
"""
idata = self._data["instance"]
inst = idata.setdefault(instance_name, {})
inst[KEY_RESTART_WHEN] = time.time()
inst[KEY_RESTART_COUNT] = inst.get(KEY_RESTART_COUNT, 0) + 1
def RemoveInstance(self, instance_name):
"""Update state to reflect that a machine is running.
This method removes the record for a named instance (as we only
track down instances).
@type instance_name: string
@param instance_name: the name of the instance to remove from books
"""
idata = self._data["instance"]
idata.pop(instance_name, None)