launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #15990
[Merge] lp:~wgrant/launchpad/bi-uB-no-bfjb into lp:launchpad
William Grant has proposed merging lp:~wgrant/launchpad/bi-uB-no-bfjb into lp:launchpad.
Commit message:
BuilderInteractor.updateBuild constructs a BuildFarmJobBehavior itself as necessary, while BuilderInteractor.rescueIfLost is given a cookie directly.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/bi-uB-no-bfjb/+merge/188766
More BuilderInteractor refactorment. In this episode, updateBuild retrieves the Builder and constructs a BuildFarmJobBehavior itself only when necessary (on build completion), allowing the general case to avoid hitting the DB except for the BuildQueue.logtail write. Meanwhile, rescueIfLost takes the expected cookie rather than calculating it itself, so SlaveScanner can eventually cache the cookie without hitting the DB each time.
--
https://code.launchpad.net/~wgrant/launchpad/bi-uB-no-bfjb/+merge/188766
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/bi-uB-no-bfjb into lp:launchpad.
=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py 2013-10-01 00:38:38 +0000
+++ lib/lp/buildmaster/interactor.py 2013-10-02 05:56:53 +0000
@@ -25,7 +25,6 @@
BuildDaemonError,
CannotFetchFile,
CannotResumeHost,
- CorruptBuildCookie,
)
from lp.buildmaster.interfaces.buildfarmjobbehavior import (
IBuildFarmJobBehavior,
@@ -267,29 +266,15 @@
status['logtail'] = status_sentence[2]
defer.returnValue((status_sentence, status))
- @staticmethod
- def verifySlaveBuildCookie(behavior, slave_cookie):
- """See `IBuildFarmJobBehavior`."""
- if behavior is None:
- if slave_cookie is not None:
- raise CorruptBuildCookie('Slave building when should be idle.')
- else:
- good_cookie = behavior.getBuildCookie()
- if slave_cookie != good_cookie:
- raise CorruptBuildCookie(
- "Invalid slave build cookie: got %r, expected %r."
- % (slave_cookie, good_cookie))
-
@classmethod
@defer.inlineCallbacks
- def rescueIfLost(cls, vitals, slave, behavior, logger=None):
+ def rescueIfLost(cls, vitals, slave, expected_cookie, logger=None):
"""Reset the slave if its job information doesn't match the DB.
- This checks the build ID reported in the slave status against the
- database. If it isn't building what we think it should be, the current
- build will be aborted and the slave cleaned in preparation for a new
- task. The decision about the slave's correctness is left up to
- `IBuildFarmJobBehavior.verifySlaveBuildCookie`.
+ This checks the build ID reported in the slave status against
+ the given cookie. If it isn't building what we think it should
+ be, the current build will be aborted and the slave cleaned in
+ preparation for a new task.
:return: A Deferred that fires when the dialog with the slave is
finished. Its return value is True if the slave is lost,
@@ -315,14 +300,12 @@
else:
slave_cookie = status_sentence[ident_position[status]]
- # verifySlaveBuildCookie will raise CorruptBuildCookie if the
- # slave cookie doesn't match the expected one, including
- # verifying that the slave cookie is None iff we expect the
- # slave to be idle.
- try:
- cls.verifySlaveBuildCookie(behavior, slave_cookie)
+ if slave_cookie == expected_cookie:
+ # The master and slave agree about the current job. Continue.
defer.returnValue(False)
- except CorruptBuildCookie as reason:
+ else:
+ # The master and slave disagree. The master is our master,
+ # so try to rescue the slave.
# An IDLE slave doesn't need rescuing (SlaveScanner.scan
# will rescue the DB side instead), and we just have to wait
# out an ABORTING one.
@@ -332,8 +315,8 @@
yield slave.abort()
if logger:
logger.info(
- "Builder '%s' rescued from '%s': '%s'" %
- (vitals.name, slave_cookie, reason))
+ "Builder slave '%s' rescued from %r (expected %r)." %
+ (vitals.name, slave_cookie, expected_cookie))
defer.returnValue(True)
@classmethod
@@ -474,9 +457,22 @@
candidate, vitals, builder, slave, new_behavior, logger)
defer.returnValue(candidate)
+ @staticmethod
+ def extractBuildStatus(status_dict):
+ """Read build status name.
+
+ :param status_dict: build status dict from slaveStatus.
+ :return: the unqualified status name, e.g. "OK".
+ """
+ status_string = status_dict['build_status']
+ lead_string = 'BuildStatus.'
+ assert status_string.startswith(lead_string), (
+ "Malformed status string: '%s'" % status_string)
+ return status_string[len(lead_string):]
+
@classmethod
@defer.inlineCallbacks
- def updateBuild(cls, queueItem, slave, behavior):
+ def updateBuild(cls, vitals, slave, builders_cache, behavior_factory):
"""Verify the current build job status.
Perform the required actions for each state.
@@ -487,76 +483,34 @@
# impossible to get past rescueIfLost unless the slave matches
# the DB, and this method isn't called unless the DB says
# there's a job.
- builder_status_handlers = {
- 'BuilderStatus.BUILDING': cls.updateBuild_BUILDING,
- 'BuilderStatus.ABORTING': cls.updateBuild_ABORTING,
- 'BuilderStatus.WAITING': cls.updateBuild_WAITING,
- }
statuses = yield cls.slaveStatus(slave)
- logger = logging.getLogger('slave-scanner')
status_sentence, status_dict = statuses
builder_status = status_dict['builder_status']
- if builder_status not in builder_status_handlers:
+ if builder_status == 'BuilderStatus.BUILDING':
+ # Build still building, collect the logtail.
+ if vitals.build_queue.job.status != JobStatus.RUNNING:
+ # XXX: This check should be removed once we confirm it's
+ # not regularly hit.
+ raise AssertionError(
+ "Job not running when assigned and slave building.")
+ vitals.build_queue.logtail = encoding.guess(
+ str(status_dict.get('logtail')))
+ transaction.commit()
+ elif builder_status == 'BuilderStatus.ABORTING':
+ # Build is being aborted.
+ vitals.build_queue.logtail = (
+ "Waiting for slave process to be terminated")
+ transaction.commit()
+ elif builder_status == 'BuilderStatus.WAITING':
+ # Build has finished. Delegate handling to the build itself.
+ builder = builders_cache[vitals.name]
+ behavior = behavior_factory(vitals.build_queue, builder, slave)
+ behavior.updateSlaveStatus(status_sentence, status_dict)
+ yield behavior.handleStatus(
+ vitals.build_queue, cls.extractBuildStatus(status_dict),
+ status_dict)
+ else:
raise AssertionError("Unknown status %s" % builder_status)
- method = builder_status_handlers[builder_status]
- yield method(
- queueItem, behavior, status_sentence, status_dict, logger)
-
- @staticmethod
- def updateBuild_BUILDING(queueItem, behavior, status_sentence, status_dict,
- logger):
- """Build still building, collect the logtail"""
- if queueItem.job.status != JobStatus.RUNNING:
- queueItem.job.start()
- queueItem.logtail = encoding.guess(str(status_dict.get('logtail')))
- transaction.commit()
-
- @staticmethod
- def updateBuild_ABORTING(queueItem, behavior, status_sentence, status_dict,
- logger):
- """Build was ABORTED.
-
- Master-side should wait until the slave finish the process correctly.
- """
- queueItem.logtail = "Waiting for slave process to be terminated"
- transaction.commit()
-
- @staticmethod
- def extractBuildStatus(status_dict):
- """Read build status name.
-
- :param status_dict: build status dict as passed to the
- updateBuild_* methods.
- :return: the unqualified status name, e.g. "OK".
- """
- status_string = status_dict['build_status']
- lead_string = 'BuildStatus.'
- assert status_string.startswith(lead_string), (
- "Malformed status string: '%s'" % status_string)
-
- return status_string[len(lead_string):]
-
- @classmethod
- def updateBuild_WAITING(cls, queueItem, behavior, status_sentence,
- status_dict, logger):
- """Perform the actions needed for a slave in a WAITING state
-
- Buildslave can be WAITING in five situations:
-
- * Build has failed, no filemap is received (PACKAGEFAIL, DEPFAIL,
- CHROOTFAIL, BUILDERFAIL,
- ABORTED)
-
- * Build has been built successfully (BuildStatus.OK), in this case
- we have a 'filemap', so we can retrieve those files and store in
- Librarian with getFileFromSlave() and then pass the binaries to
- the uploader for processing.
- """
- behavior.updateSlaveStatus(
- status_sentence, status_dict)
- d = behavior.handleStatus(
- queueItem, cls.extractBuildStatus(status_dict), status_dict)
- return d
@staticmethod
def _getSlaveScannerLogger():
=== modified file 'lib/lp/buildmaster/interfaces/builder.py'
--- lib/lp/buildmaster/interfaces/builder.py 2013-09-02 08:11:58 +0000
+++ lib/lp/buildmaster/interfaces/builder.py 2013-10-02 05:56:53 +0000
@@ -7,7 +7,6 @@
__all__ = [
'BuildDaemonError',
- 'CorruptBuildCookie',
'BuildSlaveFailure',
'CannotBuild',
'CannotFetchFile',
@@ -71,13 +70,6 @@
"""The build slave had a protocol version. This is a serious error."""
-class CorruptBuildCookie(BuildDaemonError):
- """The build slave is working with mismatched information.
-
- It needs to be rescued.
- """
-
-
class CannotResumeHost(BuildDaemonError):
"""The build slave virtual machine cannot be resumed."""
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py 2013-09-30 08:12:17 +0000
+++ lib/lp/buildmaster/manager.py 2013-10-02 05:56:53 +0000
@@ -296,8 +296,9 @@
return
behavior = self.behavior_factory(
vitals.build_queue, builder, slave)
+ db_cookie = behavior.getBuildCookie() if behavior else None
lost = yield interactor.rescueIfLost(
- vitals, slave, behavior, self.logger)
+ vitals, slave, db_cookie, self.logger)
if lost:
lost_reason = '%s is lost' % vitals.name
@@ -320,9 +321,8 @@
if vitals.build_queue is not None:
# Scan the slave and get the logtail, or collect the build
# if it's ready. Yes, "updateBuild" is a bad name.
- behavior = behavior or self.behavior_factory(
- vitals.build_queue, builder, slave)
- yield interactor.updateBuild(vitals.build_queue, slave, behavior)
+ yield interactor.updateBuild(
+ vitals, slave, self.builders_cache, self.behavior_factory)
elif vitals.manual:
# If the builder is in manual mode, don't dispatch anything.
self.logger.debug(
=== modified file 'lib/lp/buildmaster/tests/test_interactor.py'
--- lib/lp/buildmaster/tests/test_interactor.py 2013-10-01 01:07:35 +0000
+++ lib/lp/buildmaster/tests/test_interactor.py 2013-10-02 05:56:53 +0000
@@ -30,7 +30,6 @@
from lp.buildmaster.interfaces.builder import (
CannotFetchFile,
CannotResumeHost,
- CorruptBuildCookie,
)
from lp.buildmaster.tests.mock_slaves import (
AbortingSlave,
@@ -40,7 +39,6 @@
MockBuilder,
OkSlave,
SlaveTestHelpers,
- TrivialBehavior,
WaitingSlave,
)
from lp.services.config import config
@@ -78,38 +76,6 @@
self.assertRaises(
AssertionError, BuilderInteractor.extractBuildStatus, slave_status)
- def test_verifySlaveBuildCookie_building_match(self):
- BuilderInteractor.verifySlaveBuildCookie(TrivialBehavior(), 'trivial')
-
- def test_verifySlaveBuildCookie_building_mismatch(self):
- self.assertRaises(
- CorruptBuildCookie,
- BuilderInteractor.verifySlaveBuildCookie,
- TrivialBehavior(), 'difficult')
-
- def test_verifySlaveBuildCookie_idle_match(self):
- BuilderInteractor.verifySlaveBuildCookie(None, None)
-
- def test_verifySlaveBuildCookie_idle_mismatch(self):
- self.assertRaises(
- CorruptBuildCookie,
- BuilderInteractor.verifySlaveBuildCookie, None, 'foo')
-
- def test_rescueIfLost_aborts_lost_and_broken_slave(self):
- # A slave that's 'lost' should be aborted; when the slave is
- # broken then abort() should also throw a fault.
- slave = LostBuildingBrokenSlave()
- d = BuilderInteractor.rescueIfLost(
- extract_vitals_from_db(MockBuilder()), slave, TrivialBehavior())
-
- def check_slave_status(failure):
- self.assertIn('abort', slave.call_log)
- # 'Fault' comes from the LostBuildingBrokenSlave, this is
- # just testing that the value is passed through.
- self.assertIsInstance(failure.value, xmlrpclib.Fault)
-
- return d.addBoth(check_slave_status)
-
def resumeSlaveHost(self, builder):
vitals = extract_vitals_from_db(builder)
return BuilderInteractor.resumeSlaveHost(
@@ -183,6 +149,21 @@
slave = BuilderInteractor.makeSlaveFromVitals(vitals)
self.assertEqual(5, slave.timeout)
+ def test_rescueIfLost_aborts_lost_and_broken_slave(self):
+ # A slave that's 'lost' should be aborted; when the slave is
+ # broken then abort() should also throw a fault.
+ slave = LostBuildingBrokenSlave()
+ d = BuilderInteractor.rescueIfLost(
+ extract_vitals_from_db(MockBuilder()), slave, 'trivial')
+
+ def check_slave_status(failure):
+ self.assertIn('abort', slave.call_log)
+ # 'Fault' comes from the LostBuildingBrokenSlave, this is
+ # just testing that the value is passed through.
+ self.assertIsInstance(failure.value, xmlrpclib.Fault)
+
+ return d.addBoth(check_slave_status)
+
@defer.inlineCallbacks
def test_recover_idle_slave(self):
# An idle slave is not rescued, even if it's not meant to be
@@ -190,7 +171,7 @@
# we still report that it's lost.
slave = OkSlave()
lost = yield BuilderInteractor.rescueIfLost(
- extract_vitals_from_db(MockBuilder()), slave, TrivialBehavior())
+ extract_vitals_from_db(MockBuilder()), slave, 'trivial')
self.assertTrue(lost)
self.assertEqual([], slave.call_log)
@@ -209,8 +190,7 @@
# WAITING.
waiting_slave = WaitingSlave(build_id='trivial')
lost = yield BuilderInteractor.rescueIfLost(
- extract_vitals_from_db(MockBuilder()), waiting_slave,
- TrivialBehavior())
+ extract_vitals_from_db(MockBuilder()), waiting_slave, 'trivial')
self.assertFalse(lost)
self.assertEqual(['status'], waiting_slave.call_log)
@@ -223,8 +203,7 @@
# discarded.
waiting_slave = WaitingSlave(build_id='non-trivial')
lost = yield BuilderInteractor.rescueIfLost(
- extract_vitals_from_db(MockBuilder()), waiting_slave,
- TrivialBehavior())
+ extract_vitals_from_db(MockBuilder()), waiting_slave, 'trivial')
self.assertTrue(lost)
self.assertEqual(['status', 'clean'], waiting_slave.call_log)
@@ -234,8 +213,7 @@
# BUILDING.
building_slave = BuildingSlave(build_id='trivial')
lost = yield BuilderInteractor.rescueIfLost(
- extract_vitals_from_db(MockBuilder()), building_slave,
- TrivialBehavior())
+ extract_vitals_from_db(MockBuilder()), building_slave, 'trivial')
self.assertFalse(lost)
self.assertEqual(['status'], building_slave.call_log)
@@ -245,8 +223,7 @@
# abort the build, thus stopping it in its tracks.
building_slave = BuildingSlave(build_id='non-trivial')
lost = yield BuilderInteractor.rescueIfLost(
- extract_vitals_from_db(MockBuilder()), building_slave,
- TrivialBehavior())
+ extract_vitals_from_db(MockBuilder()), building_slave, 'trivial')
self.assertTrue(lost)
self.assertEqual(['status', 'abort'], building_slave.call_log)
=== modified file 'lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py'
--- lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py 2013-09-30 08:12:17 +0000
+++ lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py 2013-10-02 05:56:53 +0000
@@ -36,6 +36,7 @@
TestGetUploadMethodsMixin,
TestHandleStatusMixin,
)
+from lp.buildmaster.tests.test_manager import MockBuildersCache
from lp.registry.interfaces.pocket import (
PackagePublishingPocket,
pocketsuffix,
@@ -338,9 +339,9 @@
self.addCleanup(self._cleanup)
def updateBuild(self, candidate, slave):
+ bc = MockBuildersCache(self.builder, candidate)
return self.interactor.updateBuild(
- candidate, slave,
- self.interactor.getBuildBehavior(candidate, self.builder, slave))
+ bc.getVitals('foo'), slave, bc, self.interactor.getBuildBehavior)
def assertBuildProperties(self, build):
"""Check that a build happened by making sure some of its properties
Follow ups