launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #30655
[Merge] ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master
William Grant has proposed merging ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master with ~wgrant/launchpad:buildd-manager-failure-refactor as a prerequisite.
Commit message:
Tweak buildd-manager failure handling metrics
They all now start with "builders.failure", labels are more consistent,
and builder failure/reset are counted.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/+git/launchpad/+merge/454693
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master.
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 0c6d335..2bc0b81 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -312,6 +312,23 @@ class PrefetchedBuilderFactory(BaseBuilderFactory):
return self.candidates.pop(vitals)
+def get_statsd_labels(builder, build):
+ labels = {
+ "builder_name": builder.name,
+ "region": builder.region,
+ "virtualized": str(builder.virtualized),
+ }
+ if build is not None:
+ labels.update(
+ {
+ "build": True,
+ "arch": build.processor.name,
+ "job_type": build.job_type.name,
+ }
+ )
+ return labels
+
+
def judge_failure(builder_count, job_count, exc, retry=True):
"""Judge how to recover from a scan failure.
@@ -394,22 +411,15 @@ def recover_failure(logger, vitals, builder, retry, exception):
job_action,
)
- if job is not None and job_action is not None:
- statsd_client = getUtility(IStatsdClient)
- labels = {
- "build": True,
- "arch": job.specific_build.processor.name,
- "region": builder.region,
- "builder_name": builder.name,
- "virtualized": str(builder.virtualized),
- "job_type": job.specific_build.job_type.name,
- }
+ statsd_client = getUtility(IStatsdClient)
+ labels = get_statsd_labels(builder, job.specific_build if job else None)
+ if job is not None and job_action is not None:
if cancelling:
# We've previously been asked to cancel the job, so just set
# it to cancelled rather than retrying or failing.
logger.info("Cancelling job %s.", job.build_cookie)
- statsd_client.incr("builders.job_cancelled", labels=labels)
+ statsd_client.incr("builders.failure.job_cancelled", labels=labels)
job.markAsCancelled()
elif job_action == False:
# Fail and dequeue the job.
@@ -430,12 +440,12 @@ def recover_failure(logger, vitals, builder, retry, exception):
job.specific_build.updateStatus(
BuildStatus.FAILEDTOBUILD, force_invalid_transition=True
)
- statsd_client.incr("builders.job_failed", labels=labels)
+ statsd_client.incr("builders.failure.job_failed", labels=labels)
job.destroySelf()
elif job_action == True:
# Reset the job so it will be retried elsewhere.
logger.info("Requeueing job %s.", job.build_cookie)
- statsd_client.incr("builders.job_reset", labels=labels)
+ statsd_client.incr("builders.failure.job_reset", labels=labels)
job.reset()
if job_action == False:
@@ -447,12 +457,14 @@ def recover_failure(logger, vitals, builder, retry, exception):
# We've already tried resetting it enough times, so we have
# little choice but to give up.
logger.info("Failing builder %s.", builder.name)
+ statsd_client.incr("builders.failure.builder_failed", labels=labels)
builder.failBuilder(str(exception))
elif builder_action == True:
# Dirty the builder to attempt recovery. In the virtual case,
# the dirty idleness will cause a reset, giving us a good chance
# of recovery.
logger.info("Dirtying builder %s to attempt recovery.", builder.name)
+ statsd_client.incr("builders.failure.builder_reset", labels=labels)
builder.setCleanStatus(BuilderCleanStatus.DIRTY)
@@ -583,20 +595,13 @@ class WorkerScanner:
vitals = self.builder_factory.getVitals(self.builder_name)
builder = self.builder_factory[self.builder_name]
try:
+ labels = get_statsd_labels(builder, builder.current_build)
+ self.statsd_client.incr("builders.failure", labels=labels)
+
builder.gotFailure()
- labels = {}
if builder.current_build is not None:
builder.current_build.gotFailure()
- labels.update(
- {
- "build": True,
- "arch": builder.current_build.processor.name,
- "region": builder.region,
- }
- )
- else:
- labels["build"] = False
- self.statsd_client.incr("builders.judged_failed", labels=labels)
+
recover_failure(self.logger, vitals, builder, retry, exc)
transaction.commit()
except Exception:
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index b87e95c..01e6044 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -555,13 +555,20 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
transaction.commit()
yield scanner.singleCycle()
- self.assertEqual(2, self.stats_client.incr.call_count)
+ self.assertEqual(3, self.stats_client.incr.call_count)
self.stats_client.incr.assert_has_calls(
[
mock.call(
"build.reset,arch=386,env=test,job_type=PACKAGEBUILD"
),
- mock.call("builders.judged_failed,build=False,env=test"),
+ mock.call(
+ "builders.failure,builder_name=bob,env=test,"
+ "region=,virtualized=False"
+ ),
+ mock.call(
+ "builders.failure.builder_failed,builder_name=bob,"
+ "env=test,region=,virtualized=False"
+ ),
]
)
@@ -1411,9 +1418,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
self.stats_client.incr.assert_has_calls(
[
mock.call(
- "builders.job_reset,arch={},build=True,builder_name={},"
- "env=test,job_type=RECIPEBRANCHBUILD,region={},"
- "virtualized=True".format(
+ "builders.failure.job_reset,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region={},virtualized=True".format(
build.processor.name,
self.builder.name,
self.builder.region,
@@ -1477,7 +1484,7 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
self.stats_client.incr.assert_has_calls(
[
mock.call(
- "builders.job_cancelled,arch={},build=True,"
+ "builders.failure.job_cancelled,arch={},build=True,"
"builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
"region=builder-name,virtualized=True".format(
naked_build.processor.name,
@@ -1519,9 +1526,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
)
),
mock.call(
- "builders.job_failed,arch={},build=True,builder_name={},"
- "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
- "virtualized=True".format(
+ "builders.failure.job_failed,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region=builder-name,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
@@ -1561,9 +1568,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
)
),
mock.call(
- "builders.job_failed,arch={},build=True,builder_name={},"
- "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
- "virtualized=True".format(
+ "builders.failure.job_failed,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region=builder-name,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
@@ -1588,7 +1595,7 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
self.stats_client.incr.assert_has_calls(
[
mock.call(
- "builders.job_cancelled,arch={},build=True,"
+ "builders.failure.job_cancelled,arch={},build=True,"
"builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
"region=builder-name,virtualized=True".format(
naked_build.processor.name,
@@ -1621,6 +1628,36 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
self.assertIs(None, self.build.builder)
self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
self.assertTrue(self.builder.builderok)
+ self.assertEqual(3, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "builders.failure.job_reset,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region=builder-name,virtualized=True".format(
+ self.build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "build.reset,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+ "virtualized=True".format(
+ self.build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.failure.builder_reset,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region=builder-name,virtualized=True".format(
+ self.build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
+ self.stats_client.incr.reset_mock()
# But if the builder continues to cause trouble, it will be
# disabled.
@@ -1634,16 +1671,59 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
self.assertFalse(self.builder.builderok)
self.assertEqual("failnotes", self.builder.failnotes)
+ self.assertEqual(3, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "builders.failure.job_reset,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region=builder-name,virtualized=True".format(
+ self.build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "build.reset,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+ "virtualized=True".format(
+ self.build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.failure.builder_failed,arch={},build=True,"
+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+ "region=builder-name,virtualized=True".format(
+ self.build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
+ self.stats_client.incr.reset_mock()
def test_builder_failing_with_no_attached_job(self):
self.buildqueue.reset()
self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
+ self.stats_client.incr.reset_mock()
log = self._recover_failure("failnotes")
self.assertIn("with no job", log)
self.assertIn("Failing builder", log)
self.assertFalse(self.builder.builderok)
self.assertEqual("failnotes", self.builder.failnotes)
+ self.assertEqual(1, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "builders.failure.builder_failed,builder_name={},"
+ "env=test,region=builder-name,virtualized=True".format(
+ self.builder.name,
+ )
+ ),
+ ]
+ )
+ self.stats_client.incr.reset_mock()
class TestNewBuilders(TestCase):
Follow ups