canonical-ubuntu-qa team mailing list archive

Thread
Date

[Merge] ~hyask/autopkgtest-cloud:skia/stats_boot_attempts into autopkgtest-cloud:master

To: mp+467811@xxxxxxxxxxxxxxxxxx
From: Skia <mp+467811@xxxxxxxxxxxxxxxxxx>
Date: Wed, 19 Jun 2024 13:06:20 -0000
Reply-to: mp+467811@xxxxxxxxxxxxxxxxxx
Sender: noreply@xxxxxxxxxxxxx

Skia has proposed merging ~hyask/autopkgtest-cloud:skia/stats_boot_attempts into autopkgtest-cloud:master.

Requested reviews:
  Canonical's Ubuntu QA (canonical-ubuntu-qa)

For more details, see:
https://code.launchpad.net/~hyask/autopkgtest-cloud/+git/autopkgtest-cloud/+merge/467811

New analysis of boot attempts from the log files.
-- 
Your team Canonical's Ubuntu QA is requested to review the proposed merge of ~hyask/autopkgtest-cloud:skia/stats_boot_attempts into autopkgtest-cloud:master.

diff --git a/charms/focal/autopkgtest-web/webcontrol/stats.ipynb b/dev-tools/stats.ipynb
similarity index 73%
rename from charms/focal/autopkgtest-web/webcontrol/stats.ipynb
rename to dev-tools/stats.ipynb
index 2dbaddc..471abcb 100644
--- a/charms/focal/autopkgtest-web/webcontrol/stats.ipynb
+++ b/dev-tools/stats.ipynb
@@ -22,7 +22,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "48a388c7",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "%run ./stats.py --collect-stats --download-db --since-days-ago 4 --until-days-ago 0"
@@ -32,7 +34,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "3d3540d9",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "import sqlite3\n",
@@ -40,7 +44,7 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "# Update this path with the corresponding path to the database you want to analyze\n",
-    "db_path = \"./autopkgtest_2024-06-03 09:55:39.367132_with_stats.db\"\n",
+    "db_path = \"./autopkgtest_2024-06-18 15:20:42.817741_with_stats.db\"\n",
     "\n",
     "db = sqlite3.connect(f\"file:{db_path}?mode=ro\")\n",
     "sqlite3.paramstyle = \"named\""
@@ -73,7 +77,7 @@
    "execution_count": null,
    "id": "d16523b1",
    "metadata": {
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [],
    "source": [
@@ -99,7 +103,7 @@
     "    # Plot point for each dc-arch over time\n",
     "    for dc in df['datacenter'].sort_values().unique():\n",
     "        dc_data = df[df['datacenter'] == dc]\n",
-    "        plt.plot(dc_data['date'], dc_data['first_boot_time'], label=dc)\n",
+    "        plt.plot(dc_data['date'], dc_data['first_boot_time'], 'o-', label=dc)\n",
     "\n",
     "    # Add some title and labels\n",
     "    plt.title('Time to first boot for each datacenter-arch over time')\n",
@@ -119,10 +123,10 @@
    "id": "2ceba19c",
    "metadata": {},
    "source": [
-    "## Cumulated boot attempts\n",
+    "## Cumulated boot attempts for all datacenters\n",
     "\n",
     "The next cell show the cumulated boot attempts. Sometimes, the `nova` script is unable to reach the VM on first try, but will retry some amount of time, logging the failures. Depending on the particular issues, this can show some boot instabilities, network trouble, or anything.  \n",
-    "Since this graph shows cumulated boot attempts, it actually kinda counts the number of time a job had to retry to boot successfully, since the `boot_attempts` is almost always 1. This graph isn't scaled to the number of runned jobs to get a relative percentage, so this is heavily dependent on the number of actually run jobs. Still, this is somehow useful to get a hunch of instabilities in some particular datacenters.\n",
+    "This graph isn't scaled to the number of runned jobs to get a relative percentage, so this is heavily dependent on the number of actually run jobs. Still, this is somehow useful to get a hunch of instabilities in some particular datacenters.\n",
     "\n",
     "The same kind of tweaking to the query than the previous cell can be done here."
    ]
@@ -131,7 +135,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "e4906b31",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "with db as db_con:\n",
@@ -157,7 +163,7 @@
     "    # Plot point for each dc-arch over time\n",
     "    for dc in df['datacenter'].unique():\n",
     "        dc_data = df[df['datacenter'] == dc]\n",
-    "        plt.plot(dc_data['date'], dc_data['boot_attempts'], label=dc)\n",
+    "        plt.plot(dc_data['date'], dc_data['boot_attempts'], 'o-', label=dc)\n",
     "\n",
     "    # Add some title and labels\n",
     "    plt.title('Cumulated boot attempts for each datacenter-arch over time')\n",
@@ -169,6 +175,69 @@
     "    plt.show()\n",
     "    print(df)\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adfb1df3",
+   "metadata": {},
+   "source": [
+    "## Boot attempts distribution for a single datacenter-arch\n",
+    "\n",
+    "When an issue arises on a particular datacenter-arch, this might be useful to get a glimpse at the overall behavior when spawning VMs, and answer questions like \"is it worth it to raise the number of retries to spawn a VM?\".\n",
+    "\n",
+    "Remember that this cells requires you to set the `datacenter` and `arch` at the beginning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd15df1d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "with db as db_con:\n",
+    "    db_con.row_factory = sqlite3.Row\n",
+    "    \n",
+    "    datacenter = \"bos03\"\n",
+    "    arch = \"arm64\"\n",
+    "  \n",
+    "    query = f\"\"\"\n",
+    "    SELECT boot_attempts, substr(tests_stats.run_id, 1, 8) as date\n",
+    "    FROM tests_stats\n",
+    "    JOIN result ON result.run_id=tests_stats.run_id\n",
+    "    JOIN test ON test.id=result.test_id\n",
+    "    WHERE arch = '{arch}' AND datacenter = '{datacenter}'\n",
+    "    ORDER BY date\n",
+    "    \"\"\"\n",
+    "    df = pd.read_sql_query(query, db_con)\n",
+    "    # Get the date as datetime object\n",
+    "    df[\"date\"] = pd.to_datetime(df.date)\n",
+    "    # Get boot_attempts as integers\n",
+    "    df[\"boot_attempts\"] = pd.to_numeric(df.boot_attempts, downcast=\"integer\")\n",
+    "    # Handle NaN\n",
+    "    df = df.fillna(0)\n",
+    "    \n",
+    "    # Display data as a graph\n",
+    "    plt.figure(figsize=(14, 5))\n",
+    "\n",
+    "    # Plot point for each dc-arch over time\n",
+    "    for ba in sorted(df['boot_attempts'].unique()):\n",
+    "        ba_data = df[df['boot_attempts'] == ba]\n",
+    "        ba_data = ba_data.groupby(\"date\").count()\n",
+    "        plt.plot(ba_data, 'o-', drawstyle='steps-post', label=ba)\n",
+    "\n",
+    "    # Add some title and labels\n",
+    "    plt.title(f'Boot attempts counts for {datacenter}-{arch} over time')\n",
+    "    plt.xlabel('Date')\n",
+    "    plt.ylabel('Boot attempts counts')\n",
+    "    plt.legend()\n",
+    "\n",
+    "    # Plot the graph\n",
+    "    plt.show()\n",
+    "    print(df)\n"
+   ]
   }
  ],
  "metadata": {
diff --git a/charms/focal/autopkgtest-web/webcontrol/stats.py b/dev-tools/stats.py
similarity index 100%
rename from charms/focal/autopkgtest-web/webcontrol/stats.py
rename to dev-tools/stats.py