← Back to team overview

elementaryart team mailing list archive

[Merge] lp:~victored/cerbere/improvements into lp:cerbere

 

Victor Eduardo has proposed merging lp:~victored/cerbere/improvements into lp:cerbere.

Requested reviews:
  elementary Pantheon team (elementary-pantheon)
Related bugs:
  Bug #905434 in Cerbere: "Cerbere is unable to execute applications with arguments/options"
  https://bugs.launchpad.net/cerbere/+bug/905434

For more details, see:
https://code.launchpad.net/~victored/cerbere/improvements/+merge/109762

I've been testing the performance and reliability of the current Cerbere and I'm not satisfied with the results. Here's what I found:

1) Some processes are not respawned correctly. For instance, wingpanel is not even respawned most of the time!
2) Processes are spawned sequentially (this increases the startup time of the Pantheon desktop.)
3) The process' crash-count is increased even if the crashes didn't occur consecutively, resulting in that process not being monitored anymore. This is likely to happen after many days of activity.
4) No support for command-line arguments. This prevents us from adding stuff like 'slingshot --silent' to the process monitor.
5) Settings are only read during startup. If you change something, you need to restart cerbere (for most users this is a  "logout" -> "login" operation) in order to use the new settings. What should happen:
   - The keys "crash-time" and "crash-time-interval" should be always in sync with the app. If you change their values, cerbere should start using the new values right away.
   - If the values of "desktop-components" are changed, cerbere should apply immediately the new settings. This means:
     - If a program was removed from the list, stop monitoring it right away
     - If a new program was added to the list, launch it and start monitoring it immediately

While these issues don't seem to be important, they certainly are when you have your computer turned on for more than 10 days (based on the crash frequency of some components).

Other issues:
6) Settings are located under org.pantheon.cerbere.settings instead of just org.pantheon.cerbere
7) Cerbere is more like a multi-purpose app, since you can pretty much add anything to it. Therefore, the processes key should be renamed from "desktop-components" to something like "monitored-proceses"
8) Inconvenient variant types on keys "crash-time" and "max-crashes". (Should be uint instead of int, since setting negative values breaks cerbere)


This branch fixes all of the issues mentioned above.
-- 
https://code.launchpad.net/~victored/cerbere/improvements/+merge/109762
Your team elementaryart (old) is subscribed to branch lp:cerbere.
=== modified file 'INSTALL'
--- INSTALL	2012-04-29 01:11:54 +0000
+++ INSTALL	2012-06-12 01:47:19 +0000
@@ -1,5 +1,6 @@
+# cd to the cerbere folder
 mkdir build
 cd build
-cmake ..
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr
 make
 sudo make install

=== modified file 'org.pantheon.cerbere.gschema.xml'
--- org.pantheon.cerbere.gschema.xml	2012-04-30 19:18:34 +0000
+++ org.pantheon.cerbere.gschema.xml	2012-06-12 01:47:19 +0000
@@ -1,22 +1,20 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <schemalist>
-    <schema path="/org/pantheon/cerbere/settings/" id="org.pantheon.cerbere.settings" gettext-domain="cerbere">
-        <key type="as" name="desktop-components">
+    <schema path="/org/pantheon/cerbere/" id="org.pantheon.cerbere" gettext-domain="cerbere">
+        <key type="as" name="monitored-processes">
             <default>['wingpanel','plank','/usr/lib/notify-osd/notify-osd']</default>
-            <summary>List of desktop components</summary>
-            <description>These applications will be launched and watched by cerbere and autorestarted automagically</description>
+            <summary>List of monitored components</summary>
+            <description>These applications will be launched and watched by Cerbere, and autorestarted automagically in case they exit</description>
         </key>
-
-        <key type="i" name="max-crashes">
+        <key type="u" name="max-crashes">
             <default>5</default>
-            <summary>Maximum permitted times to relaunch a process which crashes</summary>
-            <description>Maximum permitted times to relaunch a process which crashes. Cerbere will stop trying after this number is reached.</description>
-        </key>
-
-        <key type="d" name="crash-time">
-            <default>10.0</default>
-            <summary>Time in which a process is considered crashed</summary>
-            <description>(in seconds)</description>
-        </key>        
+            <summary>Maximum consecutive crashes permitted</summary>
+            <description>Maximum consecutive crashes permitted. Cerbere will stop trying after this value is reached.</description>
+        </key>
+        <key type="u" name="crash-time-interval">
+            <default>10000</default>
+            <summary>Time in which a process is considered crashed (in milliseconds, 1/1000ths of a second)</summary>
+            <description>If a process is started, and then it exits in less time than the value of this key, it is considered crashed.</description>
+        </key>
 	</schema>
 </schemalist>

=== modified file 'src/CMakeLists.txt'
--- src/CMakeLists.txt	2012-02-11 21:34:28 +0000
+++ src/CMakeLists.txt	2012-06-12 01:47:19 +0000
@@ -2,10 +2,9 @@
 
 find_package(Vala REQUIRED)
 include(ValaVersion)
-ensure_vala_version("0.10" MINIMUM)
+ensure_vala_version("0.14" MINIMUM)
 include(ValaPrecompile)
 
-
 # pkgconfig, real C code
 find_package(PkgConfig)
 pkg_check_modules(DEPS REQUIRED glib-2.0 gio-2.0 gee-1.0)
@@ -19,13 +18,19 @@
 
 
 add_definitions(${CFLAGS})
-vala_precompile(VALA_C Cerbere.vala Watchdog.vala ProcessTimer.vala
+vala_precompile (
+VALA_C
+    Cerbere.vala
+    SettingsManager.vala
+    Watchdog.vala
+    ProcessInfo.vala
 PACKAGES
     glib-2.0
     gio-2.0
     gee-1.0
 OPTIONS
-    --thread)
+    --target-glib=2.32
+    ${ADD_OPTIONS})
 
 add_executable(cerbere
     ${VALA_C} )

=== modified file 'src/Cerbere.vala'
--- src/Cerbere.vala	2012-04-30 19:18:34 +0000
+++ src/Cerbere.vala	2012-06-12 01:47:19 +0000
@@ -1,3 +1,4 @@
+/* -*- Mode: vala; indent-tabs-mode: nil; tab-width: 4 -*- */
 /*
  * Cerbere.vala
  * This file is part of cerbere, a watchdog for the Pantheon Desktop
@@ -21,59 +22,41 @@
  *
  * Authors: Allen Lowe <allen@xxxxxxxxxxxxxxxx>
  *          ammonkey <am.monkeyd@xxxxxxxxx>
+ *          Victor Eduardo <victoreduardm@xxxxxxxxx>
  */
 
-using GLib;
-
-namespace Cerbere {
-
-    public class Cerbere : Application {
-
+public class Cerbere : GLib.Application {
+
+    public static SettingsManager settings { get; private set; default = null; }
     private Watchdog watchdog;
 
-        private double crash_time;
-        private int max_crashes;
-
-        private string[] desktop_bins;
-
-        construct {
-            application_id = "org.pantheon.cerbere";
-            flags = GLib.ApplicationFlags.IS_SERVICE;
-        }
-
-        protected override void startup () {
-            load_config ();
-
-            // Start watchdog
-            watchdog = new Watchdog (crash_time, max_crashes);
-
-            start_desktop ();
-
-            var main_loop = new MainLoop ();
-            main_loop.run ();
-        }
-
-        void load_config () {
-            var settings = new Settings ("org.pantheon.cerbere.settings");
-            desktop_bins = settings.get_strv ("desktop-components");
-            max_crashes = settings.get_int ("max-crashes");
-            crash_time = settings.get_double ("crash-time");
-        }
-
-        void start_desktop () {
-            foreach (string bin in desktop_bins) {
-                if (bin != null) {
-                    watchdog.watch_process (bin);
-                }
-            }
-        }
-
-        public static int main (string[] args) {
-            var app = new Cerbere ();
-            app.run (args);
-
-            return 0;
-        }
+    construct {
+        application_id = "org.pantheon.cerbere";
+        flags = GLib.ApplicationFlags.IS_SERVICE;
+    }
+
+    protected override void startup () {
+        this.settings = new SettingsManager ();
+
+        // Start watchdog
+        this.watchdog = new Watchdog ();
+        this.start_processes (this.settings.process_list);
+
+        // Monitor changes
+        this.settings.process_list_changed.connect (this.start_processes);
+
+        var main_loop = new MainLoop ();
+        main_loop.run ();
+    }
+
+    private void start_processes (string[] process_list) {
+        foreach (string cmd in process_list) {
+            this.watchdog.add_process_async (cmd);
+        }
+    }
+
+    public static int main (string[] args) {
+        var app = new Cerbere ();
+        return app.run (args);
     }
 }
-

=== added file 'src/ProcessInfo.vala'
--- src/ProcessInfo.vala	1970-01-01 00:00:00 +0000
+++ src/ProcessInfo.vala	2012-06-12 01:47:19 +0000
@@ -0,0 +1,152 @@
+/* -*- Mode: vala; indent-tabs-mode: nil; tab-width: 4 -*- */
+/*
+ * Copyright (C) 2012 Victor Eduardo <victoreduardm@xxxxxxxxx>
+ *
+ * Cerbere is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Cerbere is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Cerbere; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA  02110-1301  USA
+ *
+ * Authors: Victor Eduardo <victoreduardm@xxxxxxxxx>
+ */
+
+public class ProcessInfo : GLib.Object {
+
+    public signal void exited (bool normal_exit);
+    //public signal void started ();
+
+    public enum Status {
+        INACTIVE,  // not yet spawned
+        RUNNING,   // active (already spawned)
+        TERMINATED // killed/exited
+    }
+
+    private string command = "";
+    private GLib.Pid pid = -1;
+
+    public Status status = Status.INACTIVE;
+    public int exit_count { get; private set; default = 0; }
+    public int crash_count { get; private set; default = 0; }
+
+    private GLib.Timer? timer = null;
+
+    public ProcessInfo (string command) {
+        this.command = command;
+    }
+
+    public void reset_crash_count () {
+        debug ("RESETTING crash count of '%s' to 0 (normal exit)", this.command);
+        this.crash_count = 0;
+    }
+
+    public async void run_async () {
+        this.run ();
+    }
+
+    public void run () {
+        message ("STARTING process: %s", command);
+
+        if (this.status == Status.RUNNING) {
+            message ("PROCESS %s is already running", command);
+            return;
+        }
+
+        GLib.Pid process_id;
+
+        var flags = GLib.SpawnFlags.SEARCH_PATH |
+                     GLib.SpawnFlags.DO_NOT_REAP_CHILD |
+                     GLib.SpawnFlags.STDOUT_TO_DEV_NULL; // discard process output
+
+        // parse args
+        string[] argvp = null;
+        try {
+            GLib.Shell.parse_argv (this.command, out argvp);
+        }
+        catch (GLib.ShellError error) {
+            warning ("Not passing any args to %s : %s", this.command, error.message);
+            argvp = {this.command, null}; // fix value in case it's corrupted
+        }
+
+        if (argvp == null)
+            return;
+
+        // Spawn process asynchronously
+        try {
+            GLib.Process.spawn_async (null, argvp, null, flags, null, out process_id);
+        }
+        catch (GLib.Error err) {
+            warning (err.message);
+            return;
+        }
+
+        // time starts counting here
+        this.timer = new GLib.Timer ();
+
+        this.pid = process_id;
+        this.status = Status.RUNNING;
+
+        // Emit signal
+        //this.started ();
+
+        // Add watch
+        GLib.ChildWatch.add (this.pid, (pid, status) => {
+            if (pid != this.pid)
+                return;
+
+            message ("Process '%s' has been closed (ChildWatch exit)", command);
+            // Check exit status
+            if (GLib.Process.if_exited (status) || GLib.Process.if_signaled (status) ||
+                GLib.Process.core_dump (status))
+            {
+                this.terminate ();
+            }
+        });
+    }
+
+    public void terminate () {
+        if (this.status != Status.RUNNING)
+            return;
+
+        message ("Process %s is being terminated", command);
+
+        GLib.Process.close_pid (this.pid);
+
+        bool is_crash = false;
+
+        if (this.timer != null) {
+            this.timer.stop ();
+
+            double ellapsed_secs = this.timer.elapsed ();
+            double crash_time_interval_secs = (double)Cerbere.settings.crash_time_interval / 1000.0;
+
+            debug ("Elapsed time = %f secs", ellapsed_secs);
+            debug ("Min allowed time = %f secs", crash_time_interval_secs);
+
+            if (ellapsed_secs <= crash_time_interval_secs) // process crashed
+                is_crash = true;
+        }
+
+        if (is_crash) {
+            this.crash_count ++;
+            message ("Process '%s' crashed", this.command);
+        }
+
+        this.exit_count ++;
+        this.status = Status.TERMINATED;
+
+        this.timer = null;
+
+        // Emit signal
+        this.exited (!is_crash);
+    }   
+}

=== removed file 'src/ProcessTimer.vala'
--- src/ProcessTimer.vala	2012-02-11 21:34:28 +0000
+++ src/ProcessTimer.vala	1970-01-01 00:00:00 +0000
@@ -1,48 +0,0 @@
-/*
- * ProcessTimer.vala
- * This file is part of cerbere, a watchdog for the Pantheon Desktop
- *
- * Copyright (C) 2011-2012 - Allen Lowe
- *
- * Cerbere is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Cerbere is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Cerbere; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor,
- * Boston, MA  02110-1301  USA
- *
- * Authors: Victor Eduardo <victoreduardm@xxxxxxxxx>
- */
-
-public class ProcessTimer {
-
-    private GLib.Timer timer;
-
-    public double elapsed {
-        get {
-            return timer.elapsed();
-        }
-    }
-
-    public ProcessTimer () {
-        // time starts counting here
-        timer = new GLib.Timer ();
-    }
-
-    public void reset () {
-        timer.start ();
-    }
-
-    public void stop () {
-        timer.stop ();
-    }
-}
-

=== added file 'src/SettingsManager.vala'
--- src/SettingsManager.vala	1970-01-01 00:00:00 +0000
+++ src/SettingsManager.vala	2012-06-12 01:47:19 +0000
@@ -0,0 +1,52 @@
+/* -*- Mode: vala; indent-tabs-mode: nil; tab-width: 4 -*- */
+/*
+ * Copyright (C) 2012 Victor Eduardo <victoreduardm@xxxxxxxxx>
+ *
+ * Cerbere is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Cerbere is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Cerbere; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA  02110-1301  USA
+ *
+ * Authors: Victor Eduardo <victoreduardm@xxxxxxxxx>
+ */
+
+public class SettingsManager : GLib.Object {
+
+    public signal void process_list_changed (string[] new_values);
+
+    static const string SETTINGS_PATH = "org.pantheon.cerbere";
+
+    static const string MAX_CRASHES_KEY = "max-crashes";
+    static const string CRASH_TIME_INTERVAL_KEY = "crash-time-interval";
+    static const string MONITORED_PROCESSES_KEY = "monitored-processes";
+
+    public string[] process_list   { get; set; }
+    public uint max_crashes         { get; set; default = 0; }
+    public uint crash_time_interval { get; set; default = 0; }
+
+    private GLib.Settings? settings = null;
+
+    public SettingsManager () {
+        this.settings = new GLib.Settings (SETTINGS_PATH);
+
+        this.settings.bind (MAX_CRASHES_KEY, this, "max-crashes", SettingsBindFlags.DEFAULT);
+        this.settings.bind (CRASH_TIME_INTERVAL_KEY, this, "crash-time-interval", SettingsBindFlags.DEFAULT);
+        this.settings.bind (MONITORED_PROCESSES_KEY, this, "process-list", SettingsBindFlags.DEFAULT);
+
+        this.settings.changed[MONITORED_PROCESSES_KEY].connect (this.on_process_list_changed);
+    }
+
+    private void on_process_list_changed () {
+        this.process_list_changed (this.settings.get_strv (MONITORED_PROCESSES_KEY));
+    }
+}

=== modified file 'src/Watchdog.vala'
--- src/Watchdog.vala	2012-03-09 05:15:32 +0000
+++ src/Watchdog.vala	2012-06-12 01:47:19 +0000
@@ -1,3 +1,4 @@
+/* -*- Mode: vala; indent-tabs-mode: nil; tab-width: 4 -*- */
 /*
  * Watchdog.vala
  * This file is part of cerbere, a watchdog for the Pantheon Desktop
@@ -24,84 +25,74 @@
  *          Victor Eduardo <victoreduardm@xxxxxxxxx>
  */
 
-using GLib.Process;
-
 public class Watchdog {
-
-    private double CRASH_TIME;
-    private int MAX_CRASHES;
-
-    private Gee.HashMap<string, int> pids;
-    private Gee.HashMap<string, ProcessTimer> run_time;
-    private Gee.HashMap<string, int> exit_count;
-    private Gee.HashMap<string, int> crash_count;
-
-    public Watchdog (double crash_time, int max_crashes) {
-        CRASH_TIME = crash_time;
-        MAX_CRASHES = max_crashes;
-        
-        pids = new Gee.HashMap<string, int> ();
-        run_time = new Gee.HashMap<string, ProcessTimer> ();
-        exit_count = new Gee.HashMap<string, int> ();
-        crash_count = new Gee.HashMap<string, int> ();
-    }
-
-    public void watch_process (string bin_name) {
-        Pid pid;
-
-        try {
-            GLib.Process.spawn_async (null, {bin_name, null}, null, SpawnFlags.SEARCH_PATH | SpawnFlags.DO_NOT_REAP_CHILD | SpawnFlags.STDOUT_TO_DEV_NULL | SpawnFlags.STDERR_TO_DEV_NULL, null, out pid);
-        }
-        catch (GLib.Error err) {
-            warning (err.message);
-        }
-
-        stdout.printf("\nPROCESS STARTED:     [ID = %i]  %s", (int)pid, bin_name);
-
-        pids.set (bin_name, (int)pid);
-
-        // Check if we have not created the counters before
-
-        if (!exit_count.has_key(bin_name))
-            exit_count.set (bin_name, 0);
-
-        if (!crash_count.has_key(bin_name))
-            crash_count.set (bin_name, 0);
-
-        // Add and run timer
-        run_time.set(bin_name, new ProcessTimer());
-
-        ChildWatch.add (pid, (a,b) => {
-            on_process_exit (a, b, bin_name);
+    // Contains ALL the processes that are being monitored
+    private Gee.HashMap<string, ProcessInfo> processes;
+    private GLib.Mutex data_lock;
+
+    public Watchdog () {
+        this.processes = new Gee.HashMap<string, ProcessInfo> ();
+    }
+
+    public async void add_process_async (string command) {
+        this.add_process (command);
+    }
+
+    public void add_process (string command) {
+        if (command.strip () == "") // whitespace check
+            return;
+
+        // Check if a process for this command has already been created
+        if (this.processes.has_key (command))
+            return;
+
+        // Create new process
+        var process = new ProcessInfo (command);
+
+        // Add it to the table
+        this.data_lock.lock ();
+        this.processes[command] = process;
+        this.data_lock.unlock ();
+
+        // Exit handler. Respawning occurs here
+        process.exited.connect ( (normal_exit) => {
+            if (normal_exit) {
+                // Reset crash count. We only want to count consecutive crashes, so if a normal exit
+                // is detected, we should reset the counter.
+                process.reset_crash_count ();
+            }
+
+            // if still in the list, relaunch if possible
+            if (command in Cerbere.settings.process_list) {
+                // Check if the process is still present in the table since it could have been removed.
+                if (processes.has_key (command)) {
+                    // Check if the process already exceeded the maximum number of allowed crashes.
+                    uint max_crashes = Cerbere.settings.max_crashes;
+                    if (process.crash_count <= max_crashes) {
+                        message ("RELAUNCHING: %s", command);
+                        process.run (); // Reload right away
+                    }
+                    else {
+                        message ("'%s' exceeded the maximum number of crashes allowed (%s). It won't be launched again", command, max_crashes.to_string ());
+                    }
+                }
+                else {
+                    // If a process is not in the table, it means it wasn't re-launched after it exited, so
+                    // in theory this code is never reached.
+                    message ("You should NEVER get this message. If you're getting it, contact the developers.");
+                }
+            }
+            else {
+                // Remove from the list. At this point the reference count should
+                // drop to 0 and free the process.
+                message ("'%s' is no longer on settings. It will not be monitored anymore", command);
+                this.data_lock.lock ();
+                this.processes.unset (command);
+                this.data_lock.unlock ();
+            }
         });
-    }
-
-
-    private void on_process_exit (Pid pid, int status, string name) {
-        double elapsed_time = run_time.get(name).elapsed;
-        int exit_times = exit_count.get(name), crashes = crash_count.get(name);
-
-        stdout.printf("\nPROCESS TERMINATED:  [ID = %i]  %s [Status = %i]", (int)pid, name, status);
-        stdout.printf(" [Elapsed time = %.2fs]", elapsed_time);
-
-        exit_count.set(name, ++exit_times);
-        Process.close_pid (pid);
-
-        stdout.printf(" [Exit times = %i]", exit_times);
-
-        run_time.unset(name, null);
-        pids.unset (name, null);
-
-        if (elapsed_time <= CRASH_TIME) {
-            crash_count.set(name, ++crashes);
-            stdout.printf(" [CRASH #%i]", crashes);
-        }
-
-        if (crashes <= MAX_CRASHES && (if_exited (status) || if_signaled (status) || core_dump (status)))
-            watch_process (name);
-
-        if (crashes == MAX_CRASHES)
-            stdout.printf("\n\n-- Process '%s' crashed too many times. It won't be launched again.\n\n", name);
+
+        // Run
+        process.run_async ();
     }
 }
-


Follow ups