summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJon duSaint2023-07-04 17:43:31 -0700
committerJon duSaint2023-07-04 17:43:31 -0700
commitd3b3b102c298a4ffece2983e3c12a0806b3733a3 (patch)
tree860a8eebef7b76824eb962bfba20ce498608af02
parent8cceff0179daffd6a5a7502b00d1176fb17a59e2 (diff)

jobmon: A simple job monitor

The usual rc scripts don’t seem to restart failed jobs (there might be a way, but I haven’t found it). So, write a small job monitor that restarts jobs of interest.

-rw-r--r--jobmon/Makefile12
-rw-r--r--jobmon/jobmon.c253
-rw-r--r--jobmon/jobmon.rc9
3 files changed, 271 insertions, 3 deletions
diff --git a/jobmon/Makefile b/jobmon/Makefile
index 9fa9050..39756a3 100644
--- a/jobmon/Makefile
+++ b/jobmon/Makefile
@@ -1,11 +1,17 @@
# Use EXTRA_FLAGS on the command line for additional options (-g, -static, etc.)
-all: netmon
+all: jobmon netmon
+jobmon: jobmon.c
+ $(CC) -Wall $(CFLAGS) $(EXTRA_FLAGS) -o $@ $<
netmon: netmon.c
$(CC) -Wall $(CFLAGS) $(EXTRA_FLAGS) -o $@ $<
+install-jobmon: jobmon
+ install -m 0555 jobmon /usr/sbin/jobmon
+ install -m 0555 jobmon.rc /etc/rc.d/jobmon
+ @echo "-> Enable jobmon service manually with rcctl(8)"
install-netmon: netmon
install -m 0555 netmon /usr/sbin/netmon
install -m 0555 netmon.rc /etc/rc.d/netmon
@echo "-> Enable netmon service manually with rcctl(8)"
-install: install-netmon
+install: install-jobmon install-netmon
clean:
- rm -f netmon
+ rm -f jobmon netmon
diff --git a/jobmon/jobmon.c b/jobmon/jobmon.c
new file mode 100644
index 0000000..baed742
--- /dev/null
+++ b/jobmon/jobmon.c
@@ -0,0 +1,253 @@
+/**
+ * Simple job monitor
+ *
+ * Usage:
+ * jobmon [-f] [-v] [jobs]
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+bool foreground = false;
+bool use_syslog = false;
+bool verbose = false;
+volatile bool keep_going = true;
+
+/* Values correspond to `rcctl check` exit codes */
+enum job_status {
+ job_ok=0, job_stopped=1, job_error=2
+};
+
+
+void
+do_log (int priority, const char *msg, ...) {
+ if (priority == LOG_DEBUG && ! verbose) return;
+
+ va_list ap;
+ va_start (ap, msg);
+
+ if (use_syslog) {
+ vsyslog (priority, msg, ap);
+ } else {
+ FILE *out = (priority == LOG_NOTICE ||
+ priority == LOG_INFO ||
+ priority == LOG_DEBUG) ? stdout : stderr;
+ if (foreground) {
+ char buf[4+1+2+1+2 + 1 + 2+1+2+1+2 + 1] = {0}; /* YYYY-MM-DD HH:MM:SS */
+ time_t now = time (NULL);
+ strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", localtime (&now));
+ fprintf (out, "%s %s", buf, priority == LOG_DEBUG ? "debug" : (priority == LOG_INFO ? "info" : (priority == LOG_WARNING ? "warning" : "error")));
+ }
+ vfprintf (out, msg, ap);
+ fputc ('\n', out);
+ }
+
+ va_end (ap);
+}
+
+#define debug(m, ...) do_log (LOG_DEBUG, "[%u]: " m, __LINE__, ##__VA_ARGS__)
+#define info(m, ...) do_log (LOG_INFO, "[%u]: " m, __LINE__, ##__VA_ARGS__)
+#define warning(m, ...) do_log (LOG_WARNING, "[%u]: " m, __LINE__, ##__VA_ARGS__)
+#define error(m, ...) do_log (LOG_ERR, "[%u]: " m, __LINE__, ##__VA_ARGS__)
+
+/**
+ * Handler for SIGINT and SIGTERM.
+ */
+void
+term (int sig) {
+ keep_going = false;
+}
+
+/**
+ * Launch an `rcctl` process against a job.
+ *
+ * @param[in] job name of job
+ * @param[in] arg either `"check"` or `"start"`
+ * @return status of the job
+ */
+enum job_status
+run_rcctl (char *const arg, char *const job) {
+ pid_t pid = fork ();
+
+ if (pid < 0) {
+ error ("fork: %s", strerror (errno));
+ return job_error;
+ }
+
+ if (pid == 0) {
+ char *const args[] = { "/usr/sbin/rcctl", arg, job, NULL };
+
+ int fd = open ("/dev/null", O_RDWR);
+ if (fd >= 0) {
+ dup2 (fd, 0);
+ dup2 (fd, 1);
+ dup2 (fd, 2);
+ }
+
+ execv (args[0], args);
+ error ("exec(rcctl) failed: %s", strerror (errno));
+ exit (job_error);
+ }
+
+ int status;
+ pid_t w = waitpid (pid, &status, 0);
+ if (w < 0 && errno == EINTR) {
+ return job_ok; /* most likely have received signal to exit, so do less work */
+ } else if (w <= 0) {
+ error ("waitpid(%d)", pid);
+ return job_error;
+ } else if (WIFEXITED (status)) {
+ if (WEXITSTATUS (status) >= job_ok && WEXITSTATUS (status) <= job_error) {
+ return WEXITSTATUS (status);
+ }
+ return job_error;
+ }
+
+ return job_stopped;
+}
+
+#define check_job(job) run_rcctl ("check", (job))
+#define start_job(job) run_rcctl ("start", (job))
+
+/**
+ * Print a help message and exit.
+ */
+void
+help (int ec) {
+ printf ("usage: jobmon [-f|--foreground] [-v|--verbose] [job ...]\n");
+ exit (ec);
+}
+
+int
+main (int argc, char *argv[]) {
+ struct option options[] = {
+ { "verbose", no_argument, NULL, 'v' },
+ { "foreground", no_argument, NULL, 'f' },
+ { "help", no_argument, NULL, 'h' },
+ { 0, 0, 0, 0 }
+ };
+ char **jobs;
+ int job_count, i, opt;
+ unsigned int sleep_interval = 10;
+ unsigned int logged_every = 0;
+ unsigned int log_every = 60;
+
+ while ((opt = getopt_long (argc, argv, "vfh", options, NULL)) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose = true;
+ sleep_interval = 1;
+ log_every = 4;
+ break;
+ case 'f':
+ foreground = true;
+ break;
+ case 'h':
+ help (EXIT_SUCCESS);
+ break;
+ default:
+ help (EXIT_FAILURE);
+ }
+ }
+
+ jobs = calloc (argc - optind, sizeof (char *));
+ if (jobs == NULL) {
+ error ("unable to allocate memory: %s", strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+
+ for (i = optind, job_count = 0; i < argc; i++) {
+ jobs[job_count++] = argv[i];
+ }
+
+ if (job_count == 0) {
+ info ("no jobs");
+ exit (0);
+ }
+
+ if (! foreground) {
+ if (daemon (0, 0) < 0) {
+ error ("failed to daemonize: %s", strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+ openlog ("jobmon", 0, LOG_DAEMON);
+ use_syslog = true;
+ }
+
+ struct sigaction sa = { 0 };
+ sa.sa_handler = term;
+ if (sigaction (SIGINT, &sa, NULL) < 0 ||
+ sigaction (SIGTERM, &sa, NULL) < 0) {
+ warning ("failed to install signal handler: %s", strerror (errno));
+ }
+
+ if (unveil ("/usr/sbin/rcctl", "x") < 0 ||
+ unveil ("/dev/null", "rw") < 0 ||
+ unveil (NULL, NULL)) {
+ error ("unveil failed: %s", strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+ if (pledge ("proc exec stdio", NULL) < 0) {
+ error ("pledge failed: %s", strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+
+ while (keep_going) {
+ char *ok_jobs[job_count];
+ int ok = 0;
+ size_t bytes = 0;
+
+ for (int j = 0; j < job_count && keep_going; j++) {
+ enum job_status job_status = check_job (jobs[j]);
+
+ if (job_status == job_ok) {
+ ok_jobs[ok++] = jobs[j];
+ bytes += strlen (jobs[j]) + 1;
+ } else if (job_status == job_stopped) {
+ info ("restarting %s", jobs[j]);
+ start_job (jobs[j]);
+ } else {
+ info ("invalid job, removing from list %s", jobs[j]);
+ if (j < job_count - 1) {
+ memmove (&jobs[j], &jobs[j + 1], sizeof (char *) * (job_count - j - 1));
+ }
+ j--;
+ if (--job_count == 0) { keep_going = false; goto out; }
+ }
+ }
+
+ if ((logged_every++ % log_every) == 0 && ok) {
+ /* Reduce logging when all is well */
+ char *buf = calloc (bytes, sizeof (char));
+ while (--ok >= 0) {
+ strlcat (buf, ok_jobs[ok], bytes);
+ if (ok) strlcat (buf, ",", bytes);
+ }
+ info ("%s ok", buf);
+ free (buf);
+ }
+
+ sleep (sleep_interval);
+ out:;
+ }
+
+ free (jobs);
+ info ("exiting");
+
+ if (use_syslog) {
+ closelog ();
+ }
+
+ return 0;
+}
diff --git a/jobmon/jobmon.rc b/jobmon/jobmon.rc
new file mode 100644
index 0000000..ea1c291
--- /dev/null
+++ b/jobmon/jobmon.rc
@@ -0,0 +1,9 @@
+#!/bin/ksh
+
+daemon="/usr/sbin/jobmon"
+
+rc_reload=NO
+
+. /etc/rc.d/rc.subr
+
+rc_cmd $1