diff options
| author | Jon duSaint | 2023-07-04 17:43:31 -0700 |
|---|---|---|
| committer | Jon duSaint | 2023-07-04 17:43:31 -0700 |
| commit | d3b3b102c298a4ffece2983e3c12a0806b3733a3 (patch) | |
| tree | 860a8eebef7b76824eb962bfba20ce498608af02 | |
| parent | 8cceff0179daffd6a5a7502b00d1176fb17a59e2 (diff) | |
jobmon: A simple job monitor
The usual rc scripts don’t seem to restart failed jobs (there might be a way, but I haven’t found it). So, write a small job monitor that restarts jobs of interest.
| -rw-r--r-- | jobmon/Makefile | 12 | ||||
| -rw-r--r-- | jobmon/jobmon.c | 253 | ||||
| -rw-r--r-- | jobmon/jobmon.rc | 9 |
3 files changed, 271 insertions, 3 deletions
diff --git a/jobmon/Makefile b/jobmon/Makefile index 9fa9050..39756a3 100644 --- a/jobmon/Makefile +++ b/jobmon/Makefile @@ -1,11 +1,17 @@ # Use EXTRA_FLAGS on the command line for additional options (-g, -static, etc.) -all: netmon +all: jobmon netmon +jobmon: jobmon.c + $(CC) -Wall $(CFLAGS) $(EXTRA_FLAGS) -o $@ $< netmon: netmon.c $(CC) -Wall $(CFLAGS) $(EXTRA_FLAGS) -o $@ $< +install-jobmon: jobmon + install -m 0555 jobmon /usr/sbin/jobmon + install -m 0555 jobmon.rc /etc/rc.d/jobmon + @echo "-> Enable jobmon service manually with rcctl(8)" install-netmon: netmon install -m 0555 netmon /usr/sbin/netmon install -m 0555 netmon.rc /etc/rc.d/netmon @echo "-> Enable netmon service manually with rcctl(8)" -install: install-netmon +install: install-jobmon install-netmon clean: - rm -f netmon + rm -f jobmon netmon diff --git a/jobmon/jobmon.c b/jobmon/jobmon.c new file mode 100644 index 0000000..baed742 --- /dev/null +++ b/jobmon/jobmon.c @@ -0,0 +1,253 @@ +/** + * Simple job monitor + * + * Usage: + * jobmon [-f] [-v] [jobs] + * + */ + +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <signal.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <sys/wait.h> +#include <unistd.h> + +bool foreground = false; +bool use_syslog = false; +bool verbose = false; +volatile bool keep_going = true; + +/* Values correspond to `rcctl check` exit codes */ +enum job_status { + job_ok=0, job_stopped=1, job_error=2 +}; + + +void +do_log (int priority, const char *msg, ...) { + if (priority == LOG_DEBUG && ! verbose) return; + + va_list ap; + va_start (ap, msg); + + if (use_syslog) { + vsyslog (priority, msg, ap); + } else { + FILE *out = (priority == LOG_NOTICE || + priority == LOG_INFO || + priority == LOG_DEBUG) ? stdout : stderr; + if (foreground) { + char buf[4+1+2+1+2 + 1 + 2+1+2+1+2 + 1] = {0}; /* YYYY-MM-DD HH:MM:SS */ + time_t now = time (NULL); + strftime (buf, sizeof (buf), "%Y-%m-%d %H:%M:%S", localtime (&now)); + fprintf (out, "%s %s", buf, priority == LOG_DEBUG ? "debug" : (priority == LOG_INFO ? "info" : (priority == LOG_WARNING ? "warning" : "error"))); + } + vfprintf (out, msg, ap); + fputc ('\n', out); + } + + va_end (ap); +} + +#define debug(m, ...) do_log (LOG_DEBUG, "[%u]: " m, __LINE__, ##__VA_ARGS__) +#define info(m, ...) do_log (LOG_INFO, "[%u]: " m, __LINE__, ##__VA_ARGS__) +#define warning(m, ...) do_log (LOG_WARNING, "[%u]: " m, __LINE__, ##__VA_ARGS__) +#define error(m, ...) do_log (LOG_ERR, "[%u]: " m, __LINE__, ##__VA_ARGS__) + +/** + * Handler for SIGINT and SIGTERM. + */ +void +term (int sig) { + keep_going = false; +} + +/** + * Launch an `rcctl` process against a job. + * + * @param[in] job name of job + * @param[in] arg either `"check"` or `"start"` + * @return status of the job + */ +enum job_status +run_rcctl (char *const arg, char *const job) { + pid_t pid = fork (); + + if (pid < 0) { + error ("fork: %s", strerror (errno)); + return job_error; + } + + if (pid == 0) { + char *const args[] = { "/usr/sbin/rcctl", arg, job, NULL }; + + int fd = open ("/dev/null", O_RDWR); + if (fd >= 0) { + dup2 (fd, 0); + dup2 (fd, 1); + dup2 (fd, 2); + } + + execv (args[0], args); + error ("exec(rcctl) failed: %s", strerror (errno)); + exit (job_error); + } + + int status; + pid_t w = waitpid (pid, &status, 0); + if (w < 0 && errno == EINTR) { + return job_ok; /* most likely have received signal to exit, so do less work */ + } else if (w <= 0) { + error ("waitpid(%d)", pid); + return job_error; + } else if (WIFEXITED (status)) { + if (WEXITSTATUS (status) >= job_ok && WEXITSTATUS (status) <= job_error) { + return WEXITSTATUS (status); + } + return job_error; + } + + return job_stopped; +} + +#define check_job(job) run_rcctl ("check", (job)) +#define start_job(job) run_rcctl ("start", (job)) + +/** + * Print a help message and exit. + */ +void +help (int ec) { + printf ("usage: jobmon [-f|--foreground] [-v|--verbose] [job ...]\n"); + exit (ec); +} + +int +main (int argc, char *argv[]) { + struct option options[] = { + { "verbose", no_argument, NULL, 'v' }, + { "foreground", no_argument, NULL, 'f' }, + { "help", no_argument, NULL, 'h' }, + { 0, 0, 0, 0 } + }; + char **jobs; + int job_count, i, opt; + unsigned int sleep_interval = 10; + unsigned int logged_every = 0; + unsigned int log_every = 60; + + while ((opt = getopt_long (argc, argv, "vfh", options, NULL)) != -1) { + switch (opt) { + case 'v': + verbose = true; + sleep_interval = 1; + log_every = 4; + break; + case 'f': + foreground = true; + break; + case 'h': + help (EXIT_SUCCESS); + break; + default: + help (EXIT_FAILURE); + } + } + + jobs = calloc (argc - optind, sizeof (char *)); + if (jobs == NULL) { + error ("unable to allocate memory: %s", strerror (errno)); + exit (EXIT_FAILURE); + } + + for (i = optind, job_count = 0; i < argc; i++) { + jobs[job_count++] = argv[i]; + } + + if (job_count == 0) { + info ("no jobs"); + exit (0); + } + + if (! foreground) { + if (daemon (0, 0) < 0) { + error ("failed to daemonize: %s", strerror (errno)); + exit (EXIT_FAILURE); + } + openlog ("jobmon", 0, LOG_DAEMON); + use_syslog = true; + } + + struct sigaction sa = { 0 }; + sa.sa_handler = term; + if (sigaction (SIGINT, &sa, NULL) < 0 || + sigaction (SIGTERM, &sa, NULL) < 0) { + warning ("failed to install signal handler: %s", strerror (errno)); + } + + if (unveil ("/usr/sbin/rcctl", "x") < 0 || + unveil ("/dev/null", "rw") < 0 || + unveil (NULL, NULL)) { + error ("unveil failed: %s", strerror (errno)); + exit (EXIT_FAILURE); + } + if (pledge ("proc exec stdio", NULL) < 0) { + error ("pledge failed: %s", strerror (errno)); + exit (EXIT_FAILURE); + } + + while (keep_going) { + char *ok_jobs[job_count]; + int ok = 0; + size_t bytes = 0; + + for (int j = 0; j < job_count && keep_going; j++) { + enum job_status job_status = check_job (jobs[j]); + + if (job_status == job_ok) { + ok_jobs[ok++] = jobs[j]; + bytes += strlen (jobs[j]) + 1; + } else if (job_status == job_stopped) { + info ("restarting %s", jobs[j]); + start_job (jobs[j]); + } else { + info ("invalid job, removing from list %s", jobs[j]); + if (j < job_count - 1) { + memmove (&jobs[j], &jobs[j + 1], sizeof (char *) * (job_count - j - 1)); + } + j--; + if (--job_count == 0) { keep_going = false; goto out; } + } + } + + if ((logged_every++ % log_every) == 0 && ok) { + /* Reduce logging when all is well */ + char *buf = calloc (bytes, sizeof (char)); + while (--ok >= 0) { + strlcat (buf, ok_jobs[ok], bytes); + if (ok) strlcat (buf, ",", bytes); + } + info ("%s ok", buf); + free (buf); + } + + sleep (sleep_interval); + out:; + } + + free (jobs); + info ("exiting"); + + if (use_syslog) { + closelog (); + } + + return 0; +} diff --git a/jobmon/jobmon.rc b/jobmon/jobmon.rc new file mode 100644 index 0000000..ea1c291 --- /dev/null +++ b/jobmon/jobmon.rc @@ -0,0 +1,9 @@ +#!/bin/ksh + +daemon="/usr/sbin/jobmon" + +rc_reload=NO + +. /etc/rc.d/rc.subr + +rc_cmd $1 |
