// SPDX-License-Identifier: ISC // SPDX-FileCopyrightText: 2025 Demi Marie Obenour // check_posix and check_posix_bool are based on code with following license: // // Copyright 2014 Daniel Micay // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define _GNU_SOURCE 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* TODO: does this need to have credit given to Daniel Micay? */ __attribute__((format(printf, 2, 3))) intmax_t check_posix(intmax_t arg, const char *fmt, ...) { if (arg >= 0) return arg; assert(arg == -1); va_list a; va_start(a, fmt); verr(EX_OSERR, fmt, a); __builtin_unreachable(); } #define check_posix(arg, message, ...) \ ((__typeof__(arg))check_posix(arg, message, ## __VA_ARGS__)) /* And same here */ __attribute__((format(printf, 2, 3))) void check_posix_bool(intmax_t arg, const char *fmt, ...) { if (arg != -1) { assert(arg == 0); return; } va_list a; va_start(a, fmt); verr(EX_OSERR, fmt, a); va_end(a); /* not reached */ } /* And same here */ void check_posix_bool_no_atexit(intmax_t arg, const char *msg) { if (arg != -1) { assert(arg == 0); return; } perror(msg); _exit(EX_OSERR); } static volatile siginfo_t sig_info; static pid_t child_pid; /* Interrupts a call to ppoll(), which is AS-safe */ static void handler(int signum, siginfo_t *info, void *data) { (void)data; switch (signum) { case SIGCHLD: switch (info->si_code) { case CLD_EXITED: case CLD_KILLED: case CLD_DUMPED: if (info->si_pid == child_pid) sig_info = *info; break; } break; case SIGTERM: case SIGINT: case SIGWINCH: case SIGHUP: case SIGUSR1: case SIGUSR2: case SIGQUIT: kill(child_pid, signum); break; default: abort(); } } /* too_low is 1 below the lower bound; this prevents the final negation * from overflowing if too_low and the result are INT_MIN */ static int parse_int(const char *arg, int too_low) { char *end = (char *)arg; if (*arg == '0') { if (arg[1] == '\0' && too_low < 0) return 0; return INT_MIN; } int negative = *arg == '-'; if (negative) { arg++; } if (*arg < '1' || *arg > '9') { return INT_MIN; } errno = 0; long v = strtol(arg, &end, 10); if (v <= too_low || v > INT_MAX || errno || *end != '\0') { return INT_MIN; } return negative ? -(int)v : (int)v; } const struct option longopts[] = { { "oom-score-adj", required_argument, NULL, 'o' }, { "help", no_argument, NULL, 'h' }, { NULL, 0, NULL, 0 }, }; static _Noreturn void usage(int arg) { errx(arg, "Usage: notification-fd [[-o|--oom-score-adj] adjustment] -- notify-socket program argv0 args...\n"); } #if O_RDONLY != 0 || O_WRONLY != 1 || O_RDWR != 2 # error unsupported O_* constants #endif static void check_fd_usable(int fd, bool writable) { int raw_flags = fcntl(fd, F_GETFL); if (raw_flags == -1) { err(errno == EBADF ? EX_USAGE : EX_OSERR, "fcntl(%d, F_GETFD)", fd); } int flags = raw_flags & 3; if (flags != O_RDWR && flags != (writable ? O_WRONLY : O_RDONLY)) { errx(EX_USAGE, "File descriptor %d is not %s (flags 0x%x)", fd, writable ? "writable" : "readable", raw_flags); } } static void check_pipe(int fd) { struct stat buf; check_posix_bool(fstat(fd, &buf), "fstat"); if (!S_ISFIFO(buf.st_mode)) { errx(EX_USAGE, "notification fd %d is not a pipe", fd); } if (buf.st_mode & 066) { errx(EX_USAGE, "notification fd %d is accessible by group or other", fd); } } static pid_t process_cmsg(struct msghdr *const msg) { pid_t sender_pid = -1; for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { size_t data_len = cmsg->cmsg_len - sizeof(struct cmsghdr); if (cmsg->cmsg_level != SOL_SOCKET) { continue; } if (cmsg->cmsg_type == SCM_RIGHTS) { int received_fd; for (size_t i = 0; data_len - i >= sizeof(received_fd); i += sizeof(received_fd)) { memcpy(&received_fd, CMSG_DATA(cmsg) + i, sizeof(received_fd)); (void)close(received_fd); } } if (cmsg->cmsg_type == SCM_CREDENTIALS) { struct ucred creds; assert(data_len >= sizeof(creds)); memcpy(&creds, CMSG_DATA(cmsg), sizeof(creds)); sender_pid = creds.pid; } } return sender_pid; } static pid_t own_pid; void kill_process_group(void) { if (own_pid != -1) kill(0, SIGKILL); } int main(int argc, char **argv) { own_pid = getpid(); int oom_score_adj = INT_MIN; if (argc < 1) { errx(EX_USAGE, "argv[0] is NULL"); } if (own_pid < 2) { errx(EX_USAGE, "cannot run as PID 1"); } for (int i = 0; i < 3; ++i) { check_fd_usable(i, i != 0); } for (;;) { int longindex = -1; const char *lastopt = argv[optind]; int res = getopt_long(argc, argv, "+o:h", longopts, &longindex); if (res == -1) { if (argc - optind < 4) { usage(EX_USAGE); } if (strcmp(lastopt, "--") != 0) { errx(EX_USAGE, "no -- before non-option arguments"); } break; } if (res == '?') { usage(EX_USAGE); } /* getopt_long accepts abbreviated options. Disable this misfeature. */ if (lastopt[0] == '-' && lastopt[1] == '-') { const char *optname = lastopt + 2; assert(longindex >= 0 && longindex < (int)(sizeof(longopts)/sizeof(longopts[0]))); const char *expected = longopts[longindex].name; if (strncmp(expected, optname, strlen(expected)) != 0) { char *equal = strchr(optname, '='); errx(EX_USAGE, "Option --%.*s must be written as --%s", equal ? (int)(equal - optname) : INT_MAX, optname, expected); } } switch (res) { case 'o': oom_score_adj = parse_int(optarg, INT_MIN); if (oom_score_adj < -1000 || oom_score_adj > 1000) { errx(EX_USAGE, "Invalid OOM score adjustment %s", optarg); } break; case 'h': usage(0); default: assert(0); /* not reached */ } } union { struct sockaddr_un un; struct sockaddr addr; } a = {}; int notification_fd = parse_int(argv[optind], 2); const char *const socket_path = argv[optind + 1]; const char *const progname = argv[optind + 2]; char **const args_to_exec = argv + optind + 3; if (notification_fd < 3) { errx(EX_USAGE, "Invalid notification descriptor %s\n", argv[optind]); } check_fd_usable(notification_fd, true); check_pipe(notification_fd); check_posix_bool(chdir("/"), "chdir(/)"); mode_t old_mask = umask(0077); size_t len = strlen(socket_path); if (len >= sizeof(a.un.sun_path)) { /* TODO: use stravis() */ errx(EX_USAGE, "Path %s is too long", socket_path); } if (socket_path[0] != '/') { /* TODO: use stravis() */ errx(EX_USAGE, "Path %s is not absolute", socket_path); } if (check_posix(getpgrp(), "getpgrp()") != own_pid) { check_posix_bool(setsid(), "setsid"); } memcpy(a.un.sun_path, socket_path, len + 1); a.un.sun_family = AF_UNIX; int fd = check_posix(socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0), "socket"); { int flag = 1; check_posix_bool(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &flag, (socklen_t)sizeof(flag)), "setsockopt(SO_REUSEADDR)"); check_posix_bool(setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &flag, (socklen_t)sizeof(flag)), "setsockopt(SO_PASSCRED)"); } for (;;) { int status; do { status = bind(fd, &a.addr, (socklen_t)(len + 1 + offsetof(struct sockaddr_un, sun_path))); } while (status == -1 && errno == EINTR); if (!(status == -1 && errno == EADDRINUSE)) { check_posix_bool(status, "bind(%s)", socket_path); break; } check_posix_bool(unlink(socket_path), "unlink(%s)", socket_path); } umask(old_mask); /* TODO: support commands */ sigset_t sigset; sigemptyset(&sigset); const int sigs[] = { SIGCHLD, SIGQUIT, SIGTERM, SIGINT, SIGWINCH, SIGHUP, SIGUSR1, SIGUSR2 }; for (size_t i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) { check_posix_bool(sigaddset(&sigset, sigs[i]), "sigaddset(%d)", sigs[i]); } check_posix_bool(sigprocmask(SIG_BLOCK, &sigset, NULL), "sigprocmask"); struct sigaction act = { }; /* systemd ignores SIGPIPE, so emulate this */ act.sa_handler = SIG_IGN; check_posix_bool(sigaction(SIGPIPE, &act, NULL), "sigaction(SIGPIPE)"); /* add handlers */ act.sa_sigaction = handler; act.sa_mask = sigset; /* these are already blocked but that is harmless */ act.sa_flags = SA_SIGINFO; for (size_t i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) { check_posix_bool(sigaction(sigs[i], &act, NULL), "sigaction(%d)", sigs[i]); } static_assert(EOF == -1, "bad EOF definition"); check_posix_bool(fflush(NULL), "fflush"); if (oom_score_adj != INT_MIN) { char *p; int fd = check_posix(open("/proc/self/oom_score_adj", O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW), "open(\"/proc/self/oom_score_adj\")"); int to_write = check_posix(asprintf(&p, "%d\n", oom_score_adj), "asprintf"); ssize_t written = check_posix(write(fd, p, (size_t)to_write), "write(\"/proc/self/oom_score_adj\")"); assert(written == to_write); free(p); } pid_t pid = fork(); if (pid < 0) { assert(pid == -1); err(EX_OSERR, "fork()"); } child_pid = pid; if (atexit(kill_process_group)) { errx(EX_OSERR, "atexit()"); } if (pid == 0) { check_posix_bool_no_atexit(syscall(SYS_close_range, 3L, ~0UL, 0L), "close_range()"); check_posix_bool_no_atexit(setenv("NOTIFY_SOCKET", a.un.sun_path, 1), "setenv"); check_posix_bool_no_atexit(execve(progname, args_to_exec, environ), "execve"); abort(); /* not reached */ } char buf[sizeof("RELOADING=1\n") - 1]; struct iovec v[1] = { { .iov_base = buf, .iov_len = sizeof(buf), }, }; union { struct cmsghdr hdr; char buf[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int) * 253)]; } cmsg_buffer; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_iov = v, .msg_iovlen = sizeof(v)/sizeof(v[0]), .msg_control = cmsg_buffer.buf, .msg_controllen = sizeof(cmsg_buffer.buf), .msg_flags = 0, }; sigemptyset(&sigset); struct pollfd p[] = { { .fd = fd, .events = POLLIN | POLLPRI | POLLRDHUP, .revents = 0, }, }; bool ready = false; bool reloading = false; for (;;) { /* Main event loop */ if (sig_info.si_pid) { int status; int r = waitpid(sig_info.si_pid, &status, 0); if (r == -1) err(EX_OSERR, "waitpid(%jd)", (intmax_t)sig_info.si_pid); if (WIFSIGNALED(status)) { for (;;) { (void)signal(WTERMSIG(status), SIG_DFL); (void)kill(getpid(), WTERMSIG(status)); } } else if (WIFEXITED(status)) { own_pid = -1; return WEXITSTATUS(status); } else { abort(); /* cannot happen */ } } int r = ppoll(p, sizeof(p)/sizeof(p[0]), NULL, &sigset); if (r < -1 || r > (int)(sizeof(p)/sizeof(p[0]))) { abort(); } if (r == -1) { if (errno == ENOMEM) { fprintf(stderr, "Kernel out of memory in ppoll()\n"); continue; /* todo: use epoll(7) instead */ } if (errno == EINTR) { fprintf(stderr, "ppoll() interrupted by signal\n"); continue; } err(EX_OSERR, "poll"); } fprintf(stderr, "Returned from poll()\n"); if (p[0].revents) { ssize_t data = recvmsg(fd, &msg, MSG_CMSG_CLOEXEC | MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK); if (data == -1) { if (errno == EINTR) { fprintf(stderr, "recvmsg() interrupted by signal"); continue; /* signal caught */ } if (errno == EAGAIN || errno == EWOULDBLOCK) { fprintf(stderr, "ppoll() spurious wakeup\n"); continue; /* spurious wakeup */ } } size_t size = (size_t)check_posix(data, "recvmsg"); pid_t sender_pid = process_cmsg(&msg); if (msg.msg_flags & MSG_TRUNC) { char *b = (v[0].iov_base == buf) ? malloc(size) : realloc(v[0].iov_base, size); if (b != NULL) { v[0].iov_base = b; v[0].iov_len = size; } } size = (size_t)check_posix(recvmsg(fd, &msg, MSG_CMSG_CLOEXEC | MSG_DONTWAIT | MSG_TRUNC), "recvmsg"); sender_pid = process_cmsg(&msg); if (sender_pid != child_pid) { fprintf(stderr, "%jd cannot notify\n", (intmax_t)sender_pid); continue; /* cannot notify */ } const char *cursor = v[0].iov_base; const char *end = cursor + size; for (;;) { char *next = memchr(cursor, '\n', (size_t)(end - cursor)); size_t message_size = (size_t)((next == NULL ? end : next) - cursor); if (message_size == sizeof("READY=1") - 1 && memcmp(cursor, "READY=1", sizeof("READY=1") - 1) == 0) { if (!ready) { if (check_posix(write(notification_fd, "Ready\n", sizeof("Ready")), "write") != sizeof("Ready")) { errx(EX_OSERR, "cannot notify parent of readiness"); } } ready = true; if (reloading) { fprintf(stderr, "Configuration reload complete\n"); } else { fprintf(stderr, "Program ready\n"); } reloading = false; } else if (message_size == sizeof("RELOADING=1") - 1 && memcmp(cursor, "RELOADING=1", sizeof("RELOADING=1") - 1) == 0) { reloading = true; } if (next == NULL) { break; } cursor = next + 1; } } } }