//
// Syd: rock-solid application kernel
// src/workers/emu.rs: `syd_emu' emulator threads
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
// Based in part upon rusty_pool which is:
//     Copyright (c) Robin Friedli <robinfriedli@icloud.com>
//     SPDX-License-Identifier: Apache-2.0
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    env,
    mem::MaybeUninit,
    option::Option,
    os::fd::{AsRawFd, RawFd},
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc, RwLock,
    },
    thread,
    thread::JoinHandle,
};

use libseccomp::{scmp_cmp, RawSyscall, ScmpAction, ScmpFilterContext, ScmpSyscall};
use libseccomp_sys::seccomp_notify_receive;
use nix::{
    errno::Errno,
    poll::PollTimeout,
    sched::{sched_setaffinity, unshare, CloneFlags, CpuSet},
    sys::epoll::{Epoll, EpollEvent, EpollFlags},
    unistd::{close, Pid},
};

#[cfg(target_arch = "x86")]
use crate::cookie::FTRUNCATE64_COOKIE_ARG3;
#[cfg(target_arch = "x86")]
use crate::cookie::TRUNCATE64_COOKIE_ARG3;
use crate::{
    compat::epoll_ctl_safe,
    config::*,
    cookie::{
        FTRUNCATE64_COOKIE_ARG4, FTRUNCATE64_COOKIE_ARG5, FTRUNCATE_COOKIE_ARG2,
        FTRUNCATE_COOKIE_ARG3, FTRUNCATE_COOKIE_ARG4, FTRUNCATE_COOKIE_ARG5,
        MEMFD_CREATE_COOKIE_ARG2, MEMFD_CREATE_COOKIE_ARG3, MEMFD_CREATE_COOKIE_ARG4,
        MEMFD_CREATE_COOKIE_ARG5, OPENAT2_COOKIE_ARG4, OPENAT2_COOKIE_ARG5, RENAMEAT2_COOKIE_ARG5,
        SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG3, SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG4,
        SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG5, SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG3,
        SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG4, SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG5,
        SOCKET_COOKIE_ARG3, SOCKET_COOKIE_ARG4, SOCKET_COOKIE_ARG5, SYS_SOCKET,
        TRUNCATE64_COOKIE_ARG4, TRUNCATE64_COOKIE_ARG5, TRUNCATE_COOKIE_ARG2, TRUNCATE_COOKIE_ARG3,
        TRUNCATE_COOKIE_ARG4, TRUNCATE_COOKIE_ARG5, UNLINKAT_COOKIE_ARG3, UNLINKAT_COOKIE_ARG4,
        UNLINKAT_COOKIE_ARG5,
    },
    err::{err2no, SydResult},
    error, extend_ioctl,
    fs::{
        seccomp_notify_respond, AT_EXECVE_CHECK, SECCOMP_IOCTL_NOTIF_ADDFD,
        SECCOMP_IOCTL_NOTIF_LIST, SECCOMP_IOCTL_NOTIF_SEND,
    },
    hook::{HandlerMap, UNotifyEventRequest},
    info,
    path::dotdot_with_nul,
    proc::proc_mmap,
    sandbox::{Flags, Sandbox, SandboxGuard},
    scmp_arch_raw,
    workers::{aes::AesMap, BindMap, WorkerCache, WorkerData},
    ExportMode, ScmpNotifReq, SydArch, Sydcall,
};

const EOWNERDEAD: i32 = -libc::EOWNERDEAD;

#[derive(Clone)]
pub(crate) struct Worker {
    fd: RawFd,
    child: Pid,
    flags: Flags,
    epoll: Arc<Epoll>,
    cache: Arc<WorkerCache<'static>>,
    sandbox: Arc<RwLock<Sandbox>>,
    handlers: Arc<HandlerMap>,
    keep_alive: Option<u16>,
    should_exit: Arc<AtomicBool>,
    worker_data: Arc<WorkerData>,
    bind_map: BindMap,
    crypt_map: Option<AesMap>,
}

impl Worker {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        fd: RawFd,
        epoll: Arc<Epoll>,
        cache: Arc<WorkerCache<'static>>,
        sandbox: Arc<RwLock<Sandbox>>,
        handlers: Arc<HandlerMap>,
        keep_alive: Option<u16>,
        should_exit: Arc<AtomicBool>,
        worker_data: Arc<WorkerData>,
        bind_map: BindMap,
        crypt_map: Option<AesMap>,
    ) -> Self {
        let my_sandbox = SandboxGuard::Read(sandbox.read().unwrap_or_else(|err| err.into_inner()));
        let child = my_sandbox.get_child_pid();
        let flags = *my_sandbox.flags;
        drop(my_sandbox); // release the read lock.

        Worker {
            fd,
            child,
            flags,
            epoll,
            cache,
            sandbox,
            handlers,
            keep_alive,
            should_exit,
            worker_data,
            bind_map,
            crypt_map,
        }
    }

    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self, mut confined: bool) -> Result<JoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_emu".to_string())
            .stack_size(EMU_STACK_SIZE)
            .spawn(move || {
                // Unshare CLONE_FS so cwd and umask are per-thread.
                //
                // SAFETY: We unwrap here and crash the whole process,
                // if this fails as this unsharing is a hard dependency.
                #[allow(clippy::disallowed_methods)]
                unshare(CloneFlags::CLONE_FS).expect("unshare(CLONE_FS)");

                // Create sentinel, that will handle graceful teardown.
                let mut sentinel = Sentinel::new(&self);

                let dry_run = env::var_os(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();
                if !dry_run {
                    // Set a logging panic hook. The default panic
                    // hook calls system calls not permitted by emulators
                    // such as getcwd(2), stat(2) etc.
                    std::panic::set_hook(Box::new(|info| {
                        let err = match info.payload().downcast_ref::<&'static str>() {
                            Some(s) => *s,
                            None => match info.payload().downcast_ref::<String>() {
                                Some(s) => &**s,
                                None => "?",
                            },
                        };
                        let file = info.location().map(|l| l.file());
                        let line = info.location().map(|l| l.line());
                        error!("ctx": "panic", "msg": err, "file": file, "line": line);
                    }));
                }

                // Thread successfully started, increment total worker count.
                let worker_count = self.worker_data.increment_worker_total();

                // Attempt to set thread's CPU affinity mask.
                // We pin the main, init and monitor threads to CPU:0.
                // Emulator threads are pinned based on num-cpus.
                #[allow(clippy::arithmetic_side_effects)]
                let cpu_id = worker_count % *NPROC;
                let mut cpu_set = CpuSet::new();
                if cpu_set.set(cpu_id).is_ok() {
                    match sched_setaffinity(Pid::from_raw(0), &cpu_set) {
                        Ok(_) => {
                            info!("ctx": "boot", "op": "pin_emulator_thread",
                                "msg": format!("pinned emulator thread to CPU:{cpu_id}"),
                                "cpu": cpu_id);
                        }
                        Err(errno) => {
                            error!("ctx": "boot", "op": "pin_emulator_thread",
                                "msg": format!("failed to pin emulator thread to CPU:{cpu_id}: {errno}"),
                                "err": errno as i32,
                                "cpu": cpu_id);
                        }
                    }
                }

                loop {
                    // SAFETY: Confine if/once locked.
                    if !confined && Sandbox::locked_once() {
                        // SAFETY: Panic if we cannot prepare the seccomp filter.
                        #[allow(clippy::disallowed_methods)]
                        let ctx = Self::prepare_confine(
                            self.fd,
                            self.epoll.0.as_raw_fd(),
                            self.flags,
                        ).expect("prepare emulator thread confinement");

                        // Honour dry-run when exporting.
                        if !dry_run {
                            let safe_setid = self.flags.intersects(Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID);
                            info!("ctx": "confine", "op": "confine_emulator_thread",
                                "msg": format!("emulator thread confined with{} SROP mitigation",
                                    if safe_setid { "out" } else { "" }));

                            // SAFETY: Panic if we cannot confine the thread.
                            #[allow(clippy::disallowed_methods)]
                            ctx.load().expect("confine emulator thread");
                        } else {
                            error!("ctx": "confine", "op": "confine_emulator_thread",
                                "msg": "emulator thread is running unconfined in debug mode");
                        }

                        confined = true;
                    }

                    // Wait for the request to become ready as necessary.
                    // epoll_wait(2) will timeout and exit for non-core threads.
                    if self.poll().is_err() {
                        // Timeout or critical error.
                        // Decrement worker total and exit.
                        self.worker_data.decrement_worker_total();
                        break;
                    }

                    // Receive seccomp notification.
                    let request = if let Ok(request) = self.receive() {
                        request
                    } else {
                        // Critical error, decrement worker total and exit.
                        self.worker_data.decrement_worker_total();
                        break;
                    };

                    if let Some(request) = request {
                        // Mark thread busy.
                        sentinel.seccomp_id = Some(request.id);
                        self.worker_data.increment_worker_busy();

                        // Handle request.
                        self.handle(request);

                        // Mark thread idle again.
                        sentinel.seccomp_id = None;
                        self.worker_data.decrement_worker_busy();
                    } // else process died-midway, continue.
                }
            }).map_err(|err| err2no(&err))
    }

    fn receive(&self) -> Result<Option<ScmpNotifReq>, Errno> {
        // Receive and return request.
        // Break if file descriptor was closed.
        // Ignore rest of the errors as we cannot handle them,
        // e.g: EINTR|ENOENT: task is killed mid-way.
        match self.read() {
            Ok(request) => Ok(Some(request)),
            Err(Errno::EBADF) => Err(Errno::EBADF),
            Err(_) => Ok(None),
        }
    }

    fn read(&self) -> Result<ScmpNotifReq, Errno> {
        // Use libc::seccomp_notif rather than libseccomp_sys's.
        // The latter is opaque and requires us to do a heap
        // allocation which we don't always want.
        let mut req: MaybeUninit<libc::seccomp_notif> = MaybeUninit::zeroed();

        // SAFETY: libseccomp's wrapper allocates each call.
        // Note: EINTR means child killed by signal!
        Errno::result(unsafe { seccomp_notify_receive(self.fd, req.as_mut_ptr().cast()) })?;

        // SAFETY: seccomp_notify_receive returned success.
        // Request is populated and accessing it is safe.
        let req = ScmpNotifReq::from_sys(unsafe { req.assume_init() })?;

        if req.id != 0 && req.pid != 0 {
            Ok(req)
        } else {
            // interrupted/task killed mid-way.
            Err(Errno::EINTR)
        }
    }

    #[allow(clippy::cognitive_complexity)]
    fn handle(&self, req: ScmpNotifReq) {
        // Lookup the system call handler, panic if not found.
        let syscall = Sydcall(req.data.syscall, scmp_arch_raw(req.data.arch));
        let handler = if let Some(handler) = self.handlers.get(&syscall) {
            handler
        } else {
            unreachable!("BUG: Missing hook for request {req:?}!");
        };

        let request = UNotifyEventRequest::new(
            req,
            syscall,
            self.fd,
            Arc::clone(&self.cache),
            Arc::clone(&self.sandbox),
            Arc::clone(&self.bind_map),
            self.crypt_map.as_ref().map(Arc::clone),
        );
        let response = handler(request);

        if response.id == 0 && response.val == 0 && response.error == 0 && response.flags == 0 {
            // All-zero dummy empty seccomp response.
            // A previous ADDFD request has already replied to the request.
            // Nothing left to do here.
            return;
        } else if response.error == EOWNERDEAD {
            // EOWNERDEAD is a pseudo errno used by
            // the stat handler thread to close the
            // seccomp notify fd upon receiving the
            // "ghost" command.
            crate::warn!("ctx": "confine", "op": "enter_ghost_mode", "pid": req.pid,
                "sys": syscall, "arch": SydArch(req.data.arch), "args": req.data.args,
                "src": proc_mmap(req.pid(), req.data.instr_pointer).ok());
        }

        let response = libc::seccomp_notif_resp {
            id: response.id,
            val: response.val,
            error: response.error,
            flags: response.flags,
        };

        // EAGAIN|EINTR is retried.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.fd, std::ptr::addr_of!(response));

        // See above.
        if response.error == EOWNERDEAD {
            // Note, threads blocked on epoll_wait will not
            // wake up even if we close the epoll fd or
            // delete the seccomp fd from epoll wait-list here.
            // That said, they'll never ever wake up again,
            // and therefore will not consume system resources.
            let _ = epoll_ctl_safe(&self.epoll.0, self.fd, None);
            let _ = close(self.fd);

            // Inform the monitor thread to exit.
            self.should_exit.store(true, Ordering::Relaxed);
        }
    }

    fn poll(&self) -> Result<(), Errno> {
        let timeout = if let Some(keep_alive) = self.keep_alive {
            PollTimeout::from(keep_alive)
        } else {
            PollTimeout::NONE
        };

        // Wait for an event and handle EINTR.
        // Retire threads which have exited along the way.
        let mut events = [EpollEvent::empty(); 1];
        loop {
            if self.should_exit.load(Ordering::Relaxed) {
                // Exit notified, do not try to wait on epoll again.
                return Err(Errno::ESRCH);
            }

            match self.epoll.wait(&mut events, timeout) {
                Ok(0) if self.keep_alive.is_some() => return Err(Errno::ETIMEDOUT),
                Ok(0) | Err(Errno::EINTR) => {} // try again.
                Ok(1) if events[0].data() == 0 => {
                    // Seccomp request.
                    let flags = events[0].events();

                    return if flags.contains(EpollFlags::EPOLLIN) {
                        // Received scmp request.
                        Ok(())
                    } else if flags.intersects(
                        EpollFlags::EPOLLERR | EpollFlags::EPOLLHUP | EpollFlags::EPOLLRDHUP,
                    ) {
                        // Exit on any hang-up or error.
                        //
                        // Inform the monitor thread to exit.
                        self.should_exit.store(true, Ordering::Relaxed);

                        Err(Errno::EPIPE)
                    } else {
                        unreachable!(
                            "BUG: seccomp request event with invalid flags {:#x}!",
                            flags.bits()
                        );
                    };
                }
                Ok(1) => {
                    // Record retired PID and try again.
                    // These fds are added with EPOLLONESHOT.
                    #[allow(clippy::cast_possible_truncation)]
                    let pid = Pid::from_raw(events[0].data() as libc::pid_t);

                    // Retire the PidFd.
                    if let Some(map) = PIDFD_MAP.get() {
                        map.del_pidfd(pid);
                    }

                    if !self.flags.contains(Flags::FL_EXIT_WAIT_ALL) && pid == self.child {
                        // Note, threads blocked on epoll_wait will not
                        // wake up even if we close the epoll fd or
                        // delete the seccomp fd from epoll wait-list here.
                        // That said, they'll never ever wake up again,
                        // and therefore will not consume system resources.
                        let _ = epoll_ctl_safe(&self.epoll.0, self.fd, None);
                        let _ = close(self.fd);

                        // Inform the monitor thread to exit.
                        self.should_exit.store(true, Ordering::Relaxed);

                        // Exiting with the eldest process.
                        return Err(Errno::ESRCH);
                    }

                    // If we're waiting for all processes, let's just try again.
                }
                Ok(n) => unreachable!("BUG: epoll_wait returned ${n} unexpectedly!"),
                Err(errno) => return Err(errno),
            };
        }
    }

    /// Confine Worker thread.
    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        seccomp_fd: RawFd,
        epoll_fd: RawFd,
        flags: Flags,
    ) -> SydResult<ScmpFilterContext> {
        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_spec_exec:1
        ctx.set_ctl_ssb(flags.contains(Flags::FL_ALLOW_UNSAFE_SPEC_EXEC))?;

        // DO NOT synchronize filter to all threads.
        // Main thread will confine itself.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Deny open and {l,}stat with ENOSYS rather than KillProcess.
        // We need this because std::thread::spawn has unwanted
        // side-effects such as opening /sys/devices/system/cpu/online
        // on some architectures.
        for sysname in ["open", "stat", "lstat"] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Errno(Errno::ENOSYS as i32), syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow a restricted set of ioctl(2) operations to our seccomp fd only.
        //
        // Syscall argument cookies for SECCOMP_IOCTL_NOTIF_SEND my be disabled
        // at startup with trace/allow_unsafe_nocookie:1.
        let sysname = "ioctl";
        let restrict_cookie = !flags.contains(Flags::FL_ALLOW_UNSAFE_NOCOOKIE);
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                for ioctl_request in SECCOMP_IOCTL_NOTIF_LIST {
                    let mut rules = vec![scmp_cmp!($arg0 == seccomp_fd as u64)];

                    // SAFETY: We protect SECCOMP_IOCTL_NOTIF_ADDFD with system call argument
                    // cookies, to raise the bar against an attacker who has compromised Syd
                    // and aims steal file descriptors.
                    //
                    // Randomizing the seccomp-fd at startup is another mitigation against this.
                    #[allow(clippy::useless_conversion)]
                    if restrict_cookie && *ioctl_request == SECCOMP_IOCTL_NOTIF_ADDFD {
                        rules.extend(&[
                            scmp_cmp!($arg3 == (*SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*SECCOMP_IOCTL_NOTIF_ADDFD_COOKIE_ARG5).into()),
                        ]);
                    }

                    // SAFETY: We protect SECCOMP_IOCTL_NOTIF_SEND with system call argument
                    // cookies, to raise the bar against an attacker who has compromised Syd
                    // and aims to inject the flag SECCOMP_USER_NOTIF_FLAG_CONTINUE to this
                    // response in order to pass-through a system call to the host Linux kernel.
                    //
                    // Randomizing the seccomp-fd at startup is another mitigation against this.
                    #[allow(clippy::useless_conversion)]
                    if restrict_cookie && *ioctl_request == SECCOMP_IOCTL_NOTIF_SEND {
                        rules.extend(&[
                            scmp_cmp!($arg3 == (*SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*SECCOMP_IOCTL_NOTIF_SEND_COOKIE_ARG5).into()),
                        ]);
                    }

                    rules.push(scmp_cmp!($arg1 == *ioctl_request));
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;

                    if let Some(ioctl_request) = extend_ioctl(*ioctl_request) {
                        rules.pop();
                        rules.push(scmp_cmp!($arg1 == ioctl_request));
                        ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                    }
                }
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow epoll(7) API to our single epoll fd only.
        #[allow(clippy::cast_sign_loss)]
        for sysname in EPOLL_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg0 == epoll_fd as u64)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow unshare(2) with CLONE_FS only.
        let sysname = "unshare";
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[scmp_cmp!($arg0 == libc::CLONE_FS as u64)],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // SAFETY: Allow kcmp(2) with KCMP_VM or KCMP_FILE only,
        // see users of `syd::fs::is_same_vm` and `syd::proc::proc_rand_fd`.
        let sysname = "kcmp";
        const KCMP_FILE: u64 = 0;
        const KCMP_VM: u64 = 1;
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[scmp_cmp!($arg2 == KCMP_FILE)],
                )?;
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[scmp_cmp!($arg2 == KCMP_VM)],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // SAFETY: Allow write(2) _only_ if trace/memory_access allows
        // proc_pid_mem(5) access in which case we cannot avoid it.
        // The default is to use cross memory attach with
        // process_vm_{read,write}v(2) unconditionally since
        // version 3.32.6 unless relaxed.
        let sysname = "write";
        if Sandbox::memory_access() == 2 {
            // Allow write(2) access to syd::log::LOG_FD only.
            // If logging is disabled, deny write(2) completely.
            // Note, we do allow the use-case where LOG_FD is
            // intentionally set to a negative value to disable
            // logging, hence we cast directly to an u64 and not
            // use u64::try_from!
            let log_fd = crate::log::LOG_FD.load(std::sync::atomic::Ordering::Relaxed);
            #[allow(clippy::cast_sign_loss)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg0 == log_fd as u64)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        } else {
            // Allow write(2) globally for proc_pid_mem(5) access.
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in EMU_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe system calls.
        for sysname in EMU_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Syscall argument cookies may be disabled
        // at startup with trace/allow_unsafe_nocookie:1.
        if restrict_cookie {
            // memfd_create(2) may be used only with syscall argument cookies.
            let sysname = "memfd_create";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg2 == (*MEMFD_CREATE_COOKIE_ARG2).into()),
                            scmp_cmp!($arg3 == (*MEMFD_CREATE_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*MEMFD_CREATE_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*MEMFD_CREATE_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // renameat2(2) may be used only with syscall argument cookies.
            // We also prevent AT_FDCWD usage as fd argument.
            let sysname = "renameat2";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 <= RawFd::MAX as u64),
                            scmp_cmp!($arg5 == (*RENAMEAT2_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // {,f}truncate{,64}(2) may be used only with syscall argument cookies.
            let sysname = "truncate";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg2 == (*TRUNCATE_COOKIE_ARG2).into()),
                            scmp_cmp!($arg3 == (*TRUNCATE_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*TRUNCATE_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*TRUNCATE_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            let sysname = "truncate64";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            #[cfg(target_arch = "x86")]
                            scmp_cmp!($arg3 == (*TRUNCATE64_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*TRUNCATE64_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*TRUNCATE64_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            let sysname = "ftruncate";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg2 == (*FTRUNCATE_COOKIE_ARG2).into()),
                            scmp_cmp!($arg3 == (*FTRUNCATE_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*FTRUNCATE_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*FTRUNCATE_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            let sysname = "ftruncate64";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            #[cfg(target_arch = "x86")]
                            scmp_cmp!($arg3 == (*FTRUNCATE64_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*FTRUNCATE64_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*FTRUNCATE64_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // unlinkat(2) may be used only with syscall argument cookies.
            // We also prevent AT_FDCWD usage as fd argument.
            let sysname = "unlinkat";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 <= RawFd::MAX as u64),
                            scmp_cmp!($arg3 == (*UNLINKAT_COOKIE_ARG3).into()),
                            scmp_cmp!($arg4 == (*UNLINKAT_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*UNLINKAT_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // openat2(2) may be used only with syscall argument cookies.
            // We also prevent AT_FDCWD usage as fd argument.
            let sysname = "openat2";
            #[allow(clippy::useless_conversion)]
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[
                            scmp_cmp!($arg0 <= RawFd::MAX as u64),
                            scmp_cmp!($arg4 == (*OPENAT2_COOKIE_ARG4).into()),
                            scmp_cmp!($arg5 == (*OPENAT2_COOKIE_ARG5).into()),
                        ],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }

            // socket(2) may be used only with syscall argument cookies.
            // Note: We only enforce this on architectures where the
            // system call is direct and there's no socketcall(2)
            // multiplexer indirection.
            #[allow(clippy::cast_possible_truncation)]
            #[allow(clippy::useless_conversion)]
            if let Some(syscall) =
                SYS_SOCKET.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall))
            {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg3 == (*SOCKET_COOKIE_ARG3).into()),
                        scmp_cmp!($arg4 == (*SOCKET_COOKIE_ARG4).into()),
                        scmp_cmp!($arg5 == (*SOCKET_COOKIE_ARG5).into()),
                    ],
                )?;
            } else {
                match ScmpSyscall::from_name("socket") {
                    Ok(syscall) => {
                        // Allow socketcall(2).
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                            "msg": "invalid or unsupported syscall socket");
                    }
                }
            }
        } else {
            // trace/allow_unsafe_nocookie: Allow access without cookies.
            for sysname in [
                "memfd_create",
                "renameat2",
                "truncate",
                "truncate64",
                "ftruncate",
                "ftruncate64",
                "unlinkat",
                "openat2",
                "socket",
            ] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        // openat(2) may be used to open the parent directory only by getdir_long().
        // The rest of the attempts are denied with ENOSYS for compat.
        let sysname = "openat";
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                let dotdot = dotdot_with_nul();
                let oflags = (libc::O_RDONLY
                    | libc::O_CLOEXEC
                    | libc::O_DIRECTORY
                    | libc::O_LARGEFILE
                    | libc::O_NOCTTY
                    | libc::O_NOFOLLOW) as u64;
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 <= RawFd::MAX as u64),
                        scmp_cmp!($arg1 == dotdot),
                        scmp_cmp!($arg2 & oflags == oflags),
                    ],
                )?;
                ctx.add_rule_conditional(
                    ScmpAction::Errno(Errno::ENOSYS as i32),
                    syscall,
                    &[scmp_cmp!($arg0 > RawFd::MAX as u64)],
                )?;
                ctx.add_rule_conditional(
                    ScmpAction::Errno(Errno::ENOSYS as i32),
                    syscall,
                    &[scmp_cmp!($arg1 != dotdot)],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_emu_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow execveat(2) with AT_EXECVE_CHECK for Linux>=6.14.
        if *HAVE_AT_EXECVE_CHECK {
            let sysname = "execveat";
            #[allow(clippy::cast_sign_loss)]
            let atcheck = AT_EXECVE_CHECK.bits() as u64;
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg4 & atcheck == atcheck)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        if flags.intersects(Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID) {
            // SAFETY: Main thread confines these further.
            // As these system calls as per-process,
            // the main thread's seccomp rules will apply
            // to us even without TSYNC.
            for sysname in SET_ID_SYSCALLS {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }

            // SAFETY:
            // 1. cap{g,s}et is necessary to drop CAP_SET{U,G}ID after changing {U,G}ID.
            // 2. Signal system calls are necessary to handle reserved signals.
            // Note, {rt_,}sigreturn is already allowed for emulators to handle SIGALRM.
            for sysname in ["capget", "capset", "sigaction", "rt_sigaction"] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        Ok(ctx)
    }
}

/// Type that exists to manage worker exit on panic.
///
/// This type is constructed once per `Worker` and implements `Drop` to
/// handle proper worker exit in case the worker panics when executing
/// the current task or anywhere else in its work loop. If the
/// `Sentinel` is dropped at the end of the worker's work loop and the
/// current thread is panicking, handle worker exit the same way as if
/// the task completed normally (if the worker panicked while executing
/// a submitted task) then clone the worker and start it with an initial
/// task of `None`.
struct Sentinel<'a> {
    seccomp_id: Option<u64>,
    worker_ref: &'a Worker,
}

impl<'a> Sentinel<'a> {
    fn new(worker_ref: &'a Worker) -> Sentinel<'a> {
        Self {
            seccomp_id: None,
            worker_ref,
        }
    }

    #[allow(clippy::arithmetic_side_effects)]
    fn deny_syscall(&self, seccomp_id: u64, errno: Errno) {
        let response = libc::seccomp_notif_resp {
            id: seccomp_id,
            val: 0,
            error: -(errno as i32),
            flags: 0,
        };

        // EAGAIN|EINTR is retried.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.worker_ref.fd, std::ptr::addr_of!(response));
    }
}

impl Drop for Sentinel<'_> {
    fn drop(&mut self) {
        if thread::panicking() {
            if let Some(seccomp_id) = self.seccomp_id {
                // Busy thread panicked.
                // SAFETY: Deny syscall in progress!
                self.deny_syscall(seccomp_id, Errno::EACCES);
                self.worker_ref.worker_data.decrement_both();
            } else {
                // Idle thread panicked.
                self.worker_ref.worker_data.decrement_worker_total();
            }
        }
    }
}
