//
// Syd: rock-solid application kernel
// src/workers/int.rs: `syd_int' interrupter thread
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    env,
    os::fd::RawFd,
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
    thread,
    thread::JoinHandle,
};

use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    unistd::{getpid, lseek64, Pid, Whence},
};

use crate::{
    cache::SysInterrupt,
    config::*,
    err::{err2no, SydResult},
    error,
    fs::seccomp_notify_id_valid,
    info,
    landlock_policy::LandlockPolicy,
    proc::proc_status_read,
    sandbox::Flags,
    workers::WorkerCache,
    ExportMode,
};

#[derive(Clone)]
pub(crate) struct Interrupter {
    scmp: RawFd,
    flags: Flags,
    should_exit: Arc<AtomicBool>,
    cache: Arc<WorkerCache<'static>>,
}

impl Interrupter {
    pub(crate) fn new(
        scmp: RawFd,
        flags: Flags,
        should_exit: Arc<AtomicBool>,
        cache: Arc<WorkerCache<'static>>,
    ) -> Self {
        Self {
            scmp,
            flags,
            should_exit,
            cache,
        }
    }

    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self) -> Result<JoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_int".to_string())
            .stack_size(INT_STACK_SIZE)
            .spawn(move || {
                // To be used by tgkill when signaling threads.
                let tgid = getpid();

                // Honour dry-run when exporting.
                let dry_run =
                    env::var_os(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();
                if !dry_run {
                    // SAFETY: Default panic hook wont play well with seccomp.
                    std::panic::set_hook(Box::new(|_| {}));
                }

                // SAFETY: Logging will kill us after seccomp.
                let safe_setid = self
                    .flags
                    .intersects(Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID);
                info!("ctx": "confine", "op": "confine_interrupt_thread",
                    "msg": format!("interrupt thread confined with{} SROP mitigation",
                        if safe_setid { "out" } else { "" }));

                // SAFETY: Panic if we cannot prepare the seccomp filter.
                // Note, prepare_confine also applies Landlock for !dry_run.
                #[allow(clippy::disallowed_methods)]
                let ctx = Self::prepare_confine(self.scmp, tgid, self.flags, dry_run)
                    .expect("prepare interrupt thread confinement");
                #[allow(clippy::disallowed_methods)]
                if !dry_run {
                    // SAFETY: Panic if we cannot confine the thread.
                    ctx.load().expect("confine interrupt thread");
                } else {
                    error!("ctx": "confine", "op": "confine_interrupt_thread",
                        "msg": "interrupter thread is running unconfined in debug mode");
                }

                // Enter main loop.
                self.main(tgid)
            })
            .map_err(|err| err2no(&err))
    }

    fn main(self, tgid: Pid) {
        loop {
            // Wait for one cycle.
            std::thread::sleep(INT_CYCLE_TIME);

            // Unblock invalidated blocking system calls.
            {
                let (ref lock, ref cvar) = *self.cache.sysint_map.sys_block;
                let mut map = lock.lock().unwrap_or_else(|err| err.into_inner());

                // As long as the map is empty, we wait for an insert notification.
                map = cvar
                    .wait_while(map, |map| map.is_empty())
                    .unwrap_or_else(|err| err.into_inner());

                // Handle interrupts as necessary.
                map.retain(|_, interrupt| self.handle_interrupt(tgid, interrupt));
            }

            // Check if it's the time to exit.
            if self.should_exit.load(Ordering::Relaxed) {
                break;
            }
        }
    }

    fn handle_interrupt(&self, tgid: Pid, interrupt: &SysInterrupt) -> bool {
        // Check if request is still valid.
        if !self.is_valid(interrupt.request.id) {
            // Request no longer valid:
            // Interrupt syd_emu thread and remove entry.
            Self::interrupt(tgid, interrupt.handler);
            return false;
        }

        // Check pending signals for the thread.
        //
        // SAFETY:
        // 1. We want to wake the respective syd_emu thread in case the
        //    process is no longer valid otherwise we may end up with a
        //    deadlock: See miniupnpc tests, thx kepstin!
        // 2. Because we preopen the status-fd and seek here, there's
        //    no concern for PID-reuse, therefore we don't do a post
        //    seccomp-id validation.
        let status = if lseek64(&interrupt.status_fd, 0, Whence::SeekSet).is_ok() {
            proc_status_read(&interrupt.status_fd).ok()
        } else {
            None
        };

        let status = if let Some(status) = status {
            // SAFETY: seccomp-id validated, proc status is valid.
            status
        } else {
            // Process no longer valid:
            // Interrupt syd_emu thread and remove entry.
            Self::interrupt(tgid, interrupt.handler);
            return false;
        };

        // Check for per-{thread,process} pending signals:
        //
        // Aggregate pending signals from both the thread-local and
        // process-global queues. In Linux, each thread (LWP) has its
        // own pending set for signals targeted via tgkill/pthread_kill,
        // while the process-wide pending set captures signals delivered
        // to the PID (e.g., via kill). Taking the bitwise OR yields
        // the complete set of signals awaiting delivery that could
        // interrupt this execution context.
        let mut sigset = status.sig_pending_thread | status.sig_pending_process;

        // Exclude per-thread blocked signals:
        //
        // Remove those signals currently masked by this thread's signal
        // mask. Signal masks are maintained per-thread via
        // pthread_sigmask; masked signals remain pending but are not
        // delivered until unblocked. By subtracting the blocked set, we
        // isolate only the pending signals eligible for immediate
        // synchronous or asynchronous handling.
        sigset.del_set(status.sig_blocked);

        if sigset.is_empty() {
            // No interrupt signals received, keep the entry.
            return true;
        }

        // Filter out restarting signals per-process,
        // unless ignore_restart is set. This may be the
        // case e.g. when the socket has a timeout for
        // accept and connect.
        // Note, `interrupt.ignore_restart` check
        // was done before calling this function and
        // sigset_restart is only Some if it is false.
        if !interrupt.ignore_restart {
            if let Some(sigset_restart) = self
                .cache
                .sysint_map
                .sig_restart
                .lock()
                .unwrap_or_else(|err| err.into_inner())
                .get(&interrupt.tgid)
            {
                sigset.del_set(*sigset_restart);

                if sigset.is_empty() {
                    // Only restarting signals received, keep the entry.
                    return true;
                }
            }
        }

        // Interrupt syd_emu thread and remove entry.
        Self::interrupt(tgid, interrupt.handler);
        false
    }

    // Interrupt the respective `syd_emu` thread.
    #[inline]
    fn interrupt(syd: Pid, syd_emu: Pid) {
        // SAFETY: There's no libc wrapper for tgkill.
        match Errno::result(unsafe {
            libc::syscall(
                libc::SYS_tgkill,
                syd.as_raw(),
                syd_emu.as_raw(),
                libc::SIGALRM,
            )
        }) {
            Ok(_) | Err(Errno::ESRCH) => {}
            Err(errno) => {
                // SAFETY: Inter-thread signaling does not work.
                // This is seriously wrong, exit ASAP.
                unsafe { libc::_exit(errno as i32) };
            }
        }
    }

    #[inline(always)]
    fn is_valid(&self, id: u64) -> bool {
        // EAGAIN|EINTR is handled.
        // ENOENT means child died mid-way.
        seccomp_notify_id_valid(self.scmp, id).is_ok()
    }

    /// Confine Interrupter thread.
    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        scmp: RawFd,
        tgid: Pid,
        flags: Flags,
        dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        if !dry_run {
            // SAFETY: Set up a Landlock sandbox to disallow:
            // 1. All read, write, exec, network access.
            // 2. Scoped UNIX sockets.
            // We cannot enable scoped signals because we
            // want to signal Syd syscall handler threads
            // that are going to be outside this Landlock
            // sandbox.
            let abi = crate::landlock::ABI::new_current();
            let policy = LandlockPolicy {
                scoped_abs: true,
                ..Default::default()
            };
            let _ = policy.restrict_self(abi);
        }

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_spec_exec:1
        ctx.set_ctl_ssb(flags.contains(Flags::FL_ALLOW_UNSAFE_SPEC_EXEC))?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Allow interrupt handler thread to send the
        // SIGALRM signal to threads in Syd's thread group.
        let sysname = "tgkill";
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == tgid.as_raw() as u64),
                        scmp_cmp!($arg2 == libc::SIGALRM as u64),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow interrupt handler thread to
        // validate seccomp request IDs using ioctl(2).
        let sysname = "ioctl";
        #[allow(clippy::cast_sign_loss)]
        #[allow(clippy::unnecessary_cast)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == scmp as u64),
                        scmp_cmp!($arg1 == crate::fs::SECCOMP_IOCTL_NOTIF_ID_VALID as u64),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Deny open and stat family with ENOSYS rather than KillProcess.
        // We need this because std::thread::spawn has unwanted
        // side-effects such as opening /sys/devices/system/cpu/online
        // on some architectures.
        for sysname in [
            "open",
            "openat",
            "openat2",
            "stat",
            "lstat",
            "statx",
            "newfstatat",
        ] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Errno(Errno::ENOSYS as i32), syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in INT_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe system calls.
        for sysname in INT_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        if flags.intersects(Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID) {
            // SAFETY: Main thread confines these further.
            // As these system calls as per-process,
            // the main thread's seccomp rules will apply
            // to us even without TSYNC.
            for sysname in SET_ID_SYSCALLS {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }

            // SAFETY:
            // Signal system calls are necessary to handle reserved signals.
            for sysname in ["sigreturn", "rt_sigreturn"] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        Ok(ctx)
    }
}
