//
// Syd: rock-solid application kernel
// src/workers/int.rs: `syd_int' interrupter thread
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    collections::HashSet,
    env,
    os::fd::RawFd,
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
    thread,
    thread::JoinHandle,
};

use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    unistd::{getpid, Pid},
};

use crate::{
    cache::SysInterrupt,
    config::*,
    err::{err2no, SydResult},
    error, info,
    landlock_policy::LandlockPolicy,
    path::XPathBuf,
    proc::proc_status,
    workers::WorkerCache,
    ExportMode,
};

#[derive(Clone)]
pub(crate) struct Interrupter {
    scmp: RawFd,
    safe_setid: bool,
    scmp_ssb: bool,
    should_exit: Arc<AtomicBool>,
    cache: Arc<WorkerCache<'static>>,
}

impl Interrupter {
    pub(crate) fn new(
        scmp: RawFd,
        safe_setid: bool,
        scmp_ssb: bool,
        should_exit: Arc<AtomicBool>,
        cache: Arc<WorkerCache<'static>>,
    ) -> Self {
        Self {
            scmp,
            safe_setid,
            scmp_ssb,
            should_exit,
            cache,
        }
    }

    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self) -> Result<JoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_int".to_string())
            .stack_size(INT_STACK_SIZE)
            .spawn(move || {
                // To be used by tgkill when signaling threads.
                let tgid = getpid();

                // Honour dry-run when exporting.
                let dry_run =
                    env::var_os(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();
                if !dry_run {
                    // SAFETY: Default panic hook wont play well with seccomp
                    std::panic::set_hook(Box::new(|_| {}));
                }

                // SAFETY: Logging will kill us after seccomp.
                info!("ctx": "confine", "op": "confine_interrupt_thread",
                    "msg": format!("interrupt thread confined with{} SROP mitigation",
                        if self.safe_setid { "out" } else { "" }));

                // SAFETY: Panic if we cannot prepare the seccomp filter.
                // Note, prepare_confine also applies Landlock for !dry_run.
                #[allow(clippy::disallowed_methods)]
                let ctx =
                    Self::prepare_confine(self.scmp, tgid, self.safe_setid, self.scmp_ssb, dry_run)
                        .expect("prepare interrupt thread confinement");
                #[allow(clippy::disallowed_methods)]
                if !dry_run {
                    // SAFETY: Panic if we cannot confine the thread.
                    ctx.load().expect("confine interrupt thread");
                } else {
                    error!("ctx": "confine", "op": "confine_interrupt_thread",
                        "msg": "interrupter thread is running unconfined in debug mode");
                }

                // Enter main loop.
                self.main(tgid)
            })
            .map_err(|err| err2no(&err))
    }

    fn main(self, tgid: Pid) {
        loop {
            // Wait for one cycle.
            std::thread::sleep(INT_CYCLE_TIME);

            // Unblock invalidated blocking system calls.
            {
                let (ref lock, ref cvar) = *self.cache.sysint_map.sys_block;
                let mut map = lock.lock().unwrap_or_else(|err| err.into_inner());

                while map.is_empty() {
                    map = cvar.wait(map).unwrap_or_else(|err| err.into_inner());
                }
                map.retain(|_, interrupt| self.handle_interrupt(tgid, *interrupt));
            }

            // Check if it's the time to exit.
            if self.should_exit.load(Ordering::Relaxed) {
                break;
            }
        }
    }

    fn handle_interrupt(&self, tgid: Pid, interrupt: SysInterrupt) -> bool {
        // Check pending signals for the thread.
        #[allow(clippy::cast_possible_wrap)]
        let request_pid = Pid::from_raw(interrupt.request.pid as libc::pid_t);
        let status = if let Ok(status) = proc_status(request_pid) {
            status
        } else {
            // Proces no longer valid, remove.
            return false;
        };

        // SAFETY: Validate request ID to ensure `/proc` read was valid.
        // Note, this function is a hot path where we don't want to run
        // notify_supported() on each call.
        // libseccomp::notify_id_valid(self.scmp, interrupt.request.id).is_err().
        if unsafe { libseccomp_sys::seccomp_notify_id_valid(self.scmp, interrupt.request.id) } != 0
        {
            // Request no longer valid, remove.
            return false;
        }

        // Check for per-{thread,process} pending signals.
        let mut sigset = status.sig_pending_thread | status.sig_pending_process;

        // Filter out restarting signals per-process,
        // unless ignore_restart is set. This may be the
        // case e.g. when the socket has a timeout for
        // accept and connect.
        // Note, `interrupt.ignore_restart` check
        // was done before calling this function and
        // sigset_restart is only Some if it is false.
        if !interrupt.ignore_restart {
            if let Some(sigset_restart) = self
                .cache
                .sysint_map
                .sig_restart
                .lock()
                .unwrap_or_else(|err| err.into_inner())
                .get(&interrupt.tgid)
            {
                sigset.del_set(*sigset_restart);
            }
        }

        if sigset.is_empty() {
            // No interrupt signals received, keep the entry.
            return true;
        }

        // Interrupt the syscall handler thread.
        // SAFETY: There's no libc wrapper for tgkill.
        match Errno::result(unsafe {
            libc::syscall(
                libc::SYS_tgkill,
                tgid.as_raw(),
                interrupt.handler.as_raw(),
                libc::SIGALRM,
            )
        }) {
            Ok(_) | Err(Errno::ESRCH) => false,
            Err(errno) => {
                // SAFETY: Inter-thread signaling does not work.
                // This is seriously wrong, exit ASAP.
                unsafe { libc::_exit(errno as i32) };
            }
        }
    }

    /// Confine Interrupter thread.
    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        scmp: RawFd,
        tgid: Pid,
        safe_setid: bool,
        ssb: bool,
        dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        if !dry_run {
            // SAFETY: Set up a Landlock sandbox to disallow:
            // 1. All read access except `/proc` filesystem.
            // 2. All write, network access.
            // 3. Scoped UNIX sockets.
            // We cannot enable scoped signals because we
            // want to signal Syd syscall handler threads
            // that are going to be outside this Landlock
            // sandbox.
            let abi = crate::landlock::ABI::new_current();
            let mut set = HashSet::default();
            set.insert(XPathBuf::from("/proc"));
            let policy = LandlockPolicy {
                scoped_abs: true,
                read_pathset: Some(set),
                ..Default::default()
            };
            let _ = policy.restrict_self(abi);
        }

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_spec_exec:1
        ctx.set_ctl_ssb(ssb)?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Allow interrupt handler thread to send the
        // SIGALRM signal to threads in Syd's thread group.
        let sysname = "tgkill";
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == tgid.as_raw() as u64),
                        scmp_cmp!($arg2 == libc::SIGALRM as u64),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow interrupt handler thread to
        // validate seccomp request IDs using ioctl(2).
        let sysname = "ioctl";
        #[allow(clippy::cast_sign_loss)]
        #[allow(clippy::unnecessary_cast)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == scmp as u64),
                        scmp_cmp!($arg1 == crate::hook::SECCOMP_IOCTL_NOTIF_ID_VALID as u64),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow interrupt handler thread to access
        // `/proc` file system to read information
        // on pending signals.
        // TODO: Restrict this further.
        let sysname = "openat2";
        #[allow(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[scmp_cmp!($arg0 == PROC_FD() as u64)],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Deny open and stat family with ENOSYS rather than KillProcess.
        // We need this because std::thread::spawn has unwanted
        // side-effects such as opening /sys/devices/system/cpu/online
        // on some architectures.
        for sysname in ["open", "openat", "stat", "lstat", "statx", "newfstatat"] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Errno(Errno::ENOSYS as i32), syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in INT_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe system calls.
        for sysname in INT_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        if safe_setid {
            // SAFETY: Main thread confines these further.
            // As these system calls as per-process,
            // the main thread's seccomp rules will apply
            // to us even without TSYNC.
            for sysname in SET_ID_SYSCALLS {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }

            // SAFETY:
            // Signal system calls are necessary to handle reserved signals.
            for sysname in ["sigreturn", "rt_sigreturn"] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        Ok(ctx)
    }
}
