//
// Syd: rock-solid application kernel
// src/pool.rs: Self growing / shrinking `ThreadPool` implementation
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
// Based in part upon rusty_pool which is:
//     Copyright (c) Robin Friedli <robinfriedli@icloud.com>
//     SPDX-License-Identifier: Apache-2.0
//
// SPDX-License-Identifier: GPL-3.0

// Last sync with rusty_pool:
// Version 0.7.0
// Commit:d56805869ba3cbe47021d5660bbaf19ac5ec4bfb

use std::{
    collections::HashMap,
    env,
    fs::OpenOptions,
    option::Option,
    os::{
        fd::{AsRawFd, BorrowedFd, RawFd},
        unix::fs::OpenOptionsExt,
    },
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc, RwLock,
    },
    thread,
    thread::JoinHandle,
};

use nix::{
    errno::Errno,
    sched::{sched_setaffinity, CpuSet},
    sys::{
        epoll::Epoll,
        signal::{sigaction, SaFlags, SigAction, SigHandler, SigSet, Signal},
    },
    unistd::{getpid, Pid},
};

use crate::{
    compat::epoll_ctl_safe,
    config::*,
    err::{err2no, SydResult},
    error,
    hook::HandlerMap,
    info,
    sandbox::{Capability, Sandbox},
    workers::{
        aes::{AesMap, AesWorker},
        emu::Worker,
        int::Interrupter,
        ipc::IpcWorker,
        BindMap, PidFdMap, WorkerCache, WorkerData,
    },
    ExportMode,
};

// Epoll event to add seccomp fd to epoll (becomes readable when system
// call is interrupted). We specifically zero out the data field to
// distinguish from PidFds.
//
// Quoting: https://idea.popcount.org/2017-02-20-epoll-is-fundamentally-broken-12/
// """
// The best and the only scalable approach is to use recent
// Kernel 4.5+ and use level-triggered events with
// EPOLLEXCLUSIVE flag. This will ensure only one thread is
// woken for an event, avoid "thundering herd" issue and scale
// properly across multiple CPU's.
// """/
#[allow(clippy::cast_sign_loss)]
const SCMP_EPOLL_EVENT: libc::epoll_event = libc::epoll_event {
    events: (libc::EPOLLIN | libc::EPOLLEXCLUSIVE) as u32,
    u64: 0, // zeroed out to distinguish from pid-fds.
};

// Signal handler function for SIGALRM.
extern "C" fn handle_sigalrm(_: libc::c_int) {}

/// Self growing / shrinking `ThreadPool` implementation.
#[derive(Clone)]
pub(crate) struct ThreadPool {
    core_size: usize,
    keep_alive: u16,
    safe_setid: bool,
    scmp_ssb: bool,
    fd: RawFd,
    pub(crate) epoll: Arc<Epoll>,
    pub(crate) cache: Arc<WorkerCache<'static>>,
    sandbox: Arc<RwLock<Sandbox>>,
    handlers: Arc<HandlerMap>,
    bind_map: BindMap,
    crypt_map: Option<AesMap>,
    should_exit: Arc<AtomicBool>,
    worker_data: Arc<WorkerData>,
}

impl ThreadPool {
    /// Construct a new `ThreadPool` with the specified core pool size,
    /// max pool size and keep_alive time for non-core threads. This
    /// function creates an epoll instance and adds the seccomp fd to it
    /// but it does not spawn any threads.
    ///
    /// `core_size` specifies the amount of threads to keep alive for as
    /// long as the `ThreadPool` exists and the seccomp fd remains open.
    ///
    /// `keep_alive` specifies the duration in milliseconds for which to
    /// keep non-core pool worker threads alive while they do not
    /// receive any work.
    #[allow(clippy::cognitive_complexity)]
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        epoll: Epoll,
        fd: RawFd,
        safe_setid: bool,
        scmp_ssb: bool,
        core_size: usize,
        keep_alive: u16,
        sandbox: Arc<RwLock<Sandbox>>,
        handlers: Arc<HandlerMap>,
        crypt_map: Option<AesMap>,
    ) -> Result<Self, Errno> {
        // SAFETY: Borrow FD to make I/O safe API hippie.
        let seccomp_fd = unsafe { BorrowedFd::borrow_raw(fd) };
        epoll_ctl_safe(&epoll.0, seccomp_fd.as_raw_fd(), Some(SCMP_EPOLL_EVENT))?;

        let epoll = Arc::new(epoll);
        let cache = Arc::new(WorkerCache::new(Arc::clone(&epoll), fd));

        // Create pidfd map.
        let pidfd_map = PidFdMap::new(Arc::clone(&cache));
        PIDFD_MAP.set(pidfd_map).or(Err(Errno::EAGAIN))?;

        Ok(Self {
            fd,
            cache,
            sandbox,
            crypt_map,
            handlers,
            core_size,
            keep_alive,
            safe_setid,
            scmp_ssb,
            epoll,
            should_exit: Arc::new(AtomicBool::new(false)),
            worker_data: Arc::new(WorkerData::default()),
            bind_map: Arc::new(RwLock::new(HashMap::default())),
        })
    }

    /// Boot the thread pool. This is the main entry point.
    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn boot(self) -> SydResult<Option<JoinHandle<()>>> {
        // Export seccomp rules if requested.
        // We have to prepare the filter twice if exporting,
        // as we cannot move it safely between threads...
        #[allow(clippy::disallowed_methods)]
        match ExportMode::from_env() {
            Some(ExportMode::BerkeleyPacketFilter) => {
                // Worker rules
                let ctx = Worker::prepare_confine(
                    self.fd,
                    self.epoll.0.as_raw_fd(),
                    self.safe_setid,
                    self.scmp_ssb,
                )?;
                let file = OpenOptions::new()
                    .write(true)
                    .create_new(true)
                    .mode(0o400)
                    .open("syd_emu.bpf")?;
                ctx.export_bpf(file)?;

                // Interrupter rules
                // We pass dry_run=true to avoid Landlock confinement.
                let ctx = Interrupter::prepare_confine(
                    self.fd,
                    getpid(),
                    self.safe_setid,
                    self.scmp_ssb,
                    true,
                )?;
                let file = OpenOptions::new()
                    .write(true)
                    .create_new(true)
                    .mode(0o400)
                    .open("syd_int.bpf")?;
                ctx.export_bpf(file)?;

                // IPC thread rules
                // We pass dummy RawFd=0 for epoll FD.
                // We pass dry_run=true to avoid Landlock confinement.
                let ctx = IpcWorker::prepare_confine(0, self.safe_setid, self.scmp_ssb, true)?;
                let file = OpenOptions::new()
                    .write(true)
                    .create_new(true)
                    .mode(0o400)
                    .open("syd_ipc.bpf")?;
                ctx.export_bpf(file)?;

                // Aes worker rules
                let ctx = AesWorker::prepare_confine(self.safe_setid, self.scmp_ssb)?;
                let file = OpenOptions::new()
                    .write(true)
                    .create_new(true)
                    .mode(0o400)
                    .open("syd_aes.bpf")?;
                ctx.export_bpf(file)?;
            }
            Some(ExportMode::PseudoFiltercode) => {
                println!("# Syd monitor rules");
                let ctx = Worker::prepare_confine(
                    self.fd,
                    self.epoll.0.as_raw_fd(),
                    self.safe_setid,
                    self.scmp_ssb,
                )?;
                ctx.export_pfc(std::io::stdout())?;

                println!("# Syd interrupter rules");
                let ctx = Interrupter::prepare_confine(
                    self.fd,
                    getpid(),
                    self.safe_setid,
                    self.scmp_ssb,
                    false,
                )?;
                ctx.export_pfc(std::io::stdout())?;

                println!("# Syd ipc rules");
                let ctx = IpcWorker::prepare_confine(0, self.safe_setid, self.scmp_ssb, true)?;
                ctx.export_pfc(std::io::stdout())?;

                println!("# Syd encryptor rules");
                let ctx = AesWorker::prepare_confine(self.safe_setid, self.scmp_ssb)?;
                ctx.export_pfc(std::io::stdout())?;
            }
            _ => {}
        }

        // Spawn the AES thread if encryption is on.
        let crypt = {
            let sandbox = self.sandbox.read().unwrap_or_else(|err| err.into_inner());
            #[allow(clippy::disallowed_methods)]
            if sandbox.enabled(Capability::CAP_CRYPT) {
                let crypt_fds = sandbox.crypt_setup().unwrap();
                let is_mem_fd = sandbox.crypt_tmp.is_none();
                Some((crypt_fds, is_mem_fd))
            } else {
                None
            }
        };

        // Note, we spawn the AES thread before CPU pinning intentionally,
        // so they get to run on whichever CPU.
        #[allow(clippy::disallowed_methods)]
        let crypt_handle = if let Some((crypt_fds, is_mem_fd)) = crypt {
            let crypt_map = self.crypt_map.as_ref().map(Arc::clone).unwrap();
            Some(
                self.try_spawn_aes(crypt_fds, crypt_map, is_mem_fd)
                    .expect("spawn AES encryption thread"),
            )
        } else {
            None
        };

        // Ensure the lazy num_cpus::get is called before
        // the CPU pinning below as subsequent invocations
        // is going to return 1.
        let nproc = *NPROC;
        info!("ctx": "boot", "op": "pin_main_thread",
            "msg": format!("detected {nproc} CPUs on the system"),
            "num_cpus": nproc);

        // Attempt to set thread's CPU affinity mask to 0.
        // We pin the main, init and monitor threads to CPU:0.
        // Emulator threads are pinned according to num-cpus.
        let cpu_id = 0;
        let mut cpu_set = CpuSet::new();
        if cpu_set.set(cpu_id).is_ok() {
            match sched_setaffinity(Pid::from_raw(0), &cpu_set) {
                Ok(_) => {
                    info!("ctx": "boot", "op": "pin_main_thread",
                        "msg": format!("pinned main thread to CPU:{cpu_id}"),
                        "cpu": cpu_id);
                }
                Err(errno) => {
                    error!("ctx": "boot", "op": "pin_main_thread",
                        "msg": format!("failed to pin main thread to CPU:{cpu_id}: {errno}"),
                        "err": errno as i32,
                        "cpu": cpu_id);
                }
            }
        }

        // Spawn the interrupt thread which will confine itself.
        self.try_spawn_interrupt()?;

        // Spawn the monitor thread which may confine itself, and spawn
        // emulator threads. Note, this will panic if it cannot spawn
        // the initial emulator thread which is going to tear everything
        // down.
        self.monitor()?;

        // Return join handle of the encryption thread,
        // so we can wait for ongoing encryption processes
        // before exiting the sandbox.
        Ok(crypt_handle)
    }

    /// Spawn a monitor thread that watches the worker pool busy count,
    /// and spawns new helper threads as necessary. This is done to
    /// ensure a sandbox process cannot DOS Syd by merely exhausting
    /// workers by e.g. opening the read end of a FIFO over and over
    /// again.
    #[allow(clippy::cognitive_complexity)]
    pub(crate) fn monitor(self) -> Result<(), Errno> {
        thread::Builder::new()
            .name("syd_mon".to_string())
            .stack_size(MON_STACK_SIZE)
            .spawn(move || {
                info!("ctx": "boot", "op": "start_monitor_thread",
                    "msg": format!("started monitor thread with pool size set to {} threads and keep alive set to {} seconds",
                        self.core_size,
                        self.keep_alive.saturating_div(1000)),
                    "core_size": self.core_size,
                    "keep_alive": self.keep_alive);

                // SAFETY: If sandbox is locked, confine right away.
                // Pass confined parameter to try_spawn so subsequent
                // spawned threads don't need to reapply the same filter
                // as it is inherited.
                let dry_run = env::var_os(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();
                #[allow(clippy::disallowed_methods)]
                let mut confined = if Sandbox::locked_once() {
                    let ctx = Worker::prepare_confine(self.fd, self.epoll.0.as_raw_fd(), self.safe_setid, self.scmp_ssb)
                        .expect("prepare monitor thread confinement");
                    if !dry_run {
                        info!("ctx": "confine", "op": "confine_monitor_thread",
                            "msg": format!("monitor thread confined with{} SROP mitigation",
                                if self.safe_setid { "out" } else { "" }));
                        ctx.load().expect("confine monitor thread");
                    } else {
                        error!("ctx": "confine", "op": "confine_monitor_thread",
                            "msg": "monitor thread is running unconfined in debug mode");
                    }
                    true
                } else {
                    info!("ctx": "confine", "op": "confine_monitor_thread",
                        "msg": "monitor thread is running unconfined because sandbox isn't locked yet");
                    false
                };

                info!("ctx": "boot", "op": "start_core_emulator_threads",
                    "msg": format!("starting {} core emulator thread{}, sandboxing started!",
                        self.core_size,
                        if self.core_size > 1 { "s" } else { "" }),
                    "core_size": self.core_size,
                    "keep_alive": self.keep_alive);

                // SAFETY: Panic if we cannot spawn the initial core thread.
                // There's little sense in continuing in this case.
                #[allow(clippy::disallowed_methods)]
                self.try_spawn(confined)
                    .expect("spawn core emulator thread")
                    .map(drop)
                    .unwrap();

                // SAFETY: Wait for grace period to give the initial core emulator
                // thread a chance to spawn itself.
                std::thread::sleep(MON_GRACE_TIME);

                loop {
                    // Confine as necessary.
                    #[allow(clippy::disallowed_methods)]
                    if !confined && Sandbox::locked_once() {
                        let ctx = Worker::prepare_confine(self.fd, self.epoll.0.as_raw_fd(), self.safe_setid, self.scmp_ssb)
                            .expect("prepare monitor thread confinement");
                        if !dry_run {
                            info!("ctx": "confine", "op": "confine_monitor_thread",
                                "msg": format!("monitor thread confined with{} SROP mitigation",
                                    if self.safe_setid { "out" } else { "" }));
                            ctx.load().expect("confine monitor thread");
                        } else {
                            error!("ctx": "confine", "op": "confine_monitor_thread",
                                "msg": "monitor thread is running unconfined in debug mode");
                        }
                        confined = true;
                    }

                    if self.should_exit.load(Ordering::Relaxed) {
                        // Time to exit.
                        break;
                    }

                    // Spawn a new thread if all others are busy.
                    match self.try_spawn(confined) {
                        Ok(Some(_)) => {
                            // We have spawned a new emulator thread,
                            // wait for one cycle before reattempting.
                            std::thread::sleep(MON_CYCLE_TIME);
                        }
                        Ok(None) => {
                            // We have idle threads, no need to spawn a new worker.
                            // Wait for grace period before reattempting.
                            std::thread::sleep(MON_GRACE_TIME);
                        }
                        Err(_) => {
                            // Caller try_spawn logs an alert level entry about this.
                            // Wait for grace period before reattempting.
                            std::thread::sleep(MON_GRACE_TIME);
                        }
                    }
                }
            })
            .map(drop)
            .map_err(|err| err2no(&err))
    }

    /// Spawn an interrupt handler thread to unblock Syd syscall
    /// handler threads when the respective sandbox process
    /// receives a non-restarting signal.
    pub(crate) fn try_spawn_interrupt(&self) -> Result<JoinHandle<()>, Errno> {
        // Set up the signal handler for SIGALRM.
        let sig_action = SigAction::new(
            SigHandler::Handler(handle_sigalrm),
            SaFlags::empty(),
            SigSet::empty(),
        );

        // SAFETY: Register the handler for SIGALRM.
        unsafe { sigaction(Signal::SIGALRM, &sig_action) }?;

        Interrupter::new(
            self.fd,
            self.safe_setid,
            self.scmp_ssb,
            Arc::clone(&self.should_exit),
            Arc::clone(&self.cache),
        )
        .try_spawn()
    }

    /// Try to create a new encryption thread.
    pub(crate) fn try_spawn_aes(
        &self,
        fdalg: (RawFd, RawFd),
        files: AesMap,
        memfd: bool,
    ) -> Result<JoinHandle<()>, Errno> {
        AesWorker::new(fdalg, files, memfd, self.safe_setid, self.scmp_ssb).try_spawn()
    }

    /// Try to create a new worker thread as needed.
    /// Returns Ok(Some((JoinHandle, bool))) if spawn succeeded, Ok(None) if no spawn was needed.
    /// The boolean in the success case is true if the thread we spawned was a core thread.
    #[allow(clippy::cognitive_complexity)]
    #[allow(clippy::type_complexity)]
    pub(crate) fn try_spawn(
        &self,
        confined: bool,
    ) -> Result<Option<(JoinHandle<()>, bool)>, Errno> {
        // Create a new worker if there are no idle threads and the
        // current worker count is lower than the max pool size.
        let worker_count_val = self.worker_data.0.load(Ordering::Relaxed);
        let (curr_worker_count, busy_worker_count) = WorkerData::split(worker_count_val);

        let keep_alive = if curr_worker_count < self.core_size {
            // Create a new core worker if current pool size is below
            // core size during the invocation of this function.
            crate::debug!("ctx": "spawn", "dec": "create_new_core_emulator",
                "busy_worker_count": busy_worker_count,
                "curr_worker_count": curr_worker_count,
                "core_size": self.core_size);
            None
        } else if busy_worker_count < curr_worker_count {
            // We have idle threads, no need to spawn a new worker.
            crate::debug!("ctx": "spawn", "dec": "idle_emulator_exists",
                "busy_worker_count": busy_worker_count,
                "curr_worker_count": curr_worker_count,
                "core_size": self.core_size);
            return Ok(None);
        } else if curr_worker_count < *EMU_MAX_SIZE {
            // Create a new helper worker if the current worker count is
            // below the EMU_MAX_SIZE and the pool has been observed to
            // be busy (no idle workers) during the invocation of this
            // function.
            crate::debug!("ctx": "spawn", "dec": "create_new_idle_emulator",
                "busy_worker_count": busy_worker_count,
                "curr_worker_count": curr_worker_count,
                "core_size": self.core_size,
                "keep_alive": self.keep_alive);
            Some(self.keep_alive)
        } else {
            // We cannot spawn anymore workers!
            // Ideally, this should never happen.
            crate::alert!("ctx": "spawn", "dec": "emulator_capacity_exceeded",
                "busy_worker_count": busy_worker_count,
                "curr_worker_count": curr_worker_count,
                "core_size": self.core_size,
                "keep_alive": self.keep_alive);
            return Ok(None);
        };

        // Try to spawn a new worker.
        Ok(Some((
            Worker::new(
                self.fd,
                Arc::clone(&self.epoll),
                Arc::clone(&self.cache),
                Arc::clone(&self.sandbox),
                Arc::clone(&self.handlers),
                keep_alive,
                Arc::clone(&self.should_exit),
                Arc::clone(&self.worker_data),
                Arc::clone(&self.bind_map),
                self.crypt_map.as_ref().map(Arc::clone),
            )
            .try_spawn(confined)?,
            keep_alive.is_none(),
        )))
    }
}
