//
// Syd: rock-solid application kernel
// src/kernel/memfd.rs: memfd_create(2) handler
//
// Copyright (c) 2023, 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::ffi::CStr;

use libseccomp::ScmpNotifResp;
use memchr::arch::all::is_prefix;
use nix::{
    errno::Errno,
    sys::stat::{fchmod, Mode},
};

use crate::{
    config::{HAVE_MFD_NOEXEC_SEAL, MFD_NAME_PREFIX, MFD_NAME_PREFIX_HUGETLB},
    cookie::safe_memfd_create,
    fs::MfdFlags,
    hook::{RemoteProcess, UNotifyEventRequest},
    kernel::sandbox_path,
    path::XPathBuf,
    sandbox::Capability,
};

pub(crate) fn sys_memfd_create(request: UNotifyEventRequest) -> ScmpNotifResp {
    syscall_handler!(request, |request: UNotifyEventRequest| {
        const NAME_MAX: usize = 255;
        // The slash is not included in the limit.
        const MFD_NAME_PREFIX_LEN: usize = MFD_NAME_PREFIX.len() - 1;
        const MFD_NAME_MAX_LEN: usize = NAME_MAX - MFD_NAME_PREFIX_LEN;

        let req = request.scmpreq;
        let addr = req.data.args[0];
        let flags = req.data.args[1];

        // Validate flags argument first.
        let mut flags = to_mfdflags(flags)?;

        // Validate name argument next.
        if addr == 0 {
            // Return EFAULT for NULL name.
            return Err(Errno::EFAULT);
        }

        // If sandboxing for create capability is off, return immediately.
        let sandbox = request.get_sandbox();
        let check = sandbox.enabled(Capability::CAP_CREATE);
        let force_cloexec = sandbox.force_cloexec();
        let force_rand_fd = sandbox.force_rand_fd();
        let restrict_memfd = !sandbox.allow_unsafe_memfd();

        if restrict_memfd && *HAVE_MFD_NOEXEC_SEAL {
            // SAFETY: Drop the executable flag and seal as nonexecutable.
            flags.remove(MfdFlags::MFD_EXEC);
            flags.insert(MfdFlags::MFD_NOEXEC_SEAL);
        }

        let mut buf = [0u8; MFD_NAME_MAX_LEN];
        request.read_mem(&mut buf, addr)?;

        // If buffer has no null byte, return EINVAL as the path
        // is too long for us to handle.
        let name = CStr::from_bytes_until_nul(&buf).or(Err(Errno::EINVAL))?;

        // SAFETY: If name starts with `syd', return EINVAL as these
        // memory file descriptors are for Syd's internal use.
        if is_prefix(name.to_bytes(), b"syd") {
            return Err(Errno::EINVAL);
        }

        if check {
            // `check` may be false if restrict_memfd=1.
            // Check for access by appending the memfd prefix.
            let mut path = XPathBuf::from(if flags.contains(MfdFlags::MFD_HUGETLB) {
                // /memfd-hugetlb:
                MFD_NAME_PREFIX_HUGETLB
            } else {
                // /memfd:
                MFD_NAME_PREFIX
            });
            path.append_bytes(name.to_bytes());

            // Unused when request.is_some()
            let process = RemoteProcess::new(request.scmpreq.pid());

            sandbox_path(
                Some(&request),
                &sandbox,
                &process,
                &path,
                Capability::CAP_CREATE,
                false,
                "memfd_create",
            )
            .or(Err(Errno::EACCES))?;
        }
        drop(sandbox); // release the read-lock.

        // Set CLOEXEC for our fd always, and
        // Set CLOEXEC for remote fd as necessary.
        let cloexec = force_cloexec || flags.contains(MfdFlags::MFD_CLOEXEC);
        flags.insert(MfdFlags::MFD_CLOEXEC);

        // Access granted, emulate call.
        let fd = safe_memfd_create(name.to_bytes_with_nul(), flags.bits())?;

        if restrict_memfd && !*HAVE_MFD_NOEXEC_SEAL {
            // SAFETY: If the kernel does not have support for
            // MFD_NOEXEC_SEAL which is new in Linux>=6.3, explicitly
            // remove the executable bits here. If an attacker wants to
            // use executable memfds on an older kernel, they're gonna
            // have to run fchmod(2) first which syd will intervene
            // and check for access.
            fchmod(&fd, Mode::from_bits_truncate(0o600))?;
        }

        // Return the fd to the sandbox process.
        request.send_fd(fd, cloexec, force_rand_fd)
    })
}

// Convert system call argument to MfdFlags safely.
#[inline]
fn to_mfdflags(arg: u64) -> Result<MfdFlags, Errno> {
    // SAFETY: Reject undefined flags.
    let flags = arg.try_into().or(Err(Errno::EINVAL))?;

    // SAFETY: Reject invalid flags for future safety!
    let flags = MfdFlags::from_bits(flags).ok_or(Errno::EINVAL)?;

    // SAFETY:
    // 1. Linux<6.3: Reject both MFD_EXEC and MFD_NOEXEC_SEAL.
    // 2. Linux>=6.3: Reject when both are specified together.
    if *HAVE_MFD_NOEXEC_SEAL {
        if flags.contains(MfdFlags::MFD_EXEC | MfdFlags::MFD_NOEXEC_SEAL) {
            return Err(Errno::EINVAL);
        }
    } else if flags.intersects(MfdFlags::MFD_EXEC | MfdFlags::MFD_NOEXEC_SEAL) {
        return Err(Errno::EINVAL);
    }

    Ok(flags)
}
