//
// Syd: rock-solid application kernel
// src/utils/syd-tor.rs: Syd's SOCKS Proxy Forwarder
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
// Based in part upon socksns crate which is:
//     Copyright (c) 2020 Steven Engler
//     SPDX-License-Identifier: MIT
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    env,
    io::stderr,
    net::{IpAddr, Ipv4Addr, SocketAddr},
    num::NonZeroUsize,
    os::{
        fd::{AsFd, AsRawFd, BorrowedFd, FromRawFd, OwnedFd, RawFd},
        unix::net::UnixStream,
    },
    process::{exit, ExitCode},
    ptr::NonNull,
};

use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    fcntl::{fcntl, open, splice, FcntlArg, OFlag, SpliceFFlags},
    poll::PollTimeout,
    sched::{unshare, CloneFlags},
    sys::{
        epoll::{Epoll, EpollCreateFlags, EpollEvent, EpollFlags},
        mman::{mmap_anonymous, MapFlags, ProtFlags},
        prctl::set_dumpable,
        resource::{getrlimit, setrlimit, Resource},
        socket::{
            accept4, listen, shutdown, AddressFamily, Backlog, Shutdown, SockFlag, SockType,
            SockaddrLike, SockaddrStorage, UnixAddr,
        },
        stat::Mode,
    },
    unistd::{chdir, chroot, pipe2, sysconf, write, SysconfVar},
};
use sendfd::RecvWithFd;
use syd::{
    compat::epoll_ctl_safe,
    config::PIPE_BUF,
    confine::{confine_mdwe, seccomp_native_has_socketcall},
    cookie::{
        safe_connect, safe_socket, CONNECT_COOKIE_ARG3, CONNECT_COOKIE_ARG4, CONNECT_COOKIE_ARG5,
        SOCKET_COOKIE_ARG3, SOCKET_COOKIE_ARG4, SOCKET_COOKIE_ARG5,
    },
    err::{SydError, SydResult},
    fs::{closeexcept, retry_on_eintr},
    hash::SydHashMap,
    ignore_signals,
    landlock_policy::LandlockPolicy,
    parsers::sandbox::PortSet,
    path::XPathBuf,
    rng::{duprand, fillrandom, randint},
    sealbox::{
        check_madvise_guard_support, madvise_guard_install, mprotect_none, mprotect_readonly, mseal,
    },
    IgnoreSignalOpts,
};

// Pong reply to Syd for debugging.
const PONG: &[u8] =
    b"{\"ctx\":\"recv_proxy_fd\",\"tor\":\"pong\",\"msg\":\"Action brings good fortune.\"}\n\0";

syd::main! {
    // Parse options.
    let mut opts = parse_options()?;

    // Ignore all signals except SIG{CHLD,KILL,STOP}.
    // This is used to ensure we can deny {rt_,}sigreturn(2) to mitigate SROP.
    ignore_signals(IgnoreSignalOpts::empty())?;

    // SAFETY: We leak the UNIX socket fd on purpose,
    // and leave it open for process lifetime.
    // seccomp(2)(2) is going to ensure access is confined.
    std::mem::forget(opts.unix.take());

    // Attempt to set file-max to hard limit overriding the soft limit.
    if let Ok((soft_limit, hard_limit)) = getrlimit(Resource::RLIMIT_NOFILE) {
        if soft_limit < hard_limit {
            let _ = setrlimit(Resource::RLIMIT_NOFILE, hard_limit, hard_limit);
        }
    }

    // Prepare to drop `sock` right after send.
    let TorOpts {
        addr,
        fpid,
        sock,
        is_debug: debug,
        ..
    } = opts;

    // Receive the socket bound inside the namespace.
    let mut buf = [0u8; 1];
    let mut lfd = [-1; 1];
    let proxy = match sock.recv_with_fd(&mut buf, &mut lfd) {
        Ok((_, 1)) if lfd[0] >= 0 => {
            // Socket no longer necessary.
            drop(sock);
            // SAFETY: recvmsg received a valid FD.
            unsafe { OwnedFd::from_raw_fd(lfd[0]) }
        }
        Ok(_) => return Err(Errno::EBADF.into()),
        Err(error) => return Err(error.into()),
    };

    // SAFETY: Randomize received socket fd for hardening.
    let proxy_rand = duprand(proxy.as_raw_fd(), OFlag::O_CLOEXEC)?;
    drop(proxy);
    // SAFETY: duprand returns a valid FD on success.
    let proxy = unsafe { OwnedFd::from_raw_fd(proxy_rand) };

    // Start listening on the bound fd.
    // Note, Syd already set this fd non-blocking.
    listen(&proxy, Backlog::MAXCONN)?;

    // Set socket options (tcp fastopen et al.)
    set_socket_options(&proxy, Some(libc::SOMAXCONN));

    // Create epoll instance.
    let epoll = Epoll::new(EpollCreateFlags::EPOLL_CLOEXEC)?;

    // SAFETY: Randomize the epoll fd for hardening.
    let epoll_fd = duprand(epoll.0.as_raw_fd(), OFlag::O_CLOEXEC).map(|fd| {
        // SAFETY: duprand returns a valid FD on success.
        unsafe { OwnedFd::from_raw_fd(fd) }
    })?;
    drop(epoll);

    // Allocate a private memory area for the address pointer.
    //
    // We allocate three pages and install guard-pages before
    // and after the page where we store the external network
    // address.
    #[allow(clippy::cast_possible_truncation)]
    #[allow(clippy::cast_sign_loss)]
    let page_siz =
        NonZeroUsize::new(sysconf(SysconfVar::PAGE_SIZE)?.ok_or(Errno::EINVAL)? as usize)
            .ok_or(Errno::EINVAL)?;
    let base_siz = NonZeroUsize::new(page_siz.get() * 3).ok_or(Errno::EINVAL)?;

    // SAFETY: In libc we trust.
    let base_ptr = unsafe {
        mmap_anonymous(
            None,
            base_siz,
            ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
            MapFlags::MAP_PRIVATE,
        )
    }?;

    let guard_ptr_1 = base_ptr;
    let addr_ptr = NonNull::new(((guard_ptr_1.as_ptr() as usize) + page_siz.get()) as *mut _)
        .ok_or(Errno::EINVAL)?;
    let guard_ptr_2 = NonNull::new(((addr_ptr.as_ptr() as usize) + page_siz.get()) as *mut _)
        .ok_or(Errno::EINVAL)?;

    // Copy existing SockaddrStorage into the new mapping,
    // and shadow the old `addr` binding with a reference
    // to the new mapping.
    //
    // SAFETY:
    // - `addr_ptr` is non-NULL and points to a writable memory region of one page.
    // - The entire page is first filled with random data using fillrandom().
    // - A random prefix offset (aligned properly for SockaddrStorage) is chosen
    //   to obscure the exact address location, so that the structure is not placed
    //   at the beginning of the page, preventing trivial address prediction.
    // - The offset is limited to avoid exceeding the page boundary and ensures proper alignment.
    let addr: &SockaddrStorage = unsafe {
        let addr_ptr = addr_ptr.as_ptr();

        // Prefill with random data.
        let addr_vec = std::slice::from_raw_parts_mut(addr_ptr as *mut u8, page_siz.get());
        fillrandom(addr_vec)?;

        // Find a random aligned offset within the usable range.
        let addr_size = std::mem::size_of::<SockaddrStorage>();
        let addr_align = std::mem::align_of::<SockaddrStorage>();
        let max_offset = page_siz.get().checked_sub(addr_size).ok_or(Errno::EINVAL)?;
        let rand_off = randint(0..=max_offset as u64)? as usize;
        let aligned_off = rand_off & !(addr_align - 1);

        let addr_ptr = (addr_ptr as usize)
            .checked_add(aligned_off)
            .ok_or(Errno::EINVAL)? as *mut SockaddrStorage;
        addr_ptr.write(addr);

        &*addr_ptr
    };

    // Set mapping back to read-only.
    mprotect_readonly(addr_ptr, page_siz)?;

    // Set guard-pages to PROT_NONE.
    mprotect_none(guard_ptr_1, page_siz)?;
    mprotect_none(guard_ptr_2, page_siz)?;

    // Install lightweight guard-pages if supported.
    if check_madvise_guard_support() {
        madvise_guard_install(guard_ptr_1, page_siz)?;
        madvise_guard_install(guard_ptr_2, page_siz)?;
    }

    // Protect the memory area of address pointer,
    // from unwanted modifications.
    //
    // ENOSYS: mseal(2) is not implemented (Linux>=6.10)
    // EPERM: Sealing is supported only on 64-bit CPUs, 32-bit is not supported.
    match mseal(base_ptr, base_siz) {
        Ok(_) | Err(Errno::ENOSYS | Errno::EPERM) => {}
        Err(errno) => return Err(errno.into()),
    }

    // Print rules if SYD_TOR_RULES is set in the environment.
    let print = env::var_os("SYD_TOR_RULES").is_some();

    // Initialize the proxy server.
    let proxy = Proxy::new(Epoll(epoll_fd), fpid, proxy, addr, debug, print);

    // Confine the proxy server.
    proxy.confine()?;

    // Run the proxy server.
    let result = proxy.run();

    // Exit with 0 or error number.
    Ok(match result {
        Ok(_) => ExitCode::SUCCESS,
        Err(err) => ExitCode::from(err.errno().unwrap_or(Errno::ENOSYS) as i32 as u8),
    })
}

type FdMap = SydHashMap<RawFd, RawFd>;
type CliMap = SydHashMap<RawFd, Client>;

struct Proxy<'a> {
    debug: bool,
    print: bool,
    epoll: Epoll,
    pidfd: OwnedFd,
    proxy: OwnedFd,
    extaddr: &'a SockaddrStorage,
    clients: CliMap,
    fdindex: FdMap,
}

impl<'a> Proxy<'a> {
    fn run(mut self) -> SydResult<()> {
        // 1. Add PIDFd to epoll (becomes readable when process terminates).
        let event = libc::epoll_event {
            events: (EpollFlags::EPOLLET
                | EpollFlags::EPOLLIN
                | EpollFlags::EPOLLRDHUP
                | EpollFlags::EPOLLONESHOT)
                .bits() as u32,
            u64: self.pidfd.as_fd().as_raw_fd() as u64,
        };
        epoll_ctl_safe(&self.epoll.0, self.pidfd.as_fd().as_raw_fd(), Some(event))?;

        // 2. Add listening socket to epoll (not necessary to set EPOLL{ERR,HUP}).
        let event = libc::epoll_event {
            events: (EpollFlags::EPOLLET | EpollFlags::EPOLLIN | EpollFlags::EPOLLRDHUP).bits()
                as u32,
            u64: self.proxy.as_fd().as_raw_fd() as u64,
        };
        epoll_ctl_safe(&self.epoll.0, self.proxy.as_fd().as_raw_fd(), Some(event))?;

        if self.debug {
            // Reply to Syd to indicate start of traffic forwarding.
            let _ = write(stderr(), PONG);
        }

        // TODO: TOR_EPOLL_MAX_EVENTS=1024 move to config.rs
        let mut events = vec![EpollEvent::empty(); 1024];
        loop {
            // Wait for events and handle EINTR.
            let n = match self.epoll.wait(&mut events, PollTimeout::NONE) {
                Ok(n) => n,
                Err(Errno::EINTR) => continue, // Retry if interrupted by a signal.
                Err(errno) => return Err(errno.into()),
            };

            let mut is_syd = false; // Has Syd exited?
            for event in events.iter().take(n) {
                let fd = event.data() as RawFd;
                let event_flags = event.events();

                if fd == self.pidfd.as_fd().as_raw_fd() {
                    // Syd exited, drain connections.
                    is_syd = true;
                    continue;
                }

                if fd == self.proxy.as_fd().as_raw_fd() {
                    // Accept new connection.
                    if event_flags.contains(EpollFlags::EPOLLIN) {
                        self.add_connection()?;
                    }

                    continue;
                }

                // Handle events for existing connections.
                self.handle_connection(fd, event_flags)?;
            }

            if is_syd {
                // Syd exited, exit gracefully.
                break Ok(());
            }
        }
    }

    fn confine(&self) -> SydResult<()> {
        let family = self
            .extaddr
            .family()
            .ok_or::<SydError>(Errno::EAFNOSUPPORT.into())?;
        let domain = match family {
            AddressFamily::Inet => libc::AF_INET,
            AddressFamily::Inet6 => libc::AF_INET6,
            AddressFamily::Unix => libc::AF_UNIX,
            _ => return Err(Errno::EAFNOSUPPORT.into()),
        };
        let port: Option<u16> = if let Some(sa_in) = self.extaddr.as_sockaddr_in() {
            let port = sa_in.port();
            if port == 0 {
                return Err(Errno::EINVAL.into());
            }
            Some(port)
        } else if let Some(sa_in6) = self.extaddr.as_sockaddr_in6() {
            let port = sa_in6.port();
            if port == 0 {
                return Err(Errno::EINVAL.into());
            }
            Some(port)
        } else if self.extaddr.as_unix_addr().is_some() {
            None
        } else {
            return Err(Errno::EAFNOSUPPORT.into());
        };
        let addr_ptr = self.extaddr.as_ptr() as u64;
        let addr_len = self.extaddr.len() as u64;

        // We add a seccomp filter that allows required syscalls.
        let mut filter = new_filter(ScmpAction::KillProcess)?;

        let allow_call = [
            // can exit.
            "exit",
            "exit_group",
            // can {{dr}e,}allocate memory.
            // mmap{,2} and mprotect are further confined to disable PROT_EXEC.
            "brk",
            "madvise",
            "mremap",
            "munmap",
            // can read random bytes (soon to be in VDSO).
            "getrandom",
            // can handle signals.
            // can not return from signal handlers (mitigate SROP).
            "sigaction",
            "sigaltstack",
            "sigpending",
            "sigprocmask",
            "sigsuspend",
            //"sigreturn",
            "rt_sigaction",
            "rt_sigpending",
            "rt_sigprocmask",
            "rt_sigqueueinfo",
            //"rt_sigreturn",
            "rt_sigtimedwait",
            "rt_sigtimedwait_time64",
            // can set file flags.
            "fcntl",
            "fcntl64",
            // can close files.
            "close",
            // can do I/O with pipes.
            "pipe2",
            "splice",
            // can forward network.
            // socket and connect are further confined as necessary.
            "accept4",
            "setsockopt",
            "shutdown",
            // can use EPoll API,
            // can not create new EPoll FDs.
            "epoll_ctl",
            "epoll_wait",
            "epoll_pwait",
            "epoll_pwait2",
        ];

        // Default allowlist.
        for name in allow_call {
            if let Ok(syscall) = ScmpSyscall::from_name(name) {
                filter.add_rule(ScmpAction::Allow, syscall)?;
            }
        }

        // Socket filtering only works if there's no multiplexing socketcall.
        #[allow(clippy::useless_conversion)]
        if seccomp_native_has_socketcall() {
            for sysname in ["socket", "connect"] {
                #[allow(clippy::disallowed_methods)]
                filter.add_rule(ScmpAction::Allow, ScmpSyscall::from_name(sysname).unwrap())?;
            }
        } else {
            // Restrict socket(2) to the given domain, type and protocol.
            // Restrict socket(2) using syscall argument cookies.
            let sock_domain = domain as u64;
            let sock_type = (libc::SOCK_STREAM | libc::SOCK_NONBLOCK | libc::SOCK_CLOEXEC) as u64;
            let sock_protocol = 0u64;
            #[allow(clippy::disallowed_methods)]
            filter.add_rule_conditional(
                ScmpAction::Allow,
                ScmpSyscall::from_name("socket").unwrap(),
                &[
                    scmp_cmp!($arg0 == sock_domain),
                    scmp_cmp!($arg1 == sock_type),
                    scmp_cmp!($arg2 == sock_protocol),
                    scmp_cmp!($arg3 == (*SOCKET_COOKIE_ARG3).into()),
                    scmp_cmp!($arg4 == (*SOCKET_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*SOCKET_COOKIE_ARG5).into()),
                ],
            )?;

            // Restrict connect(2) to a single safe pointer.
            // Restrict connect(2) using syscall argument cookies.
            #[allow(clippy::disallowed_methods)]
            filter.add_rule_conditional(
                ScmpAction::Allow,
                ScmpSyscall::from_name("connect").unwrap(),
                &[
                    scmp_cmp!($arg1 == addr_ptr),
                    scmp_cmp!($arg2 == addr_len),
                    scmp_cmp!($arg3 == (*CONNECT_COOKIE_ARG3).into()),
                    scmp_cmp!($arg4 == (*CONNECT_COOKIE_ARG4).into()),
                    scmp_cmp!($arg5 == (*CONNECT_COOKIE_ARG5).into()),
                ],
            )?;
        }

        // Prevent executable memory.
        const PROT_EXEC: u64 = libc::PROT_EXEC as u64;
        for name in ["mmap", "mmap2", "mprotect"] {
            #[allow(clippy::disallowed_methods)]
            let syscall = ScmpSyscall::from_name(name).unwrap();
            filter.add_rule_conditional(
                ScmpAction::Allow,
                syscall,
                &[scmp_cmp!($arg2 & PROT_EXEC == 0)],
            )?;
        }

        // We will ignore unshare errors next step and here we keep
        // with the expectation that we're inside the safe directory.
        let safe_cwd = if family == AddressFamily::Unix {
            // UNIX domain socket, allow `/proc/self/fd` indirection.
            "/proc/self/fd"
        } else {
            // Ipv4/Ipv6 address, cut filesystem access completely.
            // Note, /var/empty does not exist on Ubuntu...
            "/proc/self/fdinfo"
        };
        chdir(safe_cwd)?;

        if !self.debug {
            // SAFETY: Default panic hook won't play well with seccomp.
            std::panic::set_hook(Box::new(|_| {}));

            // Set up namespace isolation.
            // The namespaces to create depends on the type of the external address.
            // For IPv4 and IPv6 addresses, we set up namespace isolation for all except NET.
            // For UNIX domain sockets, we set up namespace isolation for all.
            // In addition we chroot into `/proc/self/fdinfo` for IPv4 and IPv6 addresses.
            // In addition we chroot into `/proc/self/fd` for UNIX domain sockets.
            // Ignore errors as unprivileged userns may not be supported.
            let mut namespaces = CloneFlags::CLONE_NEWUSER
                | CloneFlags::CLONE_NEWCGROUP
                | CloneFlags::CLONE_NEWIPC
                | CloneFlags::CLONE_NEWNS
                | CloneFlags::CLONE_NEWPID
                | CloneFlags::CLONE_NEWUTS
                | CloneFlags::from_bits_retain(libc::CLONE_NEWTIME);
            if family == AddressFamily::Unix {
                // UNIX domain socket, isolate NET.
                namespaces |= CloneFlags::CLONE_NEWNET;
            }
            if unshare(namespaces).is_ok() {
                chroot(".")?; // proc/self/fd{,info}
                chdir("/")?; // prevent cwd leaking.
            }

            // Set up a Landlock sandbox:
            // 1. Disallow all filesystem access.
            // 2. Allow only connect to TOR port.
            // Ignore errors as Landlock may not be supported.
            let abi = syd::landlock::ABI::new_current();
            let conn_portset = port.map(|port| {
                let mut set = PortSet::with_capacity(0x10000);
                set.insert(port.into());
                set
            });
            let policy = LandlockPolicy {
                conn_portset,

                scoped_abs: true,
                scoped_sig: true,

                ..Default::default()
            };
            let _ = policy.restrict_self(abi);

            // Set up Memory-Deny-Write-Execute protections.
            // Ignore errors as PR_SET_MDWE may not be supported.
            let _ = confine_mdwe(false);

            // Set the process dumpable attribute to not-dumpable.
            let _ = set_dumpable(false);

            // Deny reading the timestamp counter (x86 only).
            // SAFETY: Our nix version does not have a wrapper for SET_TSC yet.
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            let _ = unsafe { libc::prctl(libc::PR_SET_TSC, libc::PR_TSC_SIGSEGV) };
        }

        if self.print {
            // Dump filter to standard error.
            eprintln!("# syd-tor rules");
            let _ = filter.export_pfc(std::io::stderr());
        }

        if !self.debug {
            // All done, load seccomp filter and begin confinement.
            filter.load()?;
        }

        Ok(())
    }

    fn handle_connection(&mut self, fd: RawFd, mut event_flags: EpollFlags) -> SydResult<()> {
        // This query MUST never fail.
        #[allow(clippy::disallowed_methods)]
        let client = self.get_client_mut(fd).unwrap();

        let fd_cli = client.fd_cli.as_raw_fd();
        let fd_ext = client.fd_ext.as_raw_fd();
        let is_ext = fd == fd_ext;

        let is_inp = event_flags
            .intersects(EpollFlags::EPOLLIN | EpollFlags::EPOLLPRI)
            .then(|| event_flags.remove(EpollFlags::EPOLLIN | EpollFlags::EPOLLPRI))
            .is_some();
        let is_out = event_flags
            .contains(EpollFlags::EPOLLOUT)
            .then(|| event_flags.remove(EpollFlags::EPOLLOUT))
            .is_some();
        let is_hup = event_flags
            .intersects(EpollFlags::EPOLLHUP)
            .then(|| event_flags.remove(EpollFlags::EPOLLHUP))
            .is_some();
        let is_rd_hup = event_flags
            .intersects(EpollFlags::EPOLLRDHUP)
            .then(|| event_flags.remove(EpollFlags::EPOLLRDHUP))
            .is_some();
        let is_err = event_flags
            .intersects(EpollFlags::EPOLLERR)
            .then(|| event_flags.remove(EpollFlags::EPOLLERR))
            .is_some();
        assert!(event_flags.is_empty(), "BUG: {event_flags:?}");

        if is_inp || is_rd_hup || is_hup || is_err {
            // Handle readable events.
            let _ = client.handle_input(is_ext);
        }

        if is_out || is_hup || is_err {
            // Handle writable events.
            let _ = client.handle_output(is_ext);
        }

        if is_rd_hup {
            // Half-close the write side on RDHUP.
            let _ = shutdown(if is_ext { fd_cli } else { fd_ext }, Shutdown::Write);
        }

        if is_hup || is_err {
            // Try to transfer any remaining data.
            let _ = client.handle_input(!is_ext);

            // Close on errors and HUP.
            self.close_connection(fd);
        }

        Ok(())
    }

    fn add_connection(&mut self) -> SydResult<()> {
        // Quoting accept(2):
        // Linux accept() (and accept4()) passes already-pending network
        // errors on the new socket as an error code from accept(). This
        // behavior differs from other BSD socket implementations. For
        // reliable operation the application should detect the network
        // errors defined for the protocol after accept() and treat them
        // like EAGAIN by retrying. In the case of TCP/IP, these are
        // ENETDOWN, EPROTO, ENOPROTOOPT, EHOSTDOWN, ENONET, EHOSTUNREACH,
        // EOPNOTSUPP, and ENETUNREACH.
        #[allow(unreachable_patterns)]
        let fd_cli = match accept4(
            self.proxy.as_fd().as_raw_fd(),
            SockFlag::SOCK_NONBLOCK | SockFlag::SOCK_CLOEXEC,
        ) {
            Ok(fd) => unsafe { OwnedFd::from_raw_fd(fd) },
            Err(
                Errno::EAGAIN
                | Errno::EHOSTDOWN
                | Errno::EHOSTUNREACH
                | Errno::EINPROGRESS
                | Errno::ENETDOWN
                | Errno::ENETUNREACH
                | Errno::ENONET
                | Errno::ENOPROTOOPT
                | Errno::EOPNOTSUPP
                | Errno::EPROTO
                | Errno::EWOULDBLOCK,
            ) => return Ok(()), // No more connections to accept
            Err(errno) => return Err(errno.into()),
        };

        let client = Client::new(fd_cli, self.extaddr)?;
        self.add_client(client)
    }

    fn close_connection(&mut self, fd: RawFd) {
        if let Some(client) = self.remove_client(fd) {
            let _ = epoll_ctl_safe(&self.epoll.0, client.fd_cli.as_raw_fd(), None);
            let _ = epoll_ctl_safe(&self.epoll.0, client.fd_ext.as_raw_fd(), None);
        } // All client OwnedFds are closed end of this block.
    }

    fn add_client(&mut self, client: Client) -> SydResult<()> {
        let fd_cli = client.fd_cli.as_fd();
        let fd_ext = client.fd_ext.as_fd();

        let event = libc::epoll_event {
            events: (EpollFlags::EPOLLET
                | EpollFlags::EPOLLIN
                | EpollFlags::EPOLLOUT
                | EpollFlags::EPOLLHUP
                | EpollFlags::EPOLLRDHUP)
                .bits() as u32,
            u64: fd_cli.as_raw_fd() as u64,
        };
        epoll_ctl_safe(&self.epoll.0, fd_cli.as_raw_fd(), Some(event))?;

        let event = libc::epoll_event {
            events: (EpollFlags::EPOLLET
                | EpollFlags::EPOLLIN
                | EpollFlags::EPOLLOUT
                | EpollFlags::EPOLLHUP
                | EpollFlags::EPOLLRDHUP)
                .bits() as u32,
            u64: fd_ext.as_raw_fd() as u64,
        };
        epoll_ctl_safe(&self.epoll.0, fd_ext.as_raw_fd(), Some(event))?;

        let fd_cli = fd_cli.as_raw_fd();
        let fd_ext = fd_ext.as_raw_fd();
        self.fdindex.insert(fd_cli, fd_cli);
        self.fdindex.insert(fd_ext, fd_cli);
        self.clients.insert(fd_cli, client);

        Ok(())
    }

    fn remove_client(&mut self, fd: RawFd) -> Option<Client> {
        let fd_cli = self.fdindex.remove(&fd)?;

        let client = self.clients.remove(&fd_cli)?;
        self.fdindex.remove(&client.fd_ext.as_raw_fd());

        Some(client)
    }

    fn get_client_mut(&mut self, fd: RawFd) -> Option<&mut Client> {
        self.fdindex
            .get(&fd)
            .and_then(|fd| self.clients.get_mut(fd))
    }

    fn new(
        epoll: Epoll,
        pidfd: OwnedFd,
        proxy: OwnedFd,
        extaddr: &'a SockaddrStorage,
        debug: bool,
        print: bool,
    ) -> Self {
        Self {
            debug,
            print,
            epoll,
            pidfd,
            proxy,
            extaddr,
            clients: CliMap::default(),
            fdindex: FdMap::default(),
        }
    }
}

struct Client {
    fd_cli: OwnedFd,
    fd_ext: OwnedFd,

    eof_cli: bool,
    eof_ext: bool,

    pipe_cli: (OwnedFd, OwnedFd),
    pipe_ext: (OwnedFd, OwnedFd),
}

impl Client {
    fn new(fd_cli: OwnedFd, extaddr: &SockaddrStorage) -> SydResult<Self> {
        let (pipe_in_from_cli, pipe_out_to_cli) = pipe2(OFlag::O_NONBLOCK | OFlag::O_CLOEXEC)?;
        let (pipe_in_from_ext, pipe_out_to_ext) = pipe2(OFlag::O_NONBLOCK | OFlag::O_CLOEXEC)?;

        // Set socket options on the client fd.
        set_socket_options(&fd_cli, Some(0));

        let family = extaddr
            .family()
            .ok_or::<SydError>(Errno::EAFNOSUPPORT.into())?;
        let fd_ext = safe_socket(
            family as i32,
            (SockType::Stream as i32) | (SockFlag::SOCK_NONBLOCK | SockFlag::SOCK_CLOEXEC).bits(),
            0,
        )?;

        // Set socket options on the external fd.
        set_socket_options(&fd_ext, None);

        retry_on_eintr(|| safe_connect(&fd_ext, extaddr)).or_else(|e| {
            if matches!(e, Errno::EAGAIN | Errno::EINPROGRESS) {
                Ok(())
            } else {
                Err(e)
            }
        })?;

        Ok(Self {
            fd_cli,
            fd_ext,

            eof_cli: false,
            eof_ext: false,

            pipe_cli: (pipe_in_from_cli, pipe_out_to_cli),
            pipe_ext: (pipe_in_from_ext, pipe_out_to_ext),
        })
    }

    fn handle_input(&mut self, ext: bool) -> Result<(), Errno> {
        let (src_fd, dst_fd, pipe_in_fd, pipe_out_fd, eof) = if ext {
            (
                self.fd_ext.as_fd(),
                self.fd_cli.as_fd(),
                self.pipe_ext.0.as_fd(),
                self.pipe_ext.1.as_fd(),
                &mut self.eof_ext,
            )
        } else {
            (
                self.fd_cli.as_fd(),
                self.fd_ext.as_fd(),
                self.pipe_cli.0.as_fd(),
                self.pipe_cli.1.as_fd(),
                &mut self.eof_cli,
            )
        };

        // Edge-triggered epoll: read until EAGAIN.
        while !*eof {
            match Self::splice(&src_fd, &pipe_out_fd) {
                Ok(0) => *eof = true,
                Ok(_) => {
                    // Data transferred - try to push through pipe immediately
                    // This prevents pipe buffer buildup.
                    loop {
                        match Self::splice(&pipe_in_fd, &dst_fd) {
                            Ok(0) | Err(Errno::EAGAIN) => break,
                            Ok(_) => {}
                            Err(errno) => return Err(errno),
                        }
                    }
                }
                Err(Errno::EAGAIN) => break,
                Err(errno) => return Err(errno),
            };
        }

        Ok(())
    }

    fn handle_output(&mut self, ext: bool) -> Result<(), Errno> {
        let (dst_fd, pipe_in_fd) = if ext {
            (self.fd_cli.as_fd(), self.pipe_ext.0.as_fd())
        } else {
            (self.fd_ext.as_fd(), self.pipe_cli.0.as_fd())
        };

        // Edge-triggered epoll: write until EAGAIN.
        loop {
            match Self::splice(&pipe_in_fd, &dst_fd) {
                Ok(0) | Err(Errno::EAGAIN) => break,
                Ok(_) => {}
                Err(errno) => return Err(errno),
            };
        }

        Ok(())
    }

    fn splice<F: AsFd>(src_fd: &F, dst_fd: &F) -> Result<usize, Errno> {
        splice(
            src_fd,
            None,
            dst_fd,
            None,
            PIPE_BUF,
            SpliceFFlags::SPLICE_F_NONBLOCK | SpliceFFlags::SPLICE_F_MORE,
        )
    }
}

fn new_filter(action: ScmpAction) -> SydResult<ScmpFilterContext> {
    let mut filter = ScmpFilterContext::new(action)?;

    // Enforce the NO_NEW_PRIVS functionality before
    // loading the seccomp filter into the kernel.
    filter.set_ctl_nnp(true)?;

    // Kill process for bad arch.
    filter.set_act_badarch(ScmpAction::KillProcess)?;

    // Use a binary tree sorted by syscall number, if possible.
    let _ = filter.set_ctl_optimize(2);

    Ok(filter)
}

// Parse command line options.
struct TorOpts {
    // -i socket-fd
    sock: UnixStream,

    // -p pid-fd
    fpid: OwnedFd,

    // -o ext-addr
    addr: SockaddrStorage,

    // -u ext-sock for UNIX domain sockets.
    // Used for `/proc/self/fd` indirection.
    unix: Option<OwnedFd>,

    // -d
    // UNSAFE! Run in debug mode without confinement.
    is_debug: bool,
}

fn parse_options() -> SydResult<TorOpts> {
    use lexopt::prelude::*;

    // Parse CLI options.
    let mut opt_addr = Some((IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 9050));
    let mut opt_unix = None;
    let mut opt_sock = None;
    let mut opt_fpid = None;

    // Skip confinement if SYD_TOR_DEBUG environment variable is set.
    // Another way to achieve the same is the `-d` CLI option.
    let mut opt_debug = env::var_os("SYD_TOR_DEBUG").is_some();

    let mut parser = lexopt::Parser::from_env();
    while let Some(arg) = parser.next()? {
        match arg {
            Short('h') => {
                help();
                exit(0);
            }
            Short('d') => opt_debug = true,
            Short('p') => {
                let fpid = parser.value()?.parse::<String>()?.parse::<RawFd>()?;
                if fpid < 0 {
                    return Err(Errno::EBADF.into());
                }
                opt_fpid = Some(fpid);
            }
            Short('i') => {
                let sock = parser.value()?.parse::<String>()?.parse::<RawFd>()?;
                if sock < 0 {
                    return Err(Errno::EBADF.into());
                }
                opt_sock = Some(sock);
            }
            Short('o') => {
                let value = parser.value()?.parse::<String>()?;

                // Split IP address and port.
                let (addr, port) = value
                    .rsplit_once(':')
                    .ok_or::<SydError>(Errno::EAFNOSUPPORT.into())?;

                // Parse the address and the external port.
                let addr = addr.parse::<IpAddr>()?;
                let port = port.parse::<u16>()?;
                if port == 0 {
                    return Err(Errno::EINVAL.into());
                }

                // -u overrides -o.
                if opt_unix.is_none() {
                    opt_addr = Some((addr, port));
                }
            }
            Short('u') => {
                opt_unix = Some(parser.value().map(XPathBuf::from)?);

                // -u overrides -o.
                opt_addr = None;
            }
            _ => return Err(arg.unexpected().into()),
        }
    }

    let fpid = if let Some(fpid) = opt_fpid {
        fpid
    } else {
        eprintln!("syd-tor: Error: -p is required.");
        help();
        exit(1);
    };

    let sock = if let Some(sock) = opt_sock {
        sock
    } else {
        eprintln!("syd-tor: Error: -i is required.");
        help();
        exit(1);
    };

    // Close all file descriptors, except:
    // 1. Standard input, output, and error which are set to /dev/null by Syd.
    // 2. The PID fd and the socket fd passed by the Syd process.
    // Nothing can be done on closeexcept errors.
    // We do it early here so FD randomization doesn't effect performance.
    #[allow(clippy::cast_sign_loss)]
    {
        let fd1 = fpid as libc::c_uint;
        let fd2 = sock as libc::c_uint;
        let _ = closeexcept(&if fd1 < fd2 {
            [0, 1, 2, fd1, fd2]
        } else {
            [0, 1, 2, fd2, fd1]
        });
    }

    // Validate socket address and convert.
    let mut unix_fd = None;
    let addr: SockaddrStorage = match (opt_addr, opt_unix) {
        (Some((addr, port)), None) => {
            assert_ne!(port, 0);
            let addr = SocketAddr::new(addr, port);
            match addr {
                SocketAddr::V4(addr) => SockaddrStorage::from(addr),
                SocketAddr::V6(addr) => SockaddrStorage::from(addr),
            }
        }
        (None, Some(unix)) => {
            // SAFETY: We are going to access this file descriptor using
            // `/proc/self/fd` indirection after isolating the mount-ns.
            // Note, confine chroots and/or chdirs to /proc/self/fd.
            #[allow(clippy::disallowed_methods)]
            let fd = open(&unix, OFlag::O_PATH | OFlag::O_CLOEXEC, Mode::empty())?;

            // SAFETY: Randomize fd to make it harder to locate for an attacker.
            let fd = duprand(fd.as_raw_fd(), OFlag::O_CLOEXEC)?;

            let unix: XPathBuf = fd.to_string().into();
            let unix = UnixAddr::new(&unix)?;

            // SAFETY: duprand returns valid FD on success.
            unix_fd = Some(unsafe { OwnedFd::from_raw_fd(fd) });

            // SAFETY: We validated address is valid above,
            // so here we pack into a SockaddrStorage safely.
            unsafe { SockaddrStorage::from_raw(unix.as_ptr().cast(), Some(unix.len())) }
                .ok_or(Errno::EINVAL)?
        }
        _ => unreachable!(),
    };

    // SAFETY: We will validate the FD below.
    let fpid = unsafe { BorrowedFd::borrow_raw(fpid) };

    // Validate file descriptor.
    // F_GETFD returns EBADF for bad-fd.
    fcntl(fpid, FcntlArg::F_GETFD)?;

    // SAFETY: Randomize fd to make it harder to locate for an attacker.
    let fpid = duprand(fpid.as_raw_fd(), OFlag::O_CLOEXEC | OFlag::O_EXCL)?;

    // SAFETY: Syd passes a valid PID FD to syd-tor.
    // Any other usecase is unsupported.
    let fpid = unsafe { OwnedFd::from_raw_fd(fpid) };

    // SAFETY: We will validate the FD below.
    let sock = unsafe { BorrowedFd::borrow_raw(sock) };

    // Validate file descriptor.
    // F_GETFD returns EBADF for bad-fd.
    fcntl(sock, FcntlArg::F_GETFD)?;

    // SAFETY: Randomize fd to make it harder to locate for an attacker.
    let sock = duprand(sock.as_raw_fd(), OFlag::O_CLOEXEC | OFlag::O_EXCL)?;

    // SAFETY: Syd passes a valid socket FD to syd-tor.
    // Any other usecase is unsupported.
    let sock = unsafe { UnixStream::from_raw_fd(sock) };

    Ok(TorOpts {
        fpid,
        sock,
        addr,
        unix: unix_fd,
        is_debug: opt_debug,
    })
}

// Set common socket options for Proxy sandboxing.
fn set_socket_options<F: AsFd>(fd: &F, backlog: Option<libc::c_int>) {
    // Helper to set socket option
    fn set_socket_option<T>(
        fd: RawFd,
        level: libc::c_int,
        optname: libc::c_int,
        optval: T,
    ) -> Result<(), Errno> {
        // SAFETY: nix lacks some socket options we want to set.
        let ret = unsafe {
            libc::setsockopt(
                fd,
                level,
                optname,
                &optval as *const _ as *const libc::c_void,
                std::mem::size_of::<T>() as libc::socklen_t,
            )
        };
        if ret == -1 {
            Err(Errno::last())
        } else {
            Ok(())
        }
    }

    // Convert Fd to RawFd.
    let fd = fd.as_fd().as_raw_fd();

    // TCP_NODELAY disables Nagle's algorithm, which improves the
    // latency of small packets by sending them immediately instead of
    // waiting to combine them with other packets.
    let _ = set_socket_option(fd, libc::IPPROTO_TCP, libc::TCP_NODELAY, 1);

    // SO_KEEPALIVE ensures that connections are checked periodically to
    // detect broken connections. This helps in maintaining long-lived
    // connections by detecting and closing broken ones.
    let _ = set_socket_option(fd, libc::SOL_SOCKET, libc::SO_KEEPALIVE, 1);

    // TCP_QUICKACK ensures that ACKs (acknowledgments) are sent
    // immediately, reducing the latency for connections that rely on
    // timely acknowledgment of received packets.
    let _ = set_socket_option(fd, libc::IPPROTO_TCP, libc::TCP_QUICKACK, 1);

    if let Some(backlog) = backlog.filter(|&backlog| backlog != 0) {
        // TCP_FASTOPEN enables Fast Open (RFC 7413) on the listener
        // socket. The value specifies the maximum length of pending SYNs
        // (similar to the backlog argument in listen(2)). Once enabled,
        // the listener socket grants the TCP Fast Open cookie on
        // incoming SYN with TCP Fast Open option.
        let _ = set_socket_option(fd, libc::IPPROTO_TCP, libc::TCP_FASTOPEN, backlog);
    } else {
        // TCP_FASTOPEN_CONNECT enables an alternative way to perform Fast
        // Open on the active side (client).
        let _ = set_socket_option(fd, libc::IPPROTO_TCP, libc::TCP_FASTOPEN_CONNECT, 1);
    }
}

fn help() {
    println!("Usage: syd-tor [-dh] -p <pid-fd> -i <socket-fd> [-o addr:port] [-u unix-sock]");
    println!("Syd's SOCKS Proxy Forwarder");
    println!("Receives listening socket from fd and forwards traffic to addr:port or UNIX socket.");
    println!("External address must be an IPv4 or IPv6 address.");
    println!("PID file descriptor is used to track the exit of Syd process.");
    println!("  -h             Print this help message and exit.");
    println!("  -d             Run in debug mode without confinement.");
    println!("  -p <pid-fd>    PID file descriptor of Syd process.");
    println!("  -i <socket-fd> Socket file descriptor to receive the listening socket from.");
    println!("  -o <addr:port> Specify external address to forward traffic to.");
    println!("                 Defaults to 127.0.0.1:9050.");
    println!("  -u <unix-sock> Specify UNIX domain socket to forward traffic to.");
    println!("                 This option has precedence over -o.");
}
