Learn Zig Series (#74) - ptrace: Process Tracing

Learn Zig Series (#74) - ptrace: Process Tracing

zig.png

What will I learn

  • What ptrace is and how to attach to and control other processes from Zig;
  • How to read and write a tracee's memory and CPU registers using PTRACE_PEEKDATA and PTRACE_GETREGS;
  • How to single-step through a process one instruction at a time with PTRACE_SINGLESTEP;
  • How to set software breakpoints using the INT3 (0xCC) instruction replacement trick;
  • How to intercept system calls before and after execution with PTRACE_SYSCALL;
  • How to build a simple strace-like syscall tracer from scratch;
  • How anti-debugging techniques work and how tracers can detect them;
  • How to build a practical file-operation logger that traces all open/read/write/close syscalls.

Requirements

  • A working modern computer running macOS, Windows or Ubuntu;
  • An installed Zig 0.14+ distribution (download from ziglang.org);
  • The ambition to learn Zig programming.

Difficulty

  • Intermediate

Curriculum (of the Learn Zig Series):

Learn Zig Series (#74) - ptrace: Process Tracing

Solutions to Episode 73 Exercises

Exercise 1: Configurable sandbox launcher reading a JSON profile

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const BpfInsn = extern struct { code: u16, jt: u8, jf: u8, k: u32 };
const SockFprog = extern struct { len: u16, filter: [*]const BpfInsn };
const Rlimit = extern struct { cur: u64, max: u64 };
const SECCOMP_RET_ALLOW: u32 = 0x7fff0000;
const SECCOMP_RET_ERRNO: u32 = 0x00050000;
const SECCOMP_RET_KILL: u32 = 0x00000000;
const AUDIT_ARCH_X86_64: u32 = 0xc000003e;

fn bpfS(code: u16, k: u32) BpfInsn { return .{ .code = code, .jt = 0, .jf = 0, .k = k }; }
fn bpfJ(code: u16, k: u32, jt: u8, jf: u8) BpfInsn { return .{ .code = code, .jt = jt, .jf = jf, .k = k }; }

const stdio_sc = [_]u32{ 0, 1, 3, 9, 10, 11, 12, 13, 14, 15, 16, 35, 39, 60, 96, 158, 228, 231, 302, 318, 334 };
const fread_sc = [_]u32{ 4, 5, 6, 21, 79, 257, 262 };
const fwrite_sc = [_]u32{ 76, 82, 87, 90, 83, 84, 85, 86, 88 };
const net_sc = [_]u32{ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 };
const proc_sc = [_]u32{ 56, 57, 58, 59, 61, 62 };

const Profile = struct {
    stdio: bool = false, file_read: bool = false, file_write: bool = false,
    network: bool = false, process: bool = false,
    max_memory_mb: u64 = 256, max_files: u64 = 32, max_cpu_seconds: u64 = 30,
};

fn parseProfile(alloc: std.mem.Allocator, path: []const u8) !Profile {
    const file = try std.fs.cwd().openFile(path, .{});
    defer file.close();
    const data = try file.readToEndAlloc(alloc, 8192);
    defer alloc.free(data);
    var p = Profile{};
    const parsed = try std.json.parseFromSlice(std.json.Value, alloc, data, .{});
    defer parsed.deinit();
    const root = parsed.value.object;
    if (root.get("stdio")) |v| p.stdio = v.bool;
    if (root.get("file_read")) |v| p.file_read = v.bool;
    if (root.get("file_write")) |v| p.file_write = v.bool;
    if (root.get("network")) |v| p.network = v.bool;
    if (root.get("process")) |v| p.process = v.bool;
    if (root.get("max_memory_mb")) |v| p.max_memory_mb = @intCast(v.integer);
    if (root.get("max_files")) |v| p.max_files = @intCast(v.integer);
    if (root.get("max_cpu_seconds")) |v| p.max_cpu_seconds = @intCast(v.integer);
    return p;
}

fn applySandbox(p: Profile) !void {
    _ = linux.syscall5(.prctl, 38, 1, 0, 0, 0);
    const mem = p.max_memory_mb * 1024 * 1024;
    var rl = Rlimit{ .cur = mem, .max = mem };
    _ = linux.syscall2(.setrlimit, 9, @intFromPtr(&rl));
    rl = .{ .cur = p.max_files, .max = p.max_files };
    _ = linux.syscall2(.setrlimit, 7, @intFromPtr(&rl));
    rl = .{ .cur = p.max_cpu_seconds, .max = p.max_cpu_seconds };
    _ = linux.syscall2(.setrlimit, 0, @intFromPtr(&rl));

    var all: [128]u32 = undefined;
    var n: usize = 0;
    if (p.stdio) for (stdio_sc) |s| { all[n] = s; n += 1; };
    if (p.file_read) for (fread_sc) |s| { all[n] = s; n += 1; };
    if (p.file_write) for (fwrite_sc) |s| { all[n] = s; n += 1; };
    if (p.network) for (net_sc) |s| { all[n] = s; n += 1; };
    if (p.process) for (proc_sc) |s| { all[n] = s; n += 1; };
    if (n == 0) return error.EmptyProfile;

    var filter: [192]BpfInsn = undefined;
    var fi: u16 = 0;
    filter[fi] = bpfS(0x20, 4); fi += 1;
    filter[fi] = bpfJ(0x15, AUDIT_ARCH_X86_64, 1, 0); fi += 1;
    filter[fi] = bpfS(0x06, SECCOMP_RET_KILL); fi += 1;
    filter[fi] = bpfS(0x20, 0); fi += 1;
    const alen: u8 = @intCast(n);
    for (all[0..n], 0..) |sc, idx| {
        filter[fi] = bpfJ(0x15, sc, alen - @as(u8, @intCast(idx)) - 1, 0); fi += 1;
    }
    filter[fi] = bpfS(0x06, SECCOMP_RET_ERRNO | 1); fi += 1;
    filter[fi] = bpfS(0x06, SECCOMP_RET_ALLOW); fi += 1;

    const prog = SockFprog{ .len = fi, .filter = &filter };
    const r: isize = @bitCast(linux.syscall3(.seccomp, 1, 0, @intFromPtr(&prog)));
    if (r < 0) return error.SeccompFailed;
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    var args = std.process.args();
    _ = args.next();
    const prof_path = args.next() orelse { try stdout.print("Usage: sandbox <profile.json> <prog>\n", .{}); return; };
    const program = args.next() orelse { try stdout.print("Missing program\n", .{}); return; };
    const profile = parseProfile(std.heap.page_allocator, prof_path) catch |e| {
        try stdout.print("Profile error: {s}\n", .{@errorName(e)}); return;
    };
    const pid = try posix.fork();
    if (pid == 0) {
        applySandbox(profile) catch { posix.exit(1); };
        const argv = [_]?[*:0]const u8{ @ptrCast(program.ptr), null };
        const envp = [_]?[*:0]const u8{ null };
        _ = linux.syscall3(.execve, @intFromPtr(argv[0].?), @intFromPtr(&argv), @intFromPtr(&envp));
        posix.exit(127);
    }
    const result = posix.waitpid(pid, 0);
    if (result.status.signal()) |sig| try stdout.print("Killed: signal {d}\n", .{@intFromEnum(sig)})
    else try stdout.print("Exited: {d}\n", .{result.status.exit_status().?});
}

The JSON profile maps human-readable categories to syscall number arrays. The launcher applies rlimits first (they work without seccomp), then installs the BPF filter. If process is false but the target needs execve, it gets EPERM immediately.

Exercise 2: Seccomp audit logger using SECCOMP_RET_LOG

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64, rbp: u64, rbx: u64,
    r11: u64, r10: u64, r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64, rip: u64, cs: u64,
    eflags: u64, rsp: u64, ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn pt(req: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(.ptrace, req, @as(usize, @bitCast(@as(isize, pid))), addr, data));
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    var args = std.process.args();
    _ = args.next();
    const target = args.next() orelse "/bin/ls";
    try stdout.print("Profiling: {s}\n\n", .{target});

    const pid = try posix.fork();
    if (pid == 0) {
        _ = pt(0, 0, 0, 0); // TRACEME
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);
        const argv = [_]?[*:0]const u8{ @ptrCast(target.ptr), null };
        const envp = [_]?[*:0]const u8{ null };
        _ = linux.syscall3(.execve, @intFromPtr(argv[0].?), @intFromPtr(&argv), @intFromPtr(&envp));
        posix.exit(1);
    }

    _ = posix.waitpid(pid, 0);
    var seen = std.AutoHashMap(u64, u32).init(std.heap.page_allocator);
    defer seen.deinit();
    var entering = true;

    while (true) {
        _ = pt(24, pid, 0, 0); // SYSCALL
        const wr = posix.waitpid(pid, 0);
        if (wr.status.signal()) |_| break;
        if (wr.status.exit_status() != null) break;
        if (entering) {
            var regs: UserRegs = undefined;
            _ = pt(12, pid, 0, @intFromPtr(&regs)); // GETREGS
            const entry = try seen.getOrPut(regs.orig_rax);
            if (!entry.found_existing) entry.value_ptr.* = 0;
            entry.value_ptr.* += 1;
        }
        entering = !entering;
    }

    try stdout.print("const allowed = [_]u32{{ ", .{});
    var it = seen.iterator();
    var first = true;
    while (it.next()) |entry| {
        if (!first) try stdout.print(", ", .{});
        try stdout.print("{d}", .{entry.key_ptr.*});
        first = false;
    }
    try stdout.print(" }};\n", .{});
}

The "learn then enforce" pattern -- profile a known-good execution, then generate the minimal whitelist. The ptrace approach gives you structured data immediately, and the output is valid Zig you can paste directly into a seccomp filter.

Exercise 3: Pledge-like API with narrowing via stacked filters

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const BpfInsn = extern struct { code: u16, jt: u8, jf: u8, k: u32 };
const SockFprog = extern struct { len: u16, filter: [*]const BpfInsn };
const SECCOMP_RET_ALLOW: u32 = 0x7fff0000;
const SECCOMP_RET_ERRNO: u32 = 0x00050000;
const SECCOMP_RET_KILL: u32 = 0x00000000;
const AUDIT_ARCH_X86_64: u32 = 0xc000003e;

fn bpfS(code: u16, k: u32) BpfInsn { return .{ .code = code, .jt = 0, .jf = 0, .k = k }; }
fn bpfJ(code: u16, k: u32, jt: u8, jf: u8) BpfInsn { return .{ .code = code, .jt = jt, .jf = jf, .k = k }; }

const Flags = packed struct { stdio: bool = false, rpath: bool = false, inet: bool = false, _pad: u13 = 0 };
const stdio_sc = [_]u32{ 0, 1, 3, 9, 10, 11, 12, 13, 14, 15, 16, 35, 39, 60, 158, 228, 231, 302, 318, 334 };
const rpath_sc = [_]u32{ 4, 5, 6, 21, 79, 257, 262 };
const inet_sc = [_]u32{ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 };

fn pledgeNarrow(flags: Flags) !void {
    var calls: [128]u32 = undefined;
    var n: usize = 0;
    if (flags.stdio) for (stdio_sc) |s| { calls[n] = s; n += 1; };
    if (flags.rpath) for (rpath_sc) |s| { calls[n] = s; n += 1; };
    if (flags.inet) for (inet_sc) |s| { calls[n] = s; n += 1; };
    if (n == 0) return error.EmptyPledge;

    var filter: [192]BpfInsn = undefined;
    var fi: u16 = 0;
    filter[fi] = bpfS(0x20, 4); fi += 1;
    filter[fi] = bpfJ(0x15, AUDIT_ARCH_X86_64, 1, 0); fi += 1;
    filter[fi] = bpfS(0x06, SECCOMP_RET_KILL); fi += 1;
    filter[fi] = bpfS(0x20, 0); fi += 1;
    const alen: u8 = @intCast(n);
    for (calls[0..n], 0..) |sc, idx| {
        filter[fi] = bpfJ(0x15, sc, alen - @as(u8, @intCast(idx)) - 1, 0); fi += 1;
    }
    filter[fi] = bpfS(0x06, SECCOMP_RET_ERRNO | 1); fi += 1;
    filter[fi] = bpfS(0x06, SECCOMP_RET_ALLOW); fi += 1;
    _ = linux.syscall5(.prctl, 38, 1, 0, 0, 0);
    const prog = SockFprog{ .len = fi, .filter = &filter };
    const r: isize = @bitCast(linux.syscall3(.seccomp, 1, 0, @intFromPtr(&prog)));
    if (r < 0) return error.SeccompFailed;
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    try pledgeNarrow(.{ .stdio = true, .rpath = true, .inet = true });
    try stdout.print("Phase 1: stdio+rpath+inet\n", .{});
    const s1: isize = @bitCast(linux.syscall3(.socket, 2, 1, 0));
    try stdout.print("  socket() = {d} (works)\n", .{s1});
    if (s1 >= 0) posix.close(@intCast(s1));

    try pledgeNarrow(.{ .stdio = true, .rpath = true }); // drop inet
    try stdout.print("Phase 2: stdio+rpath (inet dropped)\n", .{});
    const s2: isize = @bitCast(linux.syscall3(.socket, 2, 1, 0));
    try stdout.print("  socket() = {d} (blocked)\n", .{s2});
    try stdout.print("\nFilters stack: kernel takes the most restrictive result.\n", .{});
}

Seccomp filters stack -- each new filter is added to a chain and the kernel runs ALL of them, taking the most restrictive result. You can never widen permissions, only narrow. Each pledgeNarrow call can only remove permissions, never grant new ones.


Last episode we built seccomp sandboxes that restrict WHICH syscalls a process can make. But seccomp operates from the perspective of the process restricting itself -- the process installs its own filter. What if you want to observe and control ANOTHER process from the outside? What if you want to watch every syscall it makes, inspect its memory, or pause it mid-execution to examine its state?

That's what ptrace does, and it's the foundation of every debugger you've ever used. GDB, strace, ltrace, LLDB -- they all rely on ptrace under the hood. And honestly, once you understand how ptrace works, debuggers stop feeling like magic and start feeling like (somewhat clunky) API clients ;-)

The ptrace syscall is Linux-specific (BSDs have their own variants) and it gives one process -- the tracer -- almost complete control over another process -- the tracee. The tracer can read and write the tracee's memory, inspect and modify its CPU registers, single-step through instructions, intercept syscalls, and inject signals. It's the most powerful debugging primitive the kernel exposes.

Ptrace overview: attaching to and controlling processes

The basic ptrace workflow is: fork a child, have the child call PTRACE_TRACEME before exec, then the parent uses various ptrace requests to control execution. Alternatively, you can attach to an already-running process with PTRACE_ATTACH (requires the same UID or CAP_SYS_PTRACE).

Here's the minimal attach-and-trace pattern:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_PEEKTEXT: u32 = 1;
const PTRACE_PEEKDATA: u32 = 2;
const PTRACE_CONT: u32 = 7;
const PTRACE_SINGLESTEP: u32 = 9;
const PTRACE_GETREGS: u32 = 12;
const PTRACE_SETREGS: u32 = 13;
const PTRACE_ATTACH: u32 = 16;
const PTRACE_DETACH: u32 = 17;
const PTRACE_SYSCALL: u32 = 24;
const PTRACE_POKEDATA: u32 = 5;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace,
        request,
        @as(usize, @bitCast(@as(isize, pid))),
        addr,
        data,
    ));
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();

    const pid = try posix.fork();
    if (pid == 0) {
        // child: request to be traced
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        // stop ourselves so the parent gets a notification
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19); // SIGSTOP

        // after the parent continues us, do some work
        const w = std.io.getStdOut().writer();
        w.print("Child: running after parent continued us\n", .{}) catch {};
        w.print("Child: PID = {d}\n", .{linux.syscall0(.getpid)}) catch {};
        posix.exit(42);
    }

    // parent: wait for child to stop
    try stdout.print("Tracer: waiting for tracee (pid {d}) to stop...\n", .{pid});
    _ = posix.waitpid(pid, 0);

    // read the child's registers
    var regs: UserRegs = undefined;
    _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));
    try stdout.print("Tracer: tracee stopped at RIP=0x{x}\n", .{regs.rip});
    try stdout.print("Tracer: RSP=0x{x}, RBP=0x{x}\n", .{ regs.rsp, regs.rbp });

    // continue the child
    try stdout.print("Tracer: continuing tracee...\n", .{});
    _ = ptrace(PTRACE_CONT, pid, 0, 0);

    // wait for exit
    const result = posix.waitpid(pid, 0);
    if (result.status.exit_status()) |code| {
        try stdout.print("Tracer: tracee exited with code {d}\n", .{code});
    }
}

The PTRACE_TRACEME call in the child tells the kernel "my parent is my tracer." After that, any signal delivered to the child (including the SIGSTOP we send ourselves) causes the child to stop and the parent's waitpid to return. The parent can then inspect registers, memory, and decide whether to continue, single-step, or do something else entirely.

Notice we use a raw kill syscall instead of posix.raise() or similar -- when you're doing ptrace work, you want to be very explicit about which syscalls you're invoking. The less magic between you and the kernel, the better.

Reading and writing process memory and registers

Once you have a stopped tracee, you can read its memory word-by-word with PTRACE_PEEKDATA and write it with PTRACE_POKEDATA. These operate on aligned usize values (8 bytes on x86-64). For registers, PTRACE_GETREGS fills a UserRegs struct and PTRACE_SETREGS applies changes:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_PEEKDATA: u32 = 2;
const PTRACE_POKEDATA: u32 = 5;
const PTRACE_CONT: u32 = 7;
const PTRACE_GETREGS: u32 = 12;
const PTRACE_SETREGS: u32 = 13;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data,
    ));
}

// read a block of memory from the tracee
fn readTraceeMemory(pid: i32, addr: usize, buf: []u8) void {
    var offset: usize = 0;
    while (offset < buf.len) {
        const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, addr + offset, 0));
        const bytes: [8]u8 = @bitCast(word);
        const remaining = buf.len - offset;
        const to_copy = @min(remaining, 8);
        @memcpy(buf[offset..][0..to_copy], bytes[0..to_copy]);
        offset += 8;
    }
}

// write a block of memory to the tracee
fn writeTraceeMemory(pid: i32, addr: usize, data: []const u8) void {
    var offset: usize = 0;
    while (offset < data.len) {
        if (data.len - offset >= 8) {
            const word: usize = @bitCast(data[offset..][0..8].*);
            _ = ptrace(PTRACE_POKEDATA, pid, addr + offset, word);
        } else {
            // partial word: read-modify-write
            const existing: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, addr + offset, 0));
            var bytes: [8]u8 = @bitCast(existing);
            const remaining = data.len - offset;
            @memcpy(bytes[0..remaining], data[offset..][0..remaining]);
            const word: usize = @bitCast(bytes);
            _ = ptrace(PTRACE_POKEDATA, pid, addr + offset, word);
        }
        offset += 8;
    }
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();

    // we'll put a message on the stack, fork, and read it from the parent
    var message: [32]u8 = undefined;
    @memcpy(message[0..13], "Hello ptrace!");
    message[13] = 0;

    const pid = try posix.fork();
    if (pid == 0) {
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);

        // after parent modifies our memory, print the message
        const w = std.io.getStdOut().writer();
        const len = std.mem.indexOfScalar(u8, &message, 0) orelse message.len;
        w.print("Child sees: \"{s}\"\n", .{message[0..len]}) catch {};
        posix.exit(0);
    }

    _ = posix.waitpid(pid, 0);

    // read the child's registers to find the stack
    var regs: UserRegs = undefined;
    _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));
    try stdout.print("Child RSP: 0x{x}\n", .{regs.rsp});

    // read some bytes from the child's stack
    var stack_peek: [64]u8 = undefined;
    readTraceeMemory(pid, regs.rsp, &stack_peek);
    try stdout.print("Stack bytes: ", .{});
    for (stack_peek[0..16]) |b| try stdout.print("{x:0>2} ", .{b});
    try stdout.print("\n", .{});

    // find and modify the message in the child's address space
    // in a real debugger you'd search or use symbols, but we know
    // the child inherited our memory layout (copy-on-write)
    const msg_addr = @intFromPtr(&message);
    var remote_msg: [32]u8 = undefined;
    readTraceeMemory(pid, msg_addr, &remote_msg);
    const len = std.mem.indexOfScalar(u8, &remote_msg, 0) orelse remote_msg.len;
    try stdout.print("Original message: \"{s}\"\n", .{remote_msg[0..len]});

    // overwrite it
    const new_msg = "MODIFIED by tracer!";
    writeTraceeMemory(pid, msg_addr, new_msg);

    try stdout.print("Wrote new message, continuing child...\n", .{});
    _ = ptrace(PTRACE_CONT, pid, 0, 0);
    _ = posix.waitpid(pid, 0);
}

The readTraceeMemory and writeTraceeMemory helpers handle the fact that PTRACE_PEEKDATA only reads one machine word at a time. For partial words at the end of a buffer, we do a read-modify-write to avoid corrupting adjacent memory. This is a pain, but that's ptrace for you -- it was designed in the 1970s and the API hasn't exactly been modernized since.

Having said that, on modern Linux there's also process_vm_readv / process_vm_writev which can read/write arbitrary-length buffers in one syscall without the word-at-a-time limitation. But those require specific permissions and don't work in all contexts where ptrace does.

Single-stepping: one instruction at a time

PTRACE_SINGLESTEP is the simplest (and slowest) way to trace execution. It executes exactly one machine instruction, then stops the tracee. The tracer gets a SIGTRAP notification and can inspect registers to see what happened:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_SINGLESTEP: u32 = 9;
const PTRACE_GETREGS: u32 = 12;
const PTRACE_PEEKDATA: u32 = 2;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data,
    ));
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();

    const pid = try posix.fork();
    if (pid == 0) {
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);

        // do a few simple things so we can watch the instructions
        var x: u64 = 0;
        x += 10;
        x *= 3;
        _ = x;
        posix.exit(0);
    }

    _ = posix.waitpid(pid, 0);

    try stdout.print("Single-stepping tracee (pid {d})...\n\n", .{pid});
    try stdout.print("Step |     RIP          | Instruction bytes\n", .{});
    try stdout.print("-----|------------------|------------------\n", .{});

    var step_count: u32 = 0;
    const max_steps: u32 = 40; // don't go forever

    while (step_count < max_steps) {
        _ = ptrace(PTRACE_SINGLESTEP, pid, 0, 0);
        const wr = posix.waitpid(pid, 0);

        if (wr.status.exit_status() != null) {
            try stdout.print("\nTracee exited after {d} steps\n", .{step_count});
            break;
        }

        var regs: UserRegs = undefined;
        _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));

        // peek at the instruction bytes at RIP
        const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, regs.rip, 0));
        const bytes: [8]u8 = @bitCast(word);

        try stdout.print(" {d:>3} | 0x{x:0>12} | ", .{ step_count, regs.rip });
        for (bytes[0..6]) |b| try stdout.print("{x:0>2} ", .{b});
        try stdout.print("\n", .{});

        step_count += 1;
    }

    if (step_count >= max_steps) {
        try stdout.print("\n(stopped after {d} steps -- tracee still running)\n", .{max_steps});
        // clean up: continue and let it exit
        const PTRACE_CONT: u32 = 7;
        _ = ptrace(PTRACE_CONT, pid, 0, 0);
        _ = posix.waitpid(pid, 0);
    }
}

Single-stepping is incredibly slow -- every instruction requires a context switch to the tracer and back. For code coverage you'd use breakpoints at strategic locations instead. But for understanding what a program is doing at the instruction level, single-step gives you the finest granularity possible.

The instruction bytes at RIP show raw machine code. You could disassemble them (like we did in episodes 59-61) to get human-readable instructions. Tools like rr (Mozilla's record-replay debugger) use this approach to record register state at every step.

Setting breakpoints: the INT3 instruction trick

Software breakpoints are elegantly brutal. You save the original byte at the target address, replace it with 0xCC (the x86 INT3 instruction, which is only 1 byte), and when the CPU hits it, the kernel sends SIGTRAP to the process. The tracer catches the trap, restores the original byte, rewinds RIP by one, and optionaly continues execution:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_PEEKDATA: u32 = 2;
const PTRACE_POKEDATA: u32 = 5;
const PTRACE_CONT: u32 = 7;
const PTRACE_GETREGS: u32 = 12;
const PTRACE_SETREGS: u32 = 13;
const PTRACE_SINGLESTEP: u32 = 9;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data,
    ));
}

const Breakpoint = struct {
    addr: usize,
    original_byte: u8,
    enabled: bool,
};

fn setBreakpoint(pid: i32, addr: usize) Breakpoint {
    // read the word at the address
    const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, addr, 0));
    var bytes: [8]u8 = @bitCast(word);
    const original = bytes[0];

    // replace first byte with INT3 (0xCC)
    bytes[0] = 0xCC;
    const modified: usize = @bitCast(bytes);
    _ = ptrace(PTRACE_POKEDATA, pid, addr, modified);

    return .{ .addr = addr, .original_byte = original, .enabled = true };
}

fn handleBreakpoint(pid: i32, bp: *Breakpoint) void {
    // restore original byte
    const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, bp.addr, 0));
    var bytes: [8]u8 = @bitCast(word);
    bytes[0] = bp.original_byte;
    const restored: usize = @bitCast(bytes);
    _ = ptrace(PTRACE_POKEDATA, pid, bp.addr, restored);

    // rewind RIP past the INT3 (it already incremented by 1)
    var regs: UserRegs = undefined;
    _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));
    regs.rip -= 1; // back to the original instruction
    _ = ptrace(PTRACE_SETREGS, pid, 0, @intFromPtr(&regs));
}

fn rearmBreakpoint(pid: i32, bp: *Breakpoint) void {
    // single-step past the restored instruction, then re-insert INT3
    _ = ptrace(PTRACE_SINGLESTEP, pid, 0, 0);
    _ = posix.waitpid(pid, 0);

    const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, bp.addr, 0));
    var bytes: [8]u8 = @bitCast(word);
    bp.original_byte = bytes[0]; // might have changed if code is self-modifying
    bytes[0] = 0xCC;
    const modified: usize = @bitCast(bytes);
    _ = ptrace(PTRACE_POKEDATA, pid, bp.addr, modified);
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();

    const pid = try posix.fork();
    if (pid == 0) {
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);

        // target function: a simple loop
        var sum: u64 = 0;
        for (0..5) |i| {
            sum += i * 3;
        }

        const w = std.io.getStdOut().writer();
        w.print("Child result: {d}\n", .{sum}) catch {};
        posix.exit(0);
    }

    _ = posix.waitpid(pid, 0);

    // get the initial RIP (where the child stopped)
    var regs: UserRegs = undefined;
    _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));
    try stdout.print("Child stopped at RIP=0x{x}\n", .{regs.rip});

    // set a breakpoint a few instructions ahead
    const bp_addr = regs.rip + 8; // offset into the child's code
    var bp = setBreakpoint(pid, bp_addr);
    try stdout.print("Breakpoint set at 0x{x} (original byte: 0x{x})\n\n", .{ bp.addr, bp.original_byte });

    // run until breakpoint
    _ = ptrace(PTRACE_CONT, pid, 0, 0);
    const wr = posix.waitpid(pid, 0);

    if (wr.status.stopped_signal()) |sig| {
        if (@intFromEnum(sig) == 5) { // SIGTRAP
            _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));
            try stdout.print("Hit breakpoint at 0x{x}!\n", .{regs.rip - 1});
            try stdout.print("  RAX=0x{x} RBX=0x{x}\n", .{ regs.rax, regs.rbx });

            // handle and continue
            handleBreakpoint(pid, &bp);
            rearmBreakpoint(pid, &bp);

            try stdout.print("Breakpoint restored, continuing...\n", .{});
            _ = ptrace(PTRACE_CONT, pid, 0, 0);
        }
    }

    // wait for exit
    const final = posix.waitpid(pid, 0);
    if (final.status.exit_status()) |code| {
        try stdout.print("Child exited with code {d}\n", .{code});
    }
}

The INT3 replacement is brilliant in its simplicity -- it's ONE byte, so it fits into any instruction without overlapping into the next one. The x86 instruction set is variable-length (1 to 15 bytes), so a multi-byte breakpoint instruction would risk corrupting the following instruction's encoding. 0xCC solves this because it's the smallest possible instruction.

The rearm step is worth understanding: after handling the breakpoint, you need to execute the original instruction that was replaced. So you restore the byte, single-step once (executes that one instruction), then put the INT3 back. This lets you hit the same breakpoint repeatedly -- essential for loops.

Tracing system calls with PTRACE_SYSCALL

PTRACE_SYSCALL is arguably the most useful ptrace request. It continues the tracee but stops it at the next syscall boundary -- once on entry (before the syscall executes) and once on exit (after it returns). This is how strace works:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_SYSCALL: u32 = 24;
const PTRACE_GETREGS: u32 = 12;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data,
    ));
}

const syscall_names = [_][]const u8{
    "read", "write", "open", "close", "stat", "fstat", "lstat", "poll",
    "lseek", "mmap", "mprotect", "munmap", "brk", "rt_sigaction",
    "rt_sigprocmask", "rt_sigreturn", "ioctl", "pread64", "pwrite64",
    "readv", "writev", "access", "pipe", "select", "sched_yield",
    "mremap", "msync", "mincore", "madvise", "shmget", "shmat", "shmctl",
    "dup", "dup2", "pause", "nanosleep", "getitimer", "alarm", "setitimer",
    "getpid", "sendfile", "socket", "connect", "accept", "sendto",
    "recvfrom", "sendmsg", "recvmsg", "shutdown", "bind", "listen",
    "getsockname", "getpeername", "socketpair", "setsockopt", "getsockopt",
    "clone", "fork", "vfork", "execve", "exit", "wait4", "kill", "uname",
};

fn syscallName(nr: u64) []const u8 {
    if (nr < syscall_names.len) return syscall_names[nr];
    return "???";
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();

    const pid = try posix.fork();
    if (pid == 0) {
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);

        // do things that generate syscalls
        const fd = posix.open("/etc/hostname", .{ .ACCMODE = .RDONLY }, 0) catch posix.exit(1);
        var buf: [256]u8 = undefined;
        _ = posix.read(fd, &buf) catch {};
        posix.close(fd);
        posix.exit(0);
    }

    _ = posix.waitpid(pid, 0);
    try stdout.print("Tracing syscalls for pid {d}\n\n", .{pid});

    var entering = true;
    var entry_nr: u64 = 0;
    var call_count: u32 = 0;

    while (true) {
        _ = ptrace(PTRACE_SYSCALL, pid, 0, 0);
        const wr = posix.waitpid(pid, 0);

        // check for exit
        if (wr.status.exit_status() != null) break;
        if (wr.status.signal()) |_| break;

        var regs: UserRegs = undefined;
        _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));

        if (entering) {
            entry_nr = regs.orig_rax;
            try stdout.print("[{d:>3}] {s}(", .{ call_count, syscallName(entry_nr) });

            // print first 3 args
            try stdout.print("0x{x}, 0x{x}, 0x{x}", .{ regs.rdi, regs.rsi, regs.rdx });
            try stdout.print(")", .{});
            call_count += 1;
        } else {
            // exit: rax has the return value
            const ret: isize = @bitCast(regs.rax);
            if (ret < 0) {
                try stdout.print(" = -1 (errno {d})\n", .{@as(u32, @intCast(-ret))});
            } else {
                try stdout.print(" = {d}\n", .{ret});
            }
        }
        entering = !entering;
    }

    try stdout.print("\nTotal: {d} syscalls traced\n", .{call_count});
}

The orig_rax field is important here -- on syscall entry, rax contains the syscall number, but the kernel overwrites rax with the return value on exit. The kernel saves the original number in orig_rax so tracers can always identify which syscall was called. If you read rax on entry you'd get the right number, but on exit you'd get the return value. Using orig_rax consistently is the safe approach.

Building a simple strace-like tool

Now lets combine everything into a more complete syscall tracer that can run any command. We'll support decoding common arguments (file paths, flags) and formatting the output like strace does:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_SYSCALL: u32 = 24;
const PTRACE_GETREGS: u32 = 12;
const PTRACE_PEEKDATA: u32 = 2;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data,
    ));
}

// read a C string from the tracee's address space
fn readTraceeString(pid: i32, addr: usize, buf: []u8) []u8 {
    if (addr == 0) {
        buf[0] = 0;
        return buf[0..0];
    }
    var offset: usize = 0;
    while (offset < buf.len - 1) {
        const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, addr + offset, 0));
        const bytes: [8]u8 = @bitCast(word);
        for (bytes) |b| {
            if (b == 0 or offset >= buf.len - 1) return buf[0..offset];
            buf[offset] = b;
            offset += 1;
        }
    }
    return buf[0..offset];
}

const sc_names = [_][]const u8{
    "read", "write", "open", "close", "stat", "fstat", "lstat", "poll",
    "lseek", "mmap", "mprotect", "munmap", "brk", "rt_sigaction",
    "rt_sigprocmask", "rt_sigreturn", "ioctl", "pread64", "pwrite64",
    "readv", "writev", "access", "pipe", "select", "sched_yield",
    "mremap", "msync", "mincore", "madvise", "shmget", "shmat", "shmctl",
    "dup", "dup2", "pause", "nanosleep", "getitimer", "alarm", "setitimer",
    "getpid", "sendfile", "socket", "connect", "accept", "sendto",
    "recvfrom", "sendmsg", "recvmsg", "shutdown", "bind", "listen",
    "getsockname", "getpeername", "socketpair", "setsockopt", "getsockopt",
    "clone", "fork", "vfork", "execve", "exit", "wait4", "kill", "uname",
    // gap...
};

fn scName(nr: u64) []const u8 {
    if (nr < sc_names.len) return sc_names[nr];
    if (nr == 79) return "getcwd";
    if (nr == 102) return "getuid";
    if (nr == 104) return "getgid";
    if (nr == 110) return "getppid";
    if (nr == 158) return "arch_prctl";
    if (nr == 228) return "clock_gettime";
    if (nr == 231) return "exit_group";
    if (nr == 257) return "openat";
    if (nr == 262) return "newfstatat";
    if (nr == 302) return "prlimit64";
    if (nr == 318) return "getrandom";
    if (nr == 334) return "rseq";
    return "???";
}

pub fn main() !void {
    const stderr = std.io.getStdErr().writer();

    var args = std.process.args();
    _ = args.next(); // skip our name

    const target = args.next() orelse {
        try stderr.print("Usage: mini-strace <command>\n", .{});
        return;
    };

    const pid = try posix.fork();
    if (pid == 0) {
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);
        const argv = [_]?[*:0]const u8{ @ptrCast(target.ptr), null };
        const envp = [_]?[*:0]const u8{ null };
        _ = linux.syscall3(.execve, @intFromPtr(argv[0].?), @intFromPtr(&argv), @intFromPtr(&envp));
        posix.exit(127);
    }

    _ = posix.waitpid(pid, 0);
    var entering = true;
    var entry_regs: UserRegs = undefined;
    var path_buf: [256]u8 = undefined;

    while (true) {
        _ = ptrace(PTRACE_SYSCALL, pid, 0, 0);
        const wr = posix.waitpid(pid, 0);
        if (wr.status.exit_status() != null) break;
        if (wr.status.signal()) |_| break;

        var regs: UserRegs = undefined;
        _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));

        if (entering) {
            entry_regs = regs;
            const nr = regs.orig_rax;
            const name = scName(nr);

            // format based on syscall type
            if (nr == 257) { // openat
                const path = readTraceeString(pid, regs.rsi, &path_buf);
                try stderr.print("{s}(AT_FDCWD, \"{s}\", 0x{x})", .{ name, path, regs.rdx });
            } else if (nr == 0) { // read
                try stderr.print("{s}({d}, buf, {d})", .{ name, @as(isize, @bitCast(regs.rdi)), regs.rdx });
            } else if (nr == 1) { // write
                try stderr.print("{s}({d}, buf, {d})", .{ name, @as(isize, @bitCast(regs.rdi)), regs.rdx });
            } else if (nr == 3) { // close
                try stderr.print("{s}({d})", .{ name, @as(isize, @bitCast(regs.rdi)) });
            } else if (nr == 59) { // execve
                const path = readTraceeString(pid, regs.rdi, &path_buf);
                try stderr.print("{s}(\"{s}\", ...)", .{ name, path });
            } else {
                try stderr.print("{s}(0x{x}, 0x{x}, 0x{x})", .{ name, regs.rdi, regs.rsi, regs.rdx });
            }
        } else {
            const ret: isize = @bitCast(regs.rax);
            if (ret < 0) {
                const errno: u32 = @intCast(-ret);
                const ename: []const u8 = switch (errno) {
                    1 => "EPERM", 2 => "ENOENT", 9 => "EBADF",
                    13 => "EACCES", 17 => "EEXIST", 22 => "EINVAL",
                    else => "?",
                };
                try stderr.print(" = -1 {s} (errno {d})\n", .{ ename, errno });
            } else {
                try stderr.print(" = {d}\n", .{ret});
            }
        }
        entering = !entering;
    }
}

This is a stripped-down strace. The real strace is ~30,000 lines of C because it decodes all ~450 syscalls with full struct formatting, but the core mechanism is identical. Notice we write trace output to stderr, not stdout -- the traced program's stdout must pass through unmodified, same convention strace uses.

Anti-debugging techniques and detection

Programs can detect they're being traced and refuse to run. This is common in malware, DRM, and anti-cheat systems. There are three main techniques:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(.ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data));
}

fn demonstrateAntiDebug() void {
    const w = std.io.getStdOut().writer();

    // technique 1: PTRACE_TRACEME self-check
    const result = ptrace(0, 0, 0, 0); // PTRACE_TRACEME
    if (result < 0) {
        w.print("[anti-debug] PTRACE_TRACEME failed -- debugger detected!\n", .{}) catch {};
    } else {
        w.print("[anti-debug] PTRACE_TRACEME succeeded -- not being debugged\n", .{}) catch {};
    }

    // technique 2: check /proc/self/status for TracerPid
    const fd = posix.open("/proc/self/status", .{ .ACCMODE = .RDONLY }, 0) catch return;
    defer posix.close(fd);
    var buf: [4096]u8 = undefined;
    const n = posix.read(fd, &buf) catch 0;
    if (std.mem.indexOf(u8, buf[0..n], "TracerPid:\t")) |pos| {
        const start = pos + "TracerPid:\t".len;
        var end = start;
        while (end < n and buf[end] != '\n') end += 1;
        const tpid = buf[start..end];
        w.print("[anti-debug] TracerPid: {s}", .{tpid}) catch {};
        if (std.mem.eql(u8, std.mem.trim(u8, tpid, " \t\n"), "0"))
            w.print(" (not traced)\n", .{}) catch {}
        else
            w.print(" (TRACER DETECTED!)\n", .{}) catch {};
    }

    // technique 3: timing -- traced programs run much slower
    const t0 = std.time.nanoTimestamp();
    var dummy: u64 = 0;
    for (0..100000) |i| dummy +%= i;
    const elapsed = std.time.nanoTimestamp() - t0;
    _ = dummy;
    w.print("[anti-debug] Loop: {d}ns", .{elapsed}) catch {};
    if (elapsed > 50_000_000)
        w.print(" -- SUSPICIOUS\n", .{}) catch {}
    else
        w.print(" -- normal\n", .{}) catch {};
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    try stdout.print("=== Anti-debugging demos ===\n\nWithout debugger:\n", .{});
    const pid = try posix.fork();
    if (pid == 0) { demonstrateAntiDebug(); posix.exit(0); }
    _ = posix.waitpid(pid, 0);

    try stdout.print("\nWith debugger attached:\n", .{});
    const pid2 = try posix.fork();
    if (pid2 == 0) {
        _ = ptrace(0, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);
        demonstrateAntiDebug();
        posix.exit(0);
    }
    _ = posix.waitpid(pid2, 0);
    _ = ptrace(7, pid2, 0, 0); // CONT
    _ = posix.waitpid(pid2, 0);

    try stdout.print("\nCountermeasures:\n", .{});
    try stdout.print("  TRACEME check: tracer intercepts the syscall, fakes return 0\n", .{});
    try stdout.print("  /proc check: tracer intercepts read, patches TracerPid to 0\n", .{});
    try stdout.print("  Timing: hardest to defeat -- the overhead is real\n", .{});
}

The PTRACE_TRACEME self-check is trivial to bypass -- the tracer intercepts the ptrace syscall itself and modifies rax to 0. The /proc/self/status check is also defeatable by intercepting read and patching the buffer. Timing checks are harder because the tracing overhead is genuine. In the security research world this is an arms race with no definitive winner.

Practical example: file operation tracer

Here's the capstone for this episode -- a focused tracer that only logs file-related system calls (open, read, write, close, stat) and produces a clean summary. This is genuinely useful for understanding what files a program touches:

const std = @import("std");
const linux = std.os.linux;
const posix = std.posix;

const PTRACE_TRACEME: u32 = 0;
const PTRACE_SYSCALL: u32 = 24;
const PTRACE_GETREGS: u32 = 12;
const PTRACE_PEEKDATA: u32 = 2;

const UserRegs = extern struct {
    r15: u64, r14: u64, r13: u64, r12: u64,
    rbp: u64, rbx: u64, r11: u64, r10: u64,
    r9: u64, r8: u64, rax: u64, rcx: u64,
    rdx: u64, rsi: u64, rdi: u64, orig_rax: u64,
    rip: u64, cs: u64, eflags: u64, rsp: u64,
    ss: u64, fs_base: u64, gs_base: u64,
    ds: u64, es: u64, fs: u64, gs: u64,
};

fn ptrace(request: u32, pid: i32, addr: usize, data: usize) isize {
    return @bitCast(linux.syscall4(
        .ptrace, request,
        @as(usize, @bitCast(@as(isize, pid))), addr, data,
    ));
}

fn readString(pid: i32, addr: usize, buf: []u8) []u8 {
    if (addr == 0) return buf[0..0];
    var off: usize = 0;
    while (off < buf.len - 1) {
        const word: usize = @bitCast(ptrace(PTRACE_PEEKDATA, pid, addr + off, 0));
        const bytes: [8]u8 = @bitCast(word);
        for (bytes) |b| {
            if (b == 0 or off >= buf.len - 1) return buf[0..off];
            buf[off] = b;
            off += 1;
        }
    }
    return buf[0..off];
}

const FileOp = struct {
    path: [256]u8,
    path_len: usize,
    ops: struct {
        opened: bool = false,
        read_bytes: u64 = 0,
        write_bytes: u64 = 0,
        closed: bool = false,
    } = .{},
};

pub fn main() !void {
    const stderr = std.io.getStdErr().writer();
    var args = std.process.args();
    _ = args.next();
    const target = args.next() orelse {
        try stderr.print("Usage: file-tracer <command>\n", .{});
        return;
    };

    const pid = try posix.fork();
    if (pid == 0) {
        _ = ptrace(PTRACE_TRACEME, 0, 0, 0);
        _ = linux.syscall2(.kill, linux.syscall0(.getpid), 19);
        const argv = [_]?[*:0]const u8{ @ptrCast(target.ptr), null };
        const envp = [_]?[*:0]const u8{ null };
        _ = linux.syscall3(.execve, @intFromPtr(argv[0].?), @intFromPtr(&argv), @intFromPtr(&envp));
        posix.exit(127);
    }

    _ = posix.waitpid(pid, 0);

    // track fd -> path mapping
    var fd_paths: [256][256]u8 = undefined;
    var fd_path_lens: [256]usize = [_]usize{0} ** 256;
    var total_reads: u64 = 0;
    var total_writes: u64 = 0;
    var total_opens: u32 = 0;
    var path_buf: [256]u8 = undefined;

    var entering = true;
    var entry_nr: u64 = 0;
    var entry_regs: UserRegs = undefined;

    while (true) {
        _ = ptrace(PTRACE_SYSCALL, pid, 0, 0);
        const wr = posix.waitpid(pid, 0);
        if (wr.status.exit_status() != null) break;
        if (wr.status.signal()) |_| break;

        var regs: UserRegs = undefined;
        _ = ptrace(PTRACE_GETREGS, pid, 0, @intFromPtr(&regs));

        if (entering) {
            entry_nr = regs.orig_rax;
            entry_regs = regs;
        } else {
            const ret: isize = @bitCast(regs.rax);

            switch (entry_nr) {
                257 => { // openat
                    if (ret >= 0 and ret < 256) {
                        const fd_idx: usize = @intCast(ret);
                        const path = readString(pid, entry_regs.rsi, &path_buf);
                        @memcpy(fd_paths[fd_idx][0..path.len], path);
                        fd_path_lens[fd_idx] = path.len;
                        total_opens += 1;
                        try stderr.print("open(\"{s}\") = {d}\n", .{ path, ret });
                    }
                },
                2 => { // open (legacy)
                    if (ret >= 0 and ret < 256) {
                        const fd_idx: usize = @intCast(ret);
                        const path = readString(pid, entry_regs.rdi, &path_buf);
                        @memcpy(fd_paths[fd_idx][0..path.len], path);
                        fd_path_lens[fd_idx] = path.len;
                        total_opens += 1;
                        try stderr.print("open(\"{s}\") = {d}\n", .{ path, ret });
                    }
                },
                0 => { // read
                    if (ret > 0) {
                        const fd_idx: usize = @intCast(@as(isize, @bitCast(entry_regs.rdi)));
                        if (fd_idx < 256 and fd_path_lens[fd_idx] > 0) {
                            const bytes: u64 = @intCast(ret);
                            total_reads += bytes;
                            try stderr.print("read({d} \"{s}\", {d} bytes)\n", .{
                                fd_idx,
                                fd_paths[fd_idx][0..fd_path_lens[fd_idx]],
                                bytes,
                            });
                        }
                    }
                },
                1 => { // write
                    if (ret > 0) {
                        const fd_idx: usize = @intCast(@as(isize, @bitCast(entry_regs.rdi)));
                        if (fd_idx < 256 and fd_path_lens[fd_idx] > 0) {
                            const bytes: u64 = @intCast(ret);
                            total_writes += bytes;
                        }
                    }
                },
                3 => { // close
                    const fd_val: isize = @bitCast(entry_regs.rdi);
                    if (fd_val >= 0 and fd_val < 256) {
                        const fd_idx: usize = @intCast(fd_val);
                        if (fd_path_lens[fd_idx] > 0) {
                            try stderr.print("close({d} \"{s}\")\n", .{
                                fd_idx, fd_paths[fd_idx][0..fd_path_lens[fd_idx]],
                            });
                            fd_path_lens[fd_idx] = 0;
                        }
                    }
                },
                else => {},
            }
        }
        entering = !entering;
    }

    try stderr.print("\n=== File I/O Summary ===\n", .{});
    try stderr.print("Files opened: {d}\n", .{total_opens});
    try stderr.print("Total bytes read: {d}\n", .{total_reads});
    try stderr.print("Total bytes written: {d}\n", .{total_writes});
}

The fd-to-path mapping is the key technique. When openat returns a file descriptor, we record which path was opened. When read/write happens on that fd, we report the filename alongside the byte count. This gives you a file-centric view of I/O that's incredibly useful for debugging "why is my program slow" -- often it's reading files you didn't expect (config files, locale data, shared libraries).

Exercises

  1. Build a memory watchpoint tool using ptrace. The tool accepts a PID and a memory address, and monitors that address for changes. Use PTRACE_ATTACH to attach to a running process, periodically read the target address with PTRACE_PEEKDATA, and print a message whenever the value changes. Include the old value, new value, and current RIP. Stop after detecting 5 changes or when the target process exits.

  2. Extend the file operation tracer to also decode the flags argument to openat -- print whether the file was opened read-only, write-only, read-write, with O_CREAT, O_TRUNC, O_APPEND, etc. Also track the total number of read and write syscalls per fd (not just bytes) and print a per-file summary at the end showing path, open flags, read count, write count, total bytes in each direction.

  3. Build a syscall argument modifier that uses ptrace to intercept specific syscalls and change their arguments before the kernel processes them. For example, intercept openat calls for a specific filename and redirect them to a different file by modifying the path pointer in the tracee's memory (write a new path string into the tracee's stack, then point RSI to it). Demonstrate by tracing /bin/cat /etc/hostname and redirecting it to read /etc/os-release instead.

Bedankt en tot de volgende keer!

  • ptrace is the foundational Linux mechanism for debugging and process inspection -- every debugger (GDB, strace, LLDB) uses it under the hood
  • The tracer has almost total control over the tracee: reading/writing memory, inspecting/modifying registers, intercepting signals and syscalls
  • PTRACE_PEEKDATA / PTRACE_POKEDATA read and write one machine word at a time -- for bulk access, wrap them in helper functions that handle partial words
  • Software breakpoints work by replacing one byte with INT3 (0xCC), catching SIGTRAP, restoring the original byte, rewinding RIP, and optionally rearming
  • PTRACE_SYSCALL stops the tracee twice per syscall (entry and exit) -- read orig_rax for the syscall number and rax for the return value
  • Anti-debugging techniques (self-TRACEME, /proc/self/status, timing checks) exist but each one has known countermeasures -- it's an arms race with no definitive winner
  • The fd-to-path mapping technique in file tracers gives you a high-level view of a program's file I/O that raw syscall output can't match
  • Syscall tracing connects directly to what we built with seccomp in episode 73 -- seccomp blocks syscalls from inside, ptrace observes them from outside

@scipio



0
0
0.000
0 comments