Learn Zig Series (#76) - Mini Project: Process Monitor

Learn Zig Series (#76) - Mini Project: Process Monitor

zig.png

What will I learn

  • How to enumerate all running processes by scanning /proc for numeric directories;
  • How to read and parse /proc/[pid]/stat for CPU time and process state;
  • How to compute per-process CPU usage percentages using delta measurements between snapshots;
  • How to sort a process list by different columns (CPU, memory, PID, name) using Zig's sort with custom comparators;
  • How to build a refreshing terminal UI with ANSI escape codes that clears and redraws each frame;
  • How to filter processes by name substring or UID;
  • How to send signals to processes from your own program using the kill syscall;
  • How to tie together /proc parsing, terminal control, and user input into a working top-like tool.

Requirements

  • A working modern computer running macOS, Windows or Ubuntu;
  • An installed Zig 0.14+ distribution (download from ziglang.org);
  • The ambition to learn Zig programming.

Difficulty

  • Intermediate

Curriculum (of the Learn Zig Series):

Learn Zig Series (#76) - Mini Project: Process Monitor

Solutions to Episode 75 Exercises

Exercise 1: Top-like process lister sorted by RSS memory

const std = @import("std");

const ProcEntry = struct {
    pid: u32,
    name: [64]u8 = undefined,
    name_len: usize = 0,
    state: u8 = '?',
    rss_kb: u64 = 0,
    cpu_secs: u64 = 0,
    cmdline: [128]u8 = undefined,
    cmdline_len: usize = 0,
};

fn readProcFile(pid: u32, filename: []const u8, buf: []u8) ![]u8 {
    var path_buf: [64]u8 = undefined;
    const path = std.fmt.bufPrint(&path_buf, "/proc/{d}/{s}", .{ pid, filename }) catch return error.PathTooLong;
    const file = std.fs.openFileAbsolute(path, .{}) catch return error.NotFound;
    defer file.close();
    const n = try file.readAll(buf);
    return buf[0..n];
}

fn findField(content: []const u8, key: []const u8) ?[]const u8 {
    var it = std.mem.splitScalar(u8, content, '\n');
    while (it.next()) |line| {
        if (std.mem.startsWith(u8, line, key)) {
            const colon = std.mem.indexOfScalar(u8, line, ':') orelse continue;
            return std.mem.trimLeft(u8, line[colon + 1 ..], " \t");
        }
    }
    return null;
}

fn parseKbValue(s: ?[]const u8) u64 {
    const v = s orelse return 0;
    var it = std.mem.splitScalar(u8, v, ' ');
    const num = it.next() orelse return 0;
    return std.fmt.parseInt(u64, num, 10) catch 0;
}

fn readEntry(pid: u32) !ProcEntry {
    var entry = ProcEntry{ .pid = pid };
    var status_buf: [4096]u8 = undefined;
    const status = try readProcFile(pid, "status", &status_buf);

    if (findField(status, "Name:")) |n| {
        const len = @min(n.len, 64);
        @memcpy(entry.name[0..len], n[0..len]);
        entry.name_len = len;
    }
    if (findField(status, "State:")) |s| { if (s.len > 0) entry.state = s[0]; }
    entry.rss_kb = parseKbValue(findField(status, "VmRSS:"));

    var stat_buf: [512]u8 = undefined;
    const stat = readProcFile(pid, "stat", &stat_buf) catch "";
    const rparen = std.mem.lastIndexOfScalar(u8, stat, ')') orelse 0;
    if (rparen + 2 < stat.len) {
        var it = std.mem.tokenizeAny(u8, stat[rparen + 2 ..], " ");
        var idx: u32 = 0;
        while (it.next()) |tok| : (idx += 1) {
            if (idx == 11 or idx == 12) { // utime(13), stime(14) => index 11,12 after state
                const ticks = std.fmt.parseInt(u64, tok, 10) catch 0;
                entry.cpu_secs += ticks;
            }
            if (idx > 12) break;
        }
        entry.cpu_secs /= 100; // clock ticks to seconds (assuming HZ=100)
    }

    var cmd_buf: [128]u8 = undefined;
    const cmd = readProcFile(pid, "cmdline", &cmd_buf) catch "";
    for (0..cmd.len) |i| { entry.cmdline[i] = if (cmd[i] == 0) ' ' else cmd[i]; }
    entry.cmdline_len = cmd.len;

    return entry;
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    var entries: [1024]ProcEntry = undefined;
    var count: usize = 0;

    const proc_dir = try std.fs.openDirAbsolute("/proc", .{ .iterate = true });
    var it = proc_dir.iterate();
    while (try it.next()) |de| {
        if (de.name.len == 0 or de.name[0] < '1' or de.name[0] > '9') continue;
        const pid = std.fmt.parseInt(u32, de.name, 10) catch continue;
        entries[count] = readEntry(pid) catch continue;
        count += 1;
        if (count >= 1024) break;
    }

    std.mem.sortUnstable(ProcEntry, entries[0..count], {}, struct {
        pub fn lessThan(_: void, a: ProcEntry, b: ProcEntry) bool {
            return a.rss_kb > b.rss_kb; // descending
        }
    }.lessThan);

    const top = @min(count, 15);
    try stdout.print("{s:<7} {s:<16} {s:<6} {s:>10} {s:>8}  {s}\n", .{
        "PID", "NAME", "STATE", "RSS(MB)", "CPU(s)", "CMDLINE",
    });
    try stdout.print("{s}\n", .{"-" ** 78});
    for (entries[0..top]) |e| {
        try stdout.print("{d:<7} {s:<16} {c}     {d:>7}   {d:>6}  {s}\n", .{
            e.pid, e.name[0..e.name_len], e.state,
            e.rss_kb / 1024, e.cpu_secs, e.cmdline[0..@min(e.cmdline_len, 40)],
        });
    }
}

Scanning /proc for numeric directory entries is the standard way every Linux top-like tool discovers processes. Sorting by RSS descending immediately shows you the memory hogs.

Exercise 2: Network connection monitor showing new and closed connections

const std = @import("std");

const Conn = struct {
    local_ip: u32, local_port: u16,
    remote_ip: u32, remote_port: u16,
    state: u8,
};

fn parseConns(content: []const u8, out: []Conn) usize {
    var it = std.mem.splitScalar(u8, content, '\n');
    _ = it.next(); // skip header
    var n: usize = 0;
    while (it.next()) |line| {
        if (line.len < 10 or n >= out.len) break;
        var fit = std.mem.tokenizeAny(u8, line, " ");
        _ = fit.next(); // sl
        const local = fit.next() orelse continue;
        const remote = fit.next() orelse continue;
        const st = fit.next() orelse continue;
        var li = std.mem.splitScalar(u8, local, ':');
        var ri = std.mem.splitScalar(u8, remote, ':');
        out[n] = .{
            .local_ip = std.fmt.parseInt(u32, li.next() orelse continue, 16) catch continue,
            .local_port = std.fmt.parseInt(u16, li.next() orelse continue, 16) catch continue,
            .remote_ip = std.fmt.parseInt(u32, ri.next() orelse continue, 16) catch continue,
            .remote_port = std.fmt.parseInt(u16, ri.next() orelse continue, 16) catch continue,
            .state = std.fmt.parseInt(u8, st, 16) catch continue,
        };
        n += 1;
    }
    return n;
}

fn fmtIp(buf: []u8, addr: u32) []u8 {
    const len = (std.fmt.bufPrint(buf, "{d}.{d}.{d}.{d}", .{
        addr & 0xFF, (addr >> 8) & 0xFF, (addr >> 16) & 0xFF, (addr >> 24) & 0xFF,
    }) catch "?").len;
    return buf[0..len];
}

fn connEq(a: Conn, b: Conn) bool {
    return a.local_ip == b.local_ip and a.local_port == b.local_port and
        a.remote_ip == b.remote_ip and a.remote_port == b.remote_port;
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    var prev: [512]Conn = undefined;
    var prev_n: usize = 0;
    var buf: [32768]u8 = undefined;
    var round: u32 = 0;

    while (round < 30) : (round += 1) {
        const file = std.fs.openFileAbsolute("/proc/net/tcp", .{}) catch continue;
        defer file.close();
        const n = file.readAll(&buf) catch continue;
        var curr: [512]Conn = undefined;
        const curr_n = parseConns(buf[0..n], &curr);

        if (round > 0) {
            var ip_buf: [16]u8 = undefined;
            var rip_buf: [16]u8 = undefined;
            for (curr[0..curr_n]) |c| {
                var found = false;
                for (prev[0..prev_n]) |p| { if (connEq(c, p)) { found = true; break; } }
                if (!found) try stdout.print("[NEW]    {s}:{d} -> {s}:{d}\n", .{
                    fmtIp(&ip_buf, c.local_ip), c.local_port,
                    fmtIp(&rip_buf, c.remote_ip), c.remote_port,
                });
            }
            for (prev[0..prev_n]) |p| {
                var found = false;
                for (curr[0..curr_n]) |c| { if (connEq(p, c)) { found = true; break; } }
                if (!found) try stdout.print("[CLOSED] {s}:{d} -> {s}:{d}\n", .{
                    fmtIp(&ip_buf, p.local_ip), p.local_port,
                    fmtIp(&rip_buf, p.remote_ip), p.remote_port,
                });
            }
        }

        @memcpy(prev[0..curr_n], curr[0..curr_n]);
        prev_n = curr_n;
        std.time.sleep(2_000_000_000);
    }
}

The snapshot-diff approach is the simplest way to detect connection changes -- store the previous state, compare with current, report differences. A production tool would use netlink sockets for event-driven notifications instead of polling, but for monitoring purposes this works.

Exercise 3: /proc filesystem explorer for a given PID

const std = @import("std");

fn readFile(path: []const u8, buf: []u8) ![]u8 {
    const f = std.fs.openFileAbsolute(path, .{}) catch return error.NotFound;
    defer f.close();
    const n = try f.readAll(buf);
    return buf[0..n];
}

fn printSection(w: anytype, title: []const u8) !void {
    try w.print("\n=== {s} ===\n", .{title});
}

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    var args = std.process.args();
    _ = args.next();
    const pid_str = args.next() orelse { try stdout.print("Usage: procexplore <pid>\n", .{}); return; };
    var path_buf: [128]u8 = undefined;
    var buf: [16384]u8 = undefined;

    // memory map
    try printSection(stdout, "Memory Map");
    const maps_path = try std.fmt.bufPrint(&path_buf, "/proc/{s}/maps", .{pid_str});
    const maps = readFile(maps_path, &buf) catch "unavailable";
    var line_it = std.mem.splitScalar(u8, maps, '\n');
    while (line_it.next()) |line| {
        if (line.len == 0) continue;
        const label: []const u8 = if (std.mem.indexOf(u8, line, "[heap]") != null) "HEAP"
            else if (std.mem.indexOf(u8, line, "[stack]") != null) "STACK"
            else if (std.mem.indexOf(u8, line, ".so") != null) "SHLIB"
            else if (std.mem.indexOf(u8, line, "[vdso]") != null) "VDSO"
            else "ANON/FILE";
        try stdout.print("  [{s:<9}] {s}\n", .{ label, line[0..@min(line.len, 90)] });
    }

    // open file descriptors
    try printSection(stdout, "Open File Descriptors");
    var fd_path_buf: [128]u8 = undefined;
    const fd_dir_path = try std.fmt.bufPrint(&fd_path_buf, "/proc/{s}/fd", .{pid_str});
    const fd_dir = std.fs.openDirAbsolute(fd_dir_path, .{ .iterate = true }) catch {
        try stdout.print("  (permission denied)\n", .{});
        return;
    };
    var fd_it = fd_dir.iterate();
    while (try fd_it.next()) |entry| {
        var link_buf: [256]u8 = undefined;
        var link_path_buf: [128]u8 = undefined;
        const lpath = try std.fmt.bufPrint(&link_path_buf, "/proc/{s}/fd/{s}", .{ pid_str, entry.name });
        const target = std.fs.readLinkAbsolute(lpath, &link_buf) catch "(unreadable)";
        try stdout.print("  fd {s:<4} -> {s}\n", .{ entry.name, target });
    }

    // environment
    try printSection(stdout, "Environment (first 10)");
    const env_path = try std.fmt.bufPrint(&path_buf, "/proc/{s}/environ", .{pid_str});
    const env = readFile(env_path, &buf) catch "unavailable";
    var env_it = std.mem.splitScalar(u8, env, 0);
    var env_count: u32 = 0;
    while (env_it.next()) |v| {
        if (v.len == 0) continue;
        try stdout.print("  {s}\n", .{v[0..@min(v.len, 120)]});
        env_count += 1;
        if (env_count >= 10) { try stdout.print("  ... (truncated)\n", .{}); break; }
    }

    // limits
    try printSection(stdout, "Resource Limits");
    const lim_path = try std.fmt.bufPrint(&path_buf, "/proc/{s}/limits", .{pid_str});
    const lims = readFile(lim_path, &buf) catch "unavailable";
    try stdout.print("{s}\n", .{lims});

    // cgroups
    try printSection(stdout, "Cgroup Hierarchy");
    const cg_path = try std.fmt.bufPrint(&path_buf, "/proc/{s}/cgroup", .{pid_str});
    const cgroups = readFile(cg_path, &buf) catch "unavailable";
    try stdout.print("{s}\n", .{cgroups});
}

The fd directory is the one that's most likely to require root (or at least same-user) permissions. Notice we handle that gracefully -- print a message and move on rather than crashing.

Last episode we explored the /proc and /sys pseudo-filesystems and learned how to read process information, memory maps, network connections, CPU stats, and hardware details -- all through plain file I/O. Today we're going to take everything from that episode (and quite some things from the episodes before it) and build something actually useful: a process monitor. Think of it as a simplified top or htop written entirely in Zig, reading directly from /proc.

This is a single-episode mini project, similar to what we did with the HTTP status checker in episode 25 and the CLI task runner in episode 36. The goal isn't to replicate every feature of htop -- it's to tie together /proc parsing, ANSI terminal control, sorting, filtering, and signal delivery into one cohesive program. By the end you'll have a working tool you can actually run on your system and get useful output from ;-)

The data layer: reading process info from /proc

The foundation of any process monitor is the ability to enumerate processes and extract their stats. We covered the individual pieces in episode 75 -- now we need to turn that into a reusable data layer. The key insight is that we need TWO snapshots of CPU time to calculate percentages: CPU usage is always "how much CPU time was consumed between measurement A and measurement B."

const std = @import("std");
const linux = std.os.linux;

pub const ProcessInfo = struct {
    pid: u32,
    name: [64]u8 = undefined,
    name_len: usize = 0,
    state: u8 = '?',
    ppid: u32 = 0,
    uid: u32 = 0,
    rss_kb: u64 = 0,
    vsize_kb: u64 = 0,
    utime: u64 = 0, // user time in clock ticks
    stime: u64 = 0, // system time in clock ticks
    threads: u32 = 0,
    cpu_pct: f64 = 0.0, // calculated between snapshots
    cmdline: [256]u8 = undefined,
    cmdline_len: usize = 0,
};

fn readProcFile(pid: u32, filename: []const u8, buf: []u8) ![]u8 {
    var path_buf: [80]u8 = undefined;
    const path = std.fmt.bufPrint(&path_buf, "/proc/{d}/{s}", .{ pid, filename }) catch
        return error.PathTooLong;
    const file = std.fs.openFileAbsolute(path, .{}) catch return error.NotFound;
    defer file.close();
    const n = try file.readAll(buf);
    return buf[0..n];
}

fn findKeyValue(content: []const u8, key: []const u8) ?[]const u8 {
    var it = std.mem.splitScalar(u8, content, '\n');
    while (it.next()) |line| {
        if (std.mem.startsWith(u8, line, key)) {
            const colon = std.mem.indexOfScalar(u8, line, ':') orelse continue;
            return std.mem.trimLeft(u8, line[colon + 1 ..], " \t");
        }
    }
    return null;
}

fn parseKb(s: ?[]const u8) u64 {
    const v = s orelse return 0;
    var it = std.mem.splitScalar(u8, v, ' ');
    const num = it.next() orelse return 0;
    return std.fmt.parseInt(u64, num, 10) catch 0;
}

pub fn readProcessInfo(pid: u32) !ProcessInfo {
    var info = ProcessInfo{ .pid = pid };

    // /proc/[pid]/status for human-readable fields
    var status_buf: [4096]u8 = undefined;
    const status = try readProcFile(pid, "status", &status_buf);

    if (findKeyValue(status, "Name:")) |n| {
        const len = @min(n.len, 64);
        @memcpy(info.name[0..len], n[0..len]);
        info.name_len = len;
    }
    if (findKeyValue(status, "State:")) |s| {
        if (s.len > 0) info.state = s[0];
    }
    if (findKeyValue(status, "PPid:")) |v| {
        info.ppid = std.fmt.parseInt(u32, std.mem.trim(u8, v, " \t"), 10) catch 0;
    }
    if (findKeyValue(status, "Uid:")) |v| {
        var it = std.mem.tokenizeAny(u8, v, " \t");
        if (it.next()) |uid| info.uid = std.fmt.parseInt(u32, uid, 10) catch 0;
    }
    info.rss_kb = parseKb(findKeyValue(status, "VmRSS:"));
    info.vsize_kb = parseKb(findKeyValue(status, "VmSize:"));
    if (findKeyValue(status, "Threads:")) |v| {
        info.threads = std.fmt.parseInt(u32, std.mem.trim(u8, v, " \t"), 10) catch 0;
    }

    // /proc/[pid]/stat for CPU times -- remember the parenthesised name gotcha
    var stat_buf: [512]u8 = undefined;
    const stat = readProcFile(pid, "stat", &stat_buf) catch "";
    const rparen = std.mem.lastIndexOfScalar(u8, stat, ')') orelse 0;
    if (rparen + 2 < stat.len) {
        var it = std.mem.tokenizeAny(u8, stat[rparen + 2 ..], " ");
        var idx: u32 = 0;
        while (it.next()) |tok| : (idx += 1) {
            switch (idx) {
                11 => info.utime = std.fmt.parseInt(u64, tok, 10) catch 0, // field 14
                12 => info.stime = std.fmt.parseInt(u64, tok, 10) catch 0, // field 15
                else => {},
            }
            if (idx > 12) break;
        }
    }

    // /proc/[pid]/cmdline
    var cmd_buf: [256]u8 = undefined;
    const cmdline = readProcFile(pid, "cmdline", &cmd_buf) catch "";
    for (0..cmdline.len) |i| {
        info.cmdline[i] = if (cmdline[i] == 0) ' ' else cmdline[i];
    }
    info.cmdline_len = cmdline.len;

    return info;
}

We parse both /proc/[pid]/status (for the human-readable fields like Name, VmRSS, Threads) and /proc/[pid]/stat (for the CPU tick counters utime and stime). As we discussed last episode, the stat file has that nasty parenthesised-name gotcha where you need to find the LAST ) before splitting fields. The utime and stime values are in clock ticks -- on virtually every Linux system that's 100 ticks per second (the kernel constant USER_HZ).

Scanning all processes

To get a list of every running process, we iterate /proc/ and pick up every directory entry that's a valid number. Processes can vanish between the directory listing and reading their files, so every read has to handle errors gracefully:

pub const MAX_PROCS = 2048;

pub fn scanAllProcesses(out: []ProcessInfo) !usize {
    const proc_dir = try std.fs.openDirAbsolute("/proc", .{ .iterate = true });
    var it = proc_dir.iterate();
    var count: usize = 0;

    while (try it.next()) |entry| {
        if (count >= out.len) break;
        // only numeric directory names are PIDs
        if (entry.name.len == 0) continue;
        if (entry.name[0] < '1' or entry.name[0] > '9') continue;
        const pid = std.fmt.parseInt(u32, entry.name, 10) catch continue;

        out[count] = readProcessInfo(pid) catch continue;
        count += 1;
    }

    return count;
}

That catch continue is doing a lot of heavy lifting here. A process might exit right after we see its directory entry but before we read its stat file. Or we might not have permissions to read a root-owned process. Either way, we skip it and move on. This is exactly the pattern that top, ps, and htop all use -- you can't avoid the race condition, you just handle it.

Computing CPU percentages

Here we go -- the part that confuses everyone the first time they build a process monitor. You CANNOT compute CPU usage from a single snapshot. The utime and stime values in /proc/[pid]/stat are cumulative counters -- they tell you how many clock ticks this process has consumed SINCE IT STARTED. To get a percentage, you need the delta between two measurements, divided by the elapsed wall clock time:

pub fn computeCpuPercentages(
    current: []ProcessInfo,
    current_count: usize,
    previous: []const ProcessInfo,
    previous_count: usize,
    elapsed_ns: u64,
) void {
    const elapsed_secs = @as(f64, @floatFromInt(elapsed_ns)) / 1_000_000_000.0;
    if (elapsed_secs < 0.001) return; // avoid division by zero

    const hz: f64 = 100.0; // USER_HZ, almost always 100 on Linux

    for (current[0..current_count]) |*proc| {
        // find this PID in the previous snapshot
        var prev_utime: u64 = 0;
        var prev_stime: u64 = 0;
        var found = false;
        for (previous[0..previous_count]) |prev| {
            if (prev.pid == proc.pid) {
                prev_utime = prev.utime;
                prev_stime = prev.stime;
                found = true;
                break;
            }
        }
        if (!found) {
            proc.cpu_pct = 0.0;
            continue;
        }

        // delta ticks / (elapsed seconds * ticks_per_second) * 100
        const delta_utime = if (proc.utime >= prev_utime) proc.utime - prev_utime else 0;
        const delta_stime = if (proc.stime >= prev_stime) proc.stime - prev_stime else 0;
        const delta_total = @as(f64, @floatFromInt(delta_utime + delta_stime));
        proc.cpu_pct = (delta_total / (elapsed_secs * hz)) * 100.0;
    }
}

The linear search through the previous array to find matching PIDs is O(n*m) which sounds terrible, but for typical systems with a few hundred processes it's negligible compared to the I/O cost of reading /proc files. If you were monitoring a system with 50,000 containers though, you'd want a hash map lookup instead. Having said that, for our purposes this is perfectly fine.

Sorting by columns

Now for the interactive part. A real process monitor lets you sort by different columns -- CPU usage, memory, PID, name. Zig's std.mem.sortUnstable accepts a comparison function, so we can swap comparators based on user input:

pub const SortColumn = enum {
    cpu,
    memory,
    pid,
    name,
};

pub fn sortProcesses(procs: []ProcessInfo, count: usize, col: SortColumn) void {
    const slice = procs[0..count];
    switch (col) {
        .cpu => std.mem.sortUnstable(ProcessInfo, slice, {}, struct {
            pub fn f(_: void, a: ProcessInfo, b: ProcessInfo) bool {
                return a.cpu_pct > b.cpu_pct;
            }
        }.f),
        .memory => std.mem.sortUnstable(ProcessInfo, slice, {}, struct {
            pub fn f(_: void, a: ProcessInfo, b: ProcessInfo) bool {
                return a.rss_kb > b.rss_kb;
            }
        }.f),
        .pid => std.mem.sortUnstable(ProcessInfo, slice, {}, struct {
            pub fn f(_: void, a: ProcessInfo, b: ProcessInfo) bool {
                return a.pid < b.pid;
            }
        }.f),
        .name => std.mem.sortUnstable(ProcessInfo, slice, {}, struct {
            pub fn f(_: void, a: ProcessInfo, b: ProcessInfo) bool {
                const a_name = a.name[0..a.name_len];
                const b_name = b.name[0..b.name_len];
                return std.mem.order(u8, a_name, b_name) == .lt;
            }
        }.f),
    }
}

The anonymous struct pattern for inline comparison functions is idiomatic Zig. Each comparator captures no state (the _: void context parameter) and uses sortUnstable rather than sortStable because we don't care about preserving the relative order of equal elements -- a process with 0% CPU is the same as another process with 0% CPU for our purposes.

CPU and memory sort descending (highest first, which is what you almost always want when monitoring) while PID sorts ascending (numerical order) and name sorts alphabetically. You could argue name should be case-insensitive but honestly, process names on Linux are almost universally lowercase anyway.

The terminal UI: ANSI escape codes

For the display we use ANSI escape sequences to clear the screen, position the cursor, and add a bit of color. We used ANSI codes back in the ECS terminal rendering episode (ep58), so this should look familiar:

const ANSI_CLEAR = "\x1b[2J\x1b[H"; // clear screen + cursor to top-left
const ANSI_BOLD = "\x1b[1m";
const ANSI_DIM = "\x1b[2m";
const ANSI_GREEN = "\x1b[32m";
const ANSI_YELLOW = "\x1b[33m";
const ANSI_RED = "\x1b[31m";
const ANSI_CYAN = "\x1b[36m";
const ANSI_RESET = "\x1b[0m";
const ANSI_REVERSE = "\x1b[7m";

fn cpuColor(pct: f64) []const u8 {
    if (pct > 50.0) return ANSI_RED;
    if (pct > 10.0) return ANSI_YELLOW;
    return ANSI_GREEN;
}

fn stateName(state: u8) []const u8 {
    return switch (state) {
        'R' => "running",
        'S' => "sleeping",
        'D' => "disk",
        'Z' => "zombie",
        'T' => "stopped",
        't' => "traced",
        'X' => "dead",
        else => "?",
    };
}

pub fn renderDisplay(
    writer: anytype,
    procs: []const ProcessInfo,
    count: usize,
    sort_col: SortColumn,
    filter: ?[]const u8,
    total_mem_kb: u64,
    uptime_secs: u64,
) !void {
    try writer.print("{s}", .{ANSI_CLEAR});

    // header bar
    const days = uptime_secs / 86400;
    const hours = (uptime_secs % 86400) / 3600;
    const mins = (uptime_secs % 3600) / 60;
    try writer.print("{s}{s} zigmon - {d} processes | up {d}d {d}h {d}m | sort: {s}{s}\n", .{
        ANSI_BOLD, ANSI_REVERSE,
        count, days, hours, mins, @tagName(sort_col),
        ANSI_RESET,
    });

    // memory summary
    const used_kb = total_mem_kb - getAvailableMemKb();
    const pct = if (total_mem_kb > 0) (used_kb * 100) / total_mem_kb else 0;
    try writer.print("{s}Mem: {d}MB / {d}MB ({d}%%){s}", .{
        ANSI_DIM, used_kb / 1024, total_mem_kb / 1024, pct, ANSI_RESET,
    });
    if (filter) |f| {
        try writer.print("  filter: \"{s}\"", .{f});
    }
    try writer.print("\n\n", .{});

    // column headers
    try writer.print("{s}{s:<7} {s:<16} {s:<8} {s:>7} {s:>9} {s:>5} {s}{s}\n", .{
        ANSI_BOLD,
        "PID", "NAME", "STATE", "CPU%", "MEM(MB)", "THR", "CMDLINE",
        ANSI_RESET,
    });

    // process rows
    var rows: usize = 0;
    const max_rows: usize = 30;
    for (procs[0..count]) |p| {
        if (rows >= max_rows) break;

        // apply filter
        if (filter) |f| {
            const name_slice = p.name[0..p.name_len];
            if (std.mem.indexOf(u8, name_slice, f) == null) continue;
        }

        const color = cpuColor(p.cpu_pct);
        try writer.print("{s}{d:<7} {s:<16} {s:<8} {d:>6.1} {d:>8} {d:>5}  {s}{s}\n", .{
            color,
            p.pid,
            p.name[0..p.name_len],
            stateName(p.state),
            p.cpu_pct,
            p.rss_kb / 1024,
            p.threads,
            p.cmdline[0..@min(p.cmdline_len, 50)],
            ANSI_RESET,
        });
        rows += 1;
    }

    try writer.print("\n{s}Keys: c=CPU m=Mem p=PID n=Name f=Filter k=Kill q=Quit{s}\n", .{
        ANSI_DIM, ANSI_RESET,
    });
}

fn getAvailableMemKb() u64 {
    var buf: [4096]u8 = undefined;
    const file = std.fs.openFileAbsolute("/proc/meminfo", .{}) catch return 0;
    defer file.close();
    const n = file.readAll(&buf) catch return 0;
    var it = std.mem.splitScalar(u8, buf[0..n], '\n');
    while (it.next()) |line| {
        if (std.mem.startsWith(u8, line, "MemAvailable:")) {
            const colon = std.mem.indexOfScalar(u8, line, ':') orelse return 0;
            const val = std.mem.trimLeft(u8, line[colon + 1 ..], " \t");
            var vit = std.mem.splitScalar(u8, val, ' ');
            const num = vit.next() orelse return 0;
            return std.fmt.parseInt(u64, num, 10) catch 0;
        }
    }
    return 0;
}

The color coding is simple but effective: green for low CPU, yellow for moderate, red for processes eating more than 50% CPU. The status bar at the top shows the sort column and any active filter, so you always know what you're looking at.

Filtering processes

The filter is a straightforward substring match on the process name. When the user presses 'f', we read a filter string and only display processes whose name contains that substring:

pub fn applyFilter(
    procs: []const ProcessInfo,
    count: usize,
    filter: []const u8,
    out: []ProcessInfo,
) usize {
    var n: usize = 0;
    for (procs[0..count]) |p| {
        if (n >= out.len) break;
        const name = p.name[0..p.name_len];
        if (filter.len == 0 or std.mem.indexOf(u8, name, filter) != null) {
            out[n] = p;
            n += 1;
        }
    }
    return n;
}

We do the filtering in the render function directly (shown above), but having it as a separate function is useful if you want to count filtered results or do further processing. A nice extension would be to also match against the cmdline string, not just the process name -- many processes run as python3 or node where the actual script name is only visible in cmdline.

Sending signals: the kill functionality

The whole point of a process monitor (as opposed to just ps) is that you can ACT on what you see. When you identify a runaway process, you want to kill it right there. In Zig we can send signals via the kill syscall:

pub fn sendSignal(pid: u32, sig: u32) !void {
    const result = linux.syscall2(
        .kill,
        @as(usize, pid),
        @as(usize, sig),
    );
    const signed: isize = @bitCast(result);
    if (signed < 0) {
        const errno: u32 = @intCast(-signed);
        return switch (errno) {
            1 => error.PermissionDenied, // EPERM
            3 => error.ProcessNotFound, // ESRCH
            else => error.SignalFailed,
        };
    }
}

pub fn killProcess(pid: u32) !void {
    // try SIGTERM first (polite shutdown)
    try sendSignal(pid, 15);
}

pub fn forceKillProcess(pid: u32) !void {
    // SIGKILL (cannot be caught or ignored)
    try sendSignal(pid, 9);
}

We covered signal handling extensively in episode 67, so the numbers should look familar. SIGTERM (15) is the polite "please shut down" signal that processes can catch and handle gracefully. SIGKILL (9) is the nuclear option -- the kernel terminates the process immediately, no cleanup, no signal handler, no mercy. A well-designed monitor tries SIGTERM first and only escalates to SIGKILL if the process doesn't die.

The permission check matters here: you can only kill processes owned by the same user (unless you're root). Trying to kill PID 1 (systemd/init) as a regular user will get you an EPERM error, which is exactly what should happen.

Terminal raw mode for key input

To read individual keypresses without waiting for Enter, we need to put the terminal into raw mode. This means disabling canonical mode (line buffering) and echo:

const posix = std.posix;

const OrigTermios = struct {
    termios: posix.termios,
    fd: posix.fd_t,
};

pub fn enableRawMode() !OrigTermios {
    const fd = std.io.getStdIn().handle;
    const orig = try posix.tcgetattr(fd);
    var raw = orig;

    // disable canonical mode and echo
    raw.lflag = @bitCast(@as(u32, @bitCast(raw.lflag)) &
        ~@as(u32, @bitCast(linux.tc_lflag_t{
        .ICANON = true,
        .ECHO = true,
    })));

    // read returns after 0 bytes with 100ms timeout
    raw.cc[@intFromEnum(linux.V.MIN)] = 0;
    raw.cc[@intFromEnum(linux.V.TIME)] = 1; // 100ms timeout

    try posix.tcsetattr(fd, .FLUSH, raw);

    return .{ .termios = orig, .fd = fd };
}

pub fn disableRawMode(saved: OrigTermios) void {
    posix.tcsetattr(saved.fd, .FLUSH, saved.termios) catch {};
}

Setting VMIN=0 and VTIME=1 means read() will return immediately if there's a character waiting, or after 100ms if there's nothing. This gives us non-blocking-ish key input without using threads or epoll. The 100ms is short enough that the UI feels responsive, and long enough that we're not busy-spinning.

Putting it all together: the main loop

Here's where everything comes together. The main loop takes snapshots, computes CPU percentages, sorts, renders, and handles keypress input:

pub fn main() !void {
    const stdout = std.io.getStdOut().writer();
    const stdin = std.io.getStdIn();

    // get total memory for the header display
    var membuf: [4096]u8 = undefined;
    const memfile = try std.fs.openFileAbsolute("/proc/meminfo", .{});
    defer memfile.close();
    const memn = try memfile.readAll(&membuf);
    var total_mem_kb: u64 = 0;
    var mem_it = std.mem.splitScalar(u8, membuf[0..memn], '\n');
    while (mem_it.next()) |line| {
        if (std.mem.startsWith(u8, line, "MemTotal:")) {
            const colon = std.mem.indexOfScalar(u8, line, ':') orelse break;
            const val = std.mem.trimLeft(u8, line[colon + 1 ..], " \t");
            var vit = std.mem.splitScalar(u8, val, ' ');
            total_mem_kb = std.fmt.parseInt(u64, vit.next() orelse "0", 10) catch 0;
            break;
        }
    }

    // enable raw mode for keypress reading
    const saved_term = enableRawMode() catch {
        try stdout.print("Failed to set raw mode\n", .{});
        return;
    };
    defer disableRawMode(saved_term);

    var current: [MAX_PROCS]ProcessInfo = undefined;
    var previous: [MAX_PROCS]ProcessInfo = undefined;
    var current_count: usize = 0;
    var previous_count: usize = 0;
    var sort_col: SortColumn = .cpu;
    var filter_buf: [32]u8 = undefined;
    var filter_len: usize = 0;
    var timer = std.time.Timer.start() catch unreachable;

    // first snapshot (no CPU% yet)
    current_count = scanAllProcesses(&current) catch 0;

    while (true) {
        // capture elapsed time since last snapshot
        const elapsed_ns = timer.read();
        timer.reset();

        // take new snapshot
        @memcpy(previous[0..current_count], current[0..current_count]);
        previous_count = current_count;
        current_count = scanAllProcesses(&current) catch 0;

        // compute CPU percentages from deltas
        computeCpuPercentages(&current, current_count, &previous, previous_count, elapsed_ns);

        // sort
        sortProcesses(&current, current_count, sort_col);

        // get uptime
        var upbuf: [64]u8 = undefined;
        var uptime_secs: u64 = 0;
        if (std.fs.openFileAbsolute("/proc/uptime", .{})) |uf| {
            defer uf.close();
            const un = uf.readAll(&upbuf) catch 0;
            var uit = std.mem.splitScalar(u8, upbuf[0..un], '.');
            if (uit.next()) |s| uptime_secs = std.fmt.parseInt(u64, s, 10) catch 0;
        } else |_| {}

        // render
        const filter: ?[]const u8 = if (filter_len > 0) filter_buf[0..filter_len] else null;
        try renderDisplay(stdout, &current, current_count, sort_col, filter, total_mem_kb, uptime_secs);

        // wait and check for input -- poll for 2 seconds total, checking keys every 100ms
        var wait: u32 = 0;
        while (wait < 20) : (wait += 1) {
            var key_buf: [1]u8 = undefined;
            const nread = stdin.read(&key_buf) catch 0;
            if (nread > 0) {
                switch (key_buf[0]) {
                    'q' => {
                        try stdout.print("{s}", .{ANSI_CLEAR});
                        return;
                    },
                    'c' => sort_col = .cpu,
                    'm' => sort_col = .memory,
                    'p' => sort_col = .pid,
                    'n' => sort_col = .name,
                    'f' => {
                        // toggle filter off if already active
                        if (filter_len > 0) {
                            filter_len = 0;
                        } else {
                            // read filter string (crude -- wait for Enter)
                            disableRawMode(saved_term);
                            try stdout.print("\x1b[{d};1HFilter: ", .{@as(u32, 36)});
                            var fbuf: [32]u8 = undefined;
                            const fr = std.io.getStdIn().reader();
                            const fline = fr.readUntilDelimiter(&fbuf, '\n') catch "";
                            if (fline.len > 0) {
                                @memcpy(filter_buf[0..fline.len], fline);
                                filter_len = fline.len;
                            }
                            _ = enableRawMode() catch {};
                        }
                        break; // refresh immediately
                    },
                    'k' => {
                        // kill mode: read PID
                        disableRawMode(saved_term);
                        try stdout.print("\x1b[{d};1HKill PID: ", .{@as(u32, 36)});
                        var kbuf: [16]u8 = undefined;
                        const kr = std.io.getStdIn().reader();
                        const kline = kr.readUntilDelimiter(&kbuf, '\n') catch "";
                        if (std.fmt.parseInt(u32, std.mem.trim(u8, kline, " \n"), 10)) |target_pid| {
                            killProcess(target_pid) catch |err| {
                                try stdout.print("Kill failed: {s}\n", .{@errorName(err)});
                                std.time.sleep(1_000_000_000);
                            };
                        } else |_| {}
                        _ = enableRawMode() catch {};
                        break; // refresh immediately
                    },
                    else => {},
                }
            }
        }
    }
}

The 2-second refresh interval (20 iterations x 100ms VTIME timeout) is the same default as top. Every cycle we take a new snapshot of all processes, compute the CPU deltas against the previous snapshot, sort, render, and poll for key input. The computeCpuPercentages function is what turns raw tick counters into meaningful percentages.

The filter and kill modes temporarily drop out of raw mode so we can read a full line of input (the PID or filter string). It's a bit crude -- a fancier implementation would handle character-by-character input with backspace support -- but it works.

Running it

Compile and run:

$ zig build-exe zigmon.zig
$ ./zigmon

You'll see something like:

 zigmon - 247 processes | up 12d 4h 33m | sort: cpu
Mem: 6847MB / 15887MB (43%)

PID     NAME             STATE      CPU%   MEM(MB)   THR CMDLINE
1423    firefox          running    12.3       892     4  /usr/lib/firefox/firefox
2891    node             sleeping    4.1       345     2  node /home/user/project/server.js
1       systemd          sleeping    0.0        12     1  /sbin/init
...

Keys: c=CPU m=Mem p=PID n=Name f=Filter k=Kill q=Quit

Press 'm' to sort by memory, 'c' for CPU, 'f' to filter by process name, 'k' to kill a process by PID, 'q' to quit.

Design decisions and tradeoffs

A few things I want to call out about the design choices we made here:

Polling vs event-driven: We poll /proc every 2 seconds. An alternative would be to use netlink process events (PROC_EVENT_FORK, PROC_EVENT_EXEC, PROC_EVENT_EXIT) to get notified when processes start and stop. That would be more efficient but adds significant complexity (netlink socket setup, parsing multipart messages, handling batched events). For a monitoring tool where 2-second granularity is fine, polling is the pragmatic choice.

Fixed-size arrays vs dynamic allocation: We use [MAX_PROCS]ProcessInfo on the stack. Each ProcessInfo is roughly 430 bytes, so 2048 of them is about 860KB -- well within stack limits. This avoids any heap allocation in the hot path. On a system with more than 2048 processes you'd want to switch to an ArrayList, but that's genuinely rare outside of container orchestration hosts.

The O(nm) CPU delta lookup*: Finding the matching PID in the previous snapshot is a linear scan. For 500 processes that's 250,000 comparisons worst case. On modern hardware that's under a microsecond. The /proc reads themselves take milliseconds, so optimizing the lookup would be pointless -- profile before you optmize ;-)

String comparisons for filtering: We do a std.mem.indexOf substring search. This is case-sensitive, which matches how process names actually work on Linux (everything is case-sensitive). If you wanted case-insensitive matching you'd need to lowercase both strings first. But then you'd also want to ask whether that's actually useful -- I can't think of a Linux process name where case sensitivity matters for filtering.

The techniques in this episode brought together file I/O from episode 10, terminal control from the ECS engine in episode 58, process management from episode 64, signal handling from episode 67, resource limits from episode 71, and /proc parsing from episode 75. That's quite some ground we've covered in this systems programming arc. Coming up next, we'll shift gears and start building something that works across machines -- file synchronization over the network.

Bedankt en tot de volgende keer!

@scipio



0
0
0.000
0 comments