Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Using ptrace to track all execve() calls across children

I am trying to write a tool on Linux CentOS to track all spawned processes and what is run. In essence, I'm interested in walking all fork/clones and emitting all the command-lines from execve(). Strace already does (some of) this, but it also truncates the calls and the arguments. I also wanted to better understand how ptrace() works.

So, the first roadblock was figuring out how to use ptrace() to walk a fork/clone without having the tracing program require to fork a copy of itself. I dug in and found out how strace does this. Since fork is implemented with clone on Linux, I noticed that strace pounds some bits into the clone syscall to enable child tracing w/o any extra headache.

So, in essence the code is just a big:

while (1) {
    int pid = wait3(-1,...);

    /* process what happened */

    ptrace(PTRACE_SYSCALL, pid,...);
}

This works fine for relatively simple processes like /bin/sh, however, some processes are causing the wait() to hang indefinitely. The only thing I've been able to determine is that the process I'm tracing is performing a sys_rt_sigsuspend() on it's child (so, the tracer's grandchild) and then things wedge.

I was curious if there's a sane way I can debug what might be happening. Something is clearly preventing the process tree from making forward progress

Here's the source code of the program in question:

#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>

/* For the clone flags
 */
#include <sched.h>

/* #include <errno.h> */

#include <sys/ptrace.h>
#include <sys/user.h>

/* Defines our syscalls like 
 */
#include <sys/syscall.h>

#include <sys/reg.h>
#include <stdio.h>

#include <signal.h>

#include <ctype.h>

#include <map>

using namespace std;

char bufstr[4096];

#ifdef __x86_64__
#define REG_ACC  RAX
#define REG_ARG1 RDI
#define REG_ARG2 RSI
#else
#define REG_ACC  EAX
#define REG_ARG1 EBX
#define REG_ARG2 ECX
#endif

/* Trace control structure per PID that we're tracking
 */
class tcb {
    int      pid_;
    int entering_;

    public:

    tcb(int pid, int entering = 1) : pid_(pid), entering_(entering) {};
    tcb()                          : pid_(-1)                       {};
    // tcb(const tcb& p)              : pid_(pid.pid()), entering_(entering.entering()) {};
    int&       pid() { return      pid_; }
    int&  entering() { return entering_; }
};

/* Fetch a string from process (pid) at location (ptr).  Buf is the place
 * to store the data with size limit (size).  Return the number of bytes
 * copied.
 */
int get_string(int pid, long ptr, char *buf, int size)
{
    long data;
    char *p = (char *) &data;
    int j = 0;

    while ((data = ptrace(PTRACE_PEEKTEXT, pid, (void *) ptr, 0)) && j < size) {
        int i;

        for (i = 0; i < sizeof(data) && j < size; i++, j++) {
            if (!(buf[j] = p[i]))
                goto done;
        }
        ptr += sizeof(data);
    }

    done:

    buf[j] = '\0';

    return j;
}

int main(int argc, char *argv[])
{
    int status = 0;
    long scno = 0;
    // int entering = 1;
    struct user_regs_struct regs;
    map<int, tcb> pidTable;
    struct sigaction sa;


    /* Setup 
     */


    int pid = fork();



    if (!pid && argc) {
        if (ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
            perror("ptrace(PTRACE_ME,... ");

            exit(1);
        }
        execvp(argv[1], &argv[1]);
    } else {
        sa.sa_flags = 0;
        sa.sa_handler = SIG_DFL;
        sigemptyset(&sa.sa_mask);
        sigaction(SIGCHLD, &sa, NULL);

        waitpid(pid, &status, 0);

        pidTable[pid] = tcb(pid);

        fprintf(stderr, "pid is %d\n", pidTable[pid].pid());

        while (!pidTable.empty()) {
            if (pid > 0) {
                //fprintf(stderr, "%d: Restarting %d\n", getpid(), pid);
                if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) {
                    perror("ptrace(PTRACE_SYSCALL,...");
                    exit(1);
                }
            }

            // waitpid(pid, &status, 0);
            // pid = waitpid(-1, &status, 0);
            pid = wait3(&status, __WALL, 0);

            // fprintf(stderr, "Pid from wait is %d\n", pid);

            if (pid < 0) {
                perror("waitpid");
                break;
            } else {

                /* fprintf(stderr, "%d: Status is: ", pid); */

                /*
                if (WIFEXITED(status)) {
                    fprintf(stderr, "exited");
                } else if (WIFSIGNALED(status)) {
                    fprintf(stderr, "exited");
                } else if (WIFSTOPPED(status), "stopped") {
                    fprintf(stderr, "stopped");
                } else if (WIFCONTINUED(status)) {
                    fprintf(stderr, "continued");
                }
                fprintf(stderr, "\n");
                */

                if (WIFEXITED(status) || WIFSIGNALED(status)) {
                    /* Probably empty the table here */
                    pidTable.erase(pid);

                    fprintf(stderr, "Detect process term/kill %d\n", pid);

                    /* if (ptrace(PTRACE_DETACH, pid, 0, 0) < 0) {
                        perror("ptrace");
                    } */

                    pid = -1;

                    continue;
                }
            }

            ptrace(PTRACE_GETREGS, pid, 0, &regs);

#ifdef __x86_64__
            scno = regs.orig_rax;
#else
            scno = regs.orig_eax;
#endif /* __x86_64__ */

            if (scno == SYS_execve) {
                fprintf(stderr, "%d: Exec branch\n", pid);
                if (pidTable[pid].entering()) {
                    long ldata, ptr, ptr1;

                    ptrace(PTRACE_GETREGS, pid, 0, &regs);

                    #ifdef __x86_64__
                    ptr = regs.rdi;
                    #else
                    ptr = regs.ebx;
                    #endif /* __x86_64__ */

                    fprintf(stderr, "%d: exec(", pid);

                    if (ptr) {
                        get_string(pid, ptr, bufstr, sizeof(bufstr));

                        fprintf(stderr, "%s", bufstr);

                    } 

                    #ifdef __x86_64__
                    ptr1 = regs.rsi;
                    #else
                    ptr1 = regs.ecx;
                    #endif /* __x86_64__ */


                    for (; ptr1; ptr1 += sizeof(unsigned long)) {
                        ptr = ptr1;
                        /* Indirect through ptr since we have char *argv[] */
                        ptr = ptrace(PTRACE_PEEKTEXT, pid, (void *) ptr, 0);

                        if (!ptr)
                            break;

                        get_string(pid, ptr, bufstr, sizeof(bufstr));
                        fprintf(stderr, ", %s", bufstr);
                    }
                    fprintf(stderr, ")\n");

                    pidTable[pid].entering() = 0;
                }
                else {
                    long acc = ptrace(PTRACE_PEEKUSER, pid, sizeof(unsigned long) * REG_ACC, 0);
                    pidTable[pid].entering() = 1;
                    fprintf(stderr, "%d: Leaving exec: eax is %ld\n", pid, acc);
                }
            } else if (scno == SYS_fork || scno == SYS_clone) {
                fprintf(stderr, "%d: fork/clone branch\n", pid);
                if (pidTable[pid].entering()) {
                    long flags = ptrace(PTRACE_PEEKUSER, pid, (sizeof(unsigned long) * REG_ARG1), 0);

                    fprintf(stderr, "%d: Entering fork/clone\n", pid);
                    pidTable[pid].entering() = 0;

                    if (ptrace(PTRACE_POKEUSER, pid, (sizeof(unsigned long) * REG_ARG1), flags | CLONE_PTRACE & 
                                                                                         ~(flags & CLONE_VFORK ? 
                                                                                         CLONE_VFORK | CLONE_VM : 0)) < 0) {
                        perror("ptrace");
                    }

                    if (ptrace(PTRACE_POKEUSER, pid, (sizeof(unsigned long) * REG_ARG2), 0) < 0) {
                        perror("ptrace");
                    }

                } else {
                    // int child;

                    ptrace(PTRACE_GETREGS, pid, 0, &regs);

                    #ifdef __x86_64__
                    fprintf(stderr, "%d: Leaving fork/clone: rax = %ld\n", pid, regs.rax);
                    #else
                    fprintf(stderr, "%d: Leaving fork/clone: eax = %ld\n", pid, regs.eax);
                    #endif

                    pidTable[pid].entering() = 1;

                    #ifdef __x86_64__
                    if (regs.rax <= 0) {
                    #else
                    if (regs.eax <= 0) {
                    #endif
                        continue;
                    }

                    #ifdef __x86_64__
                    int newpid = regs.rax;
                    #else
                    int newpid = regs.eax;
                    #endif
                    pidTable[newpid] = tcb(newpid, 0);
                    //pidTable[newpid] = tcb(newpid, 1);
                    //pidTable[newpid] = pidTable[pid];
                    fprintf(stderr, "%d: forked child is %d\n", pid, newpid);
                }
            } else if (scno == SYS_exit) {
                fprintf(stderr, "%d: exit syscall detected\n", pid);
            } else if (scno < 0) {
                fprintf(stderr, "Negative syscall number for %d\n", pid);
                exit(1);
            } else {
                fprintf(stderr, "%d: Scno is %ld\n", pid, scno);
            }
        }
    }
    return 0;
}
like image 886
Clint O Avatar asked Mar 15 '11 19:03

Clint O


People also ask

What is the ptrace system call?

The ptrace() system call provides a means by which one process (the "tracer") may observe and control the execution of another process (the "tracee"), and examine and change the tracee's memory and registers. It is primarily used to implement breakpoint debugging and system call tracing.

How does ptrace work?

The ptrace system call allows the parent process to inspect the attached child. For example, in Linux, strace (which is implemented with the ptrace system call) can inspect the system calls invoked by the child process. When the attached child process invokes a system call, the ptracing parent process can be notified.

Why ptrace is used?

The ptrace() system call provides a means by which a parent process may observe and control the execution of another process, and examine and change its core image and registers. It is primarily used to implement breakpoint debugging and system call tracing.

How is strace implemented?

strace is implemented primarily by relying on ptrace . ptrace internals are a bit tricky, as execution is transferred between a set of files, but the implementation itself is relatively straight forward.


2 Answers

By the way. strace -f -s99999 -e trace=clone,execve appears to give good-quality results. To see a trace of strace's own actions, you might try systemtap, ie.

# stap -e 'probe syscall.ptrace {if (execname()=="strace") log(argstr)}' -c 'strace COMMAND'

(Current systemtap doesn't pretty-print the ptrace arguments quite rightly.)

Or you can strace strace:

strace -e trace=ptrace strace -f -s99999 -e trace=clone,execve COMMAND

like image 145
fche Avatar answered Oct 14 '22 00:10

fche


There are flags of ptrace PTRACE_SETOPTIONS subcall: PTRACE_O_TRACEFORK, PTRACE_O_TRACEEXEC, and PTRACE_O_TRACEEXIT. More is at man page of ptrace.

like image 3
osgx Avatar answered Oct 13 '22 23:10

osgx