13

在处理这个问题时,我遇到了一个可能的想法ptrace,但我无法正确理解如何ptrace与线程交互。

假设我有一个给定的多线程主进程,并且我想附加到其中的一个特定线程(可能来自一个分叉的子进程)。

  1. 我可以附加到特定线程吗?(手册在这个问题上存在分歧。)

  2. 如果是这样,这是否意味着单步执行只执行该线程的指令?它会停止所有进程的线程吗?

  3. 如果是这样,在我调用PTRACE_SYSCALLor时所有其他线程是否保持停止PTRACE_SINGLESTEP,或者所有线程是否继续?有没有办法只在一个线程中前进但保证其他线程保持停止?

基本上,我想通过强制所有线程停止来同步原始程序,然后通过单步执行一个跟踪的线程只执行一小组单线程指令。

到目前为止,我的个人尝试看起来有点像这样:

pid_t target = syscall(SYS_gettid);   // get the calling thread's ID
pid_t pid = fork();

if (pid > 0)
{
    waitpid(pid, NULL, 0);            // synchronise main process

    important_instruction();
}
else if (pid == 0)
{
    ptrace(target, PTRACE_ATTACH, NULL, NULL);    // does this work?

    // cancel parent's "waitpid" call, e.g. with a signal

    // single-step to execute "important_instruction()" above

   ptrace(target, PTRACE_DETACH, NULL, NULL);     // parent's threads resume?

   _Exit(0);
}

但是,我不确定,也找不到合适的参考,这是并发正确的,并且important_instruction()保证只有在所有其他线程停止时才会执行。我也明白,当父母收到来自其他地方的信号时,可能会出现竞争条件,我听说我应该PTRACE_SEIZE改用,但这似乎并不存在于任何地方。

任何澄清或参考将不胜感激!

4

4 回答 4

27

我写了第二个测试用例。我不得不添加一个单独的答案,因为包含示例输出的第一个答案太长了。

首先,这里是tracer.c

#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/ptrace.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <sys/user.h>
#include <dirent.h>
#include <string.h>
#include <signal.h>
#include <errno.h>
#include <stdio.h>
#ifndef   SINGLESTEPS
#define   SINGLESTEPS 10
#endif

/* Similar to getline(), except gets process pid task IDs.
 * Returns positive (number of TIDs in list) if success,
 * otherwise 0 with errno set. */
size_t get_tids(pid_t **const listptr, size_t *const sizeptr, const pid_t pid)
{
    char     dirname[64];
    DIR     *dir;
    pid_t   *list;
    size_t   size, used = 0;

    if (!listptr || !sizeptr || pid < (pid_t)1) {
        errno = EINVAL;
        return (size_t)0;
    }

    if (*sizeptr > 0) {
        list = *listptr;
        size = *sizeptr;
    } else {
        list = *listptr = NULL;
        size = *sizeptr = 0;
    }

    if (snprintf(dirname, sizeof dirname, "/proc/%d/task/", (int)pid) >= (int)sizeof dirname) {
        errno = ENOTSUP;
        return (size_t)0;
    }

    dir = opendir(dirname);
    if (!dir) {
        errno = ESRCH;
        return (size_t)0;
    }

    while (1) {
        struct dirent *ent;
        int            value;
        char           dummy;

        errno = 0;
        ent = readdir(dir);
        if (!ent)
            break;

        /* Parse TIDs. Ignore non-numeric entries. */
        if (sscanf(ent->d_name, "%d%c", &value, &dummy) != 1)
            continue;

        /* Ignore obviously invalid entries. */
        if (value < 1)
            continue;

        /* Make sure there is room for another TID. */
        if (used >= size) {
            size = (used | 127) + 128;
            list = realloc(list, size * sizeof list[0]);
            if (!list) {
                closedir(dir);
                errno = ENOMEM;
                return (size_t)0;
            }
            *listptr = list;
            *sizeptr = size;
        }

        /* Add to list. */
        list[used++] = (pid_t)value;
    }
    if (errno) {
        const int saved_errno = errno;
        closedir(dir);
        errno = saved_errno;
        return (size_t)0;
    }
    if (closedir(dir)) {
        errno = EIO;
        return (size_t)0;
    }

    /* None? */
    if (used < 1) {
        errno = ESRCH;
        return (size_t)0;
    }

    /* Make sure there is room for a terminating (pid_t)0. */
    if (used >= size) {
        size = used + 1;
        list = realloc(list, size * sizeof list[0]);
        if (!list) {
            errno = ENOMEM;
            return (size_t)0;
        }
        *listptr = list;
        *sizeptr = size;
    }

    /* Terminate list; done. */
    list[used] = (pid_t)0;
    errno = 0;
    return used;
}


static int wait_process(const pid_t pid, int *const statusptr)
{
    int   status;
    pid_t p;

    do {
        status = 0;
        p = waitpid(pid, &status, WUNTRACED | WCONTINUED);
    } while (p == (pid_t)-1 && errno == EINTR);
    if (p != pid)
        return errno = ESRCH;

    if (statusptr)
        *statusptr = status;

    return errno = 0;
}

static int continue_process(const pid_t pid, int *const statusptr)
{
    int   status;
    pid_t p;

    do {

        if (kill(pid, SIGCONT) == -1)
            return errno = ESRCH;

        do {
            status = 0;
            p = waitpid(pid, &status, WUNTRACED | WCONTINUED);
        } while (p == (pid_t)-1 && errno == EINTR);

        if (p != pid)
            return errno = ESRCH;

    } while (WIFSTOPPED(status));

    if (statusptr)
        *statusptr = status;

    return errno = 0;
}

void show_registers(FILE *const out, pid_t tid, const char *const note)
{
    struct user_regs_struct regs;
    long                    r;

    do {
        r = ptrace(PTRACE_GETREGS, tid, &regs, &regs);
    } while (r == -1L && errno == ESRCH);
    if (r == -1L)
        return;

#if (defined(__x86_64__) || defined(__i386__)) && __WORDSIZE == 64
    if (note && *note)
        fprintf(out, "Task %d: RIP=0x%016lx, RSP=0x%016lx. %s\n", (int)tid, regs.rip, regs.rsp, note);
    else
        fprintf(out, "Task %d: RIP=0x%016lx, RSP=0x%016lx.\n", (int)tid, regs.rip, regs.rsp);
#elif (defined(__x86_64__) || defined(__i386__)) && __WORDSIZE == 32
    if (note && *note)
        fprintf(out, "Task %d: EIP=0x%08lx, ESP=0x%08lx. %s\n", (int)tid, regs.eip, regs.esp, note);
    else
        fprintf(out, "Task %d: EIP=0x%08lx, ESP=0x%08lx.\n", (int)tid, regs.eip, regs.esp);
#endif
}


int main(int argc, char *argv[])
{
    pid_t *tid = 0;
    size_t tids = 0;
    size_t tids_max = 0;
    size_t t, s;
    long   r;

    pid_t child;
    int   status;

    if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s COMMAND [ ARGS ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program executes COMMAND in a child process,\n");
        fprintf(stderr, "and waits for it to stop (via a SIGSTOP signal).\n");
        fprintf(stderr, "When that occurs, the register state of each thread\n");
        fprintf(stderr, "is dumped to standard output, then the child process\n");
        fprintf(stderr, "is sent a SIGCONT signal.\n");
        fprintf(stderr, "\n");
        return 1;
    }

    child = fork();
    if (child == (pid_t)-1) {
        fprintf(stderr, "fork() failed: %s.\n", strerror(errno));
        return 1;
    }

    if (!child) {
        prctl(PR_SET_DUMPABLE, (long)1);
        prctl(PR_SET_PTRACER, (long)getppid());
        fflush(stdout);
        fflush(stderr);
        execvp(argv[1], argv + 1);
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return 127;
    }

    fprintf(stderr, "Tracer: Waiting for child (pid %d) events.\n\n", (int)child);
    fflush(stderr);

    while (1) {

        /* Wait for a child event. */
        if (wait_process(child, &status))
            break;

        /* Exited? */
        if (WIFEXITED(status) || WIFSIGNALED(status)) {
            errno = 0;
            break;
        }

        /* At this point, only stopped events are interesting. */
        if (!WIFSTOPPED(status))
            continue;

        /* Obtain task IDs. */
        tids = get_tids(&tid, &tids_max, child);
        if (!tids)
            break;

        printf("Process %d has %d tasks,", (int)child, (int)tids);
        fflush(stdout);

        /* Attach to all tasks. */
        for (t = 0; t < tids; t++) {
            do {
                r = ptrace(PTRACE_ATTACH, tid[t], (void *)0, (void *)0);
            } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH));
            if (r == -1L) {
                const int saved_errno = errno;
                while (t-->0)
                    do {
                        r = ptrace(PTRACE_DETACH, tid[t], (void *)0, (void *)0);
                    } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH));
                tids = 0;
                errno = saved_errno;
                break;
            }
        }
        if (!tids) {
            const int saved_errno = errno;
            if (continue_process(child, &status))
                break;
            printf(" failed to attach (%s).\n", strerror(saved_errno));
            fflush(stdout);
            if (WIFCONTINUED(status))
                continue;
            errno = 0;
            break;
        }

        printf(" attached to all.\n\n");
        fflush(stdout);

        /* Dump the registers of each task. */
        for (t = 0; t < tids; t++)
            show_registers(stdout, tid[t], "");
        printf("\n");
        fflush(stdout);

        for (s = 0; s < SINGLESTEPS; s++) {
            do {
                r = ptrace(PTRACE_SINGLESTEP, tid[tids-1], (void *)0, (void *)0);
            } while (r == -1L && errno == ESRCH);
            if (!r) {
                for (t = 0; t < tids - 1; t++)
                    show_registers(stdout, tid[t], "");
                show_registers(stdout, tid[tids-1], "Advanced by one step.");
                printf("\n");
                fflush(stdout);
            } else {
                fprintf(stderr, "Single-step failed: %s.\n", strerror(errno));
                fflush(stderr);
            }
        }

        /* Detach from all tasks. */
        for (t = 0; t < tids; t++)
            do {
                r = ptrace(PTRACE_DETACH, tid[t], (void *)0, (void *)0);
            } while (r == -1 && (errno == EBUSY || errno == EFAULT || errno == ESRCH));
        tids = 0;
        if (continue_process(child, &status))
            break;
        if (WIFCONTINUED(status)) {
            printf("Detached. Waiting for new stop events.\n\n");
            fflush(stdout);
            continue;
        }
        errno = 0;
        break;
    }
    if (errno)
        fprintf(stderr, "Tracer: Child lost (%s)\n", strerror(errno));
    else
    if (WIFEXITED(status))
        fprintf(stderr, "Tracer: Child exited (%d)\n", WEXITSTATUS(status));
    else
    if (WIFSIGNALED(status))
        fprintf(stderr, "Tracer: Child died from signal %d\n", WTERMSIG(status));
    else
        fprintf(stderr, "Tracer: Child vanished\n");
    fflush(stderr);

    return status;
}

tracer.c执行指定的命令,等待命令接收到SIGSTOP信号。(tracer.c不自己发送;您可以让被跟踪者自行停止,或从外部发送信号。)

当命令停止时,tracer.c将 ptrace 附加到每个线程,并将其中一个线程单步执行固定数量的步骤(SINGLESTEPS编译时常量),显示每个线程的相关寄存器状态。

之后,它从命令中分离出来,并向它发送一个SIGCONT信号,让它继续正常运行。

这是一个简单的测试程序worker.c,我用于测试:

#include <pthread.h>
#include <signal.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#ifndef   THREADS
#define   THREADS  2
#endif

volatile sig_atomic_t   done = 0;

void catch_done(int signum)
{
    done = signum;
}

int install_done(const int signum)
{
    struct sigaction act;

    sigemptyset(&act.sa_mask);
    act.sa_handler = catch_done;
    act.sa_flags = 0;
    if (sigaction(signum, &act, NULL))
        return errno;
    else
        return 0;
}

void *worker(void *data)
{
    volatile unsigned long *const counter = data;

    while (!done)
        __sync_add_and_fetch(counter, 1UL);

    return (void *)(unsigned long)__sync_or_and_fetch(counter, 0UL);
}

int main(void)
{
    unsigned long   counter = 0UL;
    pthread_t       thread[THREADS];
    pthread_attr_t  attrs;
    size_t          i;

    if (install_done(SIGHUP) ||
        install_done(SIGTERM) ||
        install_done(SIGUSR1)) {
        fprintf(stderr, "Worker: Cannot install signal handlers: %s.\n", strerror(errno));
        return 1;
    }

    pthread_attr_init(&attrs);
    pthread_attr_setstacksize(&attrs, 65536);
    for (i = 0; i < THREADS; i++)
        if (pthread_create(&thread[i], &attrs, worker, &counter)) {
            done = 1;
            fprintf(stderr, "Worker: Cannot create thread: %s.\n", strerror(errno));
            return 1;
        }
    pthread_attr_destroy(&attrs);

    /* Let the original thread also do the worker dance. */
    worker(&counter);

    for (i = 0; i < THREADS; i++)
        pthread_join(thread[i], NULL);

    return 0;
}

使用例如编译两者

gcc -W -Wall -O3 -fomit-frame-pointer worker.c -pthread -o worker
gcc -W -Wall -O3 -fomit-frame-pointer tracer.c -o tracer

并在单独的终端或后台运行,使用例如

./tracer ./worker &

跟踪器显示工作人员的 PID:

Tracer: Waiting for child (pid 24275) events.

此时,孩子正在正常跑步。当您向SIGSTOP孩子发送 a 时,该操作开始。跟踪器检测到它,进行所需的跟踪,然后分离并让子进程正常继续:

kill -STOP 24275

Process 24275 has 3 tasks, attached to all.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a65, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a58, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a65, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a58, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.

Detached. Waiting for new stop events.

您可以根据需要多次重复上述操作。请注意,我选择了SIGSTOP信号作为触发器,因为这种方式tracer.c也可用作生成每个请求的复杂多线程核心转储的基础(因为多线程进程可以简单地通过向自身发送 a 来触发它SIGSTOP)。

worker()在上面的例子中,线程都在旋转的函数的反汇编:

0x400a50: eb 0b                 jmp          0x400a5d
0x400a52: 66 0f 1f 44 00 00     nopw         0x0(%rax,%rax,1)
0x400a58: f0 48 83 07 01        lock addq    $0x1,(%rdi)          = fourth step
0x400a5d: 8b 05 00 00 00 00     mov          0x0(%rip),%eax       = first step
0x400a63: 85 c0                 test         %eax,%eax            = second step
0x400a65: 74 f1                 je           0x400a58             = third step
0x400a67: 48 8b 07              mov          (%rdi),%rax
0x400a6a: 48 89 c2              mov          %rax,%rdx
0x400a6d: f0 48 0f b1 07        lock cmpxchg %rax,(%rdi)
0x400a72: 75 f6                 jne          0x400a6a
0x400a74: 48 89 d0              mov          %rdx,%rax
0x400a77: c3                    retq

现在,这个测试程序只展示了如何停止一个进程,附加到它的所有线程,单步执行一个线程所需数量的指令,然后让所有线程正常继续;它还没有证明这同样适用于让特定线程正常继续(通过PTRACE_CONT)。但是,我在下面描述的细节表明,对我来说,同样的方法应该适用于PTRACE_CONT.

我在编写上述测试程序时遇到的主要问题或惊喜是

long r;

do {
    r = ptrace(PTRACE_cmd, tid, ...);
} while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH));

循环,尤其是对于这种ESRCH情况(由于ptrace 手册页描述,我只添加了其他情况)。

您会看到,大多数 ptrace 命令仅在任务停止时才允许使用。然而,当任务仍在完成例如单步命令时,任务不会停止。因此,使用上述循环——可能添加毫秒 nanosleep 或类似的以避免浪费 CPU——确保在我们尝试提供新命令之前,前一个 ptrace 命令已经完成(因此任务停止)。

Kerrek SB,我相信您在测试程序中遇到的至少一些问题是由于这个问题造成的?对我个人而言,这是一种D'oh!时刻意识到这当然是必要的,因为 ptracing 本质上是异步的,而不是同步的。

SIGCONT(这种异步性也是我上面提到的交互的原因PTRACE_CONT。我相信通过使用上面显示的循环进行正确处理,交互不再是问题——实际上是可以理解的。)


添加对此答案的评论:

Linux 内核使用 task_struct 结构中的一组任务状态标志(请参阅include/linux/sched.h定义)来跟踪每个任务的状态。面向用户空间的一面ptrace()kernel/ptrace.c.

PTRACE_SINGLESTEPorPTRACE_CONT被调用时,kernel/ptrace.c:ptrace_continue()处理大部分细节。它通过调用wake_up_state(child, __TASK_TRACED)( kernel/sched/core.c::try_to_wake_up(child, __TASK_TRACED, 0)) 结束。

当一个进程通过SIGSTOP信号停止时,所有的任务都将停止,并最终处于“停止,未跟踪”状态。

附加到每个任务(通过 PTRACE_ATTACH 或 PTRACE_SEIZE,请参阅:)kernel/ptrace.cptrace_attach()修改任务状态。但是,ptrace 状态位(请参阅常量)与任务可运行状态位(请参阅常量include/linux/ptrace.h:PT_是分开的。include/linux/sched.h:TASK_

在附加到任务并向进程发送SIGCONT信号后,停止状态不会立即修改(我相信),因为任务也在被跟踪。执行 PTRACE_SINGLESTEP 或 PTRACE_CONT 以 结束kernel/sched/core.c::try_to_wake_up(child, __TASK_TRACED, 0),这会更新任务状态,并将任务移动到运行队列。

现在,我还没有找到代码路径的复杂部分是,当下一次计划任务时,任务状态如何在内核中更新。我的测试表明,使用单步(这是另一个任务状态标志)时,只有任务状态会被更新,而单步标志会被清除。似乎 PTRACE_CONT 不那么可靠;我相信这是因为单步标志“强制”了任务状态的改变。也许有一个“竞争条件”。继续信号传递和状态变化?

(进一步编辑:内核开发人员肯定希望wait()被调用,例如看这个线程。)

换句话说,在注意到进程已经停止之后(注意你可以使用/proc/PID/stat或者/proc/PID/status如果进程不是子进程,并且还没有附加),我相信下面的过程是最健壮的一个:

pid_t  pid, p; /* Process owning the tasks */
tid_t *tid;    /* Task ID array */
size_t tids;   /* Tasks */
long   result;
int    status;
size_t i;

for (i = 0; i < tids; i++) {
    while (1) {
        result = ptrace(PTRACE_ATTACH, tid[i], (void *)0, (void *)0);
        if (result == -1L && (errno == ESRCH || errno == EBUSY || errno == EFAULT || errno == EIO)) {
            /* To avoid burning up CPU for nothing: */
            sched_yield(); /* or nanosleep(), or usleep() */
            continue;
        }
        break;
    }       
    if (result == -1L) {
        /*
         * Fatal error. First detach from tid[0..i-1], then exit.
        */
    }
}

/* Send SIGCONT to the process. */
if (kill(pid, SIGCONT)) {
    /*
     * Fatal error, see errno. Exit.
    */
}

/* Since we are attached to the process,
 * we can wait() on it. */
while (1) {
    errno = 0;
    status = 0;
    p = waitpid(pid, &status, WCONTINUED);
    if (p == (pid_t)-1) {
        if (errno == EINTR)
            continue;
        else
            break;
    } else
    if (p != pid) {
        errno = ESRCH;
        break;
    } else
    if (WIFCONTINUED(status)) {
        errno = 0;
        break;
    }
}
if (errno) {
    /*
     * Fatal error. First detach from tid[0..tids-1], then exit.
    */
}

/* Single-step each task to update the task states. */
for (i = 0; i < tids; i++) {
    while (1) {
        result = ptrace(PTRACE_SINGLESTEP, tid[i], (void *)0, (void *)0);
        if (result == -1L && errno == ESRCH) {
            /* To avoid burning up CPU for nothing: */
            sched_yield(); /* or nanosleep(), or usleep() */
            continue;
        }
        break;
    }       
    if (result == -1L) {
        /*
         * Fatal error. First detach from tid[0..i-1], then exit.
        */
    }
}

/* Obtain task register structures, to make sure the single-steps
 * have completed and their states have stabilized. */
for (i = 0; i < tids; i++) {
    struct user_regs_struct regs;

    while (1) {
        result = ptrace(PTRACE_GETREGS, tid[i], &regs, &regs);
        if (result == -1L && (errno == ESRCH || errno == EBUSY || errno == EFAULT || errno == EIO)) {
            /* To avoid burning up CPU for nothing: */
            sched_yield(); /* or nanosleep(), or usleep() */
            continue;
        }
        break;
    }       
    if (result == -1L) {
        /*
         * Fatal error. First detach from tid[0..i-1], then exit.
        */
    }
}

完成上述操作后,所有任务都应附加并处于预期状态,以便例如 PTRACE_CONT 无需其他技巧即可工作。

如果未来内核的行为发生变化——我确实相信 STOP/CONT 信号和 ptracing 之间的交互可能会发生变化;至少有必要向 LKML 开发人员提出有关此行为的问题!——,上述程序仍然可以正常工作。(谨慎起见,通过使用 PTRACE_SINGLESTEP 循环几次,也可能是一个好主意。)

与 PTRACE_CONT 的不同之处在于,如果将来行为发生变化,最初的 PTRACE_CONT 可能实际上会继续该过程,从而导致ptrace()随后的过程失败。使用 PTRACE_SINGLESTEP,进程停止,允许进一步ptrace()调用成功。

问题?

于 2013-09-04T00:56:28.240 回答
7

我可以附加到特定线程吗?

是的,至少在当前内核上。

这是否意味着单步只执行该线程的指令?它会停止所有进程的线程吗?

是的。它不会停止其他线程,只会停止附加的线程。

有没有办法只在一个线程中前进但保证其他线程保持停止?

是的。发送SIGSTOP到进程(用于waitpid(PID,,WUNTRACED)等待进程停止),然后PTRACE_ATTACH发送到进程中的每个线程。发送SIGCONTwaitpid(PID,,WCONTINUED)用于等待进程继续)。

由于在您附加时所有线程都已停止,并且附加会停止线程,因此在SIGCONT传递信号后所有线程都保持停止状态。您可以按您喜欢的任何顺序单步执行线程。


我发现这很有趣,可以创建一个测试用例。(好吧,实际上我怀疑没有人会相信我的话,所以我决定最好证明你可以自己复制。)

我的系统似乎遵循Linux man-pages projectman 2 ptrace中的描述,而 Kerrisk 似乎非常擅长将它们与内核行为保持同步。一般来说,我更喜欢 kernel.org 资源 wrt。Linux 内核到其他来源。

概括:

  • 附加到进程本身 (TID==PID) 只会停止原始线程,而不是所有线程。

  • 附加到特定线程(使用来自 的 TID /proc/PID/task/)确实会停止该线程。(换句话说,TID == PID 的线程并不特殊。)

  • 向进程发送 aSIGSTOP将停止所有线程,但ptrace()仍然可以正常工作。

  • 如果您向进程发送了一个,请在分离之前SIGSTOP不要调用。似乎干扰了信号。ptrace(PTRACE_CONT, TID)PTRACE_CONTSIGCONT

    您可以先发送SIGSTOP, 然后PTRACE_ATTACH, 然后发送SIGCONT, 没有任何问题; 线程将保持停止状态(由于 ptrace)。换句话说,PTRACE_ATTACHand与andPTRACE_DETACH混合得很好,没有任何我能看到的副作用。SIGSTOPSIGCONT

  • SIGSTOPSIGCONT影响整个过程,即使您尝试使用tgkill()(or pthread_kill()) 将信号发送到特定线程。

  • 停止和继续一个特定的线程,PTHREAD_ATTACH它;停止和继续进程的所有线程,分别向进程发送SIGSTOPSIGCONT发出信号。

就个人而言,我相信这验证了我在另一个问题中建议的方法。

这是您可以编译并运行以自己测试的丑陋测试代码traces.c

#define  GNU_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <sys/syscall.h>
#include <dirent.h>
#include <pthread.h>
#include <signal.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#ifndef   THREADS
#define   THREADS  3
#endif

static int tgkill(int tgid, int tid, int sig)
{
    int retval;

    retval = syscall(SYS_tgkill, tgid, tid, sig);
    if (retval < 0) {
        errno = -retval;
        return -1;
    }

    return 0;
}

volatile unsigned long counter[THREADS + 1] = { 0UL };

volatile sig_atomic_t run = 0;
volatile sig_atomic_t done = 0;

void handle_done(int signum)
{
    done = signum;
}

int install_done(int signum)
{
    struct sigaction act;
    sigemptyset(&act.sa_mask);
    act.sa_handler = handle_done;
    act.sa_flags = 0;
    if (sigaction(signum, &act, NULL))
        return errno;
    return 0;
}

void *worker(void *data)
{
    volatile unsigned long *const counter = data;

    while (!run)
        ;

    while (!done)
        (*counter)++;

    return (void *)(*counter);
}

pid_t *gettids(const pid_t pid, size_t *const countptr)
{
    char           dirbuf[128];
    DIR           *dir;
    struct dirent *ent;

    pid_t         *data = NULL, *temp;
    size_t         size = 0;
    size_t         used = 0;

    int            tid;
    char           dummy;

    if ((int)pid < 2) {
        errno = EINVAL;
        return NULL;
    }

    if (snprintf(dirbuf, sizeof dirbuf, "/proc/%d/task/", (int)pid) >= (int)sizeof dirbuf) {
        errno = ENAMETOOLONG;
        return NULL;
    }

    dir = opendir(dirbuf);
    if (!dir)
        return NULL;

    while (1) {
        errno = 0;
        ent = readdir(dir);
        if (!ent)
            break;

        if (sscanf(ent->d_name, "%d%c", &tid, &dummy) != 1)
            continue;

        if (tid < 2)
            continue;

        if (used >= size) {
            size = (used | 127) + 129;
            temp = realloc(data, size * sizeof data[0]);
            if (!temp) {
                free(data);
                closedir(dir);
                errno = ENOMEM;
                return NULL;
            }
            data = temp;
        }

        data[used++] = (pid_t)tid;
    }
    if (errno) {
        free(data);
        closedir(dir);
        errno = EIO;
        return NULL;
    }
    if (closedir(dir)) {
        free(data);
        errno = EIO;
        return NULL;
    }

    if (used < 1) {
        free(data);
        errno = ENOENT;
        return NULL;
    }

    size = used + 1;
    temp = realloc(data, size * sizeof data[0]);
    if (!temp) {
        free(data);
        errno = ENOMEM;
        return NULL;
    }
    data = temp;

    data[used] = (pid_t)0;

    if (countptr)
        *countptr = used;

    errno = 0;
    return data;
}

int child_main(void)
{
    pthread_t   id[THREADS];
    int         i;

    if (install_done(SIGUSR1)) {
        fprintf(stderr, "Cannot set SIGUSR1 signal handler.\n");
        return 1;
    }

    for (i = 0; i < THREADS; i++)
        if (pthread_create(&id[i], NULL, worker, (void *)&counter[i])) {
            fprintf(stderr, "Cannot create thread %d of %d: %s.\n", i + 1, THREADS, strerror(errno));
            return 1;
        }

    run = 1;

    kill(getppid(), SIGUSR1);

    while (!done)
        counter[THREADS]++;

    for (i = 0; i < THREADS; i++)
        pthread_join(id[i], NULL);

    printf("Final counters:\n");
    for (i = 0; i < THREADS; i++)
        printf("\tThread %d: %lu\n", i + 1, counter[i]);
    printf("\tMain thread: %lu\n", counter[THREADS]);

    return 0;
}

int main(void)
{
    pid_t   *tid = NULL;
    size_t   tids = 0;
    int      i, k;
    pid_t    child, p;

    if (install_done(SIGUSR1)) {
        fprintf(stderr, "Cannot set SIGUSR1 signal handler.\n");
        return 1;
    }

    child = fork();
    if (!child)
        return child_main();

    if (child == (pid_t)-1) {
        fprintf(stderr, "Cannot fork.\n");
        return 1;
    }

    while (!done)
        usleep(1000);

    tid = gettids(child, &tids);
    if (!tid) {
        fprintf(stderr, "gettids(): %s.\n", strerror(errno));
        kill(child, SIGUSR1);
        return 1;
    }

    fprintf(stderr, "Child process %d has %d tasks.\n", (int)child, (int)tids);
    fflush(stderr);

    for (k = 0; k < (int)tids; k++) {
        const pid_t t = tid[k];

        if (ptrace(PTRACE_ATTACH, t, (void *)0L, (void *)0L)) {
            fprintf(stderr, "Cannot attach to TID %d: %s.\n", (int)t, strerror(errno));
            kill(child, SIGUSR1);
            return 1;
        }

        fprintf(stderr, "Attached to TID %d.\n\n", (int)t);

        fprintf(stderr, "Peeking the counters in the child process:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "Waiting a short moment ... ");
        fflush(stderr);

        usleep(250000);

        fprintf(stderr, "and another peek:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "\n");
        fflush(stderr);

        usleep(250000);

        ptrace(PTRACE_DETACH, t, (void *)0L, (void *)0L);
    }

    for (k = 0; k < 4; k++) {
        const pid_t t = tid[tids / 2];

        if (k == 0) {
            fprintf(stderr, "Sending SIGSTOP to child process ... ");
            fflush(stderr);
            kill(child, SIGSTOP);
        } else
        if (k == 1) {
            fprintf(stderr, "Sending SIGCONT to child process ... ");
            fflush(stderr);
            kill(child, SIGCONT);
        } else
        if (k == 2) {
            fprintf(stderr, "Sending SIGSTOP to TID %d ... ", (int)tid[0]);
            fflush(stderr);
            tgkill(child, tid[0], SIGSTOP);
        } else
        if (k == 3) {
            fprintf(stderr, "Sending SIGCONT to TID %d ... ", (int)tid[0]);
            fflush(stderr);
            tgkill(child, tid[0], SIGCONT);
        }
        usleep(250000);
        fprintf(stderr, "done.\n");
        fflush(stderr);

        if (ptrace(PTRACE_ATTACH, t, (void *)0L, (void *)0L)) {
            fprintf(stderr, "Cannot attach to TID %d: %s.\n", (int)t, strerror(errno));
            kill(child, SIGUSR1);
            return 1;
        }

        fprintf(stderr, "Attached to TID %d.\n\n", (int)t);

        fprintf(stderr, "Peeking the counters in the child process:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "Waiting a short moment ... ");
        fflush(stderr);

        usleep(250000);

        fprintf(stderr, "and another peek:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "\n");
        fflush(stderr);

        usleep(250000);

        ptrace(PTRACE_DETACH, t, (void *)0L, (void *)0L);
    }

    kill(child, SIGUSR1);

    do {
        p = waitpid(child, NULL, 0);
        if (p == -1 && errno != EINTR)
            break;
    } while (p != child);

    return 0;
}

使用例如编译和运行

gcc -DTHREADS=3 -W -Wall -O3 traces.c -pthread -o traces
./traces

输出是子进程计数器的转储(每个在单独的线程中递增,包括使用最终计数器的原始线程)。比较短暂等待期间的计数器。例如:

Child process 18514 has 4 tasks.
Attached to TID 18514.

Peeking the counters in the child process:
    counter[0] = 0
    counter[1] = 0
    counter[2] = 0
    counter[3] = 0
Waiting a short moment ... and another peek:
    counter[0] = 18771865
    counter[1] = 6435067
    counter[2] = 54247679
    counter[3] = 0

正如您在上面看到的,只有使用最终计数器的初始线程(其 TID == PID)被停止。其他三个线程也是如此,它们按顺序使用前三个计数器:

Attached to TID 18515.

Peeking the counters in the child process:
    counter[0] = 25385151
    counter[1] = 13459822
    counter[2] = 103763861
    counter[3] = 560872
Waiting a short moment ... and another peek:
    counter[0] = 25385151
    counter[1] = 69116275
    counter[2] = 120500164
    counter[3] = 9027691

Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 25397582
    counter[1] = 105905400
    counter[2] = 155895025
    counter[3] = 17306682
Waiting a short moment ... and another peek:
    counter[0] = 32358651
    counter[1] = 105905400
    counter[2] = 199601078
    counter[3] = 25023231

Attached to TID 18517.

Peeking the counters in the child process:
    counter[0] = 40600813
    counter[1] = 111675002
    counter[2] = 235428637
    counter[3] = 32298929
Waiting a short moment ... and another peek:
    counter[0] = 48727731
    counter[1] = 143870702
    counter[2] = 235428637
    counter[3] = 39966259

接下来的两个案例检查SIGCONT/ SIGSTOPwrt。整个过程:

Sending SIGSTOP to child process ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 56887263
    counter[1] = 170646440
    counter[2] = 235452621
    counter[3] = 48077803
Waiting a short moment ... and another peek:
    counter[0] = 56887263
    counter[1] = 170646440
    counter[2] = 235452621
counter[3] = 48077803

Sending SIGCONT to child process ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 64536344
    counter[1] = 182359343
    counter[2] = 253660731
    counter[3] = 56422231
Waiting a short moment ... and another peek:
    counter[0] = 72029244
    counter[1] = 182359343
    counter[2] = 288014365
    counter[3] = 63797618

如您所见,发送SIGSTOP将停止所有线程,但不会阻碍ptrace(). 同样,在 之后SIGCONT,线程继续正常运行。

最后两种情况检查使用tgkill()SIGSTOP/发送SIGCONT到特定线程(对应于第一个计数器的线程),同时附加到另一个线程的效果:

Sending SIGSTOP to TID 18514 ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 77012930
    counter[1] = 183059526
    counter[2] = 344043770
    counter[3] = 71120227
Waiting a short moment ... and another peek:
    counter[0] = 77012930
    counter[1] = 183059526
    counter[2] = 344043770
    counter[3] = 71120227

Sending SIGCONT to TID 18514 ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 88082419
    counter[1] = 194059048
    counter[2] = 359342314
    counter[3] = 84887463
Waiting a short moment ... and another peek:
    counter[0] = 100420161
    counter[1] = 194059048
    counter[2] = 392540525
    counter[3] = 111770366

不幸的是,正如预期的那样,处置(停止/运行)是进程范围的,而不是线程特定的,正如您在上面看到的那样。这意味着要停止特定线程并让其他线程正常运行,您需要分别PTHREAD_ATTACH到您希望停止的线程。

为了证明我上面的所有陈述,您可能必须添加测试用例;我最终得到了相当多的代码副本,都经过了轻微的编辑,以测试它们,我不确定我是否选择了最完整的集合。如果您发现遗漏,我很乐意扩展测试程序。

问题?

于 2013-09-03T06:33:24.687 回答
2

进程中的每个线程都被单独跟踪(并且每个线程都可能被不同的跟踪进程跟踪,或者未被跟踪)。当您调用 ptrace attach 时,您总是只附加到一个线程。只有那个线程会被停止 - 其他线程将继续照常运行。

手册页的最新版本ptrace()非常清楚地说明了这一点:

附加和后续命令是每个线程的:在多线程进程中,每个线程可以单独附加到(可能不同的)跟踪器,或者不附加,因此不进行调试。因此,“tracee”总是意味着“(一个)线程”,而不是“一个(可能是多线程的)进程”。Ptrace 命令始终使用以下形式的调用发送到特定的跟踪对象

ptrace(PTRACE_foo, pid, ...)

其中 pid 是对应 Linux 线程的线程 ID。

(请注意,在本页中,“多线程进程”是指由使用该clone(2) CLONE_THREAD标志创建的线程组成的线程组。)

单步执行仅影响您指向的线程。如果其他线程正在运行,它们将继续运行,如果它们处于跟踪停止状态,它们将保持跟踪停止状态。(这意味着如果您正在单步执行的线程试图获取另一个非运行线程持有的互斥锁或类似的同步资源,它将无法获取该互斥锁)。

如果要在单步执行一个线程时停止进程的所有线程,则需要附加到所有线程。还有一个复杂的情况是,如果进程在您尝试附加到它时正在运行,则可能会在您枚举它们时创建新线程。

于 2013-09-03T04:37:58.270 回答
-3

它会停止所有进程的线程吗?

是的,它跟踪进程,这个进程的所有线程都停止了。想象一下,你怎么能在你的 IDE 中看到不同的线程。

从手册:

ptrace() 系统调用提供了一种方法,通过该方法一个进程(“跟踪器”)可以观察和控制另一个进程(“跟踪者”)的执行

要附加的示例代码:

printf("Attaching to process %d\n",Tpid);
if ((ptrace(PTRACE_ATTACH, Tpid, 0, 0)) != 0) {;
    printf("Attach result %d\n",res);
}

所以是的,你被附加到一个线程,是的,它停止了进程的所有线程。

if ((res = ptrace(PTRACE_SINGLESTEP, Tpid, 0, signo)) < 0) {
perror("Ptrace singlestep error");
exit(1);
}
res = wait(&stat);

也许在这里看到:http ://www.secretmango.com/jimb/Whitepapers/ptrace/ptrace.html

于 2013-09-02T17:10:14.307 回答