
如果我在创建的进程的生命周期内保持程序运行,那么一切正常;但是如果我启动一个进程,杀死我的程序,然后重新启动它,之前创建的进程将永远保持 STOPPED 状态(似乎 ptrace(PTRACE_CONT,...) 无法恢复它)。代码片段附在下面:

static int exitFlag = 0;
static void sighandler (int/* signum */)
{ exitFlag = 1; }

int JsfNode::run (void)
    /* load jobs */
        vector <JobInfo2> jobs;
        loadStruct <vector <JobInfo2> > (
                jobFile (), jobs);
        for (unsigned i=0 ; i<jobs.size () ; i++) {
            JobInfo2& info = jobs [i];
            string name = info.parm.name;
            if (m_jobs.find (name) == m_jobs.end ()) {
                Job2& job = m_jobs [name];
                job.info = info;
                /* trace it so that we can wait() it */
                switch (info.state) {
                case js2Active:
                case js2Canceling:
                case js2Suspending:
                if (ptrace (PTRACE_ATTACH, info.pid, 0, 0))
                        jdebug ("PTRACE_ATTACH failed for: %d (%s)\n", info.pid,
                                        strerror (errno));
                default: break;

    /* run until we are signaled to stop */
    signal (SIGINT, sighandler);
    while (!exitFlag)
        sleep (1);

    /* save jobs */
        vector <JobInfo2> jobs;
        for (map <string, Job2>::iterator it=m_jobs.begin () ;
                        it!=m_jobs.end () ; it++) {
            JobInfo2& info = it->second.info;
            ptrace (PTRACE_DETACH, info.pid, NULL, NULL);
            jobs.push_back (info);
        saveStruct <vector <JobInfo2> > (
                jobFile (), jobs);

    return 0;

void JsfNode::startJob (Job2 & job)
    JobParm2 parm = job.info.parm;
    jdebug ("starting \"%s\"..\n", parm.name.c_str());

    /* get the uid of the run-as user */
    uid_t uid = 0;  /* run as root if the specified user is invalid */
    struct passwd * pwe = getpwnam (parm.user.c_str());
    if (pwe != NULL)
        uid = pwe->pw_uid;

    /* prepare the script file */
    string scriptfile = m_workdir+"/"+parm.name+"_scriptfile";
    ofstream ofscriptfile (scriptfile.c_str());
    ofscriptfile << parm.script;
    chown (scriptfile.c_str(), uid, uid);
    chmod (scriptfile.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);

    /* prepare the MPIMACHINEFILE */
    string machinefile = m_workdir+"/"+parm.name+"_machinefile";
    ofstream ofmachinefile (machinefile.c_str());
    for (Resource::iterator it=parm.res.begin () ; it!=parm.res.end () ; it++)
        ofmachinefile << *it << ':' << parm.taskPerNode << '\n';
    ofmachinefile.close ();
    chown (machinefile.c_str(), uid, uid);
    chmod (machinefile.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);

    /* prepare the redirection channels */
    int ipipe [2] = {-1,-1};
    int opipe [2] = {-1,-1};
    if (parm.redio > 0) {
        if (pipe (ipipe) == -1) {
            unlink (machinefile.c_str());
            unlink (scriptfile.c_str());
            return; /* do not fail the job, just try later */
        if (pipe (opipe) == -1) {
            close (ipipe [0]);
            close (ipipe [1]);
            goto unlink;

    /* OK, fork it! -----------------> */

    pid_t pid;
    if ((pid = fork ()) == -1) {
        close (opipe [0]);
        close (opipe [1]);
        goto close;

    if (pid == 0) {
        /* enable parent-tracing */
        ptrace (PTRACE_TRACEME, 0, NULL, NULL);

        /* drop the root privilege */
        setuid (uid);

        /* redirect stdin/stdout */
        if (parm.redio) {
            if (dup2 (ipipe [0],0)<0 ||
                dup2 (opipe [1],1)<0)
                exit (errno);
            close (ipipe [0]);
            close (ipipe [1]);
            close (opipe [0]);
            close (opipe [1]);

        /* prepare the arguments/environments */
        char * arg[] = {
                strdup (scriptfile.c_str()),
                strdup (parm.args.c_str()),
                NULL    /* the required null entry */
        setenv ("MPIMACHINEFILE", machinefile.c_str(), 1);
        setenv ("DISPLAY", parm.headNode.c_str(), 1);
        setenv ("JSF_JOBID", parm.name.c_str(), 1);

        /* execute it! ------> */
        execv (scriptfile.c_str(), arg);
        exit (errno);

    /* redirect stdin/stdout */
    if (parm.redio) {
        close (ipipe [0]);
        close (opipe [1]);
        job.redPipe [0] = opipe [0];
        job.redPipe [1] = ipipe [1];
    /* start the nurse thread */
    NurseData * nd = new NurseData (this, job);
    if (pthread_create (&job.nurseId, NULL, ::_jobnurse, nd) == 0)
        job.nurseActive = true;
    else delete nd;

    job.info.pid = pid;
    setJobState (job, js2Active);

void JsfNode::monitorJob (Job2 & job)
    int status;
    pid_t pid = waitpid (job.info.pid, &status, WNOHANG);
    if (pid < 0) {
        if (errno == ECHILD) {
            /* the job process has disappeared.. */
            job.exitCode = 0;
            setJobState (job, js2Finished);
    } else if (pid == job.info.pid) {
        if (WIFEXITED(status)) {
            job.exitCode = WEXITSTATUS(status);
            setJobState (job, js2Finished);
        } else if (WIFSIGNALED(status)) {
            setJobState (job, js2Canceled);
        } else if (WIFSTOPPED(status)) {
            if (ptrace (PTRACE_CONT, pid, NULL, NULL))
                jdebug ("PTRACE_CONT failed for: %d (%s)\n", pid, strerror(errno));

    /* ... */

1 回答 1


是的,问题是由多线程引起的。如果 monitorJob() 在单独的线程中运行,则 ptrace(PTRACE_CONT) 会失败。将其移至主线程(调用 ptrace(PTRACE_ATTACH) 的线程)后,一切顺利。

于 2013-05-03T14:39:34.980 回答