3

我有一个用 Java 编写的服务器 - (在 REQ-REP 和 PUB-SUB 模式中使用 ZeroMQ。)我使用 Apache Commons Daemon (prunsrv/procrun) 将它包装为 Windows 服务

有时对 ZContext.destroy 的调用会挂起。当上下文没有被破坏时,我已经设法获得堆栈跟踪。如果我理解正确, ZContext.destroy() 应该管理所有套接字的终止。可能是什么问题呢?

我已经粘贴了上下文没有被破坏时收到的堆栈跟踪 - 但我们必须结束该过程。

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.zeromq.ZContext;
import org.zeromq.ZMQ;

/**
 * This is the driver for starting and stopping the PowerLoggerService
 * 
 */
public class PowerLoggerService {

    static PreferenceManager prefMgr;
    static ModuleServer moduleServer;
    static DiscoveryServer discoveryServer;
    static LoggerRequestServer loggerRequestServer;
    static NotificationServer notificationServer;
    static ScheduledExecutorService scheduler;
    static ZContext zContext;

    private static final Logger log = LoggerFactory.getLogger("PowerLoggerService");
    private volatile static boolean shouldStop;
    private volatile static boolean contextWasDestroyed = false;

    public static void start(String [] args) {

        zContext = new ZContext();
        zContext.setLinger(0);

        scheduler = Executors.newScheduledThreadPool(1);
        log.trace("Entered startPMServer");
        notificationServer = new NotificationServer(zContext);
        notificationServer.publishStatus(MessageType.STARTED, "Server started");

        if(prefMgr.ensureValidDBPrefs()) {
            moduleServer = new ModuleServer();
            moduleServer.start();
            discoveryServer = new DiscoveryServer();
            discoveryServer.start();

            loggerRequestServer = new LoggerRequestServer(zContext);
            loggerRequestServer.start();                
            scheduler.scheduleAtFixedRate(notificationServer, 0, 10, TimeUnit.SECONDS);
        }
        else {
            log.error("Could not ensure that we have valid DB preferences ... need to exit.");
            System.exit(1);
        }

        while(!PowerLoggerService.shouldStop){
            log.trace("Value of boolean stop in start function: {}", PowerLoggerService.shouldStop);
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                log.error("PowerLoggerService main thread interrupted while sleeping");
            }
            log.trace("PowerLoggerService is running.");
            //printAllStackTraces();
        }
        //Sometimes the ZContext is not destroyed and the process goes to STOP_PENDING
        //the root cause needs to be found out and fixed - for now we are brute forcing
        //a System.exit
        if(contextWasDestroyed == false) {
            log.error("The context was not destroyed cleanly so we are doing a forced exit");
            printAllStackTraces();
            System.exit(-1);
        }

        log.debug("Stop was set to true and we are now exiting the start function!");
    }

    private static void printAllStackTraces() {
        StringBuilder sb = new StringBuilder();
        log.trace("=========================START TRACE");
        Map liveThreads = Thread.getAllStackTraces();
        sb.append("\n");

        for (Iterator i = liveThreads.keySet().iterator(); i.hasNext(); ) {
            Thread key = (Thread)i.next();
            sb.append("==============>Thread ").append(key.getName()).append("\n");
            StackTraceElement[] trace = (StackTraceElement[])liveThreads.get(key);
            for (StackTraceElement trace1 : trace) {
                sb.append("\tat ").append(trace1).append("\n");
            }
        }

        log.trace(sb.toString());
        log.trace("=========================END TRACE");
    }

    public static void stop() {
        try {
            log.info("Going to stop the PowerLoggerService");
            discoveryServer.shutDown();
            moduleServer.shutDown();
            loggerRequestServer.shutDown();
            notificationServer.shutDown();

            log.trace("In stop method after shutting down notification server");
            scheduler.shutdown();
            try {
                log.trace("Terminated ?: {}",scheduler.awaitTermination(3, TimeUnit.SECONDS));
            } catch (InterruptedException ex) {
                log.error("Interrupted when awaiting termination",ex);
            }

            List<Runnable> waiters = scheduler.shutdownNow();
            log.debug("Notification scheduled tasks that were waiting to be cancelled: {}", waiters.size());
            log.debug("Going to destroy the context");

            log.trace("going to set stop to true");
            PowerLoggerService.shouldStop = true;
            log.trace("stop is set to {}", PowerLoggerService.shouldStop);
            //printAllStackTraces();
            zContext.destroy();
            contextWasDestroyed = true;
            log.trace("AFTER context is destroyed");
        }
        catch (Exception e) {
            log.error("Unhandled exception when trying to shut down server ... We are going to have an unclean exit", e);
            PowerLoggerService.shouldStop = true;
        }
        log.trace("Value of stop is set to {}", PowerLoggerService.shouldStop);
    }


    static void main(String [] args) {
        String mode = args[0];
        prefMgr = PreferenceManager.getInstance();

        if(null != mode) switch (mode) {
            case "start":
                PowerLoggerService.shouldStop = false;
                start(args);
                log.trace("Exiting the service start method");
                break;
            case "stop":
                stop();
                log.debug("Got request for shutting down service");
                break;
        }        
    }

    public static void logErrorAndStop(String message) {
        notificationServer.publishStatus(MessageType.ERROR,message);
        stop();
    }
}

给出以下跟踪:

2014-08-07 14:02:27,TRACE,PowerLoggerService,PowerLoggerService.java,101,k.c.p.p.PowerLoggerService,printAllStackTraces
==============>Thread reaper-1
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll0(Native Method)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.access$400(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl.doSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.select(Unknown Source)
        at zmq.Poller.run(Poller.java:207)
        at java.lang.Thread.run(Unknown Source)
==============>Thread Finalizer
        at java.lang.Object.wait(Native Method)
        at java.lang.ref.ReferenceQueue.remove(Unknown Source)
        at java.lang.ref.ReferenceQueue.remove(Unknown Source)
        at java.lang.ref.Finalizer$FinalizerThread.run(Unknown Source)
==============>Thread iothread-2
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll0(Native Method)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.access$400(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl.doSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.select(Unknown Source)
        at zmq.Poller.run(Poller.java:207)
        at java.lang.Thread.run(Unknown Source)
==============>Thread main
        at java.lang.Thread.dumpThreads(Native Method)
        at java.lang.Thread.getAllStackTraces(Unknown Source)
        at kr.co.powermanager.powerlogger.PowerLoggerService.printAllStackTraces(PowerLoggerService.java:89)
        at kr.co.powermanager.powerlogger.PowerLoggerService.start(PowerLoggerService.java:79)
        at kr.co.powermanager.powerlogger.PowerLoggerService.main(PowerLoggerService.java:148)
==============>Thread Signal Dispatcher
==============>Thread Attach Listener
==============>Thread Thread-3
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll0(Native Method)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.access$400(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl.doSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.select(Unknown Source)
        at zmq.Signaler.wait_event(Signaler.java:135)
        at zmq.Mailbox.recv(Mailbox.java:105)
        at zmq.SocketBase.process_commands(SocketBase.java:793)
        at zmq.SocketBase.recv(SocketBase.java:714)
        at org.zeromq.ZMQ$Socket.recv(ZMQ.java:1247)
        at org.zeromq.ZMQ$Socket.recv(ZMQ.java:1235)
        at kr.co.powermanager.powerlogger.LoggerRequestServer.run(LoggerRequestServer.java:45)
==============>Thread Thread-4
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll0(Native Method)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.poll(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl$SubSelector.access$400(Unknown Source)
        at sun.nio.ch.WindowsSelectorImpl.doSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(Unknown Source)
        at sun.nio.ch.SelectorImpl.select(Unknown Source)
        at zmq.Signaler.wait_event(Signaler.java:135)
        at zmq.Mailbox.recv(Mailbox.java:105)
        at zmq.Ctx.terminate(Ctx.java:190)
        at org.zeromq.ZMQ$Context.term(ZMQ.java:301)
        at org.zeromq.ZContext.destroy(ZContext.java:98)
        at kr.co.powermanager.powerlogger.PowerLoggerService.stop(PowerLoggerService.java:129)
        at kr.co.powermanager.powerlogger.PowerLoggerService.main(PowerLoggerService.java:152)
==============>Thread Reference Handler
        at java.lang.Object.wait(Native Method)
        at java.lang.Object.wait(Unknown Source)
        at java.lang.ref.Reference$ReferenceHandler.run(Unknown Source)

2014-08-07 14:02:27,TRACE,PowerLoggerService,PowerLoggerService.java,102,k.c.p.p.PowerLoggerService,printAllStackTraces =========================END TRACE
4

1 回答 1

1

请阅读以下有关如何正确终止上下文的文章:

http://zeromq.org/whitepapers:0mq-termination

这只是为了排除您一方的错误。

查看 jeromq 源代码,当前版本(0.3.6-SNAPSHOT)未能通过其自己的有关创建和拆除套接字的单元测试。

您可以尝试以前的版本(0.3.4 似乎没有相同的问题)或更改为 jzmq,它只是稳定的 ZeroMQ 库的包装器。

于 2016-03-18T10:09:19.357 回答