0

我正在编写一个代码,它使用 NIO/Selector 进行网页抓取。有用。我确实得到了 OP_CONNECT,然后我发送了 GET 请求,并取回了整个 html 页面。但是,在那之后,我没有得到 -1 就知道它已经完成了。我确实看到了,这意味着整个页面已经发送,但是 SocketChannel.read 没有返回 -1 来指示流的结束。非常感谢任何帮助!

这是整个示例代码:

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.StandardSocketOptions;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.SelectionKey;
import java.nio.channels.Selector;
import java.nio.channels.SocketChannel;
import java.util.Iterator;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HttpClientTest {
    private static final Logger logger = LoggerFactory.getLogger(HttpClientTest.class);
    private static final String BASE_URL_STR = "https://www.youtube.com/channel";
    private static final String CHANNEL_ID = "UCDm6kPZFCoT7altG4WNGy-A";

    private final ByteArrayOutputStream baHtmlPage = new ByteArrayOutputStream();
    private final ByteBuffer buffer = ByteBuffer.allocate(128 * 1024);

    private String htmlPage = null;

    private void startHttpClient() throws InterruptedException {


        // open Selector and ServerSocketChannel by calling the open() method
        try (Selector selector = Selector.open();
                SocketChannel socketChannel = SocketChannel.open()) {

            // check that both of them were successfully opened
            if ((socketChannel.isOpen()) && (selector.isOpen())) {

                // configure non-blocking mode
                socketChannel.configureBlocking(false);
                socketChannel.setOption(StandardSocketOptions.SO_RCVBUF,
                        128 * 1024);
                socketChannel.setOption(StandardSocketOptions.SO_SNDBUF,
                        128 * 1024);
                socketChannel.setOption(StandardSocketOptions.SO_KEEPALIVE,
                        true);
                //socketChannel.setOption(StandardSocketOptions.TCP_NODELAY,
                //      true);

                //socketChannel.connect(new InetSocketAddress(IP, DEFAULT_PORT));
                socketChannel.connect(createSocketAddress(CHANNEL_ID));

                // register the current channel with the given selector
                socketChannel.register(selector, SelectionKey.OP_CONNECT);


                while (true) {
                    // wait for incomming events
                    int num = selector.selectNow();
                    if (num==0) {
                        //Thread.yield();
                        Thread.sleep(2000);
                        System.out.println("sleep: 2 sec");
                        continue;
                    }


                    // there is something to process on selected keys
                    Iterator<SelectionKey> keys = selector.selectedKeys().iterator();
                    while (keys.hasNext()) {
                        SelectionKey key = (SelectionKey) keys.next();

                        // prevent the same key from coming up again
                        keys.remove();

                        if (!key.isValid()) {
                            continue;
                        }

                        if (key.isConnectable() && socketChannel.finishConnect()) {
                            System.out.println("Key: OP_CONNECT");
                            // reset the byte-array
                            baHtmlPage.reset();

                            // Connected --> Send the HTTP request 
                            key.interestOps(SelectionKey.OP_WRITE);

                        } else if (key.isReadable()) {
                            System.out.println("Key: OP_READ");
                            if (readResponse(key)) {
                                logger.info("finished reading, htmlpage:{}", htmlPage);
                            } else {
                                key.interestOps(SelectionKey.OP_READ);
                            }

                            // Once read is done --> we are done
                            //key.interestOps(SelectionKey.OP_WRITE);

                        } else if (key.isWritable()) {
                            System.out.println("Key: OP_WRITE");
                            if (writeHttpRequest(key)) {                            
                                // HTTP request is sent --> Get the response
                                key.interestOps(SelectionKey.OP_READ);
                            }
                        }
                    }

                }
            } else { // if ((serverSocketChannel.isOpen()) && (selector.isOpen())) {
                System.out
                        .println("The server socket channel or selector cannot be opened!");
            }
        } catch (IOException ex) {
            System.err.println(ex);
        }
    }

    private static InetSocketAddress createSocketAddress(String channelID) throws MalformedURLException {
        //String urlStr = BASE_URL_STR + "/" + CHANNEL_ID;  
        String urlStr = "http://www.google.com";  

        URL url = new URL(urlStr);
        String host = url.getHost();  
        int port = url.getPort();  
        if (port == -1) 
            port = 80;

        return new InetSocketAddress(host, port);
    }

    private boolean readResponse(SelectionKey key) throws IOException {
        boolean done = false;
        SocketChannel socketChannel = (SocketChannel) key.channel();

        int numRead = -1;
        do {
            buffer.clear();
            numRead = socketChannel.read(buffer);

            baHtmlPage.write(buffer.array(), 0, numRead);
            System.out.println("Server sent:" + new String(buffer.array(), 0, numRead, "UTF-8") );
        } while(numRead>0);

        if (numRead == -1) {
            System.out.println("Connection closed by: " + socketChannel.getRemoteAddress());
            key.cancel();
            socketChannel.close();
            htmlPage = baHtmlPage.toString("UTF-8");
            done = true;
        }
        return done;
    }

    private boolean writeHttpRequest(SelectionKey key) throws IOException {
        boolean done = false;

        SocketChannel socketChannel = (SocketChannel) key.channel();
        String request = 
                "GET /channel/UCDm6kPZFCoT7altG4WNGy-A HTTP/1.1\r\n" + 
                "Host: www.youtube.com\r\n" +
                "Cache-Control: no-cache\r\n\r\n"; 

        // ISO-8859-1
        ByteBuffer randomBuffer = ByteBuffer.wrap(request.getBytes("UTF-8"));
        int rem = randomBuffer.remaining();
        int num = socketChannel.write(randomBuffer);

        if (rem==num) {
            done = true;
            System.out.printf("Request written:%s\n", request);
        }
        return done;
    }

//  private void doEchoJob(SelectionKey key, byte[] data) {
//
//      SocketChannel socketChannel = (SocketChannel) key.channel();
//      List<byte[]> channelData = keepDataTrack.get(socketChannel);
//      channelData.add(data);
//
//      key.interestOps(SelectionKey.OP_WRITE);
//  }

    public static void main(String[] args) throws InterruptedException {
        HttpClientTest client = new HttpClientTest();
        client.startHttpClient();
    }
}
4

1 回答 1

2

您正在执行 HTTP/1.1 请求,该请求具有隐式保持活动状态。这意味着,一旦发送了完整的响应,服务器就不必关闭连接,而是将其保持打开一段时间,希望它会收到更多请求,从而可以节省另一个 TCP 连接设置的开销。

虽然这有助于提高浏览器正常情况下的性能,但对您的情况没有帮助。我建议使用 HTTP/1.0 而不是 HTTP/1.1,这样您就不必处理 keep-alive 或其他 HTTP/1.1 功能,如分块编码。除此之外,建议使用已经处理所有这些问题的现有 HTTP 库。

于 2014-09-12T04:38:18.793 回答