0

我的 ServletContextListener:

@WebListener
public class RunServlet implements ServletContextListener {

    private ScheduledExecutorService scheduler;

    @Override
    public void contextInitialized(ServletContextEvent event) {
        System.out.println("ready");
        scheduler = Executors.newScheduledThreadPool(10);
        scheduler.execute(new RunThread("http://stackoverflow.com"));
    }

    @Override
    public void contextDestroyed(ServletContextEvent event) {
        scheduler.shutdownNow();
        System.out.println("removed");
    }
}

实现 Runnable 的类是 RunThread(从网页中获取所有链接,点击链接,解析网页并使用 jsoup 和 hibernate 将单词保存到数据库中):

public class RunThread implements Runnable{
    private Document html;
    private String url;

    private static final int threads_num = Runtime.getRuntime().availableProcessors()*4;
    private int links = 0;
    private int alinks = 0;

    public RunThread(String url){
        this.url = url;
        try {
            this.html = Jsoup.connect(url).get();
            this.links = html.select("a[href]").size();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    @Override
    public void run() {
        if(alinks != links){
            Elements collectedLinks = html.select("a[href]");
            ExecutorService executor = Executors.newFixedThreadPool(threads_num);
            for(Element link:collectedLinks){
                if(alinks == links) break;
                else{
                    String current = link.attr("abs:href");
                    if(!current.equals(url) && current.startsWith(url)&& !current.contains("#")){
                        executor.execute(new RunThread(current));
                        alinks++;
                    }
                }
            }
        }
        AnalyzePage(html, url);
    }

    private void AnalyzePage(Document doc,String url){
        String text = doc.body().text();
        SaveTextToDB(text,url);
    }

    public void SaveTextToDB(String text, String link){
        TreeMap<String, Integer> frequencyMap = new TreeMap<String, Integer>();
        StringTokenizer parser = 
            new StringTokenizer(text.replaceAll("[0-9]+","").replaceAll("[^a-zA-Zа-яА-Я]-[^a-zA-Zа-яА-Я]", " "), " \t\n\r\f.,;:!?%#+№/<←→↓@'\"—«»©“\\(\\)");
        while (parser.hasMoreTokens()) {
            String currentWord = parser.nextToken();
            Integer frequency = frequencyMap.get(currentWord);
        if (frequency == null) {
            frequency = 0;
        }
        frequencyMap.put(currentWord, frequency + 1);
    }

    for (Map.Entry<String,Integer> entry : frequencyMap.entrySet()){
        Indexation word = new Indexation();
        IndexationPK pk = new IndexationPK();
        pk.setLink(link);
        pk.setWord(entry.getKey());
        word.setFrequency(entry.getValue());
        word.setIndexationPK(pk);
        IndexationDAO indDAO = new IndexationDAOImpl();
        indDAO.AddRecord(word);
    }}
}

我收到下一个错误:

java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.read(SocketInputStream.java:129)
    at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
    at java.io.BufferedInputStream.read1(BufferedInputStream.java:258)
    at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
    at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:695)
    at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:640)
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1195)
    at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:379)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:381)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:364)
    at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:143)
    at org.jsoup.helper.HttpConnection.get(HttpConnection.java:132)
    at com.mstu.service.RunThread.<init>(RunThread.java:35)
    at com.mstu.service.RunThread.run(RunThread.java:53)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
    at java.lang.Thread.run(Thread.java:662)
Exception in thread "pool-9-thread-1" java.lang.NullPointerException
    at com.mstu.service.RunThread.AnalyzePage(RunThread.java:63)
    at com.mstu.service.RunThread.run(RunThread.java:59)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
    at java.lang.Thread.run(Thread.java:662)
java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.read(SocketInputStream.java:129)
    at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
    at java.io.BufferedInputStream.read1(BufferedInputStream.java:258)
    at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
    at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:695)
    at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:640)
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1195)
    at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:379)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:381)
    at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:364)
    at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:143)
    at org.jsoup.helper.HttpConnection.get(HttpConnection.java:132)
    at com.mstu.service.RunThread.<init>(RunThread.java:35)
    at com.mstu.service.RunThread.run(RunThread.java:53)
    at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
    at java.lang.Thread.run(Thread.java:662)
java.net.SocketTimeoutException: Read timed out

怎么了?请帮我。

4

1 回答 1

1

我怀疑这与您的线程无关,但是

 this.html = Jsoup.connect(url).get();

无法连接到您给定的 URL。这是否作为独立组件运行?值得测试这个独立的(将其拉出Runnable并使其成为一个可以轻松测试的独立类),并检查是否需要设置 HTTP 代理等。

(我可能会在run()方法中运行上述内容,而不是在对象创建中获取连接并挂在它上面,直到执行程序准备好运行它)

于 2012-04-10T19:27:34.843 回答