3

我已经设置了一个Filter为我的 GWT Web 应用程序添加爬虫支持。这个想法是捕获所有包含“ _escaped_fragment_=”的请求并为爬虫提供快照。

Filter我已经按如下方式设置了使用 Guice:

filter("/*").through(CrawlerFilter.class);

以下是该CrawlerFilter课程的代码(非常感谢Patrick):

@Singleton
public class CrawlerFilter implements Filter {
    private static final Logger logger = Logger.getLogger(CrawlerFilter.class.getName());

    /**
     * Special URL token that gets passed from the crawler to the servlet
     * filter. This token is used in case there are already existing query
     * parameters.
     */
    private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
    private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
    /**
     * Special URL token that gets passed from the crawler to the servlet
     * filter. This token is used in case there are not already existing query
     * parameters.
     */
    private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
    private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();

    private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
        private static final long serialVersionUID = 1L;

        @Override
        public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
            return true;
        }
    }

    private WebClient webClient = null;

    private static final long _pumpEventLoopTimeoutMillis = 30000;
    private static final long _jsTimeoutMillis = 1000;
    private static final long _pageWaitMillis = 200;
    final int _maxLoopChecks = 2;

    public void doFilter(ServletRequest request, ServletResponse response, FilterChain filterChain) throws IOException,
            ServletException {
        // Grab the request uri and query strings.
        final HttpServletRequest httpRequest = (HttpServletRequest) request;
        final String requestURI = httpRequest.getRequestURI();
        final String queryString = httpRequest.getQueryString();
        final HttpServletResponse httpResponse = (HttpServletResponse) response;

        if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
            // This is a Googlebot crawler request, let's return a static
            // indexable html page post javascript execution, as rendered in the browser.

            final String domain = httpRequest.getServerName();
            final int port = httpRequest.getServerPort();

            // Rewrite the URL back to the original #! version
            // -- basically remove _escaped_fragment_ from the query.
            // Unescape any %XX characters as need be.
            final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
            final String scheme = httpRequest.getScheme();
            final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);  // get from localhost
            final WebRequest webRequest = new WebRequest(urlWithHashFragment);

            // Use the headless browser to obtain an HTML snapshot.
            webClient = new WebClient(BrowserVersion.FIREFOX_3_6);
            webClient.getCache().clear();
            webClient.setJavaScriptEnabled(true);
            webClient.setThrowExceptionOnScriptError(false);
            webClient.setRedirectEnabled(false);
            webClient.setAjaxController(new SyncAllAjaxController());
            webClient.setCssErrorHandler(new SilentCssErrorHandler());

            if (logger.getLevel() == Level.FINEST)
                logger.log(Level.FINEST, "HtmlUnit starting webClient.getPage(webRequest) where webRequest = "
                        + webRequest.toString());
            final HtmlPage page = webClient.getPage(webRequest);

            // Important! Give the headless browser enough time to execute
            // JavaScript
            // The exact time to wait may depend on your application.

            webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);

            int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
            int loopCount = 0;
            while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
                ++loopCount;
                waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
                if (waitForBackgroundJavaScript == 0) {
                    if (logger.getLevel() == Level.FINEST)
                        logger.log(Level.FINEST, "HtmlUnit exits background javascript at loop counter " + loopCount);
                    break;
                }
                synchronized (page) {
                    if (logger.getLevel() == Level.FINEST)
                        logger.log(Level.FINEST, "HtmlUnit waits for background javascript at loop counter "
                                + loopCount);
                    try {
                        page.wait(_pageWaitMillis);
                    }
                    catch (InterruptedException e) {
                        logger.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount);
                        e.printStackTrace();
                    }
                }
            }
            webClient.getAjaxController().processSynchron(page, webRequest, false);
            if (webClient.getJavaScriptEngine().isScriptRunning()) {
                logger.log(Level.WARNING, "HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
                webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
            }

            // Return the static snapshot.
            final String staticSnapshotHtml = page.asXml();
            httpResponse.setContentType("text/html;charset=UTF-8");
            final PrintWriter out = httpResponse.getWriter();
            out.println("<hr />");
            out.println("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
            out.println(urlWithHashFragment + "\">this link</a> for the interactive application.<br></h3></center>");
            out.println("<hr />");
            out.println(staticSnapshotHtml);
            // Close web client.
            webClient.closeAllWindows();
            out.println("");
            out.flush();
            out.close();
            if (logger.getLevel() == Level.FINEST)
                logger.log(Level.FINEST, "HtmlUnit completed webClient.getPage(webRequest) where webRequest = "
                        + webRequest.toString());
        }
        else {
            if (requestURI.contains(".nocache.")) {
                // Ensure the gwt nocache bootstrapping file is never cached.
                // References:
                // https://stackoverflow.com/questions/4274053/how-to-clear-cache-in-gwt
                // http://seewah.blogspot.com/2009/02/gwt-tips-2-nocachejs-getting-cached-in.html
                //
                final Date now = new Date();
                httpResponse.setDateHeader("Date", now.getTime());
                httpResponse.setDateHeader("Expires", now.getTime() - 86400000L); // One day old.
                httpResponse.setHeader("Pragma", "no-cache");
                httpResponse.setHeader("Cache-control", "no-cache, no-store, must-revalidate");
            }

            filterChain.doFilter(request, response);
        }
    }

    /**
     * Maps from the query string that contains _escaped_fragment_ to one that
     * doesn't, but is instead followed by a hash fragment. It also unescapes
     * any characters that were escaped by the crawler. If the query string does
     * not contain _escaped_fragment_, it is not modified.
     * 
     * @param queryString
     * @return A modified query string followed by a hash fragment if
     *         applicable. The non-modified query string otherwise.
     * @throws UnsupportedEncodingException
     */
    private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
        // Seek the escaped fragment.
        int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
        int length = ESCAPED_FRAGMENT_LENGTH2;
        if (index == -1) {
            index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
            length = ESCAPED_FRAGMENT_LENGTH1;
        }
        if (index != -1) {
            // Found the escaped fragment, so build back the original decoded
            // one.
            final StringBuilder queryStringSb = new StringBuilder();
            // Add url parameters if any.
            if (index > 0) {
                queryStringSb.append("?");
                queryStringSb.append(queryString.substring(0, index));
            }
            // Add the hash fragment as a replacement for the escaped fragment.
            queryStringSb.append("#!");
            // Add the decoded token.
            final String token2Decode = queryString.substring(index + length, queryString.length());
            final String tokenDecoded = URLDecoder.decode(token2Decode, "UTF-8");
            queryStringSb.append(tokenDecoded);
            return queryStringSb.toString();
        }
        return queryString;
    }

    @Override
    public void destroy() {
        if (webClient != null)
            webClient.closeAllWindows();
    }

    @Override
    public void init(FilterConfig config) throws ServletException {
    }

}

它使用 HtmlUnit 创建快照。

然而; 当我尝试使用常规浏览器访问快照时发生错误。我输入的 URL 格式如下:

http://www.myapp.com/?_escaped_fragment_=myobject%3Bid%3D507ac730e4b0e2b7a73b1b81

但是按Filter结果处理会出现以下错误:

Proxy Error

The proxy server received an invalid response from an upstream server.
The proxy server could not handle the request GET /.

Reason: Error reading from remote server

Apache/2.2.22 (Amazon) Server at www.myapp.com Port 80

任何帮助,将不胜感激。

4

0 回答 0