0

我有一个需要针对 seo 优化的 gwt 应用程序(为 google 抓取内容),并且我一直在尝试许多无法满足我们需求的解决方案(返回 html 页面需要我们大量时间),试验是:

  1. 我尝试使用 htmlUnit 作为无头浏览器来按需抓取页面,获取 html 内容大约需要 15 秒(在审核这个时间时,结果是这个时间的 80% 是由等待后台 javascript 的循环占用的” while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) ")
  2. 一种技术,包括在 google 请求之前抓取页面,然后在 google 请求时提供保存的快照(但这种解决方案绝对不方便,因为内容更改非常频繁,google 可能会将其视为“CLOACKING”)

有什么建议吗?

用于爬取的代码:

public class CrawlFilter implements Filter {
    private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
        private static final long serialVersionUID = 1L;

    @Override
    public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
        return true;
    }
}

private final Logger log = Logger.getLogger(CrawlFilter.class.getName());

/**
 * Special URL token that gets passed from the crawler to the servlet
 * filter. This token is used in case there are already existing query
 * parameters.
 */
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
 * Special URL token that gets passed from the crawler to the servlet
 * filter. This token is used in case there are not already existing query
 * parameters.
 */
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();

private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
private static final int _maxLoopChecks = 2;

private WebClient webClient;

public void doFilter(ServletRequest request, ServletResponse response,
                     FilterChain filterChain) throws IOException, ServletException {
    // Grab the request uri and query strings.
    final HttpServletRequest httpRequest = (HttpServletRequest) request;
    final String requestURI = httpRequest.getRequestURI();
    final String queryString = httpRequest.getQueryString();
    final HttpServletResponse httpResponse = (HttpServletResponse) response;

    if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
        final int port = httpRequest.getServerPort();
        final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
        final String scheme = httpRequest.getScheme();
        final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
        final WebRequest webRequest = new WebRequest(urlWithHashFragment);

        log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());

        httpResponse.setContentType("text/html;charset=UTF-8");
        final PrintWriter out = httpResponse.getWriter();
        out.println(renderPage(webRequest));
        out.flush();
        out.close();

        log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
    } else {
        filterChain.doFilter(request, response);
    }
}

@Override
public void destroy() {
    if (webClient != null) {
        webClient.closeAllWindows();
    }
}

@Override
public void init(FilterConfig config) throws ServletException {
}

private StringBuilder renderPage(WebRequest webRequest) throws IOException {
    webClient = new WebClient(BrowserVersion.FIREFOX_17);
    webClient.getCache().clear();
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setJavaScriptEnabled(true);
    webClient.getOptions().setThrowExceptionOnScriptError(false);
    webClient.getOptions().setRedirectEnabled(false);
    webClient.setAjaxController(new SyncAllAjaxController());
    webClient.setCssErrorHandler(new SilentCssErrorHandler());

    final HtmlPage page = webClient.getPage(webRequest);
    webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);

    int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
    int loopCount = 0;

    while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
        ++loopCount;
        waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);

        if (waitForBackgroundJavaScript == 0) {
            log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
            break;
        }

        synchronized (page) {
            log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
            try {
                page.wait(_pageWaitMillis);
            } catch (InterruptedException e) {
                log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
            }
        }
    }

    webClient.getAjaxController().processSynchron(page, webRequest, false);
    if (webClient.getJavaScriptEngine().isScriptRunning()) {
        log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
        webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
    }

    final String staticSnapshotHtml = page.asXml();
    StringBuilder stringBuilder = new StringBuilder();
    stringBuilder.append("<hr />\n");
    stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
    stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
    stringBuilder.append("<hr />");
    stringBuilder.append(staticSnapshotHtml);

    return stringBuilder;
}

/**
 * Maps from the query string that contains _escaped_fragment_ to one that
 * doesn't, but is instead followed by a hash fragment. It also unescapes any
 * characters that were escaped by the crawler. If the query string does not
 * contain _escaped_fragment_, it is not modified.
 *
 * @param queryString
 * @return A modified query string followed by a hash fragment if applicable.
 *         The non-modified query string otherwise.
 * @throws UnsupportedEncodingException
 */
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
    int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
    int length = ESCAPED_FRAGMENT_LENGTH2;

    if (index == -1) {
        index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
        length = ESCAPED_FRAGMENT_LENGTH1;
    }

    if (index != -1) {
        StringBuilder queryStringSb = new StringBuilder();
        if (index > 0) {
            queryStringSb.append("?");
            queryStringSb.append(queryString.substring(0, index));
        }
        queryStringSb.append("#!");
        queryStringSb.append(URLDecoder.decode(queryString.substring(index
                + length, queryString.length()), "UTF-8"));
        return queryStringSb.toString();
    }

    return queryString;
}
}
4

1 回答 1

0

我建议让 HtmlUnit 离线生成静态 html。您可以控制更新频率。

然后,让您的 servlet 过滤器拦截爬虫请求返回已经生成的静态 html。

于 2013-11-04T03:43:19.610 回答