我已经设置了一个Filter
为我的 GWT Web 应用程序添加爬虫支持。这个想法是捕获所有包含“ _escaped_fragment_=
”的请求并为爬虫提供快照。
Filter
我已经按如下方式设置了使用 Guice:
filter("/*").through(CrawlerFilter.class);
以下是该CrawlerFilter
课程的代码(非常感谢Patrick):
@Singleton
public class CrawlerFilter implements Filter {
private static final Logger logger = Logger.getLogger(CrawlerFilter.class.getName());
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are not already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();
private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
private static final long serialVersionUID = 1L;
@Override
public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
return true;
}
}
private WebClient webClient = null;
private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
final int _maxLoopChecks = 2;
public void doFilter(ServletRequest request, ServletResponse response, FilterChain filterChain) throws IOException,
ServletException {
// Grab the request uri and query strings.
final HttpServletRequest httpRequest = (HttpServletRequest) request;
final String requestURI = httpRequest.getRequestURI();
final String queryString = httpRequest.getQueryString();
final HttpServletResponse httpResponse = (HttpServletResponse) response;
if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
// This is a Googlebot crawler request, let's return a static
// indexable html page post javascript execution, as rendered in the browser.
final String domain = httpRequest.getServerName();
final int port = httpRequest.getServerPort();
// Rewrite the URL back to the original #! version
// -- basically remove _escaped_fragment_ from the query.
// Unescape any %XX characters as need be.
final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
final String scheme = httpRequest.getScheme();
final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment); // get from localhost
final WebRequest webRequest = new WebRequest(urlWithHashFragment);
// Use the headless browser to obtain an HTML snapshot.
webClient = new WebClient(BrowserVersion.FIREFOX_3_6);
webClient.getCache().clear();
webClient.setJavaScriptEnabled(true);
webClient.setThrowExceptionOnScriptError(false);
webClient.setRedirectEnabled(false);
webClient.setAjaxController(new SyncAllAjaxController());
webClient.setCssErrorHandler(new SilentCssErrorHandler());
if (logger.getLevel() == Level.FINEST)
logger.log(Level.FINEST, "HtmlUnit starting webClient.getPage(webRequest) where webRequest = "
+ webRequest.toString());
final HtmlPage page = webClient.getPage(webRequest);
// Important! Give the headless browser enough time to execute
// JavaScript
// The exact time to wait may depend on your application.
webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);
int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
int loopCount = 0;
while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
++loopCount;
waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
if (waitForBackgroundJavaScript == 0) {
if (logger.getLevel() == Level.FINEST)
logger.log(Level.FINEST, "HtmlUnit exits background javascript at loop counter " + loopCount);
break;
}
synchronized (page) {
if (logger.getLevel() == Level.FINEST)
logger.log(Level.FINEST, "HtmlUnit waits for background javascript at loop counter "
+ loopCount);
try {
page.wait(_pageWaitMillis);
}
catch (InterruptedException e) {
logger.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount);
e.printStackTrace();
}
}
}
webClient.getAjaxController().processSynchron(page, webRequest, false);
if (webClient.getJavaScriptEngine().isScriptRunning()) {
logger.log(Level.WARNING, "HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
}
// Return the static snapshot.
final String staticSnapshotHtml = page.asXml();
httpResponse.setContentType("text/html;charset=UTF-8");
final PrintWriter out = httpResponse.getWriter();
out.println("<hr />");
out.println("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
out.println(urlWithHashFragment + "\">this link</a> for the interactive application.<br></h3></center>");
out.println("<hr />");
out.println(staticSnapshotHtml);
// Close web client.
webClient.closeAllWindows();
out.println("");
out.flush();
out.close();
if (logger.getLevel() == Level.FINEST)
logger.log(Level.FINEST, "HtmlUnit completed webClient.getPage(webRequest) where webRequest = "
+ webRequest.toString());
}
else {
if (requestURI.contains(".nocache.")) {
// Ensure the gwt nocache bootstrapping file is never cached.
// References:
// https://stackoverflow.com/questions/4274053/how-to-clear-cache-in-gwt
// http://seewah.blogspot.com/2009/02/gwt-tips-2-nocachejs-getting-cached-in.html
//
final Date now = new Date();
httpResponse.setDateHeader("Date", now.getTime());
httpResponse.setDateHeader("Expires", now.getTime() - 86400000L); // One day old.
httpResponse.setHeader("Pragma", "no-cache");
httpResponse.setHeader("Cache-control", "no-cache, no-store, must-revalidate");
}
filterChain.doFilter(request, response);
}
}
/**
* Maps from the query string that contains _escaped_fragment_ to one that
* doesn't, but is instead followed by a hash fragment. It also unescapes
* any characters that were escaped by the crawler. If the query string does
* not contain _escaped_fragment_, it is not modified.
*
* @param queryString
* @return A modified query string followed by a hash fragment if
* applicable. The non-modified query string otherwise.
* @throws UnsupportedEncodingException
*/
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
// Seek the escaped fragment.
int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
int length = ESCAPED_FRAGMENT_LENGTH2;
if (index == -1) {
index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
length = ESCAPED_FRAGMENT_LENGTH1;
}
if (index != -1) {
// Found the escaped fragment, so build back the original decoded
// one.
final StringBuilder queryStringSb = new StringBuilder();
// Add url parameters if any.
if (index > 0) {
queryStringSb.append("?");
queryStringSb.append(queryString.substring(0, index));
}
// Add the hash fragment as a replacement for the escaped fragment.
queryStringSb.append("#!");
// Add the decoded token.
final String token2Decode = queryString.substring(index + length, queryString.length());
final String tokenDecoded = URLDecoder.decode(token2Decode, "UTF-8");
queryStringSb.append(tokenDecoded);
return queryStringSb.toString();
}
return queryString;
}
@Override
public void destroy() {
if (webClient != null)
webClient.closeAllWindows();
}
@Override
public void init(FilterConfig config) throws ServletException {
}
}
它使用 HtmlUnit 创建快照。
然而; 当我尝试使用常规浏览器访问快照时发生错误。我输入的 URL 格式如下:
http://www.myapp.com/?_escaped_fragment_=myobject%3Bid%3D507ac730e4b0e2b7a73b1b81
但是按Filter
结果处理会出现以下错误:
Proxy Error
The proxy server received an invalid response from an upstream server.
The proxy server could not handle the request GET /.
Reason: Error reading from remote server
Apache/2.2.22 (Amazon) Server at www.myapp.com Port 80
任何帮助,将不胜感激。