我编写了一个网络爬虫。但是在抓取时,它会下载太多 GB 的数据。
我只想阅读文本(避免图像......等)。
我使用Boilerpipe从 html 中提取内容
这是我找到最终重定向网址的方法
public String getFinalRedirectedUrl(String url) throws IOException{
HttpURLConnection connection;
String finalUrl = url;
int redirectCount = 0;
do {
connection = (HttpURLConnection) new URL(finalUrl)
.openConnection();
connection.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
connection.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
connection.setInstanceFollowRedirects(false);
connection.setUseCaches(false);
connection.setRequestMethod("GET");
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode >= 300 && responseCode < 400) {
String redirectedUrl = connection.getHeaderField("Location");
if (null == redirectedUrl)
break;
finalUrl = redirectedUrl;
redirectCount++;
if(redirectCount > Config.MAX_REDIRECT_COUNT){
throw new java.net.ProtocolException("Server redirected too many times ("+Config.MAX_REDIRECT_COUNT+")");
}
} else{
break;
}
} while (connection.getResponseCode() != HttpURLConnection.HTTP_OK);
connection.disconnect();
return finalUrl;
}
这就是我获取网址的方式
private HTMLDocument fetch(URL url) throws IOException{
final HttpURLConnection httpcon = (HttpURLConnection) url.openConnection();
httpcon.setFollowRedirects(true);
httpcon.setConnectTimeout(Config.HTTP_CONNECTION_TIMEOUT_TIME);
httpcon.setReadTimeout(Config.HTTP_READ_TIMEOUT_TIME);
httpcon.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2");
final String ct = httpcon.getContentType();
Charset cs = Charset.forName("Cp1252");
if (ct != null) {
if(!ct.contains("text/html")){
System.err.println("Content type is:"+ct);
return new HTMLDocument("");
}
Matcher m = PAT_CHARSET.matcher(ct);
if(m.find()) {
final String charset = m.group(1);
try {
cs = Charset.forName(charset);
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
// keep default
}
}
}
InputStream in = httpcon.getInputStream();
final String encoding = httpcon.getContentEncoding();
if(encoding != null) {
if("gzip".equalsIgnoreCase(encoding)) {
in = new GZIPInputStream(in);
} else {
System.err.println("WARN: unsupported Content-Encoding: "+encoding);
}
}
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buf = new byte[4096];
int r;
while ((r = in.read(buf)) != -1) {
bos.write(buf, 0, r);
}
in.close();
final byte[] data = bos.toByteArray();
return new HTMLDocument(data, cs);
}
并使用Boilerpipe获取身体
HTMLDocument htmlDoc = fetch(new URL(url));
String body = ArticleExtractor.INSTANCE.getText(htmlDoc.toInputSource());
如何减少下载的数据量?