1

使用java,我需要读取一个warc归档文件,根据html页面的内容对其进行过滤,并编写一个新的归档文件。

以下代码读取存档。如何org.archive.io.warc.WARCRecordInfo从一个重建一个org.archive.io.ArchiveRecord

import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.*;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;

public class Test126b {
    public static void main() throws Exception {
        File out = new java.io.File("out.warc.gz");
        OutputStream bos = new BufferedOutputStream(new FileOutputStream(out));
        WARCWriterPoolSettings settings = ...
        WARCWriter writer = new WARCWriter(new AtomicInteger(), bos, out, settings);

        File in  = new java.io.File("in.warc.gz");
        WARCReader reader = WARCReaderFactory.get(in);
        Iterator<ArchiveRecord> it = reader.iterator();

        while (it.hasNext()) {
            ArchiveRecord archiveRecord = it.next();
            if (archiveRecord.getHeader().getHeaderValue("WARC-Type") == "response") {
                WARCRecord warcRecord = (WARCRecord) archiveRecord;
                WarcResource warcResource = new WarcResource(warcRecord, reader);
                warcResource.parseHeaders();
                String url = warcResource.getWarcHeaders().getUrl();
                System.out.println("+++ url: " + url);
                byte[] content = IOUtils.toByteArray(warcResource);

                String htmlPage = new String(content);
                if (htmlPage.contains("hello world")) {
                    writer.writeRecord(warcRecordInfo) // how to reconstruct the WARCRecordInfo
                }
            }
        }

        reader.close();
        writer.close();
    }
}
4

0 回答 0