-1

在这里我需要处理捕获的异常,我需要将这些 PDF 移动到我在代码“失败文件夹”中指定的文件夹中。

package extractInfoFromPDF;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.itextpdf.text.exceptions.InvalidPdfException;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;

public class Test {
    static FileWriter output = null;
    public static void main(String[] args) throws IOException {



        File file = new File("c:/write.txt");
        output = new FileWriter(file);

        PdfReader pdfArticle = null;

        Pattern pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\\'<>])\\S)+)\\b", Pattern.CASE_INSENSITIVE);

        File ArticleFolder = new File("D:\\AI\\failed1");
        File[] listOfArticles = ArticleFolder.listFiles();
        int count = 0;

        StringBuffer s = null;

        for (File article : listOfArticles) {

            if(!article.getName().contains("article.fulltext.000001")){
                continue;
            }

                pdfArticle = new PdfReader(article.getAbsolutePath());
try{
                s = new StringBuffer(PdfTextExtractor.getTextFromPage(pdfArticle, 1));
} catch (InvalidPdfException|StringIndexOutOfBoundsException|ArrayIndexOutOfBoundsException  e) {

    copyFile(article, new File ("D:\\AI\\fail"));
    delete(article);

}

            // System.out.println(s);
            Matcher m = pattern.matcher(s);
            String DOI = null;
            if (m.find()) {
                DOI = m.group();

            }
            if (DOI == null) {
                Pattern pattern2 = Pattern.compile("(DOI:).*", Pattern.CASE_INSENSITIVE);
                Matcher m2 = pattern2.matcher(s);

                if (m2.find()) {
                    DOI = m2.group();
                    DOI=DOI.replaceAll("\\s+", "");
                    m = pattern.matcher(DOI);
                    if (m.find()) {
                        DOI = m.group();

                    }else{
                        DOI = "DOI-NOT-AVALIABLE";
                    }

                }else{
                    DOI = "DOI-NOT-AVALIABLE";
                }

            }
            count = count + 1;
            String d[]=DOI.split(" ");

            for(String d2 : d){
                if(d2.contains("10.")){
                    DOI=d2;
                }
            }

            DOI = DOI.replaceAll("(DOI:)(doi:)(\\s+)([\\.,;)]])", "").trim();
            System.out.println(count + "    TAN: " + article.getName() + "      "
            + DOI);
//if(DOI.matches(""[A-Z-a-z-0-7]"))

            output.write(count + "  TAN: " + article.getName() + "      " + DOI+"\n");

            // FileUtils.writeStringToFile(new File("write.txt"), count++
            // +"   TAN: "+article.getName()+"      "+DOI, "UTF-8");

        }

        output.close();


    }


    public static void copyFile(File source, File dest) throws IOException{

        if(!dest.exists()){

        dest.createNewFile();

        }

        InputStream in = null;

        OutputStream out = null;

        try{

        in = new FileInputStream(source);

        out = new FileOutputStream(dest);

        byte[] buf = new byte[1024];

        int len;

        while((len = in.read(buf)) > 0){

        out.write(buf, 0, len);

        }

        }

        finally{

        in.close();

        out.close();

        }
    }

    public static boolean delete(File resource) throws IOException{ 

        if(resource.isDirectory()){

        File[] childFiles = resource.listFiles();

        for(File child : childFiles){

        delete(child);

        }

        }

        return resource.delete();

        }


}

这是我的完整代码,下面是我得到异常的特定行。

s = new StringBuffer(PdfTextExtractor.getTextFromPage(pdfArticle, 1));

我从 1000 和 1000 的 PDF 中得到String index out of range(-1 或一些 thime 0)几个 100 的 PDF。当我谷歌它时,没有解决方案。下面是我从 iText 获得的例外情况。而不是来自我的代码。对于某些pdf,我还可以ArrayIndexOutOfBoundsException在代码中的同一行中获得(有时是397或有时是286或其他类似的3位数字)(PdfTextExtractor.getTextFromPage)(有时是 397 或有时是 286 或其他一些 3 位数字) 。

java.lang.StringIndexOutOfBoundsException: String index out of range: -1
    at java.lang.String.charAt(String.java:695)
    at com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy.getResultantText(LocationTextExtractionStrategy.java:121)
    at com.itextpdf.text.pdf.parser.PdfTextExtractor.getTextFromPage(PdfTextExtractor.java:73)
    at com.itextpdf.text.pdf.parser.PdfTextExtractor.getTextFromPage(PdfTextExtractor.java:88)
    at extractInfoFromPDF.Test.main(Test.java:41)

对于 ArrayIndexOutOfBoundsException 我另一个 PDF 我得到了这个异常

Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 397
    at com.itextpdf.text.pdf.CMapAwareDocumentFont.getWidth(CMapAwareDocumentFont.java:182)
    at com.itextpdf.text.pdf.parser.TextRenderInfo.getStringWidth(TextRenderInfo.java:210)
    at com.itextpdf.text.pdf.parser.TextRenderInfo.getUnscaledWidth(TextRenderInfo.java:113)
    at com.itextpdf.text.pdf.parser.TextRenderInfo.getUnscaledBaselineWithOffset(TextRenderInfo.java:147)
    at com.itextpdf.text.pdf.parser.TextRenderInfo.getBaseline(TextRenderInfo.java:122)
    at com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy.renderText(LocationTextExtractionStrategy.java:154)
    at com.itextpdf.text.pdf.parser.PdfContentStreamProcessor.displayPdfString(PdfContentStreamProcessor.java:303)
    at com.itextpdf.text.pdf.parser.PdfContentStreamProcessor.access$2500(PdfContentStreamProcessor.java:74)
    at com.itextpdf.text.pdf.parser.PdfContentStreamProcessor$ShowText.invoke(PdfContentStreamProcessor.java:496)
    at com.itextpdf.text.pdf.parser.PdfContentStreamProcessor.invokeOperator(PdfContentStreamProcessor.java:246)
    at com.itextpdf.text.pdf.parser.PdfContentStreamProcessor.processContent(PdfContentStreamProcessor.java:366)
    at com.itextpdf.text.pdf.parser.PdfReaderContentParser.processContent(PdfReaderContentParser.java:79)
    at com.itextpdf.text.pdf.parser.PdfTextExtractor.getTextFromPage(PdfTextExtractor.java:73)
    at com.itextpdf.text.pdf.parser.PdfTextExtractor.getTextFromPage(PdfTextExtractor.java:88)
    at extractInfoFromPDF.Test.main(Test.java:41)
4

1 回答 1

0

经过所有尝试,我发现了异常的问题。这是因为5.1 版中的 iText API 错误,当我用最新的5.2 版重建我的应用程序时,我没有任何例外,而且一切正常 :)

于 2012-05-30T10:45:32.837 回答