2

问题陈述:我无法使用 SAS 从 PDF 文件中读取数据。

效果如何:我可以从网站上下载 PDF 并保存。

不工作(需要帮助):我无法使用 SAS 从 PDF 文件中读取数据。源内容结构应始终保持不变。预期输出附加为 jpg 图像。

如果有人知道并帮助我如何使用 SAS 程序解决这种情况,那将是一个很好的学习和帮助。 下图是 PDF 格式的源文件,SAS 数据集格式的预期结果相同:

我试过这样的事情:

/*Proxy address*/
%let proxy_host=xxx.com;
%let port=123;

/*Output location*/
filename output "/desktop/Response.pdf";

/*Download the source file and save it in the desired location*/
proc http           
url="https://cdn.nar.realtor/sites/default/files/documents/ehs-10-2020-overview-2020-11-19_0.pdf"       
method="get"        
proxyhost="&proxy_host."        
proxyport=&port         
out=output;     
run;

%let lineSize = 2000;

data base;
   format text_line $&lineSize..;
   infile output lrecl=&lineSize;
   input text_line $;
run;

DATA _NULL_ ;
X "PS2ASCII /desktop/Response.pdf
/desktop/flatfile.txt";
RUN;
4

1 回答 1

2

您可以使用Apache PDFBox®库,这是一个用于处理 PDF 文档的开源 Java 工具。该库可以在 SASProc GROOVY中使用 Java 代码从 PDF 文档中去除文本及其在页面上的位置。

例子:

您将不得不编写更多代码来从剥离的文本中创建数据集。

filename overview "overview.pdf";
filename ov_text  "overview.txt";

* download a pdf document;

proc http           
url="https://cdn.nar.realtor/sites/default/files/documents/ehs-10-2020-overview-2020-11-19_0.pdf"       
method="get"        
/*proxyhost="&proxy_host."        */
/*proxyport=&port         */
out=overview;     
run;

* download the Apache PDFBox library (a .jar file); 

filename jar 'pdfbox.jar';

%if %sysfunc(FEXIST(jar)) ne 1 %then %do;
  proc http
    url='https://www.apache.org/dyn/closer.lua?filename=pdfbox/2.0.21/pdfbox-app-2.0.21.jar&action=download'
    out=jar;
  run;
%end;

* Use GROOVY to read the PDF, strip out the text and position, and write that
* parse to a text file which SAS can read;

proc groovy classpath="pdfbox.jar"; 
  submit 
    "%sysfunc(pathname(overview))"  /* the input, a pdf file */
    "%sysfunc(pathname(ov_text))"   /* the output, a text file */
  ;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.io.FileWriter;
import java.io.PrintWriter;

public class GetLinesFromPDF extends PDFTextStripper {
    
    static List<String> lines = new ArrayList<String>();
    public GetLinesFromPDF() throws IOException {
    }
    /**
     * @throws IOException If there is an error parsing the document.
     */
    public static void main( String[] args ) throws IOException {
        PDDocument document = null;
        PrintWriter out = null;
        String inPdf = args[0];
        String outTxt = args[1];

        try {
            document = PDDocument.load( new File(inPdf) );

            PDFTextStripper stripper = new GetLinesFromPDF();

            stripper.setSortByPosition( true );
            stripper.setStartPage( 0 );
            stripper.setEndPage( document.getNumberOfPages() );

            Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
            stripper.writeText(document, dummy);
            
            out = new PrintWriter(new FileWriter(outTxt));

            // print lines to text file
            for(String line:lines){
              out.println(line); 
            }
        }
        finally {
            if( document != null ) {
                document.close();
            }
            if( out != null ) {
                out.close();
            }
        }
    }
    /**
     * Override the default functionality of PDFTextStripper.writeString()
     */
    @Override
    protected void writeString(String str, List<TextPosition> textPositions) throws IOException {
        String places = "";

        for(TextPosition tp:textPositions){
          places += "(" + tp.getX() + "," + tp.getY() + ") ";
        }

        lines.add(str + " found @ " + places);
    }
}

  endsubmit;
quit;

* preview the stripped text that was saved;

data _null_;
  infile ov_text;
  input;
  putlog _infile_;
run;

/*
 * additional SAS code will be needed to input the text as data 
 * and construct a data set that matches the original tabular content layout
 */
于 2020-12-11T06:08:14.227 回答