我正在尝试从 PDF 中提取数据并将其拆分为某些类别。我能够从 PDF 中提取数据并根据其字体大小将其拆分为类别。例如:假设有3个类别,国家类别,首都类别和城市类别。我能够将所有国家、首都和城市归入各自的类别。但是我无法绘制出哪个首都属于哪个城市和哪个国家或哪个国家属于哪个城市和首都。*它是随机读取数据,我如何在不破坏顺序的情况下从下到上读取数据,所以我可以将第一个单词放在第一类,第二个放在第二个等等。*
或者有人知道一些更有效的方法吗?所以我可以将文本放入各自的类别并映射它。
我正在使用 Java,这是我的代码:
public class readPdfText {
public static void main(String[] args) {
try{
PdfReader reader = null;
String src = "pdffile.pdf";
try {
reader = new PdfReader("pdfile.pdf");
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
SemTextExtractionStrategy smt = new SemTextExtractionStrategy();
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
PdfTextExtractor.getTextFromPage(reader, i, smt);
}
}catch(Exception e){
}
}
}
SemTextExtractionStrategy 类:
public class SemTextExtractionStrategy implements TextExtractionStrategy {
private String text;
StringBuffer str = new StringBuffer();
StringBuffer item = new StringBuffer();
StringBuffer cat = new StringBuffer();
StringBuffer desc = new StringBuffer();
float temp = 0;
@Override
public void beginTextBlock() {
}
@Override
public void renderText(TextRenderInfo renderInfo) {
text = renderInfo.getText();
Vector curBaseline = renderInfo.getBaseline().getStartPoint();
Vector topRight = renderInfo.getAscentLine().getEndPoint();
Rectangle rect = new Rectangle(curBaseline.get(0), curBaseline.get(1),
topRight.get(0), topRight.get(1));
float curFontSize = rect.getHeight();
compare(text, curFontSize);
}
private void add(String text2, float curFontSize) {
str.append(text2);
System.out.println("str: " + str);
}
public void compare(String text2, float curFontSize) {
// text2.getFont().getBaseFont().Contains("bold");
// temp = curFontSize;
boolean flag = check(text);
if (temp == curFontSize) {
str.append(text);
/*
* if (curFontSize == 11.222168){ item.append(str);
* System.out.println(item); }else if (curFontSize == 10.420532){
* desc.append(str); }
*/
// str.append(text);
} else {
if (temp>9.8 && temp<10){
String Contry= str.toString();
System.out.println("Contry: "+Contry);
}else if(temp>8 && temp <9){
String itemPrice= str.toString();
System.out.println("itemPrice: "+itemPrice);
}else if(temp >7 && temp< 7.2){
String captial= str.toString();
System.out.println("captial: "+captial);
}else if(temp >7.2 && temp <8){
String city= str.toString();
System.out.println("city: "+city);
}else{
System.out.println("size: "+temp+" "+"str: "+str);
}
temp = curFontSize;
// System.out.println(temp);
str.delete(0, str.length());
str.append(text);
}
}
private boolean check(String text2) {
return true;
}
@Override
public void endTextBlock() {
}
@Override
public void renderImage(ImageRenderInfo renderInfo) {
}
@Override
public String getResultantText() {
return text;
}
}