下面是一个函数,它使用 TessNet2(OCR 框架)扫描由 TessNet2 内置的 OCR 函数捕获的单词列表。由于我扫描的页面质量不够完美,因此单词的检测不是 100% 准确的。
因此,有时它会将“S”与“5”或“l”与“1”混淆。此外,它不考虑大小写。所以我必须搜索这两种情况。
它的工作方式是我在纸上搜索某些彼此接近的单词。所以第一组词[I]是“Abstracting Service Ordered”。如果页面包含彼此相邻的这些单词,则它会移动到下一组单词 [j],然后是下一个 [h]。如果页面包含所有 3 组单词,则返回 true。
这是我想到的最好的方法,但我希望这里有人可以给我另一种尝试的方法。
public Boolean isPageABSTRACTING(List<tessnet2.Word> wordList)
{
for (int i = 0; i < wordList.Count; i++) //scan through words
{
if ((wordList[i].Text == "Abstracting" || wordList[i].Text == "abstracting" || wordList[i].Text == "abstractmg" || wordList[i].Text == "Abstractmg" && wordList[i].Confidence >= 50) && (wordList[i + 1].Text == "Service" || wordList[i + 1].Text == "service" || wordList[i + 1].Text == "5ervice" && wordList[i + 1].Confidence >= 50) && (wordList[i + 2].Text == "Ordered" || wordList[i + 2].Text == "ordered" && wordList[i + 2].Confidence >= 50)) //find 1st tier check
{
for (int j = 0; j < wordList.Count; j++) //scan through words again
{
if ((wordList[j].Text == "Due" || wordList[j].Text == "Oue" && wordList[j].Confidence >= 50) && (wordList[j + 1].Text == "Date" || wordList[j + 1].Text == "Oate" && wordList[j + 1].Confidence >= 50) && (wordList[j + 2].Text == "&" && wordList[j + 2].Confidence >= 50)) //find 2nd tier check
{
for (int h = 0; h < wordList.Count; h++) //scan through words again
{
if ((wordList[h].Text == "Additional" || wordList[h].Text == "additional" && wordList[h].Confidence >= 50) && (wordList[h + 1].Text == "comments" || wordList[h + 1].Text == "Comments" && wordList[h + 1].Confidence >= 50) && (wordList[h + 2].Text == "about" || wordList[h + 2].Text == "About" && wordList[h + 2].Confidence >= 50) && (wordList[h + 3].Text == "this" || wordList[h + 3].Text == "This" && wordList[h + 3].Confidence >= 50)) //find 3rd tier check
{
return true;
}
}
}
}
}
}
return false;
}