I am using iTextSharp to extract text from PDF documents, but some text files that are encoding ISO-8859-1 are not displayed correctly.
Below is my code, if anyone can help me I will be grateful.
public string ReadPdfFile(string fileName)
{
StringBuilder text = new StringBuilder();
PdfReader pdfReader = null;
try
{
if (File.Exists(fileName))
{
pdfReader = new PdfReader(fileName);
Encoding encoding = Encoding.GetEncoding("iso8859-2");
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, new LocationTextExtractionStrategy());
currentText = encoding.GetString(ASCIIEncoding.Convert(Encoding.UTF8, encoding, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
}
pdfReader.Close();
}
return text.ToString();
}
catch
{
return string.Empty;
}
finally
{
if (pdfReader != null) pdfReader.Close();
}
}