我需要从 XPS 文档中提取特定页面的文本。提取的文本应写入字符串。我需要这个来使用 Microsofts SpeechLib 读出提取的文本。请仅在 C# 中提供示例。
谢谢
我需要从 XPS 文档中提取特定页面的文本。提取的文本应写入字符串。我需要这个来使用 Microsofts SpeechLib 读出提取的文本。请仅在 C# 中提供示例。
谢谢
ReachFramework
添加对和WindowsBase
和以下using
语句的引用:
using System.Windows.Xps.Packaging;
然后使用此代码:
XpsDocument _xpsDocument=new XpsDocument("/path",System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader
=_xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
IXpsFixedPageReader _page
= _document.FixedPages[documentViewerElement.MasterPageNumber];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null )
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
string _fullPageText = _currentText.ToString();
文本存在于Glyphs
->UnicodeString
字符串属性中。您必须XMLReader
用于固定页面。
从所有页面返回文本的方法(修改后的 Amir:s 代码,希望没问题):
/// <summary>
/// Get all text strings from an XPS file.
/// Returns a list of lists (one for each page) containing the text strings.
/// </summary>
private static List<List<string>> ExtractTextFromXps(string xpsFilePath)
{
var xpsDocument = new XpsDocument(xpsFilePath, FileAccess.Read);
var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
if (fixedDocSeqReader == null)
return null;
const string UnicodeString = "UnicodeString";
const string GlyphsString = "Glyphs";
var textLists = new List<List<string>>();
foreach (IXpsFixedDocumentReader fixedDocumentReader in fixedDocSeqReader.FixedDocuments)
{
foreach (IXpsFixedPageReader pageReader in fixedDocumentReader.FixedPages)
{
var pageContentReader = pageReader.XmlReader;
if (pageContentReader == null)
continue;
var texts = new List<string>();
while (pageContentReader.Read())
{
if (pageContentReader.Name != GlyphsString)
continue;
if (!pageContentReader.HasAttributes)
continue;
if (pageContentReader.GetAttribute(UnicodeString) != null)
texts.Add(pageContentReader.GetAttribute(UnicodeString));
}
textLists.Add(texts);
}
}
xpsDocument.Close();
return textLists;
}
用法:
var txtLists = ExtractTextFromXps(@"C:\myfile.xps");
int pageIdx = 0;
foreach (List<string> txtList in txtLists)
{
pageIdx++;
Console.WriteLine("== Page {0} ==", pageIdx);
foreach (string txt in txtList)
Console.WriteLine(" "+txt);
Console.WriteLine();
}
private string ReadXpsFile(string fileName)
{
XpsDocument _xpsDocument = new XpsDocument(fileName, System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader
= _xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
string _fullPageText="";
for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
{
IXpsFixedPageReader _page
= _document.FixedPages[pageCount];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
while (_pageContentReader.Read())
{
if (_pageContentReader.Name == "Glyphs")
{
if (_pageContentReader.HasAttributes)
{
if (_pageContentReader.GetAttribute("UnicodeString") != null)
{
_currentText.
Append(_pageContentReader.
GetAttribute("UnicodeString"));
}
}
}
}
}
_fullPageText += _currentText.ToString();
}
return _fullPageText;
}
类的完整代码:
using System.Collections.Generic;
using System.Drawing;
using System.Windows.Forms;
using System.Windows.Xps.Packaging;
namespace XPS_Data_Transfer
{
internal static class XpsDataReader
{
public static List<string> ReadXps(string address, int pageNumber)
{
var xpsDocument = new XpsDocument(address, System.IO.FileAccess.Read);
var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
if (fixedDocSeqReader == null) return null;
const string uniStr = "UnicodeString";
const string glyphs = "Glyphs";
var document = fixedDocSeqReader.FixedDocuments[pageNumber - 1];
var page = document.FixedPages[0];
var currentText = new List<string>();
var pageContentReader = page.XmlReader;
if (pageContentReader == null) return null;
while (pageContentReader.Read())
{
if (pageContentReader.Name != glyphs) continue;
if (!pageContentReader.HasAttributes) continue;
if (pageContentReader.GetAttribute(uniStr) != null)
currentText.Add(Dashboard.CleanReversedPersianText(pageContentReader.GetAttribute(uniStr)));
}
return currentText;
}
}
}
从自定义文件的自定义页面返回字符串数据列表。