我使用以下代码(在 C# 中)将 .docx 单词文件(xml 内容)转换为文本:
private string ReadNode(XmlNode node)
{
if (node == null || node.NodeType != XmlNodeType.Element)
return string.Empty;
StringBuilder sb = new StringBuilder();
foreach (XmlNode child in node.ChildNodes)
{
if (child.NodeType != XmlNodeType.Element) continue;
switch (child.LocalName)
{
case "t": // Text
sb.Append(child.InnerText.TrimEnd());
string space = ((XmlElement)child).GetAttribute("xml:space");
if (!string.IsNullOrEmpty(space) && space == "preserve")
sb.Append(' ');
break;
case "tab":// Tab
sb.Append("\t");
break;
case "p":// Paragraph
if (ReadNode(child).Trim() != "")
{
sb.Append(ReadNode(child));
sb.Append(Environment.NewLine);
}
break;
default:
sb.Append(ReadNode(child));
break;
}
}
return sb.ToString();
}
如何在我的代码中读取页面内容的“行号”(类似读取“p”或“tab”)?
请查看图片文件(http://i.stack.imgur.com/OVx3O.jpg):