我有来自富文本格式 (RTF) 文件的以下数据:
{\rtf1\ansi\deff3\adeflang1025\n{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial ;}{\f3\froman\fprq2\fcharset128 Times New Roman;}{\f4\fswiss\fprq2\fcharset128 Arial;}{\f5\fnil\fprq2\fcharset128 Droid Sans Fallback;}{\f6\fnil\fprq2\ fcharset128 DejaVu Sans;}{\f7\fswiss\fprq0\fcharset128 DejaVu Sans;}}\n{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}\n{\stylesheet{\s0\snext0 \nowidctlpar{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\cf0\kerning1\hich\af5\langfe2052\dbch\af6\afs24\lang1081\loch\f3\fs24\lang1033 默认;}\n{\s15\sbasedon0 \snext16\sb240\sa120\keepn\hich\af5\dbch\af6\afs28\loch\f4\fs28 标题;}\n{\s16\sbasedon0\snext16\sb0\sa120 正文;}\n{\s17\ sbasedon16\snext17\sb0\sa120\dbch\af7 列表;}\n{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 标题;}\n{\s19\sbasedon0\snext19\noline\dbch\af7 索引;}\ n}{\info{\creatim\yr2018\mo7\dy15\hr11\min52}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment OpenOffice}{\vern4140}}\deftab709\n\n{\*\pgdsctbl\n{\pgdsc0\pgdscuse195\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 默认;}}\n\formshade\paperh15840\paperw12240\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn12240\pghsxn15840\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont \aftnstart1\aftnnrlc\n\pgndec\pard\plain \s0\nowidctlpar{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\cf0\kerning1\hich\af5\langfe2052\dbch\af6\afs24\lang1081\loch\f3 \fs24\lang1033{\rtlch \ltrch\loch\n我喜欢阅读。}\n\par }
按照 Rob Miller 的“Text Processing with Ruby”中的示例,我有以下 Parslet Parser:
require "parslet"
class Rtf < Parslet::Parser
rule(:space) { str(" ") }
rule(:hypen) { str("-") }
rule(:integer) { match["0-9"].repeat(1) }
rule(:newline) { str("\n") }
rule(:slash) { str("\\") }
rule(:letter_sequence) { match["a-z"].repeat }
rule(:special_chars) { match["\\\\{}"] }
rule(:unformatted_text) { ( special_chars.absent? >> any ).repeat(1).as(:text) }
rule(:control_word) { ( slash >>
letter_sequence.as(:word) >>
control_delimiter.maybe.as(:delimiter)
).as(:control_word)
}
rule(:control_delimiter) { space | ( hypen.maybe >> integer ) | str(";") }
rule(:group) {
(
str("{") >>
newline.maybe >>
content >>
newline.maybe >>
str("}")
)
}
rule(:content) {
(
unformatted_text | control_word | group
).repeat
}
rule(:header) {
( slash >> str("rtf") >> integer.maybe.as(:version) ).as(:rtf) >>
( slash >> letter_sequence.as(:charset) ) >>
( slash >> str("deff") >> integer.maybe ).maybe.as(:deff) >>
color_table.maybe.as(:color_table) >>
newline.maybe
}
rule(:color_table) {
newline.maybe >>
str("{") >>
( slash >> str("colortabl;") ) >>
color_definition.repeat(1).as(:colors) >>
str("}") >>
newline.maybe
}
rule(:color_definition) {
slash >> str("red") >> (intger.as(:int)).as(:red) >>
slash >> str("green") >> (intger.as(:int)).as(:green) >>
slash >> str("blue") >> (intger.as(:int)).as(:blue) >>
str(";")
}
rule(:file) {
str("{") >>
header.as(:header) >>
content.as(:document) >>
str("}") >>
newline.maybe
}
root :file
end
使用上述 Parslet 解析 rtf 文件会产生:
(byebug) 解析 {:header=>{:rtf=>{:version=>"1"@5}, :charset=>"ansi"@7, :deff=>"\deff3"@11, :color_table= >nil}, :document=>[{:control_word=>{:word=>"adeflang"@18, :delimiter=>"1025"@26}}, {:text=>"\n"@30}, {:text=>"\n"@374}, {:text=>"\n"@431}, {:control_word=>{:word=>"deftab"@1050, :delimiter=>"709"@ 1056}}, {:text=>"\n\n"@1059}, {:text=>"\n"@1191}, {:control_word=>{:word=>"formshade"@1193, :delimiter =>nil}}, {:control_word=>{:word=>"paperh"@1203, :delimiter=>"15840"@1209}}, {:control_word=>{:word=>"paperw"@1215, :delimiter=>"12240"@1221}}, {:control_word=>{:word=>"margl"@1227, :delimiter=>"1134"@1232}}, {:control_word=>{:word=>"margr"@1237, :delimiter=>"1134"@1242}}, {:control_word=> {:word=>"margt"@1247, :delimiter=>"1134"@1252}}, {:control_word=>{:word=>"margb"@1257, :delimiter=>"1134"@1262}} , {:control_word=>{:word=>"sectd"@1267, :delimiter=>nil}}, {:control_word=>{:word=>"sbknone"@1273, :delimiter=>nil}}, { :control_word=>{:word=>"sectunlocked"@1281, :delimiter=>"1"@1293}}, {:control_word=>{:word=>"pgndec"@1295, :delimiter=>nil}} , {:control_word=>{:word=>"pgwsxn"@1302, :delimiter=>"12240"@1308}}, {:control_word=>{:word=>"pghsxn"@1314, :delimiter=>"15840"@1320}}, {:control_word=>{:word=>"marglsxn"@1326, :delimiter=>"1134"@1334}}, {:control_word=>{:word=>"margrsxn"@1339 , :delimiter=>"1134"@1347}}, {:control_word=>{:word=>"margtsxn"@1352, :delimiter=>"1134"@1360}}, {:control_word=>{:word= >"margbsxn"@1365, :delimiter=>"1134"@1373}}, {:control_word=>{:word=>"ftnbj"@1378, :delimiter=>nil}}, {:control_word=>{: word=>"ftnstart"@1384, :delimiter=>"1"@1392}}, {:control_word=>{:word=>"ftnrstcont"@1394, :delimiter=>nil}}, {:control_word=> {:word=>"ftnnar"@1405, :delimiter=>nil}}, {:control_word=>{:word=>"aenddoc"@1412, :delimiter=>nil}}, {:control_word=>{:word=>"aftnrstcont"@1420, :delimiter=>nil}}, {:control_word=>{:word=>"aftnstart"@1432, :delimiter=>"1"@1441}}, {:control_word=> {:word=>"aftnnrlc"@1443, :delimiter=>nil}}, {:text=>"\n"@1451}, {:control_word=>{:word=>"pgndec"@1453, :delimiter =>nil}}, {:control_word=>{:word=>"pard"@1460, :delimiter=>nil}}, {:control_word=>{:word=>"plain"@1465, :delimiter=> " "@1470}}, {:control_word=>{:word=>"s"@1472, :delimiter=>"0"@1473}}, {:control_word=>{:word=>"nowidctlpar"@1475 , :delimiter=>nil}}, {:control_word=>{:word=>"cf"@1529, :delimiter=>"0"@1531}}, {:control_word=>{:word=>"kerning" @1533,:分隔符=>"1"@1540}}, {:control_word=>{:word=>"hich"@1542, :delimiter=>nil}}, {:control_word=>{:word=>"af"@1547, :delimiter =>"5"@1549}}, {:control_word=>{:word=>"langfe"@1551, :delimiter=>"2052"@1557}}, {:control_word=>{:word=>"dbch "@1562, :delimiter=>nil}}, {:control_word=>{:word=>"af"@1567, :delimiter=>"6"@1569}}, {:control_word=>{:word=> "afs"@1571, :delimiter=>"24"@1574}}, {:control_word=>{:word=>"lang"@1577, :delimiter=>"1081"@1581}}, {:control_word= >{:word=>"loch"@1586, :delimiter=>nil}}, {:control_word=>{:word=>"f"@1591, :delimiter=>"3"@1592}}, {: control_word=>{:word=>"fs"@1594, :delimiter=>"24"@1596}}, {:control_word=>{:word=>"lang"@1599, :delimiter=>"1033"@1603}}, {:text=>"\n"@1643 }, {:control_word=>{:word=>"par"@1645, :delimiter=>" "@1648}}]}
RTF 文件中的任何文本,即“我喜欢阅读。”,都没有被解析,我不知道为什么。任何指导将不胜感激。