我有一组 POT 文件,我想阅读这些文件以便在 Haskell 中对翻译进行后处理。由于 POT 文件非常大,我想使用attoparsec来获得良好的性能。
我尝试使用hgettext,但我的任务不是阅读翻译来翻译我的程序,而只是解析 POT 文件。基本上,我想得到一个(comment, msgid, msgstr)
元组列表
我有一组 POT 文件,我想阅读这些文件以便在 Haskell 中对翻译进行后处理。由于 POT 文件非常大,我想使用attoparsec来获得良好的性能。
我尝试使用hgettext,但我的任务不是阅读翻译来翻译我的程序,而只是解析 POT 文件。基本上,我想得到一个(comment, msgid, msgstr)
元组列表
假设要读取的 POT 文件存储在mypot.pot
其中,您可以使用此代码,您可以直接在其中运行runghc
。我已经使用一些可汗学院 PO 文件成功地对此进行了测试,但它目前仅适用于简单文件(无复数等),但它很容易扩展:
{-# LANGUAGE OverloadedStrings #-}
import Prelude hiding (takeWhile)
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 hiding (takeWhile, skipWhile)
import qualified Data.Text.Encoding as TE
import Data.Text(Text)
import Data.Word
import Data.ByteString (unsnoc, ByteString)
import qualified Data.ByteString.Char8 as B
import Control.Applicative
import Control.Monad
import Data.Maybe
import Data.Either
import qualified Data.Text as T
data PORecord = PORecord {
poComment :: Text,
poMsgid :: Text,
poMsgstr :: Text
} deriving (Show, Eq)
takeTillEOL :: Parser ByteString
takeTillEOL = takeWhile (not . isEndOfLine)
parseMsgidMsgstrLine :: ByteString -> Parser Text
parseMsgidMsgstrLine key = do
void (string key) <?> "Line key"
skipSpace
char '"' <?> "Opening Qutotation mark"
val <- takeTillEOL
endOfLine <?> "EOL"
return $ TE.decodeUtf8 $ fromMaybe "" $ (fst <$> unsnoc val)
msgidLine = parseMsgidMsgstrLine "msgid"
msgstrLine = parseMsgidMsgstrLine "msgstr"
escapedTextLine :: Parser Text
escapedTextLine = char '"' *> (TE.decodeUtf8 <$> takeTillEOL) <* endOfLine
--escapedTextLine = do
-- char '"'
-- val <- takeTillEOL
-- return $ (traceShow val ())
nameP :: String -> Parser a -> Parser a
nameP str p = p <?> str
commentLine :: Parser Text
commentLine = nameP "comment line" $ do
char '#' <?> "Line start hash"
-- Skip space but not newline
void $ many (char ' ')
txt <- TE.decodeUtf8 <$> takeTillEOL
endOfLine <?> "EOF"
return txt
emptyLine :: Parser ()
emptyLine = skipSpace <* endOfLine
poRecord :: Parser PORecord
poRecord = do
comments <- many1 commentLine <?> "Comments"
msgidPrimary <- msgidLine <?> "msgid"
extraMsgid <- many escapedTextLine <?> "Extra msgid"
msgstrPrimary <- msgstrLine <?> "msgstr"
extraMsgstr <- many escapedTextLine <?> "Extra msgstr"
endOfLine
let comment = T.intercalate "\n" comments
let msgid = T.intercalate "\n" $ msgidPrimary : extraMsgid
let msgstr = T.intercalate "\n" $ msgstrPrimary : extraMsgstr
return $ PORecord comment msgid msgstr
poFile :: Parser [PORecord]
poFile =
--let options = choice [emptyLine *> pure Nothing, ]
many1 poRecord <?> "PO results"
main :: IO ()
main = do
f <- B.readFile "mypot.pot"
print $ parseOnly poFile f