我正在尝试实现一个简单的词典压缩算法,它使用确定性有限自动机作为数据结构(实际上它是确定性非循环有限状态自动机,请参阅Wikipedia entry)。当我对一个大型词典数据库运行程序时(我有两个数据集——一个包含 ~900.000 个唯一词,另一个包含 ~4.000.000 个唯一词)我得到一个堆溢出:
mindfa.exe: Heap exhausted;
Current maximum heap size is 1073741824 bytes (1024 MB);
use `+RTS -M<size>' to increase it.
6,881,239,544 bytes allocated in the heap
4,106,345,528 bytes copied during GC
1,056,362,696 bytes maximum residency (96 sample(s))
6,884,200 bytes maximum slop
1047 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 13140 colls, 0 par 2.14s 2.13s 0.0002s 0.0019s
Gen 1 96 colls, 0 par 197.37s 199.06s 2.0736s 3.3260s
INIT time 0.00s ( 0.00s elapsed)
MUT time 2.54s ( 12.23s elapsed)
GC time 190.09s (191.68s elapsed)
RP time 0.00s ( 0.00s elapsed)
PROF time 9.42s ( 9.51s elapsed)
EXIT time 0.00s ( 0.00s elapsed)
Total time 202.05s (203.91s elapsed)
%GC time 94.1% (94.0% elapsed)
Alloc rate 2,706,148,904 bytes per MUT second
Productivity 1.3% of total user, 1.2% of total elapsed
我的猜测是问题之一是懒惰addWord
和addWords
功能。
-- | Update the tree structure, starting from the current location.
addWord :: Zipper TnLabel -> B.ByteString -> Zipper TnLabel
addWord z s | B.null s = z
addWord (Zipper (DFA ts) parents) s = addWord z rest
where
ch = B.head s
rest = B.tail s
pack defaultFlag = packTransitionLabel ch (if B.null rest then bit bitWordStop else defaultFlag)
z = case break (\(w,_) -> getCh w == ch) ts of
(_, []) -> Zipper
{ _focus = DFA []
, _parents = (pack 0, [], ts) : parents
}
(left, (w, newFocus):right) -> Zipper
{ _focus = newFocus
, _parents = ((pack w), left, right) : parents
}
-- | Add a list of words to the DFA tree.
addWords :: Zipper TnLabel -> [B.ByteString] -> Zipper TnLabel
addWords z [] = z
addWords z (s:ss) = addWords z' ss
where
z' = addWord (root z) s
我已经阅读了seq
,$!
和!
,但仍然看不到如何在我的示例中使用它们。如何使代码严格(呃)?另一方面,也许我使用了错误的数据结构(树+拉链)?
这是我正在做的一个(不是很)简短、独立、正确(可编译)的示例。当你运行它时,它应该打印出状态数、转换数和整个 DFA 树,如下所示:
Lexicon
State# 16
Transition# 21
*
|
b--*
|
e--*
| |
| d!-*
| |
| s!-*
| |
| d--*
| |
| i--*
| | |
| | n--*
| | |
| | g!-*
| |
| e--*
| |
| d!-*
|
a--*
|
d!-*
|
n--*
| |
| e--*
| |
| s--*
| |
| s!-*
|
l--*
| |
| y!-*
|
a--*
|
s--*
|
s!-*
编码:
{-# LANGUAGE FlexibleInstances, OverloadedStrings #-}
module Main (main) where
import Prelude hiding (print)
import qualified Data.ByteString.Lazy as B hiding (unpack)
import qualified Data.ByteString.Lazy.Char8 as B (unpack)
import Data.Word (Word8, Word16)
import Data.Bits ((.|.), (.&.), bit, complement, testBit)
import Foreign.Storable (sizeOf)
import Text.Printf hiding (fromChar, toChar)
--------------------------------------------- Deterministic finite automaton
type TnLabel = Word16
bitsInWord :: Int
bitsInWord = sizeOf (0::TnLabel) * 8
bitWordStop :: Int
bitWordStop = bitsInWord-1 -- ^ marks the end of a word
packTransitionLabel :: Word8 -> TnLabel -> TnLabel
packTransitionLabel ch flags = (flags .&. complement 0xFF) .|. fromIntegral ch
getCh :: TnLabel -> Word8
getCh w = fromIntegral $ w .&. 0xFF
type Transition e = (e, DFA e)
data DFA e = DFA [Transition e]
deriving (Show, Eq)
-- DFA Zipper -----------------------------------------------------------------
data Zipper e = Zipper
{ _focus :: DFA e
, _parents :: [(e, [Transition e], [Transition e])]
}
deriving (Show)
-- Moving around ---------------------------------------------------------------
-- | The parent of the given location.
parent :: Zipper TnLabel -> Maybe (Zipper TnLabel)
parent (Zipper _ []) = Nothing
parent (Zipper focus ((event, left, right):parents)) = Just Zipper
{ _focus = DFA $ left++((event,focus):right)
, _parents = parents
}
-- | The top-most parent of the given location.
root :: Zipper TnLabel -> Zipper TnLabel
root z@(Zipper _ []) = z
root z = case parent z of
Nothing -> z
Just z2 -> root z2
-- Modification -----------------------------------------------------------------
-- | Update the tree structure, starting from the current location.
addWord :: Zipper TnLabel -> B.ByteString -> Zipper TnLabel
addWord z s | B.null s = z
addWord (Zipper (DFA ts) parents) s = addWord z rest
where
ch = B.head s
rest = B.tail s
pack defaultFlag = packTransitionLabel ch (if B.null rest then bit bitWordStop else defaultFlag)
z = case break (\(w,_) -> getCh w == ch) ts of
(_, []) -> Zipper
{ _focus = DFA []
, _parents = (pack 0, [], ts) : parents
}
(left, (w, newFocus):right) -> Zipper
{ _focus = newFocus
, _parents = ((pack w), left, right) : parents
}
-- | Add a list of words to the DFA tree.
addWords :: Zipper TnLabel -> [B.ByteString] -> Zipper TnLabel
addWords z [] = z
addWords z (s:ss) = addWords z' ss
where
z' = addWord (root z) s
-- Conversion ------------------------------------------------------------
empty :: Zipper TnLabel
empty = Zipper
{ _focus = DFA []
, _parents = []
}
toDFA :: Zipper TnLabel -> DFA TnLabel
toDFA (Zipper dfa _) = dfa
fromList :: [B.ByteString] -> DFA TnLabel
fromList = toDFA . root . addWords empty
-- Stats ------------------------------------------------------------------
-- | Number of states in the whole DFA tree.
stateCount :: DFA TnLabel -> Int
stateCount = go 0
where
go acc (DFA []) = acc
go acc (DFA ts) = go' (acc+1) ts
go' acc [] = acc
go' acc ((_,dfa):ts) = go 0 dfa + go' acc ts
-- | Number of transitions in the whole DFA tree.
transitionCount :: DFA TnLabel -> Int
transitionCount = go 0
where
go acc (DFA []) = acc
go acc (DFA ts) = go' acc ts
go' acc [] = acc
go' acc ((_,dfa):ts) = go 1 dfa + go' acc ts
-- DFA drawing ---------------------------------------------------------
draw' :: DFA TnLabel -> [String]
draw' (DFA ts) = "*" : drawSubTrees ts
where
drawSubTrees [] = []
drawSubTrees [(w, node)] = "|" : shift (toChar w : flagCh w : "-") " " (draw' node)
drawSubTrees ((w, node):xs) = "|" : shift (toChar w : flagCh w : "-") "| " (draw' node) ++ drawSubTrees xs
shift first other = zipWith (++) (first : repeat other)
flagCh flags = if testBit flags bitWordStop then '!' else '-'
toChar w = head . B.unpack . B.singleton $ getCh w
draw :: DFA TnLabel -> String
draw = unlines . draw'
print :: DFA TnLabel -> IO ()
print = putStr . draw
-- Main -----------------------------------------------------------------
main :: IO ()
main = do
let dfa = fromList ["bad", "badass", "badly", "badness", "bed", "bedded", "bedding", "beds"]
printf "Lexicon\n"
printf "\tState# %d\n" (stateCount dfa)
printf "\tTransition# %d\n" (transitionCount dfa)
print dfa