4

我有一些带有多行块的文本文件,例如

2011/01/01 13:13:13,<AB>, Some Certain Text,=,
[    
certain text
         [
                  0: 0 0 0 0 0 0 0 0 
                  8: 0 0 0 0 0 0 0 0 
                 16: 0 0 0 9 343 3938 9433 8756 
                 24: 6270 4472 3182 2503 1768 1140 836 496 
                 32: 326 273 349 269 144 121 94 82 
                 40: 64 80 66 59 56 47 50 46 
                 48: 64 35 42 53 42 40 41 34 
                 56: 35 41 39 39 47 30 30 39 
                 Total count: 12345
        ]
    certain text
]
some text
2011/01/01 14:14:14,<AB>, Some Certain Text,=,
[
 certain text
   [
              0: 0 0 0 0 0 0 0 0 
              8: 0 0 0 0 0 0 0 0 
             16: 0 0 0 4 212 3079 8890 8941 
             24: 6177 4359 3625 2420 1639 974 594 438 
             32: 323 286 318 296 206 132 96 85 
             40: 65 73 62 53 47 55 49 52 
             48: 29 44 44 41 43 36 50 36 
             56: 40 30 29 40 35 30 25 31 
             64: 47 31 25 29 24 30 35 31 
             72: 28 31 17 37 35 30 20 33 
             80: 28 20 37 25 21 23 25 36 
             88: 27 35 22 23 15 24 34 28
             Total count: 123456 
    ]
    certain text
some text
]

这些变长块存在于文本之间。我想在 : 之后读出所有数字并将它们保存在单独的数组中。在这种情况下,将有两个数组:

array1 = { 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 343 3938 9433 8756 6270 4472 3182 2503 1768 1140 836 496 326 273 349 269 144 121 94 82 64 80 66 59 56 47 50 46 64 35 42 53 42 40 41 34 35 41 39 39 47 30 30 39 12345 }

array2 = { 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 212 3079 8890 8941 6177 4359 3625 2420 1639 974 594 438 323 286 318 296 206 132 96 85 65 73 62 53 47 55 49 52 29 44 44 41 41 43 36 50 30 30 30 29 40 35 30 25 31 47 31 47 31 25 29 24 31 35 31 28 31 17 37 35 30 20 33 28 20 37 25 21 25 21 23 25 36 25 36 25 36 27 35 36 27 35 22 23 15 23 15 234 34 34 34 34 34 34 34 34 34 34 28 123456 }

我发现 lpeg 可能是实现它的一种轻量级方式。但我对 PEG 和 LPeg 完全陌生。请帮忙!

4

5 回答 5

5

LPEG版本:

local lpeg            = require "lpeg"
local lpegmatch       = lpeg.match
local C, Ct, P, R, S  = lpeg.C, lpeg.Ct, lpeg.P, lpeg.R, lpeg.S
local Cg              = lpeg.Cg

local data_to_arrays

do
  local colon    = P":"
  local lbrak    = P"["
  local rbrak    = P"]"
  local digits   = R"09"^1
  local eol      = P"\n\r" + P"\r\n" + P"\n" + P"\r"
  local ws       = S" \t\v"
  local optws    = ws^0
  local getnum   = C(digits) / tonumber * optws
  local start    = lbrak * optws * eol
  local stop     = optws * rbrak
  local line     = optws * digits * colon * optws
                 * getnum * getnum * getnum * getnum
                 * getnum * getnum * getnum * getnum
                 * eol
  local count    = optws * P"Total count:" * optws * getnum * eol
  local inner    = Ct(line^1 * count^-1)
--local inner    = Ct(line^1 * Cg(count, "count")^-1)
  local array    = start * inner * stop
  local extract  = Ct((array + 1)^0)

  data_to_arrays = function (data)
    return lpegmatch (extract, data)
  end
end

这实际上只有在数据块的每一行正好有八个整数时才有效。根据您输入的格式如何,这可能是诅咒或祝福;-)

和一个测试文件:

data = [[
some text
[    
some text
         [
                  0: 0 0 0 0 0 0 0 0 
                  8: 0 0 0 0 0 0 0 0 
                 16: 0 0 0 9 343 3938 9433 8756 
                 24: 6270 4472 3182 2503 1768 1140 836 496 
                 32: 326 273 349 269 144 121 94 82 
                 40: 64 80 66 59 56 47 50 46 
                 48: 64 35 42 53 42 40 41 34 
                 56: 35 41 39 39 47 30 30 39 
                 Total count: 12345
        ]
    some text
]
some text
[
 some text
   [
              0: 0 0 0 0 0 0 0 0 
              8: 0 0 0 0 0 0 0 0 
             16: 0 0 0 4 212 3079 8890 8941 
             24: 6177 4359 3625 2420 1639 974 594 438 
             32: 323 286 318 296 206 132 96 85 
             40: 65 73 62 53 47 55 49 52 
             48: 29 44 44 41 43 36 50 36 
             56: 40 30 29 40 35 30 25 31 
             64: 47 31 25 29 24 30 35 31 
             72: 28 31 17 37 35 30 20 33 
             80: 28 20 37 25 21 23 25 36 
             88: 27 35 22 23 15 24 34 28 
    ]
    some text
some text
]
]]

local arrays = data_to_arrays (data)

for n = 1, #arrays do
  local ar   = arrays[n]
  local size = #ar
  io.write (string.format ("[%d] = { --[[size: %d items]]\n  ", n, size))
  for i = 1, size do
    io.write (string.format ("%d,%s", ar[i], (i % 5 == 0) and "\n  " or " "))
  end
  if ar.count ~= nil then
    io.write (string.format ("\n  [\"count\"] = %d,", ar.count))
  end
  io.write (string.format ("\n}\n"))
end
于 2013-10-16T19:28:49.797 回答
3

我的纯 Lua 字符串库解决方案是这样的:

local bracket_pattern = "%b[]" --pattern for getting into brackets
local number_pattern = "(%d+)%s+" --pattern for parsing numbers
local output_array = {} --output 2-dimensional array
local i = 1
local j = 1
local tmp_number
local tmp_sub_str

for tmp_sub_str in file_content:gmatch(bracket_pattern) do --iterating through [string]
    table.insert(output_array, i, {}) --adding new [string] group
    for tmp_number in tmp_sub_str:gmatch(number_pattern) do --iterating through numberWHITESPACE
        table.insert(output_array[i], tonumber(tmp_number)) --adding [string] group element (number)
    end
    i = i + 1
end

编辑:这也适用于更新的文件格式。

于 2013-10-16T18:59:26.433 回答
3

试试这个不使用 LPEG 的代码:

-- assume T contains the text
local a={}
local i=0
for b in T:gmatch("%b[]") do
        b=b:gsub("%d+:","")
        i=i+1
        local t={}
        local j=0
        for n in b:gmatch("%d+") do
                j=j+1; t[j]=tonumber(n)
        end
        a[i]=t
end
于 2013-10-16T19:03:16.347 回答
2

phg 已经为您的问题提供了一个很好的 LPeg 解决方案,但这是另一个使用 LPeg 的 re 模块的解决方案。语法更接近 BNF,使用的运算符更像“正则表达式”,因此该解决方案可能更容易理解。

re = require 're'

function dump(t)
  io.write '{'
  for _, v in ipairs(t) do
    io.write(v, ',')
  end
  io.write '}\n'
end

local textformat = [[
  data_in   <-  block+
  block     <-  text '[' block_content ']'
  block_content <- {| data_arr |} / (block / text)*
  data_arr  <- (text ':' nums whitesp)+
  text      <- whitesp [%w' ']+ whitesp
  nums      <- (' '+ {digits} -> tonumber)+
  digits    <- %d+
  whitesp   <- %s*
]]
local parser = re.compile(textformat, {tonumber = tonumber})
local arr1, arr2 = parser:match(data)

dump(arr1)
dump(arr2)

每个数据数组块都被捕获到一个单独的表中,并作为输出之一返回match

使用data与上面相同的输入,匹配并捕获两个块,因此返回 2 个表。检查这两个表给出:

{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,343,3938,9433,8756,6270,4472 ,3182,2503, 1768,1140,836,496,326,273,349,269,144,121,94,82,64,80,66,59,56,47,50,46,64,35,42 ,53,42,40,41,34,35,41 ,39,39,47,30,30,39,12345,} {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ,0,0,4,212,3079,8890,8941,6177,4359,3625,2420, 1639,974,594,438,323,286,318,296,206,132,96,85,65,73,62,53,47,55,49,52,29,44,4 , 41,43,36,50,36,40,30,29,40,35,30,25,31,47,31,25,29,24,30,35,31,28,31,17,37 ,35,30 ,20,33,28,20,37,25,21,23,25,36,27,35,22,23,15,24,34,28,}

于 2013-10-17T07:02:14.723 回答
1

我知道这是一个迟到的回复,但定义的语法要少得多,以下模式会找到开头[并捕获每个没有后缀的数字,:直到达到关闭为止]。然后重复整个block过程,直到没有匹配。

local patt = re.compile([=[
    data    <- {| block |}+
    block   <- ('[' ((%d+ ':') / { %d+ } -> int / [^]%d]+)+ ']') / ([^[]+ block)
]=], { int = tonumber })

您可以像这样在一个表中一次捕获所有恢复的数组

local a = { patt:match[=[ ... ]=] }
于 2015-12-28T00:16:29.573 回答