0

我有一个这样的日志文件:

some strings...
<FX>
another strings...
<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>
some strings...
<FX>
<FX>
 <TEG1>
 </TEG1>
</FX>

我需要解析它并得到这个结果:

<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>

<FX>
 <TEG3>
 </TEG3>
</FX>

我已经写过这样的正则表达式:

<FX>([\s\S]+?)</FX>

但它返回此匹配项:

<FX>
another strings...
<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>

<FX>
<FX>
 <TEG1>
 </TEG1>
</FX>

任何人都可以帮助我使用正则表达式吗?预先感谢。

4

2 回答 2

0

根据隐藏在“另一个字符串”后面的内容,您可能会逃脱:

  Dim sAll : sAll = goFS.OpenTextFile("..\data\15168620.txt").ReadAll()
  WScript.Echo sAll
  WScript.Echo "--------"
  Dim reX  : Set reX = New RegExp
  reX.Global  = True
  reX.Pattern = "<FX>[\s\S]*?(<FX>[\s\S]+?</FX>)"
  Dim oMTS : Set oMTS = reX.Execute(sAll)
  Dim oMT
  For Each oMT in oMTS
      WScript.Echo oMT.SubMatches(0)
      WScript.Echo "--------"
  Next

输出:

some strings...
<FX>
another strings...
<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>
some strings...
<FX>
<FX>
 <TEG1>
 </TEG1>
</FX>

--------
<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>
--------
<FX>
 <TEG1>
 </TEG1>
</FX>
--------

更新:

我仍然希望我们可以避免行人接近:

  Dim sAll : sAll = goFS.OpenTextFile("..\data\15168620-2.txt").ReadAll()
  WScript.Echo sAll
  WScript.Echo "--------"
  Dim aAll : aAll = Split(sAll, "FX>")
  Dim sTry
  For Each sTry In aAll
      If "</" = Right(sTry, 2) Then
         WScript.Echo "<FX>" & sTry & "FX>"
         WScript.Echo "--------"
      End If
  Next

输出:

some strings...
<FX>
another <FX> strings...
<FX><FX><FX><FX><FX>
<FX>
<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>
some strings...
<FX>
<FX>
 <TEG1>
 </TEG1>
</FX>

--------
<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>
--------
<FX>
 <TEG1>
 </TEG1>
</FX>
--------

更新二:

行人方法 - 逐行阅读,在每个 <FX>上开始一个新的集合,处理/输出集合</FX>

  Dim alLines : Set alLines = CreateObject("System.Collections.ArrayList")
  alLines.Capacity = 500
  Dim oTS     : Set oTS     = goFS.OpenTextFile("..\data\15168620-2.txt")
  Do Until oTS.AtEndOfStream
     Dim sLine : sLine = oTS.Readline()
     Select Case True
       Case "<FX>" = Left(sLine, 4)
            alLines.Clear
            alLines.Add sLine
       Case "</FX>" = Left(sLine, 5)
            alLines.Add sLine
            WScript.Echo Join(alLines.ToArray(), vbCrLf)
            WScript.Echo "--------"
       Case Else
            alLines.Add sLine
     End Select
  Loop
  oTS.Close

输出:

<FX>
 <TEG1>
  <TEG2>
  </TEG2>
 </TEG1>
</FX>
--------
<FX>
 <TEG1>
 </TEG1>
</FX>
--------
于 2013-03-02T15:13:11.067 回答
0

With so huge file (10 GB), RegExp is worthless. Here is my idea.

' StripInvalidXML.vbs
Option Explicit

Const ForReading = 1, ForWriting = 2, ForAppending = 8
Const TristateUseDefault = -2, TristateTrue = -1, TristateFalse = 0
Const TAG_OPEN = "<FX>", TAG_CLOSE = "</FX>"

Dim fso, fin, fout
Dim sLine, sBlock

Set fso  = CreateObject("Scripting.FileSystemObject")
Set fin  = fso.OpenTextFile("input_log.xml",  ForReading,  False)
Set fout = fso.OpenTextFile("output_log.xml", ForAppending, True)

Do Until fin.AtEndOfStream
    sLine = fin.ReadLine
    If sLine = TAG_OPEN Then
        sBlock = sLine
    Else
        sBlock = sBlock & sLine
    End If
    sBlock = sBlock & vbNewLine
    If sLine = TAG_CLOSE Then
        fout.WriteLine sBlock
    End If
Loop

fin.Close
fout.Close
于 2013-03-03T13:27:32.033 回答