1

嗨,我有一个以下格式的输入文件。

    .....
    ......

    <TABLE COLS="3">
             <ROW>
                <R>data</R>
                <R>data</R>   
              </ROW>
              <ROW>
                <R>data</R>
                <R>data</R>
                <R>data</R>
              </ROW>
    </TABLE>
    <TABLE COLS="4">
             <ROW>
                <R>data</R>
                <R>data</R>
                <R>data</R>
                <R>data</R>
                <R>data</R>   
              </ROW>
              <ROW>
                <R>data</R>
                <R>data</R>
              </ROW>
    </TABLE> 
    .......
    .....
    .
    ...

输出文件应为:

....
....
.
..

<table ct="3">
<ent="1">
<ent="2">
<ent="3">

         <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn>  
          </row>
          <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn>
            <rvn ="3">data</rvn>  
          </row>
</table>
<table ct="4">
<ent="1">
<ent="2">
<ent="3">
<ent="4">
         <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn> 
            <rvn ="3">data</rvn> 
            <rvn ="4">data</rvn>
            <rvn ="5">data</rvn>
          </row>
          <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn>
          </row>
</table>
...
...
...

我已经编写了以下代码:当我运行此代码时,表 col 值将被最后一个表 col 值替换。而且我在增加<rvn>价值方面也面临问题。你们中的任何一个都可以帮我解决这个问题。

    import re

    def tblcnv( st, val ):
        Tcolspec = ''
        Endval = int(val) + 1
        for i in range(1, Endval):
            l = str(i)
            Tcolspec += "<colspec col='" + l + "' colwidth=''/>\n"
            Theader = re.sub(r"(?i)<table.*?>","<table ct='" + val +"'>\n" + Tcolspec + "\n", st)
        return Theader

    in_data = open("in.txt", "r")
    out_data = open("out.txt", "w")
    Rdata = in_data.read()
    Rval = Rdata.replace("\n", " ")

    Rval = re.sub("(?i)(<TABLE.*cols=\"(\d+).*?</TABLE>)", lambda m: tblcnv(m.group(1), m.group(2)), Rval)
    out_data.write(Rval)
4

2 回答 2

1

使用 HTML/XML 解析器是一种操作 HTML/XML 的更简单且不易出错的方法。

它更容易,因为解析器允许您处理更高级别的概念:标签和属性,而不是任意字符串上的正则表达式。


这是使用lxml的示例:

import lxml.etree as ET
import itertools as IT

content = '''\
<root>
<TABLE COLS="3">
         <ROW>
            <R>data</R>
            <R>data</R>   
          </ROW>
          <ROW>
            <R>data</R>
            <R>data</R>
            <R>data</R>
          </ROW>
</TABLE>
<TABLE COLS="4">
         <ROW>
            <R>data</R>
            <R>data</R>
            <R>data</R>
            <R>data</R>
            <R>data</R>   
          </ROW>
          <ROW>
            <R>data</R>
            <R>data</R>
          </ROW>
</TABLE>
</root>
'''

root = ET.fromstring(content)
for elt in root.iter():
    elt.tag = elt.tag.lower()
    if elt.tag == 'table':
        elt.attrib['ct'] = elt.attrib['COLS']
        del elt.attrib['COLS']
        # Add <ent> tags
        for i in range(int(elt.attrib['ct']), 0, -1):
            elt.insert(0, ET.Element('ent', value=str(i)))
    # Restart count every time <row> is encountered        
    if elt.tag == 'row':
        count = IT.count(1)
    # Change <R> to <rvn>    
    if elt.tag == 'r':
        elt.tag = 'rvn'
        elt.attrib['value'] = str(next(count))
print(ET.tostring(root, pretty_print = True))

产量

<root>
<table ct="3">
         <ent value="1"/><ent value="2"/><ent value="3"/><row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>   
          </row>
          <row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>
            <rvn value="3">data</rvn>
          </row>
</table>
<table ct="4">
         <ent value="1"/><ent value="2"/><ent value="3"/><ent value="4"/><row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>
            <rvn value="3">data</rvn>
            <rvn value="4">data</rvn>
            <rvn value="5">data</rvn>   
          </row>
          <row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>
          </row>
</table>
</root>
于 2013-01-29T17:06:00.727 回答
1

这是您的工作代码...

注意:您不应该为此使用正则表达式...解析总是更好的方法...

import re

counter = None

def datacnv( st ):
    global counter
    return "<rvn=\""+ next(counter) +"\">" + st + "</rvn>\n"

def rowcnv( st ):
    global counter

    counter = iter("".join([str(x) for x in range(1,10)]))

    st = re.sub("(?i)<R>(.*?)</R>", lambda m: datacnv(m.group(1)), st)

    return "<row>\n" + st + "</row>\n"

def tblcnv( st, val ):
    Tcolspec = ''
    Endval = int(val) + 1
    for i in range(1, Endval):
        l = str(i)
        Tcolspec += "<colspec col='" + l + "' colwidth=''/>\n"
    Theader = re.sub(r"(?i)<table.*?>","\n<table ct='" + val +"'>\n" + Tcolspec + "\n", st)

    Theader = re.sub("(?i)<ROW>(.*?)</ROW>", lambda m: rowcnv(m.group(1)), Theader)

    return Theader

in_data = open("in.txt", "r")
out_data = open("out.txt", "w")
Rdata = in_data.read().lower()
in_data.close()
Rval = Rdata.replace("\n", " ")

Rval = re.sub("(?i)(<TABLE.*?cols=\"(\d+).*?</TABLE>)", lambda m: tblcnv(m.group(1), m.group(2)), Rval)
out_data.write(Rval)

out_data.close()

输出

<table ct='3'>
<colspec col='1' colwidth=''/>
<colspec col='2' colwidth=''/>
<colspec col='3' colwidth=''/>

              <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
                  </row>
               <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
                 <rvn="3">data</rvn>
               </row>
     </table>     
<table ct='4'>
<colspec col='1' colwidth=''/>
<colspec col='2' colwidth=''/>
<colspec col='3' colwidth=''/>
<colspec col='4' colwidth=''/>

              <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
                 <rvn="3">data</rvn>
                 <rvn="4">data</rvn>
                 <rvn="5">data</rvn>
                  </row>
               <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
               </row>
     </table>  
于 2013-01-29T14:22:37.227 回答