python - 用python按空格分割引用字符串的更有效方法？

Question

最近我正在处理nginx分析工作与python的访问日志。

我找到了shlex根据这个使用空格分割引用字符串的方法

但是真的很慢，分析2000行日志需要1.2秒多。我的 nginx 服务器每秒生成超过 2500 行。

所以我尝试过使用re或更多原生（和粗鲁）的方式来索引字符串。

这些代码在虚拟机中运行，对于 2000 行日志，两者都花费了大约 0.5 秒以上

我还有其他选择可以提高效率吗？

提前致谢

这是我的代码

import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
    line = re.split('\"', line)
    line_pre = re.split('\s+', line[0])

    r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
    http_method =r.findall(line[1])
    #http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
    if len(http_method):
        http_method = http_method[0]
    else:
        http_method = ''
    r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
    code_byte = r.findall(line[2])
    #code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
    status = int(code_byte[0][0])
    bytes_sent = int(code_byte[0][1])
    r = re.compile(r":\d+$")
    upstream_addr = r.sub("", line_pre[4])
    request_time = int(float(line_pre[0])*1000)
    if line_pre[1] == '-':
        upstream_response_time = -1
    else:
        upstream_response_time = int(float(line_pre[1])*1000)
    remote_addr = line_pre[2]
    host = line_pre[7].replace(' ','')
    logdatetime = line_pre[5].replace('[','')
    dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
    year = int(str(dt)[0:4])
    monthday = int(str(dt)[4:10].replace("-",""))
    hour = int(str(dt)[11:13])
    logtime = int(str(dt)[14:16])
    sec = time.mktime(dt.timetuple())
    r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
    request_uri = r.findall(line[1])
    #request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
    http_referer = line[3]
    user_agent = line[5]
    gzip_ratio = line[7]
    http_x_forwarded_for = line[9]
    r = re.compile(r"^([0-9\.]+)\s+(.*)")
    serad_guid = r.findall(line[11])
    server_addr = serad_guid[0][0]
    guid = serad_guid[0][1]
    doc = {
                    "hour":hour,
                    "year":year,
                    "date":monthday,
                    "time":logtime,
                    "sec":sec,
                    "request_time":request_time,
                    "upstream_response_time":upstream_response_time,
                    "remote_addr":remote_addr,
                    "upstream_addr":upstream_addr,
                    "host":host,
                    "method":http_method,
                    "request_uri":request_uri,
                    #"request_protocal":"",
                    "status":status,
                    "bytes_sent":bytes_sent,
                    "http_referer":http_referer,
                    "user_agent":user_agent,
                    "gzip_ratio":gzip_ratio,
                    "http_x_forwarded_for":http_x_forwarded_for,
                    "server_addr":server_addr,
                    "guid":guid

    }
    return doc
t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

和

索引方式

import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'

def pair(l):
    for i in range(0, len(l), 2):
        yield (l[i], l[i+1])

def convert(line):
    line = line.replace("  ", "")
    quotes_positions = allindices(line, "\"")
    if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
        return None

    space_positions = allindices(line, " ")

    target_positions = []

    for s in space_positions:
        true_target = True
        for qs, qe in pair(quotes_positions):
            if s > qs and s < qe:
                true_target = False
                break
        if true_target:
            target_positions.append(s)

    ret = []
    for i in range(0, len(target_positions)):
        if i + 1 == len(target_positions):
            ret.append(line[target_positions[i] + 1:])
        else:
            ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
    return ret


# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
    listindex = list()
    i = string.find(sub)
    while i >= 0:
        listindex.append(i)
        i = string.find(sub, i + 1)
    return listindex

t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

score 3 · Accepted Answer

这看起来有点像 CSV；我想知道是否可以滥用 csv 模块来使用它？

>>> for row in csv.reader([line], delimiter=' '):
...     print repr(row)
... 
['0.278', '0.264', '113.116.52.174', '-', '10.10.3.41:20080', '', '[08/Apr/2012:23:59:08', '+0800]', 'shenzhen.anjuke.com', 'GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0', '200', '10914', 'http://shenzhen.anjuke.com/prop/view/104178677', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)', '-', '-', '-', '114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E']

score 3 · Accepted Answer

只是根据示例行写了一个正则表达式，我实际上并不知道某些字段的含义，所以我为它们使用了占位符名称，您可以将它们重命名为更有意义的名称。在我的机器上，这个片段比你的第一个片段快 4~5 倍。

log_line_re = re.compile(
r"""
(?P<float1>[0-9.]+)
\s
(?P<float2>[0-9.]+)
\s
(?P<ip1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<field1>.+?)
\s
(?P<ip_port_1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})
\s+
\[(?P<request_date>.+?)\]
\s
(?P<host>.+?)
\s
"
(?P<http_method>[A-Z]+)
\s
(?P<request_path>.+?)
\s
HTTP/(?P<http_version>[0-9.]+)
"
\s
(?P<status_code>\d{3})
\s
(?P<number>\d+)
\s
"
(?P<referer>.+?)
"
\s
"(?P<user_agent>.+?)"
\s
"(?P<field2>.+?)"
\s
"(?P<field3>.+?)"
\s
(?P<field4>.+?)
"
(?P<ip2>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<request_guid>.+?)
"
""", re.VERBOSE)


def convert(line):
    return log_line_re.match(line).groupdict()

python - 用python按空格分割引用字符串的更有效方法？

2 回答 2

Related

Reference