最近我正在处理nginx分析工作与python的访问日志。
我找到了shlex
根据这个使用空格分割引用字符串的方法
但是真的很慢,分析2000行日志需要1.2秒多。我的 nginx 服务器每秒生成超过 2500 行。
所以我尝试过使用re
或更多原生(和粗鲁)的方式来索引字符串。
这些代码在虚拟机中运行,对于 2000 行日志,两者都花费了大约 0.5 秒以上
我还有其他选择可以提高效率吗?
提前致谢
这是我的代码
import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
line = re.split('\"', line)
line_pre = re.split('\s+', line[0])
r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
http_method =r.findall(line[1])
#http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
if len(http_method):
http_method = http_method[0]
else:
http_method = ''
r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
code_byte = r.findall(line[2])
#code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
status = int(code_byte[0][0])
bytes_sent = int(code_byte[0][1])
r = re.compile(r":\d+$")
upstream_addr = r.sub("", line_pre[4])
request_time = int(float(line_pre[0])*1000)
if line_pre[1] == '-':
upstream_response_time = -1
else:
upstream_response_time = int(float(line_pre[1])*1000)
remote_addr = line_pre[2]
host = line_pre[7].replace(' ','')
logdatetime = line_pre[5].replace('[','')
dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
year = int(str(dt)[0:4])
monthday = int(str(dt)[4:10].replace("-",""))
hour = int(str(dt)[11:13])
logtime = int(str(dt)[14:16])
sec = time.mktime(dt.timetuple())
r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
request_uri = r.findall(line[1])
#request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
http_referer = line[3]
user_agent = line[5]
gzip_ratio = line[7]
http_x_forwarded_for = line[9]
r = re.compile(r"^([0-9\.]+)\s+(.*)")
serad_guid = r.findall(line[11])
server_addr = serad_guid[0][0]
guid = serad_guid[0][1]
doc = {
"hour":hour,
"year":year,
"date":monthday,
"time":logtime,
"sec":sec,
"request_time":request_time,
"upstream_response_time":upstream_response_time,
"remote_addr":remote_addr,
"upstream_addr":upstream_addr,
"host":host,
"method":http_method,
"request_uri":request_uri,
#"request_protocal":"",
"status":status,
"bytes_sent":bytes_sent,
"http_referer":http_referer,
"user_agent":user_agent,
"gzip_ratio":gzip_ratio,
"http_x_forwarded_for":http_x_forwarded_for,
"server_addr":server_addr,
"guid":guid
}
return doc
t2 = time.time()
count =0
for i in range(12000):
convert(line)
count += 1
if count % 2000 == 0:
t1 = t2
t2 = time.time()
print str(t2-t1)
和
索引方式
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def pair(l):
for i in range(0, len(l), 2):
yield (l[i], l[i+1])
def convert(line):
line = line.replace(" ", "")
quotes_positions = allindices(line, "\"")
if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
return None
space_positions = allindices(line, " ")
target_positions = []
for s in space_positions:
true_target = True
for qs, qe in pair(quotes_positions):
if s > qs and s < qe:
true_target = False
break
if true_target:
target_positions.append(s)
ret = []
for i in range(0, len(target_positions)):
if i + 1 == len(target_positions):
ret.append(line[target_positions[i] + 1:])
else:
ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
return ret
# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
listindex = list()
i = string.find(sub)
while i >= 0:
listindex.append(i)
i = string.find(sub, i + 1)
return listindex
t2 = time.time()
count =0
for i in range(12000):
convert(line)
count += 1
if count % 2000 == 0:
t1 = t2
t2 = time.time()
print str(t2-t1)