下面的代码在 5GB 文件上运行,它消耗 99% 的 CPU,我想知道我是否在做一些非常错误的事情,或者有什么可以缩短执行时间。
2013-04-03 08:54:19,989 信息 [记录器] 2013-04-03T08:54:19.987-04:00PCMC.common.manage.springUtil<log-message-body><headers><fedDKPLoggingContext id="DKP_DumpDocumentProperties " type="context.generated.FedDKPLoggingContext"><logFilter>7</logFilter><logSeverity>255</logSeverity><schemaType>PCMC.MRP.DocumentMetaData</schemaType><UID>073104c-4e -4ce-bda-694344ee62</UID><consumerSystemId>JTR</consumerSystemId><consumerLogin>jbserviceid</consumerLogin><logLocation>成功完成服务</logLocation></fedDKPLoggingContext></headers><有效载荷>0</有效载荷></日志消息正文>
这是我正在使用的代码。我也尝试了 gz 格式,但都是徒劳的。我在下面的命令中从 bash 中调用这个 awk。
awk -f mytest.awk <(gzip -dc 扫描文件.$yesterday.gz)| gzip > tem.gz
cat mytest.awk
#!/bin/awk -f
function to_ms (time, time_ms, s) {
split(time, s, /:|\,/ )
time_ms = (s[1]*3600+s[2]*60+s[3])*1000+s[4]
#printf ("%s\n", newtime)
return time_ms
}
{
stid = gensub(/.*UID&gt;([^&]+).*/,"\\1","")
}
(stid in starttime) {
etime = to_ms($2)
endtime[stid] = etime
docid[stid] = gensub(/.*id="([^""]+).*/,"\\1","")
consumer[stid]= gensub(/.*schemaType&gt;PNC.([^.]+).*/,"\\1","")
state[stid]= gensub(/.*lt;logLocation&gt;([^'' ]+).*/,"\\1","")
next
}
{
stime = to_ms($2)
starttime[stid] = stime
st_hour[stid] = stime/(60*60*1000)
timestamp[stid] = $1" "$2
}
END {
print "Document,Consumer,Hour,ResponseTime,Timestamp,State"
for (x in starttime) {
for (y in endtime) {
if (x==y) {
diff = (endtime[y]-starttime[x])
st = sprintf("%02d", st_hour[x])
print docid[y], consumer[y], st":00", diff, timestamp[x], state[y] |"sort -k3"
delete starttime[x]
delete endtime[y]
delete docid[y]
delete consumer[y]
delete timestamp[x]
delete state[y]
}
}
}
}