我的 python 脚本:#!/usr/bin/python import sys importdumbo import cgi, urlparse #from dumbo.lib import JoinReducer #from dumbo.decor import primary, secondary
def mapper(key, value):
line = value.split('\t')
line[1] = float(line[1]) # Unix timestamp
line[3] = int(line[3]) # Port
line[4] = float(line[4]) # Delay
url_part = urlparse.urlsplit(line[6])
if url_part.path not in ('/android-test.htm',
'/iphone-test.htm',
'/symbian-test.htm',
'/meego-test.htm',
'/mobile-test.htm',
'/showlog.php',
'/showlognew.php',):
qs_dict = cgi.parse_qs(url_part.query)
line[6] = qs_dict
line[7] = int(line[7]) # HTTP status code
yield line
def reducer(key, values):
yield key, values
if __name__ == "__main__":
dumbo.run(mapper)
执行命令:
dumbo start logparser.py -input analytics.log-20111209 -output analytics-log -python \ python2.6 -hadoop /usr/local/cloudera/hadoop-0.20.2-cdh3u3/
jobtracker 日志中的 hadoop 错误
2012-03-15 09:53:18,931 INFO org.apache.hadoop.mapred.TaskInProgress: Error from attempt_201203131446_0006_m_000003_2: java.lang.RuntimeException: java.lang.NullPointerException
at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:376)
at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:572)
at org.apache.hadoop.streaming.PipeMapper.close(PipeMapper.java:136)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:57)
at org.apache.hadoop.streaming.PipeMapRunner.run(PipeMapRunner.java:34)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:391)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:325)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1157)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
引起:org.apache.hadoop.streaming.io.TypedBytesOutputReader.readKeyValue(TypedBytesOutputReader.java:57) 的 java.lang.NullPointerException 提前感谢所有帮助。