所以我最终只是创建了一个 Python 脚本来执行此操作。这并不可怕。这是脚本,以防其他人想使用它。显然它需要更多的错误检查,而不是硬编码的网址等,但你明白了。注意,你需要下载Beautiful Soup
#!/usr/bin/python
import sys
from bs4 import BeautifulSoup as BS
from urllib2 import urlopen
import re
TRACKER_BASE_URL = 'http://my.tracker.com:50030/'
trackerURLformat = TRACKER_BASE_URL + 'jobtasks.jsp?jobid=%s&type=%s&pagenum=1' # use map or reduce for the type
def findLogs(url):
finalLog = ""
print "Looking for Job: " + url
html = urlopen(url).read()
trackerSoup = BS(html)
taskURLs = [h.get('href') for h in trackerSoup.find_all(href=re.compile('taskdetails'))]
# Now that we know where all the tasks are, go find their logs
logURLs = []
for taskURL in taskURLs:
taskHTML = urlopen(TRACKER_BASE_URL + taskURL).read()
taskSoup = BS(taskHTML)
allLogURL = taskSoup.find(href=re.compile('all=true')).get('href')
logURLs.append(allLogURL)
# Now fetch the stdout log from each
for logURL in logURLs:
logHTML = urlopen(logURL).read()
logSoup = BS(logHTML)
stdoutText = logSoup.body.pre.text.lstrip()
finalLog += stdoutText
return finalLog
def main(argv):
with open(argv[1] + "-map-stdout.log", "w") as f:
f.write(findLogs(trackerURLformat % (argv[1], "map")))
print "Wrote mapers stdouts to " + f.name
with open(argv[1] + "-reduce-stdout.log", "w") as f:
f.write(findLogs(trackerURLformat % (argv[1], "reduce")))
print "Wrote reducer stdouts to " + f.name
if __name__ == "__main__":
main(sys.argv)