长期聆听,第一次来电!所以,我有这个用于解析 Google Base Feed 文本文件的 Python 脚本。它取出特定的数据片段并创建一个我可以上传到 Bing 购物的格式化文件。在终于让它运行之后,我发现它只是输出空白文件而不是我想要的清理数据。我在这里想念什么?我真的很感激任何帮助!公平的警告,我是一个相当大的 Python 新手,我已经得到了很多帮助来写这篇文章。
import sys,os
import pandas as pd
import datetime
def remove_quotes(data):
lines = data.split('\n')
for i, line in enumerate(lines):
lines[i] = lines[i].replace('"','')
print lines[i]
return data
def tab_error(index, line, output):
count = len(line.split('\t'))
if count != 19:
err = 'Tab issue at line {linenum} : {numtabs} extra tabs'.\
format(linenum=index,numtabs=(count-19))
print err
output.write(err+'\n')
return True
return False
def html_error(index, line, output):
htmltags = ['&fract12', '&39','&', '&qt;', '<', '&rt;','"','>','quot','’']
for tag in htmltags:
if line.find(tag) > 0:
err = 'HTML issue at line {linenum}'.\
format(linenum=index)
print err
output.write(err+'\n')
return True
return False
def read_data(filename):
with open(filename,'r') as infile:
data = infile.read()
return data
def tabs_check(data, output, filename):
with open(filename,'w') as cleanfile:
header = ''
for x in xrange(19):
header += 'x'+str(x+1)+'\t'
cleanfile.write(header)
# for each line in the file
for i, line in enumerate(data.split('\r')[1:]):
# check line for tabs error
data_error = tab_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error:
cleanfile.write('\n'+newline)
def html_check(data, output, filename):
with open(filename,'w') as cleanfile:
# for each line in the file
lines = data.split('\n')
cleanfile.write(lines[0])
for i, line in enumerate(lines[1:]):
# check line for HTML errors
data_error = html_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error and newline:
cleanfile.write('\n'+newline)
if __name__ == '__main__':
# Clean tabs
filename = sys.argv[1]
ts = datetime.datetime.now().isoformat()
print ts
with open('bing_errors.txt','w') as output:
# print 'Removing quotes within .. product description and ...'
# data = remove_quotes(data)
print 'Removing lines with more than 19 tabs...'
data = read_data(filename)
tabs_check(data, output, 'clean19.txt')
# Delete and reorder columns
print 'Deleting and reordering columns...'
df = pd.read_table('clean19.txt')
tmp = df[['x8','x2','x3','x4','x6','x1','x5']]
tmp.columns = ['MPID',
'Brand (BrandorManufacturer)',
'Title',
'Item Description',
'Price',
'ProductURL',
'ImageURL']
tmp.to_csv('tmp.txt', index=False, sep='\t')
os.remove('clean19.txt')
#HTML errors
print 'Checking for HTML errors...'
data = read_data('tmp.txt')
html_check(data, output, 'BT1.txt')
os.remove('tmp.txt')
# row = tmp[tmp['MPID'] == 8724]
# print row