1

长期聆听,第一次来电!所以,我有这个用于解析 Google Base Feed 文本文件的 Python 脚本。它取出特定的数据片段并创建一个我可以上传到 Bing 购物的格式化文件。在终于让它运行之后,我发现它只是输出空白文件而不是我想要的清理数据。我在这里想念什么?我真的很感激任何帮助!公平的警告,我是一个相当大的 Python 新手,我已经得到了很多帮助来写这篇文章。

import sys,os
import pandas as pd
import datetime

def remove_quotes(data):
   lines = data.split('\n')
   for i, line in enumerate(lines):
       lines[i] = lines[i].replace('"','')
       print lines[i]
   return data

def tab_error(index, line, output):
    count = len(line.split('\t'))
    if count != 19:
        err = 'Tab issue at line {linenum} : {numtabs} extra tabs'.\
                format(linenum=index,numtabs=(count-19))            
        print err
        output.write(err+'\n')
        return True
    return False

def html_error(index, line, output):
    htmltags = ['&fract12', '&39','&', '&qt;', '<', '&rt;','"','>','quot','’']
    for tag in htmltags:
        if line.find(tag) > 0:
            err = 'HTML issue at line {linenum}'.\
                    format(linenum=index)            
            print err
            output.write(err+'\n')
            return True
    return False

def read_data(filename):
    with open(filename,'r') as infile:
        data = infile.read()
    return data

def tabs_check(data, output, filename):
    with open(filename,'w') as cleanfile:
        header = ''
        for x in xrange(19):
            header += 'x'+str(x+1)+'\t'

        cleanfile.write(header)
        # for each line in the file
        for i, line in enumerate(data.split('\r')[1:]):

            # check line for tabs error
            data_error = tab_error(i, line, output)

            newline = line.replace('"','')
            newline=newline.strip()

            if not data_error:
                cleanfile.write('\n'+newline)

def html_check(data, output, filename):
    with open(filename,'w') as cleanfile:
        # for each line in the file
        lines = data.split('\n')
        cleanfile.write(lines[0])

        for i, line in enumerate(lines[1:]):

            # check line for HTML errors
            data_error = html_error(i, line, output)

            newline = line.replace('"','')
            newline=newline.strip()

            if not data_error and newline:
                cleanfile.write('\n'+newline)


if __name__ == '__main__':

    # Clean tabs
    filename = sys.argv[1]
    ts = datetime.datetime.now().isoformat()
    print ts
    with open('bing_errors.txt','w') as output:

#         print 'Removing quotes within .. product description and ...'       
#         data = remove_quotes(data)

        print 'Removing lines with more than 19 tabs...'
        data = read_data(filename)
        tabs_check(data, output, 'clean19.txt')

        # Delete and reorder columns
        print 'Deleting and reordering columns...'
        df = pd.read_table('clean19.txt')
        tmp = df[['x8','x2','x3','x4','x6','x1','x5']]   
        tmp.columns = ['MPID', 
                       'Brand (BrandorManufacturer)', 
                       'Title', 
                       'Item Description', 
                       'Price', 
                       'ProductURL', 
                       'ImageURL']

        tmp.to_csv('tmp.txt', index=False, sep='\t')
        os.remove('clean19.txt')

        #HTML errors
        print 'Checking for HTML errors...'
        data = read_data('tmp.txt')
        html_check(data, output, 'BT1.txt')
        os.remove('tmp.txt')

        # row = tmp[tmp['MPID'] == 8724]
        # print row
4

0 回答 0