不幸的是,我之前发现 Excel 会导入元素值,但不会导入这些元素中的属性。例如“stuff211”导入 stuff211 但不导入 stuff111。我认为这只是功能在 Excel 中工作方式的一个限制。导入必须在 Excel 中,还是可以使用 Python 等编程语言?我之前已经编写了一个程序来从 xml 文件中提取特定元素和属性值,如果需要,我很乐意在明天挖掘并分享?
更新
这是我之前编写并用于将数据从 xml 文件剥离到 csv 文件中的 python 脚本。请注意,我还没有设置它来获取所有属性和元素,因为我最初的目的是从文件中获取特定数据。您需要将 search_items 全局列表编辑为要搜索的项目。
您可以使用 xml 文件路径的单个参数从命令行调用脚本,也可以不使用 arg,系统将提示您选择目录。请让我知道,如果你有任何问题:
#!/usr/bin/python
# Change Ideas
# ------------
# Add option to get all xml elements / attributes
# Add support for json?
import sys, os, Tkinter, tkFileDialog as fd, traceback
# stop tinker shell from opening as only needed for file dialog
root = Tkinter.Tk()
root.withdraw()
# globals
debug_on = False
#get_all_elements = False
#get_all_attributes = False
# search items to be defined each time you run to identify values to search for.
# each item should have a search text, a type and optionally a heading e.g.
# search_items = ['exact_serach_text', 'item_type', 'column_heading(optional)']
# note: search items are case sensitive.
#
############################## E X A M P L E ##############################
search_items = [
['policyno=', 'xml_attribute', 'Policy No' ],
['transid=', 'xml_attribute', 'Trans ID' ],
['policyPremium=', 'xml_attribute', 'Pol Prem' ],
['outstandingBalance=', 'xml_attribute', 'Balance' ],
['APRCharge=', 'xml_attribute', 'APR Chrg' ],
['PayByAnnualDD=', 'xml_attribute', 'Annual DD' ],
['PayByDD=', 'xml_attribute', 'Pay by DD' ],
['mtaDebitAmount=', 'xml_attribute', 'MTA Amt' ],
['paymentMethod=', 'xml_attribute', 'Pmt Meth' ],
['ddFirstPaymentAmount=', 'xml_attribute', '1st Amt' ],
['ddRemainingPaymentsAmount=', 'xml_attribute', 'Other Amt' ],
['ddNumberOfPaymentsRemaining=', 'xml_attribute', 'Instl Rem' ],
]
item_types = ['xml_attribute', 'xml_element']
def get_heads():
heads = []
for i in search_items:
try:
# raise error if i[2] does not exist or is empty
assert len(i[2]) > 0, "No value in heading, use search text."
except:
heads.append(i[0]) # use search item as not heading is given
else:
heads.append(i[2])
return heads
def write_csv_file(path, heads, data):
"""
Writes data to file, use None for heads param if no headers required.
"""
with open(path, 'wb') as fileout:
writer = csv.writer(fileout)
if heads:
writer.writerow(heads)
for row in data:
try:
writer.writerow(row)
except:
print '...row failed in write to file:', row
exc_type, exc_value, exc_traceback = sys.exc_info()
lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
for line in lines:
print '!!', line
print 'Data written to:', path, '\n'
def find_val_in_line(item, item_type, line):
if item_type.lower() == 'xml_element':
print 'Testing still in progress for xml elements, please check output carefully'
b1, b2 = ">", "<"
tmp = line.find(item) # find the starting point of the element value
x = line.find(b1, tmp+1) + len(boundary) # find next boundary after item
y = line.find(b2, x) # find subsequent boundary to mark end of element value
return line[x:y]
elif item_type.lower() == 'xml_attribute':
b = '"'
tmp = line.find(item) # find the starting point of the attribute
x = line.find(b, tmp+1) + len(b) # find next boundary after item
y = line.find(b, x) # find subsequent boundary to mark end of attribute
return line[x:y] # return value between start and end boundaries
else:
print 'This program does not currently support type:', item_type
print 'Returning null'
return None
def find_vals_in_file(file_path):
with open(file_path, "r") as f:
buf = f.readlines()
f.seek(0)
data, row = [], []
found_in_row, pos = 0, 0
l = len(search_items)
if debug_on: print '\nsearch_items set to:\n' + str(search_items) + '\n'
# loop through the lines in the file...
for line in buf:
if debug_on: print '\n..line set to:\n ' + line
# loop through search items on each line...
for i in search_items:
if debug_on: print '...item set to:\n ' + i[0]
# if the search item is found in the line...
if i[0] in line:
val = find_val_in_line(i[0], i[1], line)
# only count as another item found if not already in that row
try:
# do not increment cnt if this works as item already exists
row[pos] = val
if debug_on: print '.....repeat item found:- ' + i[0] + ':' + val
except IndexError:
found_in_row += 1 # Index does not exist, count as new
row.append(val)
if debug_on: print '....item found, row set to:\n ' + str(row)
# if we have a full row then add row to data and start row again...
if found_in_row == l:
if debug_on: print '......row complete, appending to data\n'
data.append(row)
row, found_in_row = [], 0
pos += 1 # start at 0 and increment 1 at end of each search item
pos = 0
f.close()
return data
def main():
path, matches = None, []
os.chdir(os.getenv('userprofile'))
# check cmd line args provided...
if len(sys.argv) > 1:
path = sys.argv[1]
else:
while not path:
try:
print 'Please select a file to be parsed...'
path = fd.askopenfilename()
except:
print 'Error selecting file, please try again.'
# search for string in each file...
try:
matches = find_vals_in_file(path)
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
print "An error occurred checking file:", path
print ''.join('!! ' + line for line in lines)
# write output to file...
if len(matches) == 0:
print "No matches were found in the files reviewed."
else:
heads = get_heads()
output_path = os.path.join(os.getcwd(),'tmp_matches.csv')
write_csv_file(output_path, heads, matches)
print "Please rename the file if you wish to keep it as it will be overwritten..."
print "\nOpening file..."
os.startfile(output_path)
if debug_on:
print "\nWriting output to screen\n", "-"*24
print heads
for row in matches:
print row
if __name__ == '__main__':
main()
希望这对你有用。到目前为止,我只测试了几个不同的 xml 文件,但对我来说还可以。