最近有人问如何在 python 中执行文件 slurp,并且接受的答案建议如下:
with open('x.txt') as x: f = x.read()
我将如何执行此操作以读取文件并转换数据的字节序表示?
例如,我有一个 1GB 的二进制文件,它只是一堆打包为大端序的单精度浮点数,我想将它转换为小端序并转储到一个 numpy 数组中。下面是我为完成此任务而编写的函数以及调用它的一些真实代码。我使用struct.unpack
进行字节序转换并尝试使用mmap
.
那么我的问题是,我是否正确使用 slurp 和mmap
and struct.unpack
?有没有更清洁、更快的方法来做到这一点?现在我有什么作品,但我真的很想学习如何做得更好。
提前致谢!
#!/usr/bin/python
from struct import unpack
import mmap
import numpy as np
def mmapChannel(arrayName, fileName, channelNo, line_count, sample_count):
"""
We need to read in the asf internal file and convert it into a numpy array.
It is stored as a single row, and is binary. Thenumber of lines (rows), samples (columns),
and channels all come from the .meta text file
Also, internal format files are packed big endian, but most systems use little endian, so we need
to make that conversion as well.
Memory mapping seemed to improve the ingestion speed a bit
"""
# memory-map the file, size 0 means whole file
# length = line_count * sample_count * arrayName.itemsize
print "\tMemory Mapping..."
with open(fileName, "rb") as f:
map = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
map.seek(channelNo*line_count*sample_count*arrayName.itemsize)
for i in xrange(line_count*sample_count):
arrayName[0, i] = unpack('>f', map.read(arrayName.itemsize) )[0]
# Same method as above, just more verbose for the maintenance programmer.
# for i in xrange(line_count*sample_count): #row
# be_float = map.read(arrayName.itemsize) # arrayName.itemsize should be 4 for float32
# le_float = unpack('>f', be_float)[0] # > for big endian, < for little endian
# arrayName[0, i]= le_float
map.close()
return arrayName
print "Initializing the Amp HH HV, and Phase HH HV arrays..."
HHamp = np.ones((1, line_count*sample_count), dtype='float32')
HHphase = np.ones((1, line_count*sample_count), dtype='float32')
HVamp = np.ones((1, line_count*sample_count), dtype='float32')
HVphase = np.ones((1, line_count*sample_count), dtype='float32')
print "Ingesting HH_Amp..."
HHamp = mmapChannel(HHamp, 'ALPSRP042301700-P1.1__A.img', 0, line_count, sample_count)
print "Ingesting HH_phase..."
HHphase = mmapChannel(HHphase, 'ALPSRP042301700-P1.1__A.img', 1, line_count, sample_count)
print "Ingesting HV_AMP..."
HVamp = mmapChannel(HVamp, 'ALPSRP042301700-P1.1__A.img', 2, line_count, sample_count)
print "Ingesting HV_phase..."
HVphase = mmapChannel(HVphase, 'ALPSRP042301700-P1.1__A.img', 3, line_count, sample_count)
print "Reshaping...."
HHamp_orig = HHamp.reshape(line_count, -1)
HHphase_orig = HHphase.reshape(line_count, -1)
HVamp_orig = HVamp.reshape(line_count, -1)
HVphase_orig = HVphase.reshape(line_count, -1)