我正在尝试使用 hdf5 (pytables) 将 2 个具有内存限制的大矩阵相乘,但函数 numpy.dot 似乎给了我错误:
值错误:数组太大
我需要自己做矩阵乘法,可能是块状的,或者还有其他类似于 numpy.dot 的 python 函数?
import numpy as np
import time
import tables
import cProfile
import numexpr as ne
n_row=10000
n_col=100
n_batch=10
rows = n_row
cols = n_col
batches = n_batch
atom = tables.UInt8Atom() #?
filters = tables.Filters(complevel=9, complib='blosc') # tune parameters
fileName_a = 'C:\carray_a.h5'
shape_a = (rows*batches, cols) # predefined size
h5f_a = tables.open_file(fileName_a, 'w')
ca_a = h5f_a.create_carray(h5f_a.root, 'carray', atom, shape_a, filters=filters)
for i in range(batches):
data = np.random.rand(rows,cols)
ca_a[i*rows:(i+1)*rows]= data[:]
#h5f_0.close()
rows = n_col
cols = n_row
batches = n_batch
fileName_b = 'C:\carray_b.h5'
shape_b = (rows, cols*batches) # predefined size
h5f_b = tables.open_file(fileName_b, 'w')
ca_b = h5f_b.create_carray(h5f_b.root, 'carray', atom, shape_b, filters=filters)
#need to batch by cols
sz= rows/batches
for i in range(batches):
data = np.random.rand(sz, cols*batches)
ca_b[i*sz:(i+1)*sz]= data[:]
#h5f_1.close()
rows = n_batch*n_row
cols = n_batch*n_row
fileName_c = 'C:\carray_c.h5'
shape_c = (rows, cols) # predefined size
h5f_c = tables.open_file(fileName_c, 'w')
ca_c = h5f_c.create_carray(h5f_c.root, 'carray', atom, shape_c, filters=filters)
a= h5f_a.root.carray#[:]
b= h5f_b.root.carray#[:]
c= h5f_c.root.carray
t0= time.time()
c= np.dot(a,b) #error if aray is big
print (time.time()-t0)
更新:这里是代码。这很有趣,但使用 hdf5 运行得更快。
import numpy as np
import tables
import time
sz= 100 #chunk size
n_row=10000 #m
n_col=1000 #n
#for arbitrary size
A=np.random.rand(n_row,n_col)
B=np.random.rand(n_col,n_row)
# A=np.random.randint(5, size=(n_row,n_col))
# B=np.random.randint(5, size=(n_col,n_row))
#using numpy array
#C= np.zeros((n_row,n_row))
#using hdf5
fileName_C = 'CArray_C.h5'
atom = tables.Float32Atom()
shape = (A.shape[0], B.shape[1])
Nchunk = 128 # ?
chunkshape = (Nchunk, Nchunk)
chunk_multiple = 1
block_size = chunk_multiple * Nchunk
h5f_C = tables.open_file(fileName_C, 'w')
C = h5f_C.create_carray(h5f_C.root, 'CArray', atom, shape, chunkshape=chunkshape)
sz= block_size
t0= time.time()
for i in range(0, A.shape[0], sz):
for j in range(0, B.shape[1], sz):
for k in range(0, A.shape[1], sz):
C[i:i+sz,j:j+sz] += np.dot(A[i:i+sz,k:k+sz],B[k:k+sz,j:j+sz])
print (time.time()-t0)
t0= time.time()
res= np.dot(A,B)
print (time.time()-t0)
print (C== res)
h5f_C.close()