我需要从 teradata 读取大量数据(应用程序。800M 记录),我的代码对于一百万条记录运行良好。对于较大的集合,它需要时间来构建元数据。有人可以建议如何使它更快。下面是我用于我的应用程序的代码片段。
def get_partitions(num_partitions):
list_range =[]
initial_start=0
for i in range(num_partitions):
amp_range = 3240//num_partitions
start = (i*amp_range+1)*initial_start
end = (i+1)*amp_range
list_range.append((start,end))
initial_start = 1
return list_range
@delayed
def load(query,start,end,connString):
df = pd.read_sql(query.format(start, end),connString)
engine.dispose()
return df
connString = "teradatasql://{user}:{password}@{hostname}/?logmech={logmech}&encryptdata=true"
results = from_delayed([load(query,start, end,connString) for start,end in get_partitions(num_partitions)])