pandas.read_sql 比直接使用 teradatasql 驱动要慢。
这是一个简单的 Python 脚本,用于测试 500 万行和 60 列,其中 80% 非 NULL 和 20% NULL 列值:
import pandas
import teradatasql
import time
with teradatasql.connect (host="whomooz", user="guest", password="please") as con:
with con.cursor () as cur:
cur.execute ("create volatile table voltab (" + ",".join ([ "c{} integer".format (n) for n in range (1, 61) ]) + ") on commit preserve rows")
cur.execute ("insert into voltab(c1) select row_number() over (order by calendar_date) as c1 from sys_calendar.calendar qualify c1 <= 62500")
cur.execute ("insert into voltab(c1) select c1 + 62500 from voltab")
cur.execute ("insert into voltab(c1) select c1 + 125000 from voltab")
cur.execute ("insert into voltab(c1) select c1 + 250000 from voltab")
cur.execute ("insert into voltab(c1) select c1 + 500000 from voltab")
cur.execute ("insert into voltab(c1) select c1 + 1000000 from voltab")
cur.execute ("insert into voltab(c1) select c1 + 2000000 from voltab")
cur.execute ("insert into voltab(c1) select c1 + 4000000 from voltab where c1 <= 1000000")
cur.execute ("update voltab set " + ",".join ([ "c{} = c1".format (n) for n in range (2, 49) ]))
cur.execute ("select * from voltab")
print ('beginning fetchall')
dStartTime = time.time ()
rows = cur.fetchall ()
dElapsed = time.time () - dStartTime
print ("fetchall took {} seconds, or {} minutes, and returned {} rows".format (dElapsed, dElapsed / 60, len (rows)))
dStartTime = time.time ()
df = pandas.read_sql ("select * from voltab", con)
dElapsed = time.time () - dStartTime
print ("read_sql took {} seconds, or {} minutes, and returned {} rows".format (dElapsed, dElapsed / 60, len (df)))
我的结果是:
fetchall took 638.6090559959412 seconds, or 10.64348426659902 minutes, and returned 5000000 rows
read_sql took 2293.84486413002 seconds, or 38.23074773550034 minutes, and returned 5000000 rows