我正在使用 pyathena 库来查询模式并将其存储在 pandas 数据框中。我有一个包含至少 30,000 个项目的列表。
例如。l1 = [1,2,3,4..... 29999,30000]
现在我想在 sql 查询中传递这个列表项。由于我不能一次传递所有 30,000 个列表项,因此,我将列表分成 30 个块,并循环传递每个块,如下所示:
注意:我尝试将其分成更少的块,但每块 1000 个项目似乎是最佳选择。
#function to divide list into chunks
def divide_chunks(l, n):
# looping till length l
for i in range(0, len(l), n):
yield l[i:i + n]
# How many elements each list should have
n = 1000
x = list(divide_chunks(l1, n))
#x is list, which will have 30 sets of sub-list of l1
count = 0
while count<len(x):
#converting sub-list to string, so that it can be passed in sql query
y = str(x[count]).replace("[","").replace("]","")
queryResult = pd.read_sql("SELECT * from abc where col1 IN (y), conn)
#appending query result to final pandas dataframe
finalResult= finalResult.append(queryResult)
count+=1
在第一次迭代中它工作正常,但对于剩余的迭代,我得到以下错误:(有时它也会进入第 2 和第 3 次迭代,但仅此而已)
OperationalError Traceback (most recent call last)
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1594 else:
-> 1595 cur.execute(*args)
1596 return cur
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pyathena\util.py in _wrapper(*args, **kwargs)
27 with _lock:
---> 28 return wrapped(*args, **kwargs)
29 return _wrapper
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pyathena\cursor.py in execute(self, operation, parameters, work_group, s3_staging_dir)
54 else:
---> 55 raise OperationalError(query_execution.state_change_reason)
56 return self
OperationalError: Query exhausted resources at this scale factor
During handling of the above exception, another exception occurred:
NotSupportedError Traceback (most recent call last)
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1598 try:
-> 1599 self.con.rollback()
1600 except Exception: # pragma: no cover
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pyathena\connection.py in rollback(self)
141 def rollback(self):
--> 142 raise NotSupportedError
NotSupportedError:
During handling of the above exception, another exception occurred:
DatabaseError Traceback (most recent call last)
<ipython-input-39-90403d399324> in <module>
290 start = time.localtime()
291
--> 292 weuData()
293 #print(weuResult)
294 naData()
<ipython-input-39-90403d399324> in weuData()
127
128
--> 129 queryResult = pd.read_sql("SELECT * from abc where col1 IN("+y+")", conn2)
130 finalResult= finalResult.append(queryResult)
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\io\sql.py in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
408 coerce_float=coerce_float,
409 parse_dates=parse_dates,
--> 410 chunksize=chunksize,
411 )
412
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
1643
1644 args = _convert_params(sql, params)
-> 1645 cursor = self.execute(*args)
1646 columns = [col_desc[0] for col_desc in cursor.description]
1647
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1603 "to rollback".format(sql=args[0], exc=exc)
1604 )
-> 1605 raise_with_traceback(ex)
1606
1607 ex = DatabaseError(
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
42 if traceback == Ellipsis:
43 _, _, traceback = sys.exc_info()
---> 44 raise exc.with_traceback(traceback)
45
46
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1597 except Exception as exc:
1598 try:
-> 1599 self.con.rollback()
1600 except Exception: # pragma: no cover
1601 ex = DatabaseError(
c:\users\my_user_name\appdata\local\programs\python\python37-32\lib\site-packages\pyathena\connection.py in rollback(self)
140
141 def rollback(self):
--> 142 raise NotSupportedError
Execution failed on sql: SELECT * from abc where col1 IN (1001,1002.......2000)
Query exhausted resources at this scale factor
unable to rollback