我有一个项目,我想在购物车中查找商品之间的关联规则。为此,我使用 Sql Server 中的 ML 服务(Python),并使用mlxtend
库查找关联规则。但是问题我有的是该fpgrowth
函数显然使用了大量内存,以至于它停止工作并给出错误。尽可能使用sql server进行数据预处理以提高效率。
部分代码:
-- =============================================
-- Author: Me
-- Create date: 2021-01-01
-- Description: Association Rule
-- =============================================
--pip install mlxtend==0.16.0 --no-cache-dir
CREATE PROCEDURE [Shopping].[CalculateAssociationRule]
@CompanyID UNIQUEIDENTIFIER,
@confidence DECIMAL(7,6) =0.5 --DEFAULT
AS
BEGIN
-- SET NOCOUNT ON added to prevent extra result sets from
-- interfering with SELECT statements.
SET NOCOUNT ON;
-- Declare Variable
DECLARE @input_query NVARCHAR(MAX) = N'SELECT Int_DocShoppingID [TID],Int_StuffID [SID] FROM ##DocDetails'
DECLARE @Pattern VARCHAR(150)=N'[0-9]+\.?[0-9,.]*'
-- Declare Python Code
DECLARE @pyScript NVARCHAR(max)='
# import lib
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,fpgrowth
from mlxtend.frequent_patterns import association_rules
#Change Data Shape to TranEncoder
#dataset=[data.SID.tolist() for id, data in data.groupby("TID")]
l=data[''list''].tolist()
del data
dataset=[i.split('','') for i in l]
del l
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
del dataset
#Run Association Rule (FPG) Algorithm
Result =pd.DataFrame(columns=[''antecedents'',''consequents'',''antecedent support'',''consequent support'',''support'',''confidence'',''lift'' ,''leverage'',''conviction''])
frequent_itemsets = fpgrowth(df, min_support=0.02, use_colnames=True)
if frequent_itemsets.empty == False:
Result=association_rules(frequent_itemsets, metric="confidence", min_threshold='+CAST(@confidence AS VARCHAR(max))+')
del frequent_itemsets
Result[''antecedents'']= Result[''antecedents''].astype(str)
Result[''consequents'']= Result[''consequents''].astype(str)
#Result Output
OutputDataSet=Result
'
EXECUTE sys.sp_execute_external_script
@language = N'python37'
,@script = @pyScript
,@input_data_1 = @input_query
,@input_data_1_name = N'data'
--WITH result sets (([antecedents] VARCHAR(max),[consequents] VARCHAR(max),[antecedent support] VARCHAR(max),[consequent support] VARCHAR(max),[support] VARCHAR(max),[confidence] VARCHAR(max),[lift] VARCHAR(max),[leverage] VARCHAR(max),[conviction] VARCHAR(max)));
END
蟒蛇代码:
# import lib
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori,fpgrowth
from mlxtend.frequent_patterns import association_rules
#Change Data Shape to TranEncoder
#dataset=[data.SID.tolist() for id, data in data.groupby("TID")]
l=data['list'].tolist()
del data
dataset=[i.split(',') for i in l]
del l
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
del dataset
#Run Association Rule (FPG) Algorithm
Result =pd.DataFrame(columns=['antecedents','consequents','antecedent support','consequent support','support','confidence','lift' ,'leverage','conviction'])
frequent_itemsets = fpgrowth(df, min_support=0.02, use_colnames=True)
if frequent_itemsets.empty == False:
Result=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
del frequent_itemsets
Result['antecedents']= Result['antecedents'].astype(str)
Result['consequents']= Result['consequents'].astype(str)
#Result Output
OutputDataSet=Result
错误:内存错误
Msg 39004, Level 16, State 20, Line 3
A 'python37' script error occurred during execution of 'sp_execute_external_script' with HRESULT 0x80004004.
STDOUT message(s) from external script:
2021-12-08 10:49:22.38 Error: Python error: <class 'MemoryError'>: File "<string>", line 16, in <module>
File "C:\Program Files\Python37\lib\site-packages\mlxtend\frequent_patterns\fpgrowth.py", line 72, in fpgrowth
File "C:\Program Files\Python37\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py", line 38, in generate_itemsets
项目中使用的工具:
1. Sql Sever 2019-CU14
2.Python 3.7.9 (External Language in sql server)
3.Python Lib And Version
Package Version
--------------- -------
cycler 0.11.0
fonttools 4.28.3
joblib 1.1.0
kiwisolver 1.3.2
matplotlib 3.5.0
mlxtend 0.17.0
numpy 1.21.4
packaging 21.3
pandas 1.3.4
Pillow 8.4.0
pip 21.3.1
pyodbc 4.0.32
pyparsing 3.0.6
python-dateutil 2.8.2
pytz 2021.3
scikit-learn 1.0.1
scipy 1.7.3
setuptools 47.1.0
setuptools-scm 6.3.2
six 1.16.0
threadpoolctl 3.0.0
tomli 1.2.2
------------------------
4. 16GB Ram & Cpu core 12
有没有解决方案可以更有效地做到这一点并避免错误?