0

我正在尝试使用 Sklearn 的线性回归和 statsmodels.api 来构建线性模型。

方法是删除 p 值和 VIF 值高于标准的变量(p 值:0.05,VIF:<5)

bike_train 列是 8 月、12 月、2 月、1 月、7 月、6 月、3 月、5 月、11 月、10 月、9 月、周一、周六、周日、周四、周二、周三、小雪和雨、薄雾和多云、春季、夏季、冬天,温度,湿度,风速,bike_count

y_train = bike_train.pop('bike_count')
X_train = bike_train

# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 15)
rfe = rfe.fit(X_train, y_train)
#List of selected varialbles
list(zip(X_train.columns, rfe.support_, rfe.ranking_))
# Variables which have RFE support as true
col = X_train.columns[rfe.support_]
col

输出 :

Index(['December', 'January', 'July', 'June', 'November', 'October',
       'September', 'Sunday', 'Light Snow & Rain', 'Mist & Cloudy', 'Summer',
       'Winter', 'temp', 'humidity', 'windspeed'],
      dtype='object')

型号 1

# Creating X_train dataframe with RFE selected variables
X_train_rfe = X_train[col]
#Adding a constant
X_train_rfe = sm.add_constant(X_train_rfe)
# Running the linear model
lm = sm.OLS(y_train,X_train_rfe).fit()
#Dropping the constant
X_train_rfe = X_train_rfe.drop(['const'], axis=1)
#Summary of the linear model
print(lm.summary())

# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

型号 2

#Dropping January
X_train_new = X_train_rfe.drop(["January"], axis = 1)
#Rebuilding the model without "January"
X_train_lm = sm.add_constant(X_train_new)
lm_new = sm.OLS(y_train,X_train_lm).fit()
X_train_lm = X_train_lm.drop(['const'], axis=1)
print(lm_new.summary())

#checking VIF for new model without January
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

模型 3

#Dropping Humidity
X_train_new_1 = X_train_lm.drop(["humidity"], axis = 1)
#Rebuilding the model without "Humidity"
X_train_lm_1 = sm.add_constant(X_train_new_1)
lm_1 = sm.OLS(y_train,X_train_lm_1).fit()
X_train_lm_1 = X_train_lm_1.drop(['const'], axis=1)
print(lm_1.summary())

#checking VIF for new model without Humidity
vif = pd.DataFrame()
X = X_train_new_1
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

模型 4

#Dropping Winter
X_train_new_2 = X_train_lm_1.drop(["Winter"], axis = 1)
#Rebuilding the model without "Winter"
X_train_lm_2 = sm.add_constant(X_train_new_2)
lm_2 = sm.OLS(y_train,X_train_lm_2).fit()
X_train_lm_2 = X_train_lm_2.drop(['const'], axis=1)
print(lm_2.summary())

#checking VIF for new model without Winter
vif = pd.DataFrame()
X = X_train_new_2
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

型号 5

#Dropping June
X_train_new_3 = X_train_lm_2.drop(["June"], axis = 1)
#Rebuilding the model without "June"
X_train_lm_3 = sm.add_constant(X_train_new_3)
lm_3 = sm.OLS(y_train,X_train_lm_3).fit()
X_train_lm_3 = X_train_lm_3.drop(['const'], axis=1)
print(lm_3.summary())

#checking VIF for new model without June
vif = pd.DataFrame()
X = X_train_new_3
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

型号 6

#Dropping July
X_train_new_4 = X_train_lm_3.drop(["July"], axis = 1)
#Rebuilding the model without "July"
X_train_lm_4 = sm.add_constant(X_train_new_4)
lm_4 = sm.OLS(y_train, X_train_lm_4).fit()
X_train_lm_4 = X_train_lm_4.drop(['const'], axis=1)
print(lm_4.summary())

#checking VIF for new model without July
vif = pd.DataFrame()
X = X_train_new_4
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

y_train_pred = lm_4.predict(X_train_lm_4)

错误 :

ValueError                                Traceback (most recent call last)
<ipython-input-38-f48f554d210b> in <module>
----> 1 y_train_pred = lm_4.predict(X_train_lm_4)

C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\base\model.py in predict(self, exog, transform, *args, **kwargs)
   1097             exog = np.atleast_2d(exog)  # needed in count model shape[1]
   1098 
-> 1099         predict_results = self.model.predict(self.params, exog, *args,
   1100                                              **kwargs)
   1101 

C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in predict(self, params, exog)
    378             exog = self.exog
    379 
--> 380         return np.dot(exog, params)
    381 
    382     def get_distribution(self, params, scale, exog=None, dist_class=None):

<__array_function__ internals> in dot(*args, **kwargs)

ValueError: shapes (510,10) and (11,) not aligned: 10 (dim 1) != 11 (dim 0)

在创建模型之前缩放所有数值,如下所示:

scaler = MinMaxScaler()
num_vars=['temp','humidity','windspeed','bike_count']
bike_train[num_vars] = scaler.fit_transform(bike_train[num_vars])
bike_train.head()

请告诉我我哪里做错了,提前谢谢!!

4

0 回答 0