-3

我正在尝试预测“ Full_Time_Home_Goals”列(特征)。我遵循了Kaggle的例子。该代码适用于我的示例中的各种维度(测试数据中的 419 行和训练数据中的 892 行)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# %matplotlib inline

# Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Files
data_train = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\train.csv")
data_test = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\test.csv")


columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]

data_train = data_train.dropna()
data_test = data_test.dropna()

data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)

from sklearn import preprocessing


def encode_features(df_train, df_test):
    features = ['HomeTeam', 'AwayTeam']
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test


data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())

# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict

X_all = data_train

y_all = data_train['Full_Time_Home_Goals']

from sklearn.model_selection import train_test_split

num_test = 0.20  # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Using Random Forest and using parameters that we defined

clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 6, 9],
              'max_features': ['log2', 'sqrt', 'auto'],
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1, 5, 8]
              }

acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

我得到的错误是:

  1. 代码如下:

    Traceback(最近一次通话最后一次):文件“C:/Users/harsh/PycharmProjects/Kaggle-Machine Learning from Start to Finish with Scikit-Learn/EPL Predicting.py”,第 98 行,在 predictions = clf.predict(data_test. drop('Id', axis=1)) 文件“C:\Users\harsh\PycharmProjects\GitHub\venv\lib\site-packages\sklearn\ensemble_forest.py”,第 629 行,预测 ValueError: Number of features of模型必须与输入匹配。模型 n_features 为 4,输入 n_features 为 2

  2. 随着代码从 更改predictions = clf.predict(data_test.drop('Id', axis=1)) to predictions = clf.predict(X_test),错误是:

     raise ValueError(msg) ValueError: array length 37921 does not match index length 380
    

我该如何解决这个问题?

我使用的数据集可以在这里找到

4

1 回答 1

0

以下是您的经过测试且可以正常工作的代码:

data_train = pd.read_csv(r"train.csv")
data_test = pd.read_csv(r"test.csv")


columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]

data_train = data_train.dropna()
data_test = data_test.dropna()

data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)

from sklearn import preprocessing


def encode_features(df_train, df_test):
    features = ['HomeTeam', 'AwayTeam']
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test


data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())

# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict

y_all = data_train['Full_Time_Home_Goals']
X_all = data_train.drop(['Full_Time_Home_Goals'], axis=1)

from sklearn.model_selection import train_test_split

num_test = 0.20  # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Using Random Forest and using parameters that we defined

clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 6, 9],
              'max_features': ['log2', 'sqrt', 'auto'],
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1, 5, 8]
              }

acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

print(accuracy_score(y_test, predictions))

ids = data_test['Id']
predictions = clf.predict(data_test)

df_preds = pd.DataFrame({"id":ids, "predictions":predictions})
df_preds

   Id  HomeTeam  AwayTeam  Full_Time_Home_Goals
0   1        55       440                     3
1   2       158       493                     2
2   3       178       745                     1
3   4       185       410                     1
4   5       249        57                     2
       Id  HomeTeam  AwayTeam
0  190748       284        54
1  190749       124       441
2  190750       446        57
3  190751       185       637
4  190752       749       482
0.33213786556261704
id  predictions
0   190748  1
1   190749  1
2   190750  1
3   190751  1
4   190752  1
... ... ...
375 191123  1
376 191124  1
377 191125  1
378 191126  1
379 191127  1
380 rows × 2 columns
于 2020-09-26T15:53:44.030 回答