1
import pandas as pd
import numpy as np
import re
import seaborn as sns

import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn import  metrics, model_selection
from xgboost.sklearn import XGBClassifier
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble



warnings.filterwarnings('ignore')

train = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/train.csv (6)/train.csv')


test = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test.csv (2)/test.csv')
test_labels=pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test_labels.csv/test_labels.csv')

print("\nTrain Data")
print("==========\n",train)
print("\nTest Data")
print("==========\n",test)
print("\nTest_labels Data")
print("================\n",test_labels)
sns.barplot(x='toxic', y='identity_hate', data=train);
plt.show()
print("\n\nTrain data shape:",train.shape)
print("\nTest data shape:",test.shape)
print("\nTestLabels data shape:",test_labels.shape)
print("\nCorrelation matrix")
print("==================")
plt.title('Correlation Matrix')

sns.heatmap(train.corr())
plt.show()

print("\n Data Descriptive")
print("================\n",train.describe())

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
print("\nRegressor")
print("===========\n",xg_reg)


X = test_labels.iloc[:,1:6].values
Y = test_labels.iloc[:,6].values
#print("X value\n",X,"\n\nY value \n",Y)

# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = XGBClassifier()
model.fit(X_train, y_train)
print("\n Classifier")
print("============\n",model)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

params = {
    'n_estimators': 1,
    'max_depth': 1,
    'learning_rate': 1,
    'criterion': 'mse'
}

gradient_boosting_regressor = ensemble.GradientBoostingRegressor(**params)

gradient_boosting_regressor.fit(X, Y)

plt.figure(figsize=(10, 5))
plt.title('Gradient Boosting model (1 estimators, Single tree split)')
plt.scatter(X, Y)
plt.plot(X, gradient_boosting_regressor.predict(X), color='r')
plt.show()

执行上述代码时会发生此错误。

"raise ValueError("x 和 y 的大小必须相同")"

我有.csv1398 行和 2 列的文件。我已将 40% 作为 y_test 集,因为它在上面的代码中可见。

4

0 回答 0