import warnings warnings.filterwarnings("ignore") import matplotlib.pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) import seaborn as sns
import pandas as pd import numpy as np from scipy import stats from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold from sklearn.metrics import make_scorer,mean_squared_error from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet from sklearn.svm import LinearSVR, SVR from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor from xgboost import XGBRegressor from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler from sklearn.linear_model import Ridge
x = pd.read_csv('x.csv') y = pd.read_csv('y.csv') dif = pd.read_csv('difficulty.csv')
cols_numeric=list(x.columns) cols_numeric.remove('Unnamed: 0') def scale_minmax(col): return (col-col.min())/(col.max()-col.min()) scale_cols = [col for col in cols_numeric] x[scale_cols] = x[scale_cols].apply(scale_minmax,axis=0)
cols_transform=x.columns[1:] for col in cols_transform: x.loc[:,col], _ = stats.boxcox(x.loc[:,col]+1)
x = x.iloc[:, 1:3] Y = dif.iloc[:, 1] X = x[0:355] y = Y X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=100) y_train = np.array(y_train).reshape(-1, 1) y_valid = np.array(y_valid).reshape(-1, 1)
def get_test_data(): df_test = x.iloc[355] df_test = np.array(df_test).reshape(1, -1)
return df_test
from sklearn.metrics import make_scorer
def rmse(y_true, y_pred): diff = y_pred - y_true sum_sq = sum(diff ** 2) n = len(y_pred) return np.sqrt(sum_sq / n)
def mse(y_ture, y_pred): return mean_squared_error(y_ture, y_pred)
rmse_scorer = make_scorer(rmse, greater_is_better=False) mse_scorer = make_scorer(mse, greater_is_better=False)
from sklearn.preprocessing import StandardScaler def get_trainning_data_omitoutliers(): X1=X_train.copy() y1=y_train.copy() return X1,y1
def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5): if len(y) == 0: X, y = get_trainning_data_omitoutliers()
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
if len(param_grid) > 0: gsearch = GridSearchCV(model, param_grid, cv=rkfold, scoring="neg_mean_squared_error", verbose=1, return_train_score=True)
gsearch.fit(X, y)
model = gsearch.best_estimator_ best_idx = gsearch.best_index_
grid_results = pd.DataFrame(gsearch.cv_results_) cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score']) cv_std = grid_results.loc[best_idx, 'std_test_score']
else: grid_results = [] cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold) cv_mean = abs(np.mean(cv_results)) cv_std = np.std(cv_results)
cv_score = pd.Series({'mean': cv_mean, 'std': cv_std})
y_pred = model.predict(X).ravel()
print('----------------------') print(model) print('----------------------') print('score=', model.score(X, y)) print('mse=', mse(y, y_pred)) print('cross_val: mean=', cv_mean, ', std=', cv_std)
return model, cv_score, grid_results
opt_models = dict() score_models = pd.DataFrame(columns=['mean','std'])
splits=5
repeats=5
model = 'Ridge'
opt_models[model] = Ridge() alph_range = np.arange(0.25,6,0.25) param_grid = {'alpha': alph_range}
opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=repeats)
cv_score.name = model score_models = score_models.append(cv_score)
model = 'Lasso'
opt_models[model] = Lasso() alph_range = np.arange(1e-4,1e-3,4e-5) param_grid = {'alpha': alph_range}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=repeats)
cv_score.name = model score_models = score_models.append(cv_score)
model='LinearSVR' opt_models[model] = LinearSVR()
crange = np.arange(0.1,1.0,0.1) param_grid = {'C':crange, 'max_iter':[1000]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=repeats)
cv_score.name = model score_models = score_models.append(cv_score)
model ='ElasticNet' opt_models[model] = ElasticNet()
param_grid = {'alpha': np.arange(1e-4,1e-3,1e-4), 'l1_ratio': np.arange(0.1,1.0,0.1), 'max_iter':[100000]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)
cv_score.name = model score_models = score_models.append(cv_score)
model = 'KNeighbors' opt_models[model] = KNeighborsRegressor()
param_grid = {'n_neighbors':np.arange(3,11,1)}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)
cv_score.name = model score_models = score_models.append(cv_score)
model = 'GradientBoosting' opt_models[model] = GradientBoostingRegressor()
param_grid = {'n_estimators':[150,250,350], 'max_depth':[1,2,3], 'min_samples_split':[5,6,7]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)
cv_score.name = model score_models = score_models.append(cv_score)
model = 'XGB' opt_models[model] = XGBRegressor()
param_grid = {'n_estimators':[100,200,300,400,500], 'max_depth':[1,2,3], }
opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)
cv_score.name = model score_models = score_models.append(cv_score)
def model_predict(test_data,test_y=[],stack=False): i=0 y_predict_total = np.zeros((test_data.shape[0],)) for model in opt_models.keys():
if model!='LinearSVR' and model!='KNeighbors': y_predict=opt_models[model].predict(test_data) y_predict=y_predict.ravel() y_predict_total+=y_predict i+=1 if len(test_y)>0: print("{}_mse:".format(model),mean_squared_error(y_predict,test_y)) y_predict_mean=np.round(y_predict_total/i,3) if len(test_y)>0: print("mean_mse:",mean_squared_error(y_predict_mean,test_y)) else: y_predict_mean=pd.Series(y_predict_mean) return y_predict_mean model_predict(X_valid,y_valid) df_test = x.iloc[355] df_test = np.array(df_test).reshape(1, -1) model = opt_models['KNeighbors'] res1 = model.predict(df_test) model = opt_models['XGB'] res2 = model.predict(df_test) res = (res1+res2)/2
x_1 = X_train.iloc[:,0] x_2 = X_valid.iloc[:,0] y_train = y_train.ravel() y_valid = y_train.ravel() plt.scatter(x_1,y_train,s = 10*10,c='orange',edgecolor='k') res1=res1.ravel() res1=res1[0] df_test = x.iloc[355] df_test = df_test.iloc[0] plt.scatter(df_test,res1,s = 30*10,c='b',edgecolor='k') model = opt_models['KNeighbors'] z = model.predict(X_valid) z = z.ravel() plt.scatter(x_2,z,s = 20*10,c='k',edgecolor='k') plt.title('KNN Regressor') plt.show(block=True)
x_1 = X_train.iloc[:,0] x_2 = X_valid.iloc[:,0] y_train = y_train.ravel() y_valid = y_train.ravel() plt.scatter(x_1,y_train,s = 10*10,c='orange',edgecolor='k') res2=res2.ravel() res2=res2[0] df_test = x.iloc[355] df_test = df_test.iloc[0] plt.scatter(df_test,res2,s = 30*10,c='b',edgecolor='k') model = opt_models['XGB'] z = model.predict(X_valid) z = z.ravel() plt.scatter(x_2,z,s = 20*10,c='k',edgecolor='k') plt.title('XGB') plt.show(block=True)
|