参考

本文参考阿里云天池大赛赛题解析

源码链接点这里

源码

model

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns

# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
from sklearn.linear_model import Ridge


x = pd.read_csv('x.csv')
y = pd.read_csv('y.csv')
dif = pd.read_csv('difficulty.csv')
# normalise numeric columns
cols_numeric=list(x.columns)
cols_numeric.remove('Unnamed: 0')
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric]
x[scale_cols] = x[scale_cols].apply(scale_minmax,axis=0)
# Box-Cox
cols_transform=x.columns[1:]
for col in cols_transform:
# transform column
x.loc[:,col], _ = stats.boxcox(x.loc[:,col]+1)
# split features
x = x.iloc[:, 1:3]
Y = dif.iloc[:, 1]
X = x[0:355]
y = Y
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=100)
y_train = np.array(y_train).reshape(-1, 1)
y_valid = np.array(y_valid).reshape(-1, 1)

def get_test_data():
df_test = x.iloc[355]
df_test = np.array(df_test).reshape(1, -1)
#df_test = np.delete(df_test, [0], 1)

return df_test

from sklearn.metrics import make_scorer

# metric for evaluation
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff ** 2)
n = len(y_pred)
return np.sqrt(sum_sq / n)

def mse(y_ture, y_pred):
return mean_squared_error(y_ture, y_pred)

# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)

from sklearn.preprocessing import StandardScaler
def get_trainning_data_omitoutliers():
X1=X_train.copy()
y1=y_train.copy()
return X1,y1

def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5):
# get unmodified training data, unless data to use already specified
if len(y) == 0:
X, y = get_trainning_data_omitoutliers()
# poly_trans=PolynomialFeatures(degree=2)
# X=poly_trans.fit_transform(X)
# X=MinMaxScaler().fit_transform(X)

# create cross-validation method
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)

# perform a grid search if param_grid given
if len(param_grid) > 0:
# setup grid search parameters
gsearch = GridSearchCV(model, param_grid, cv=rkfold,
scoring="neg_mean_squared_error",
verbose=1, return_train_score=True)

# search the grid
gsearch.fit(X, y)

# extract best model from the grid
model = gsearch.best_estimator_
best_idx = gsearch.best_index_

# get cv-scores for best model
grid_results = pd.DataFrame(gsearch.cv_results_)
cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
cv_std = grid_results.loc[best_idx, 'std_test_score']

# no grid search, just cross-val score for given model
else:
grid_results = []
cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
cv_mean = abs(np.mean(cv_results))
cv_std = np.std(cv_results)

# combine mean and std cv-score in to a pandas series
cv_score = pd.Series({'mean': cv_mean, 'std': cv_std})

# predict y using the fitted model
y_pred = model.predict(X).ravel()

# print stats on model performance
print('----------------------')
print(model)
print('----------------------')
print('score=', model.score(X, y))
# print('rmse=', rmse(y, y_pred))
print('mse=', mse(y, y_pred))
print('cross_val: mean=', cv_mean, ', std=', cv_std)

return model, cv_score, grid_results

# places to store optimal models and scores
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])

# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5

model = 'Ridge'

opt_models[model] = Ridge()
alph_range = np.arange(0.25,6,0.25)
param_grid = {'alpha': alph_range}

opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

model = 'Lasso'

opt_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

model='LinearSVR'
opt_models[model] = LinearSVR()

crange = np.arange(0.1,1.0,0.1)
param_grid = {'C':crange,
'max_iter':[1000]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=repeats)


cv_score.name = model
score_models = score_models.append(cv_score)

model ='ElasticNet'
opt_models[model] = ElasticNet()

param_grid = {'alpha': np.arange(1e-4,1e-3,1e-4),
'l1_ratio': np.arange(0.1,1.0,0.1),
'max_iter':[100000]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

model = 'KNeighbors'
opt_models[model] = KNeighborsRegressor()

param_grid = {'n_neighbors':np.arange(3,11,1)}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

model = 'GradientBoosting'
opt_models[model] = GradientBoostingRegressor()

param_grid = {'n_estimators':[150,250,350],
'max_depth':[1,2,3],
'min_samples_split':[5,6,7]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

model = 'XGB'
opt_models[model] = XGBRegressor()

param_grid = {'n_estimators':[100,200,300,400,500],
'max_depth':[1,2,3],
}

opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,
splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

def model_predict(test_data,test_y=[],stack=False):
#poly_trans=PolynomialFeatures(degree=2)
#test_data1=poly_trans.fit_transform(test_data)
#test_data=MinMaxScaler().fit_transform(test_data)
i=0
y_predict_total = np.zeros((test_data.shape[0],))
for model in opt_models.keys():

if model!='LinearSVR' and model!='KNeighbors':
y_predict=opt_models[model].predict(test_data)
y_predict=y_predict.ravel()
y_predict_total+=y_predict
i+=1
if len(test_y)>0:
print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
y_predict_mean=np.round(y_predict_total/i,3)
if len(test_y)>0:
print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
else:
y_predict_mean=pd.Series(y_predict_mean)
return y_predict_mean
model_predict(X_valid,y_valid)
df_test = x.iloc[355]
df_test = np.array(df_test).reshape(1, -1)
model = opt_models['KNeighbors']
res1 = model.predict(df_test)
model = opt_models['XGB']
res2 = model.predict(df_test)
res = (res1+res2)/2
# plot KNN
x_1 = X_train.iloc[:,0] # 0or1
x_2 = X_valid.iloc[:,0] # 0or1
y_train = y_train.ravel()
y_valid = y_train.ravel()
plt.scatter(x_1,y_train,s = 10*10,c='orange',edgecolor='k')
res1=res1.ravel()
res1=res1[0]
df_test = x.iloc[355]
df_test = df_test.iloc[0] # 0or1
plt.scatter(df_test,res1,s = 30*10,c='b',edgecolor='k')
model = opt_models['KNeighbors']
z = model.predict(X_valid)
z = z.ravel()
plt.scatter(x_2,z,s = 20*10,c='k',edgecolor='k')
plt.title('KNN Regressor')
plt.show(block=True)
# plot XGB
x_1 = X_train.iloc[:,0] # 0or1
x_2 = X_valid.iloc[:,0] # 0or1
y_train = y_train.ravel()
y_valid = y_train.ravel()
plt.scatter(x_1,y_train,s = 10*10,c='orange',edgecolor='k')
res2=res2.ravel()
res2=res2[0]
df_test = x.iloc[355]
df_test = df_test.iloc[0] # 0or1
plt.scatter(df_test,res2,s = 30*10,c='b',edgecolor='k')
model = opt_models['XGB']
z = model.predict(X_valid)
z = z.ravel()
plt.scatter(x_2,z,s = 20*10,c='k',edgecolor='k')
plt.title('XGB')
plt.show(block=True)

数据预处理

import operator
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('data.csv')
difficulty = pd.read_csv('difficulty.csv')
y = pd.read_csv('y.csv')
word = data.iloc[:,3]

def count_each_char_sort_value(str):
dict = {}
for i in str:
dict[i] = dict.get(i, 0) + 1
dict = sorted(dict.items(), key=operator.itemgetter(1), reverse=True)
return dict

x = []
for i in range(0,355):
x.append(count_each_char_sort_value(word[i]))
x_pd = pd.DataFrame(x)

first_char = []
for i in range(0,355):
first_char.append(ord(word[i][0])-97)

repeat_char = []
for i in range(0,355):
if(x_pd.iloc[i,4]==None):
if(x_pd.iloc[i,3]==None):
repeat_char.append(2)
else:
repeat_char.append(1)
else:
repeat_char.append(0)
df1 = pd.DataFrame(repeat_char, columns=['repeat'])
df1.insert(1, 'first_char', first_char,allow_duplicates = False)
df1.to_csv('x.csv')

分析

对数据进行多个模型的拟合,取拟合效果最佳的模型为所选模型

stack融合的投票机制我没搞,直接算的预测值平均值

还有一个缺点是自变量选的不好,我知道自变量的正态分布和泛化程度都不符合要求(和因变量相关性也不高),所以我也没检验,不自取其辱了,哈哈哈