#Import some standard libraries for data analysis, visualization and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
%matplotlib inline
from scipy.special import boxcox1p
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, recall_score
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization
<a id = "intro"></a>
The goal of a regression problem is to predict the value of some continuous random variable $Y$, called the target variable, given the values of some input random variables $X_i$, which we call the feature variables. While the target variable must be continuous, the features can be continuous, discrete, or a mix of the two. In this notebook the target variable we are interested in will be housing prices in Ames, Iowa, and our features will be extracted from 79 possibilities.
We will first do some exploratory data analysis on these features, to determine the shape of their distributions and how they interact with the target sales price variable. Then we will do some feature extraction and selection, to determine which variables to feed into our models. Next we fit our regression models to our chosen features. Finally, we use stacking methods to combine our models into a final "megazord" model.
<a id = "EDA"></a>
In this section we perform some exploratory data analysis. First we look for any null values and outliers. Then we examine the distributions of our target variable and our features, and look at how the target variable and features interact. Throughout this section we also try to look at whether there are any noticable differences in the training set and the testing set.
First, let's read in the data and take a quick look at the first few rows.
train = pd.read_csv(r"C:\Users\ASUS\Documents\Data\kaggle\HousePrices\train.csv")
test = pd.read_csv(r"C:\Users\ASUS\Documents\Data\kaggle\HousePrices\test.csv")
train_id = train['Id']
test_id = test['Id']
n_train = train.shape[0]
n_test = test.shape[0]
data = pd.concat((train.drop(['Id', 'SalePrice'], axis = 1), test.drop('Id', axis = 1))).reset_index(drop=True)
y = train[['SalePrice']]
data.head().transpose()
We can already see some NaN's. Let's get a better look at the breakdown of null values.
train_null = data[:n_train].isnull().sum().sort_values(ascending = False)
test_null = data[n_train:].isnull().sum().sort_values(ascending = False)
f, ax = plt.subplots(figsize = (15,20), ncols = 2, nrows = 1)
sns.barplot(y= train_null[train_null >0].index, x = train_null[train_null >0], orient = 'h', ax = ax[0], palette="RdYlGn")
ax[0].set(title = 'Nulls in the training set')
sns.barplot(y= test_null[test_null >0].index, x = test_null[test_null >0], orient = 'h', ax = ax[1], palette="RdYlGn")
ax[1].set(title = 'Nulls in the testing set')
The breakdown looks pretty similar between the test set and training set, except there are a bunch of columns in the test set with just a couple of null values, which won't have a substantial effect on our predictions. We will consider the problem of replacing the null values in the modeling stage.
Next we take a look at the distribution of the target value.
f, (axs1, axs2) = plt.subplots(figsize = (15,5), ncols = 2)
sns.distplot(y, ax = axs1, color = sns.xkcd_rgb['faded green'])
axs1.set_title('Histogram of Sale Price')
sns.violinplot(y, ax = axs2, color = sns.xkcd_rgb['faded green'])
axs2.set_title('Violin Plot of Sale Price')
We see that the SalePrice distribution has a heavy positive skew. This will negatively affect some of our models. To counteract this, we apply a log transform to y.
log_y = np.log(y)
f, (axs1, axs2) = plt.subplots(figsize = (15,5), ncols = 2, nrows = 1)
sns.distplot(log_y, ax = axs1,color = sns.xkcd_rgb['faded green'])
axs1.set(title = 'Histogram of Sale Price', yticks = [])
sns.violinplot(log_y, ax = axs2, color = sns.xkcd_rgb['faded green'])
axs2.set_title('Violin Plot of Sale Price')
That looks much more normal. Next we will get a look at the distribution of our feature and see how they interact with SalePrice. We separate the continuous, nominal and categorical variables and visualize them in different ways
data.dtypes.value_counts()
continuous = [x for x in data if data[x].dtypes in ['float64', 'int64'] and data[x].value_counts().shape[0] > 20]
for x in continuous:
f, ax = plt.subplots(figsize = (20, 5))
sns.regplot(x = x, y = 'SalePrice', color = sns.xkcd_rgb['faded green'],
data = pd.concat([data[:n_train][x], log_y], axis = 1).dropna(), ax=ax)
ax.set(ylabel = 'Log of Sale Price')
f, ax = plt.subplots(figsize = (20, 5), ncols = 2)
sns.distplot(data[:n_train][x].dropna(), color = sns.xkcd_rgb['faded green'], ax = ax[0])
ax[0].set(title = 'Training Set Distribution')
sns.distplot(data[n_train:][x].dropna(), ax = ax[1])
ax[1].set(title = 'Testing Set Distribution')
There are a few thing we notice from these plots.
Some of the variables are time-based variables as opposed to continuous variables. We separate these into their own category.
There are many variables which are mostly 0. We create a new categorical variable for these which tell us whether the value is 0 or 1.
Several variables are skewed. We apply a box plot transform to them to alleviate this defect.
temporal = ['YearBuilt',
'YearRemodAdd',
'GarageYrBlt',
]
continuous = [x for x in continuous if x not in temporal]
skewed = (data[continuous] == 0).sum()[(data[continuous] == 0).sum() > 500]
normalized = pd.DataFrame()
for feat in continuous:
normalized[feat] = boxcox1p(data[feat].dropna(), .15)
for x in normalized:
f, ax = plt.subplots(figsize = (20, 5))
sns.regplot(x = x, y = 'SalePrice', color = sns.xkcd_rgb['faded green'],
data = pd.concat([normalized[:n_train][normalized[:n_train] >0 ][x], log_y], axis = 1).dropna())
ax.set(ylabel = 'Log of Sale Price')
f, ax = plt.subplots(figsize = (20, 5), ncols = 2)
sns.distplot(normalized[:n_train][normalized[:n_train] >0 ][x].dropna(), color = sns.xkcd_rgb['faded green'], ax = ax[0])
sns.distplot(normalized[:n_train][normalized[:n_train] >0 ][x].dropna(), ax = ax[1])
With those changes, we can see there is a linear relationship between our target variable and some of our features. Other relationships can be inferred from the following heatmap of correlations.
f, ax = plt.subplots(figsize = (20, 20))
corr = pd.concat((data[continuous][:n_train], y), axis = 1).dropna().corr()
sns.heatmap(corr,annot=True, cmap = "RdYlGn")
for i in range(len(temporal)):
f, ax = plt.subplots(figsize = (20,3))
ax = sns.pointplot(data[temporal].iloc[:n_train,i], y = log_y, fit_reg = False, color = sns.xkcd_rgb['rust orange'])
ax.set(ylabel = 'Log of Sale Price')
labels = ax.get_xticklabels()
for i,l in enumerate(labels):
if(i%5 != 0): labels[i] = ''
ax.set_xticklabels(labels, rotation=30)
ordinal = [x for x in data if data[x].dtypes in ['float64', 'int64'] and data[x].value_counts().shape[0] <= 20]
for x in ordinal:
f, ax = plt.subplots(figsize = (20, 5))
sns.pointplot(x = x, y = 'SalePrice',
data = pd.concat([data[:n_train][x], log_y], axis = 1).dropna())
ax.set(ylabel = 'Log of Sale Price')
f, ax = plt.subplots(figsize = (20, 5), ncols = 2)
sns.countplot(data[:n_train][x].dropna(), color = sns.xkcd_rgb['faded green'], ax = ax[0])
sns.countplot(data[n_train:][x].dropna(), ax = ax[1], color = sns.xkcd_rgb['faded blue'])
f, ax = plt.subplots(figsize = (20, 20))
corr = pd.concat((data[nominal].iloc[:n_train,:], y), axis = 1).corr()
sns.heatmap(corr,annot=True, cmap = "RdYlGn")
categorical = [x for x in data if data[x].dtypes in ['object']]
for x in categorical:
f, ax = plt.subplots(figsize = (20, 5))
sns.lvplot(x = x, y = 'SalePrice',
data = pd.concat([data[:n_train][x], log_y], axis = 1).dropna())
ax.set(ylabel = 'Log of Sale Price')
f, ax = plt.subplots(figsize = (20, 5), ncols = 2)
sns.countplot(data[:n_train][x].dropna(), color = sns.xkcd_rgb['faded green'], ax = ax[0])
sns.countplot(data[n_train:][x].dropna(), ax = ax[1], color = sns.xkcd_rgb['faded blue'])
<a id = "pred"> </a>
In this section we are going to make our predictions. We will fit a variety of models, using Bayesian Optimization to choose our parameters.
data_drop = pd.get_dummies(data.dropna(axis = 1))
X_train, X_test, y_train, y_test = train_test_split(data_drop[:n_train], log_y, test_size = .2)
def xgb_evaluate(min_child_weight,
colsample_bytree,
max_depth,
subsample,
gamma,
eta,
alpha):
params['min_child_weight'] = int(min_child_weight)
params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
params['max_depth'] = int(max_depth)
params['subsample'] = max(min(subsample, 1), 0)
params['gamma'] = max(gamma, 0)
params['eta'] = max(eta, 0)
params['alpha'] = max(alpha, 0)
cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
seed=random_state,
callbacks=[xgb.callback.early_stop(50)])
return -cv_result['test-rmse-mean'].values[-1]
def prepare_data():
xgtrain = xgb.DMatrix(X_train, label=y_train)
return xgtrain
xgtrain = prepare_data()
num_rounds = 300
random_state = 2016
num_iter = 25
init_points = 5
params = {
'silent': 1,
'eval_metric': 'rmse',
'verbose_eval': True,
'seed': random_state
}
xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
'colsample_bytree': (0.1, 1),
'max_depth': (5, 15),
'subsample': (0.5, 1),
'gamma': (0, 10),
'eta': (.1,1),
'alpha': (0, 10),
})
xgbBO.maximize(init_points=init_points, n_iter=num_iter)
par = xgbBO.res['max']
par
model_xgb = xgb.XGBRegressor(silent = 1, eval_metric = 'mae', verbose_eval = True, **par)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
learning_rate=0.05, n_estimators=720,
max_bin = 55, bagging_fraction = 0.8,
bagging_freq = 5, feature_fraction = 0.2319,
feature_fraction_seed=9, bagging_seed=9,
min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
np.sqrt(np.mean(-cross_val_score(model_xgb, data_drop[:n_train], log_y, scoring = 'neg_mean_squared_error', n_jobs = 5)))
np.sqrt(mean_squared_error(model_xgb.predict(X_test), y_test.values.ravel()))
model_lgb.fit(X_train, y_train.values.ravel())
np.sqrt(np.mean(-cross_val_score(model_lgb, X_train, y_train.values.ravel(), scoring = 'neg_mean_squared_error', n_jobs = 5)))
model_xgb.fit()
df = pd.DataFrame()
df['xgb'] = model_xgb.predict(X_test)
df['lgb'] = model_lgb.predict(X_test)
df = pd.DataFrame()
df['xgb'] = model_xgb.predict(data[n_train:])
df['lgb'] = model_lgb.predict(data[n_train:])
model_rfr = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
model_rfr.fit(df, y_test.values.ravel())
np.sqrt(np.mean(-cross_val_score(model_rfr, df, y_test.values.ravel(), scoring = 'neg_mean_squared_error', n_jobs = 5)))
prediction = model_rfr.predict(df)
model_xgb.fit(data[:n_train], y)
prediction = model_xgb.predict(data[n_train:])
sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = np.exp(prediction)
sub.to_csv(r'C:\Users\ASUS\Documents\Data\kaggle\HousePrices\submission.csv',index=False)
#model_rfr = RandomForestClassifier(n_estimators=100, bootstrap = 20, class_weight={0:20, 1:1})
model_lgb.fit(X_train[[a for a, b in x[1:15]]], holdout_err_1)
#a = [np.mean(cross_val_score(model_rfr, X_train[[a for a, b in x[1:15]]], y_1, scoring = 'recall', n_jobs = 5)) for i in range(10,20)]
sns.regplot(model_lgb.predict(X_test[[a for a, b in x[1:15]]]), holdout_err_2)
sns.regplot(np.arange(10,20), a)
sns.distplot(holdout_err_2[model_rfr.predict(X_test) == 1])
sns.distplot(holdout_err_2[model_rfr.predict(X_test) == 0])
x = [(a,b) for a, b in zip(X_train.columns, model_rfr.feature_importances_)]
x.sort(key=lambda x: x[1], reverse = True)
[a for a, b in x[1:6]]
sns.distplot(model_rfr.predict(X_train_2))
sns.distplot((model_xgb.predict(X_train_2.drop('model1', axis = 1)) - y_train_1.values.ravel()))
sns.regplot((model_xgb.predict(X_train_2.drop('model1', axis = 1)) - y_train_1.values.ravel()), model_rfr.predict(X_train_2))
np.corrcoef(e,p)
sns.regplot((model_xgb.predict(X_test) - y_test.values.ravel()), model_rfr.predict(X_test))
df = pd.DataFrame()
df['yb'] = X_train_1['YearBuilt']
df['err'] = (model_xgb.predict(X_train_2) - y_train_2.values.ravel())
df['approx'] = model_rfr.predict(X_train_2)
np.sum(np.abs(model_rfr.predict(X_train_1)) > .03)/len(y_train_2.values.ravel())
err_prediction_1 = model_rfr.predict(X_train_2)
np.percentile(np.abs(err_prediction_1), q = 80)
model_lgb.fit(X_train, y_train.values.ravel())
xgb_prediction = model_xgb.predict(X_test)
lgb_prediction = model_lgb.predict(X_test)
np.sqrt(mean_squared_error(xgb_prediction, y_test))
np.sqrt(mean_squared_error(lgb_prediction, y_test))
np.sqrt(mean_squared_error((xgb_prediction + lgb_prediction)/2, y_test))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
stack_X = pd.DataFrame()
stack_X['xgb_prediction'] = xgb_prediction
stack_X['lgb_prediction'] = lgb_prediction
KRR.fit(stack_X, y_holdout)
stack_prediction = KRR.predict(stack_X)
np.sqrt(mean_squared_error(stack_prediction, y_test))
X_holdout.shape
model_xgb.fit(X_holdout, y_holdout)
main_prediction = model_xgb.predict(X_train)
err_train = (main_prediction - y_train.values.ravel())/main_prediction
model_rfr = RandomForestRegressor()
model_rfr.fit(X_train, err_train)
np.sqrt(np.mean(-cross_val_score(model_rfr, X_train, err_train, scoring = 'neg_mean_squared_error', n_jobs = 5)))
avg = np.repeat(np.mean(err_train), err_train.shape[0])
np.sqrt(mean_squared_error(avg, err_train))