House Prices Regression Analysis

Kaggle Competition: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview

I used logistic regressions for my analysis with this dataset as I still need work with my more advanced machine learning techniques

1. Data Handling

import pandas as pd
  import numpy as np
  import matplotlib as plt
  import seaborn as sns
  %matplotlib inline

X = pd.read_csv('house_train.csv')
  X_test = pd.read_csv('house_test.csv')

  y = X['SalePrice'].reset_index(drop=True)
  y = np.log1p(y)
  train_features = X.drop(['SalePrice'], axis=1)
  features = pd.concat([train_features, X_test]).reset_index(drop=True)

  #check for null values which may mess with our predictions later on
  features.isnull().sum().sort_values(ascending = False).head(20)


  PoolQC          2909
  MiscFeature     2814
  Alley           2721
  Fence           2348
  FireplaceQu     1420
  LotFrontage      486
  GarageCond       159
  GarageQual       159
  GarageYrBlt      159
  GarageFinish     159
  GarageType       157
  BsmtCond          82
  BsmtExposure      82
  BsmtQual          81
  BsmtFinType2      80
  BsmtFinType1      79
  MasVnrType        24
  MasVnrArea        23
  MSZoning           4
  BsmtHalfBath       2
  dtype: int64

#double check the data type of each variable
  features.dtypes


  Id                 int64
  MSSubClass         int64
  MSZoning          object
  LotFrontage      float64
  LotArea            int64
                    ...   
  MiscVal            int64
  MoSold             int64
  YrSold             int64
  SaleType          object
  SaleCondition     object
  Length: 80, dtype: object

We need to replace the NA values in the data set with either 0 for numeric or most common/"None" for categoric depending on how the variable is being treated

for column in features:

      # populating with 0
      if column in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF','GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'TotalBsmtSF','Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea']:
          features[column] = features[column].fillna(0)

      # populate with 'None'
      if column in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', "PoolQC", 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'Neighborhood', 'BldgType', 'HouseStyle', 'MasVnrType', 'FireplaceQu', 'Fence', 'MiscFeature']:
          features[column] = features[column].fillna('None')

      # populate with most frequent value for categorical data
      if column in ['Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'RoofStyle', 'Electrical', 'Functional', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']:
          features[column] = features[column].fillna(features[column].mode()[0])

features.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	None	None	None	2	2008	WD	Normal
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	None	None	None	5	2007	WD	Normal
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	None	None	None	9	2008	WD	Normal
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	None	None	None	2	2006	WD	Abnorml
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	None	None	None	12	2008	WD	Normal

5 rows × 80 columns

2. Feature Engineering

Now that we have the data processed its time to add additional independent variables as well to help the prediction model have a far more accurate prediction


  features['total_yrs'] = features['YearRemodAdd'] - features['YearBuilt']  
  features['total_sqrft'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

  features['total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] + features['1stFlrSF'] + features['2ndFlrSF'])

  features['total_bath'] = (features['FullBath'] + (0.5 * features['HalfBath']) + features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

  features['total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] + features['EnclosedPorch'] + features['ScreenPorch'] + features['WoodDeckSF'])

f
  features['pool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
  features['2nd_flr'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
  features['garage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
  features['bsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
  features['fireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

  #handling the Nulls not taken care of by the feature engineering
  features['MSSubClass'] = features['MSSubClass'].apply(str)
  features["MSSubClass"] = features["MSSubClass"].fillna("Unknown")

  features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

  features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

  features['LotArea'] = features['LotArea'].astype(np.int64)

  features['Alley'] = features['Alley'].fillna('Pave')

  features['MasVnrArea'] = features['MasVnrArea'].astype(np.int64)


  features.shape
  features.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	total_yrs	total_sqrft	total_sqr_footage	total_bath	total_porch_sf	2nd_flr	garage	bsmt	fireplace
0	1	60	RL	65.0	8450	Pave	Pave	Reg	Lvl	AllPub	...	0	2566.0	2416.0	3.5	61	1	1	1	0
1	2	20	RL	80.0	9600	Pave	Pave	Reg	Lvl	AllPub	...	0	2524.0	2240.0	2.5	298	0	1	1	1
2	3	60	RL	68.0	11250	Pave	Pave	IR1	Lvl	AllPub	...	1	2706.0	2272.0	3.5	42	1	1	1	1
3	4	70	RL	60.0	9550	Pave	Pave	IR1	Lvl	AllPub	...	55	2473.0	1933.0	2.0	307	1	1	1	1
4	5	60	RL	84.0	14260	Pave	Pave	IR1	Lvl	AllPub	...	0	3343.0	2853.0	3.5	276	1	1	1	1

5 rows × 90 columns

# double check for null
  features.isnull().sum().sort_values(ascending = False).head(20)


  fireplace       0
  RoofMatl        0
  Exterior2nd     0
  MasVnrType      0
  MasVnrArea      0
  ExterQual       0
  ExterCond       0
  Foundation      0
  BsmtQual        0
  BsmtCond        0
  BsmtExposure    0
  BsmtFinType1    0
  BsmtFinSF1      0
  BsmtFinType2    0
  BsmtFinSF2      0
  BsmtUnfSF       0
  TotalBsmtSF     0
  Heating         0
  HeatingQC       0
  CentralAir      0
  dtype: int64

3. Model Building and Fitting

#one hot encoding to make it easier and faster for the model
  features_2 = pd.get_dummies(features).reset_index(drop=True)

  #go back to X and X_test so we have our train and test split, use length of y to help seperate the two back
  X = features_2.iloc[:len(y), :]
  X_test = features_2.iloc[len(X):, :]
  print('Dimensions for each df')
  print('X', X.shape, 'y', y.shape, 'X_test', X_test.shape)

Dimensions for each df
  X (1460, 327) y (1460,) X_test (1459, 327)

X.dtypes


  Id                         int64
  LotFrontage              float64
  LotArea                    int64
  OverallQual                int64
  OverallCond                int64
                            ...   
  SaleCondition_AdjLand      uint8
  SaleCondition_Alloca       uint8
  SaleCondition_Family       uint8
  SaleCondition_Normal       uint8
  SaleCondition_Partial      uint8
  Length: 327, dtype: object

For this current iteration of the Regression, we only used ridge regression with 10 k-folds, but I will start to stack different models as my skillset grows

#import models
  from sklearn.preprocessing import RobustScaler
  from sklearn.model_selection import KFold
  from sklearn.pipeline import make_pipeline
  from sklearn.linear_model import RidgeCV, Lasso
  from sklearn.metrics import mean_squared_error
  from sklearn.model_selection import cross_val_score

  def rmse_cv(model):
      rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5))
      return(rmse)

Ridge Regression


  kfolds = KFold(n_splits = 10, random_state = 42, shuffle = True)
  alphas_no = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
  ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_no, cv=kfolds))

  ridge = ridge.fit(X,y)
  rmse_cv(ridge).mean()

0.13773568754637133

LASSO Regression


  lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=42))
  rmse_cv(lasso).mean()

0.1348839439429676

We have a solid Root Mean Square error, but I'm confident that once I start using stacked models, this error will drop closer to 0