House Prices Regression Analysis

Kaggle Competition: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview

I used logistic regressions for my analysis with this dataset as I still need work with my more advanced machine learning techniques

1. Data Handling

import pandas as pd
  import numpy as np
  import matplotlib as plt
  import seaborn as sns
  %matplotlib inline
  
X = pd.read_csv('house_train.csv')
  X_test = pd.read_csv('house_test.csv')

  y = X['SalePrice'].reset_index(drop=True)
  y = np.log1p(y)
  train_features = X.drop(['SalePrice'], axis=1)
  features = pd.concat([train_features, X_test]).reset_index(drop=True)

  #check for null values which may mess with our predictions later on
  features.isnull().sum().sort_values(ascending = False).head(20)
  

  PoolQC          2909
  MiscFeature     2814
  Alley           2721
  Fence           2348
  FireplaceQu     1420
  LotFrontage      486
  GarageCond       159
  GarageQual       159
  GarageYrBlt      159
  GarageFinish     159
  GarageType       157
  BsmtCond          82
  BsmtExposure      82
  BsmtQual          81
  BsmtFinType2      80
  BsmtFinType1      79
  MasVnrType        24
  MasVnrArea        23
  MSZoning           4
  BsmtHalfBath       2
  dtype: int64
  
#double check the data type of each variable
  features.dtypes
  

  Id                 int64
  MSSubClass         int64
  MSZoning          object
  LotFrontage      float64
  LotArea            int64
                    ...   
  MiscVal            int64
  MoSold             int64
  YrSold             int64
  SaleType          object
  SaleCondition     object
  Length: 80, dtype: object
  

We need to replace the NA values in the data set with either 0 for numeric or most common/"None" for categoric depending on how the variable is being treated

for column in features:

      # populating with 0
      if column in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF','GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'TotalBsmtSF','Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea']:
          features[column] = features[column].fillna(0)

      # populate with 'None'
      if column in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', "PoolQC", 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'Neighborhood', 'BldgType', 'HouseStyle', 'MasVnrType', 'FireplaceQu', 'Fence', 'MiscFeature']:
          features[column] = features[column].fillna('None')

      # populate with most frequent value for categorical data
      if column in ['Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'RoofStyle', 'Electrical', 'Functional', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']:
          features[column] = features[column].fillna(features[column].mode()[0])
  
features.head()
  
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 0 None None None 0 2 2008 WD Normal
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 0 None None None 0 5 2007 WD Normal
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 0 None None None 0 9 2008 WD Normal
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 0 None None None 0 2 2006 WD Abnorml
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 0 None None None 0 12 2008 WD Normal

5 rows × 80 columns

2. Feature Engineering

Now that we have the data processed its time to add additional independent variables as well to help the prediction model have a far more accurate prediction


  features['total_yrs'] = features['YearRemodAdd'] - features['YearBuilt']  
  features['total_sqrft'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

  features['total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] + features['1stFlrSF'] + features['2ndFlrSF'])

  features['total_bath'] = (features['FullBath'] + (0.5 * features['HalfBath']) + features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

  features['total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] + features['EnclosedPorch'] + features['ScreenPorch'] + features['WoodDeckSF'])
  
f
  features['pool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
  features['2nd_flr'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
  features['garage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
  features['bsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
  features['fireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

  #handling the Nulls not taken care of by the feature engineering
  features['MSSubClass'] = features['MSSubClass'].apply(str)
  features["MSSubClass"] = features["MSSubClass"].fillna("Unknown")

  features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

  features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

  features['LotArea'] = features['LotArea'].astype(np.int64)

  features['Alley'] = features['Alley'].fillna('Pave')

  features['MasVnrArea'] = features['MasVnrArea'].astype(np.int64)
  

  features.shape
  features.head()
  
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... total_yrs total_sqrft total_sqr_footage total_bath total_porch_sf pool 2nd_flr garage bsmt fireplace
0 1 60 RL 65.0 8450 Pave Pave Reg Lvl AllPub ... 0 2566.0 2416.0 3.5 61 0 1 1 1 0
1 2 20 RL 80.0 9600 Pave Pave Reg Lvl AllPub ... 0 2524.0 2240.0 2.5 298 0 0 1 1 1
2 3 60 RL 68.0 11250 Pave Pave IR1 Lvl AllPub ... 1 2706.0 2272.0 3.5 42 0 1 1 1 1
3 4 70 RL 60.0 9550 Pave Pave IR1 Lvl AllPub ... 55 2473.0 1933.0 2.0 307 0 1 1 1 1
4 5 60 RL 84.0 14260 Pave Pave IR1 Lvl AllPub ... 0 3343.0 2853.0 3.5 276 0 1 1 1 1

5 rows × 90 columns

# double check for null
  features.isnull().sum().sort_values(ascending = False).head(20)
  

  fireplace       0
  RoofMatl        0
  Exterior2nd     0
  MasVnrType      0
  MasVnrArea      0
  ExterQual       0
  ExterCond       0
  Foundation      0
  BsmtQual        0
  BsmtCond        0
  BsmtExposure    0
  BsmtFinType1    0
  BsmtFinSF1      0
  BsmtFinType2    0
  BsmtFinSF2      0
  BsmtUnfSF       0
  TotalBsmtSF     0
  Heating         0
  HeatingQC       0
  CentralAir      0
  dtype: int64
  

3. Model Building and Fitting

#one hot encoding to make it easier and faster for the model
  features_2 = pd.get_dummies(features).reset_index(drop=True)

  #go back to X and X_test so we have our train and test split, use length of y to help seperate the two back
  X = features_2.iloc[:len(y), :]
  X_test = features_2.iloc[len(X):, :]
  print('Dimensions for each df')
  print('X', X.shape, 'y', y.shape, 'X_test', X_test.shape)
  
Dimensions for each df
  X (1460, 327) y (1460,) X_test (1459, 327)
  
X.dtypes
  

  Id                         int64
  LotFrontage              float64
  LotArea                    int64
  OverallQual                int64
  OverallCond                int64
                            ...   
  SaleCondition_AdjLand      uint8
  SaleCondition_Alloca       uint8
  SaleCondition_Family       uint8
  SaleCondition_Normal       uint8
  SaleCondition_Partial      uint8
  Length: 327, dtype: object
  

For this current iteration of the Regression, we only used ridge regression with 10 k-folds, but I will start to stack different models as my skillset grows

#import models
  from sklearn.preprocessing import RobustScaler
  from sklearn.model_selection import KFold
  from sklearn.pipeline import make_pipeline
  from sklearn.linear_model import RidgeCV, Lasso
  from sklearn.metrics import mean_squared_error
  from sklearn.model_selection import cross_val_score

  def rmse_cv(model):
      rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5))
      return(rmse)
  

Ridge Regression


  kfolds = KFold(n_splits = 10, random_state = 42, shuffle = True)
  alphas_no = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
  ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_no, cv=kfolds))

  ridge = ridge.fit(X,y)
  rmse_cv(ridge).mean()
  
0.13773568754637133
  

LASSO Regression


  lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=42))
  rmse_cv(lasso).mean()
  
0.1348839439429676
  

We have a solid Root Mean Square error, but I'm confident that once I start using stacked models, this error will drop closer to 0