Load Required Libraries¶
In [1]:
### Import required libraries
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from IPython.display import display # Allows the use of display() for DataFrames
import warnings
warnings.filterwarnings('ignore')
Load Train and Test Data¶
In [2]:
# Read train and test files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
Train Data¶
In [3]:
train_df.head()
Out[3]:
ID | target | 48df886f9 | 0deb4b6a8 | 34b15f335 | a8cb14b00 | 2f0771a37 | 30347e683 | d08d1fbe3 | 6ee66e115 | 20aa07010 | dc5a8f1d8 | 11d86fa6a | 77c9823f2 | 8d6c2a0b2 | 4681de4fd | adf119b9a | cff75dd09 | 96f83a237 | b8a716ebf | 6c7a4567c | 4fcfd2b4d | f3b9c0b95 | 71cebf11c | d966ac62c | 68b647452 | c88d108c9 | ff7b471cd | d5308d8bc | 0d866c3d7 | bc3f77679 | bd8f989f1 | 0eff5bf95 | 22ed6dba3 | 92b13ebba | c330f1a67 | 233c7c17c | 2cb4d123e | eeac16933 | 87ffda550 | ... | 969caa87a | 00302fe51 | 1189ee335 | ca04a07ca | f6f15ffa5 | 841704460 | ea5ed6ff7 | b1bb8eac3 | 8132d18b8 | c24ea6548 | cdfc2b069 | 2a879b4f7 | 6b119d8ce | 98dea9e42 | 9f2471031 | 88458cb21 | f40da20f4 | 7ad6b38bd | c901e7df1 | 8f55955dc | 85dcc913d | 5ca0b9b0c | eab8abf7a | 8d8bffbae | 2a1f6c7f9 | 9437d8b64 | 5831f4c76 | 2e84e09c5 | d45fd5508 | a165f5761 | 3ecc09859 | 9281abeea | 8675bec0b | 3a13ed79a | f677d4d13 | 71b203550 | 137efaa80 | fb36b89d9 | 7e293fbaf | 9fc776466 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000d6aaf2 | 38000000.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 1300000.0 | ... | 0 | 0 | 1100000.0 | 0 | 0 | 0 | 0.0 | 0.0 | 14800000 | 0.0 | 1200000.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 4000000 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 000fbd867 | 600000.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 2200000.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | ... | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0027d6b71 | 10000000.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | ... | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0028cbf45 | 2000000.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | ... | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 002a68644 | 14400000.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 2000000.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | ... | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Test Data¶
In [4]:
test_df.head()
Out[4]:
ID | 48df886f9 | 0deb4b6a8 | 34b15f335 | a8cb14b00 | 2f0771a37 | 30347e683 | d08d1fbe3 | 6ee66e115 | 20aa07010 | dc5a8f1d8 | 11d86fa6a | 77c9823f2 | 8d6c2a0b2 | 4681de4fd | adf119b9a | cff75dd09 | 96f83a237 | b8a716ebf | 6c7a4567c | 4fcfd2b4d | f3b9c0b95 | 71cebf11c | d966ac62c | 68b647452 | c88d108c9 | ff7b471cd | d5308d8bc | 0d866c3d7 | bc3f77679 | bd8f989f1 | 0eff5bf95 | 22ed6dba3 | 92b13ebba | c330f1a67 | 233c7c17c | 2cb4d123e | eeac16933 | 87ffda550 | 822e49b95 | ... | 969caa87a | 00302fe51 | 1189ee335 | ca04a07ca | f6f15ffa5 | 841704460 | ea5ed6ff7 | b1bb8eac3 | 8132d18b8 | c24ea6548 | cdfc2b069 | 2a879b4f7 | 6b119d8ce | 98dea9e42 | 9f2471031 | 88458cb21 | f40da20f4 | 7ad6b38bd | c901e7df1 | 8f55955dc | 85dcc913d | 5ca0b9b0c | eab8abf7a | 8d8bffbae | 2a1f6c7f9 | 9437d8b64 | 5831f4c76 | 2e84e09c5 | d45fd5508 | a165f5761 | 3ecc09859 | 9281abeea | 8675bec0b | 3a13ed79a | f677d4d13 | 71b203550 | 137efaa80 | fb36b89d9 | 7e293fbaf | 9fc776466 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000137c73 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.528249e+06 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 00021489f | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0004d7953 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5000000.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 00056a333 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 20884000.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 36252000.0 | 0.0 | 20000000.0 | 0.0 | ... | 0.0 | 0.0 | 25010000.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 00056d8eb | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Train and Test Data Info¶
In [5]:
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4459 entries, 0 to 4458 Columns: 4993 entries, ID to 9fc776466 dtypes: float64(1845), int64(3147), object(1) memory usage: 169.9+ MB
So there are a total of 4993 columns out of which 1845 are of type float64, 3147 are int64 and 1 is object (ID is the object column)
In [6]:
test_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 49342 entries, 0 to 49341 Columns: 4992 entries, ID to 9fc776466 dtypes: float64(4991), object(1) memory usage: 1.8+ GB
So there are a total of 4992 columns in the test set out of which 4991 are of type float64 and 1 is object (ID is the object column)
Check for Missing Values¶
In [7]:
#### Check if there are any NULL values in Train Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)
Total Train Features with NaN Values = 0
In [8]:
#### Check if there are any NULL values in Test Data
print("Total Test Features with NaN Values = " + str(test_df.columns[test_df.isnull().sum() != 0].size))
if (test_df.columns[test_df.isnull().sum() != 0].size):
print("Features with NaN => {}".format(list(test_df.columns[test_df.isnull().sum() != 0])))
test_df[test_df.columns[test_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)
Total Test Features with NaN Values = 0
Check and Remove Constant Features¶
In [9]:
# check and remove constant columns
colsToRemove = []
for col in train_df.columns:
if col != 'ID' and col != 'target':
if train_df[col].std() == 0:
colsToRemove.append(col)
# remove constant columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)
# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True)
print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)
Removed `256` Constant Columns ['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a742107', '08b9ec4ae', 'd95203ded', '58ad51def', '9f69ae59f', '863de8a31', 'be10df47c', 'f006d9618', 'a7e39d23d', '5ed0abe85', '6c578fe94', '7fa4fcee9', '5e0571f07', 'fd5659511', 'e06b9f40f', 'c506599c8', '99de8c2dc', 'b05f4b229', '5e0834175', 'eb1cc0d9c', 'b281a62b9', '00fcf67e4', 'e37b65992', '2308e2b29', 'c342e8709', '708471ebf', 'f614aac15', '15ecf7b68', '3bfe540f1', '7a0d98f3c', 'e642315a5', 'c16d456a7', '0c9b5bcfa', 'b778ab129', '2ace87cdd', '697a566f0', '97b1f84fc', '34eff114b', '5281333d7', 'c89f3ba7e', 'cd6d3c7e6', 'fc7c8f2e8', 'abbbf9f82', '24a233e8f', '8e26b560e', 'a28ac1049', '504502ce1', 'd9a8615f3', '4efd6d283', '34cc56e83', '93e98252a', '2b6cef19e', 'c7f70a49b', '0d29ab7eb', 'e4a0d39b7', 'a4d1a8409', 'bc694fc8f', '3a36fc3a2', '4ffba44d3', '9bfdec4bc', '66a866d2f', 'f941e9df7', 'e7af4dbf3', 'dc9a54a3e', '748168a04', 'bba8ce4bb', 'ff6f62aa4', 'b06fe66ba', 'ae87ebc42', 'f26589e57', '963bb53b1', 'a531a4bf0', '9fc79985d', '9350d55c1', 'de06e884c', 'fc10bdf18', 'e0907e883', 'c586d79a1', 'e15e1513d', 'a06067897', '643e42fcb', '217cd3838', '047ebc242', '9b6ce40cf', '3b2c972b3', '17a7bf25a', 'c9028d46b', '9e0473c91', '6b041d374', '783c50218', '19122191d', 'ce573744f', '1c4ea481e', 'fbd6e0a0b', '69831c049', 'b87e3036b', '54ba515ee', 'a09ba0b15', '90f77ec55', 'fb02ef0ea', '3b0cccd29', 'fe9ed417c', '589e8bd6f', '17b5a03fd', '80e16b49a', 'a3d5c2c2a', '1bd3a4e92', '611d81daa', '3d7780b1c', '113fd0206', '5e5894826', 'cb36204f9', 'bc4e3d600', 'c66e2deb0', 'c25851298', 'a7f6de992', '3f93a3272', 'c1b95c2ec', '6bda21fee', '4a64e56e7', '943743753', '20854f8bf', 'ac2e428a9', '5ee7de0be', '316423a21', '2e52b0c6a', '8bdf6bc7e', '8f523faf2', '4758340d5', '8411096ec', '9678b95b7', 'a185e35cc', 'fa980a778', 'c8d90f7d7', '080540c81', '32591c8b4', '5779da33c', 'bb425b41e', '01599af81', '1654ab770', 'd334a588e', 'b4353599c', '51b53eaec', '2cc0fbc52', '45ffef194', 'c15ac04ee', '5b055c8ea', 'd0466eb58', 'a80633823', 'a117a5409', '7ddac276f', '8c32df8b3', 'e5649663e', '6c16efbb8', '9118fd5ca', 'ca8d565f1', '16a5bb8d2', 'fd6347461', 'f5179fb9c', '97428b646', 'f684b0a96', 'e4b2caa9f', '2c2d9f267', '96eb14eaf', 'cb2cb460c', '86f843927', 'ecd16fc60', '801c6dc8e', 'f859a25b8', 'ae846f332', '2252c7403', 'fb9e07326', 'd196ca1fd', 'a8e562e8e', 'eb6bb7ce1', '5beff147e', '52b347cdc', '4600aadcf', '6fa0b9dab', '43d70cc4d', '408021ef8', 'e29d22b59']
Remove Duplicate Columns¶
In [10]:
%%time
def duplicate_columns(frame):
groups = frame.columns.to_series().groupby(frame.dtypes).groups
dups = []
for t, v in groups.items():
cs = frame[v].columns
vs = frame[v]
lcs = len(cs)
for i in range(lcs):
ia = vs.iloc[:,i].values
for j in range(i+1, lcs):
ja = vs.iloc[:,j].values
if np.array_equal(ia, ja):
dups.append(cs[i])
break
return dups
colsToRemove = duplicate_columns(train_df)
print(colsToRemove)
['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d'] CPU times: user 8min 50s, sys: 232 ms, total: 8min 51s Wall time: 8min 50s
In [11]:
# remove duplicate columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)
# remove duplicate columns in the testing set
test_df.drop(colsToRemove, axis=1, inplace=True)
print("Removed `{}` Duplicate Columns\n".format(len(colsToRemove)))
print(colsToRemove)
Removed `5` Duplicate Columns ['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
Drop Sparse Data¶
In [12]:
def drop_sparse(train, test):
flist = [x for x in train.columns if not x in ['ID','target']]
for f in flist:
if len(np.unique(train[f]))<2:
train.drop(f, axis=1, inplace=True)
test.drop(f, axis=1, inplace=True)
return train, test
In [13]:
%%time
train_df, test_df = drop_sparse(train_df, test_df)
CPU times: user 712 ms, sys: 0 ns, total: 712 ms Wall time: 712 ms
In [14]:
gc.collect()
print("Train set size: {}".format(train_df.shape))
print("Test set size: {}".format(test_df.shape))
Train set size: (4459, 4732) Test set size: (49342, 4731)
Build Train and Test Data for Modeling¶
In [15]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)
X_test = test_df.drop(["ID"], axis=1)
In [16]:
dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)
LightGBM¶
In [17]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
params = {
"objective" : "regression",
"metric" : "rmse",
"num_leaves" : 40,
"learning_rate" : 0.004,
"bagging_fraction" : 0.6,
"feature_fraction" : 0.6,
"bagging_frequency" : 6,
"bagging_seed" : 42,
"verbosity" : -1,
"seed": 42
}
lgtrain = lgb.Dataset(train_X, label=train_y)
lgval = lgb.Dataset(val_X, label=val_y)
evals_result = {}
model = lgb.train(params, lgtrain, 5000,
valid_sets=[lgtrain, lgval],
early_stopping_rounds=100,
verbose_eval=150,
evals_result=evals_result)
pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
return pred_test_y, model, evals_result
In [18]:
# Training LGB
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")
Training until validation scores don't improve for 100 rounds. [150] training's rmse: 1.50793 valid_1's rmse: 1.53931 [300] training's rmse: 1.34402 valid_1's rmse: 1.46602 [450] training's rmse: 1.23291 valid_1's rmse: 1.43402 [600] training's rmse: 1.14927 valid_1's rmse: 1.41918 [750] training's rmse: 1.08343 valid_1's rmse: 1.41347 [900] training's rmse: 1.03011 valid_1's rmse: 1.41164 [1050] training's rmse: 0.985188 valid_1's rmse: 1.4116 Early stopping, best iteration is: [976] training's rmse: 1.00631 valid_1's rmse: 1.41125 LightGBM Training Completed...
In [19]:
# feature importance
print("Features Importance...")
gain = model.feature_importance('gain')
featureimp = pd.DataFrame({'feature':model.feature_name(),
'split':model.feature_importance('split'),
'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:50])
Features Importance... feature split gain 4130 f190486d6 803 9.017404 2375 58e2e02e6 715 5.294145 3465 eeb9cd3aa 675 4.294044 4020 15ace8c9f 541 3.095043 2614 9fd594eec 379 2.850291 8 20aa07010 409 2.112768 834 6eef030c1 342 1.400379 3571 58232a6fb 380 1.400253 1457 b43a7cfd5 399 1.187425 3661 491b9ee45 297 1.048269 2687 fb0f5dbfe 410 0.997311 1482 024c577b9 258 0.931900 3867 2288333b4 195 0.860454 2079 58e056e12 347 0.857672 4185 f74e8f13d 352 0.849962 4343 1702b5bf0 291 0.836605 4508 c47340d97 313 0.815153 828 6786ea46d 184 0.798005 566 66ace2992 276 0.778013 3722 d6bb78916 313 0.770098 3791 ed8ff54b5 169 0.764647 3220 ced6a7e91 240 0.720281 4028 5c6487af1 180 0.712760 3886 50e4f96cf 152 0.686449 863 fc99f9426 238 0.667129 1378 6cf7866c1 160 0.621800 34 87ffda550 162 0.604544 853 bc70cbc26 139 0.599770 3811 adb64ff71 235 0.597324 213 186b87c05 68 0.579914 3983 45f6d00da 229 0.575398 2616 fb387ea33 122 0.546694 2134 241f0f867 185 0.530336 624 0c9462c08 217 0.505927 4453 190db8488 190 0.490855 3509 13bdd610a 177 0.480766 1067 17b81a716 143 0.444314 537 26fc93eb7 184 0.442125 3779 70feb1494 160 0.441275 3849 73687e512 172 0.436898 1007 1c71183bb 201 0.426290 4341 e176a204a 176 0.418576 2211 1931ccfdd 171 0.412802 1044 edc84139a 201 0.412291 1748 5f341a818 147 0.394860 3150 f1e0ada11 168 0.381402 4316 c5a231d81 132 0.369029 1548 26ab20ff9 137 0.368083 4290 9280f3d04 190 0.363643 1848 5a1589f1a 196 0.360180
XGB Modeling¶
In [20]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
params = {'objective': 'reg:linear',
'eval_metric': 'rmse',
'eta': 0.001,
'max_depth': 10,
'subsample': 0.6,
'colsample_bytree': 0.6,
'alpha':0.001,
'random_state': 42,
'silent': True}
tr_data = xgb.DMatrix(train_X, train_y)
va_data = xgb.DMatrix(val_X, val_y)
watchlist = [(tr_data, 'train'), (va_data, 'valid')]
model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
dtest = xgb.DMatrix(test_X)
xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
return xgb_pred_y, model_xgb
In [21]:
# Training XGB
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")
[0] train-rmse:14.0877 valid-rmse:14.0768 Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping. Will train until valid-rmse hasn't improved in 100 rounds. [100] train-rmse:12.7683 valid-rmse:12.756 [200] train-rmse:11.5765 valid-rmse:11.5629 [300] train-rmse:10.4999 valid-rmse:10.4854 [400] train-rmse:9.52802 valid-rmse:9.51304 [500] train-rmse:8.65065 valid-rmse:8.6361 [600] train-rmse:7.85865 valid-rmse:7.84473 [700] train-rmse:7.14388 valid-rmse:7.13201 [800] train-rmse:6.49911 valid-rmse:6.48993 [900] train-rmse:5.91691 valid-rmse:5.91106 [1000] train-rmse:5.3925 valid-rmse:5.39137 [1100] train-rmse:4.9196 valid-rmse:4.92423 [1200] train-rmse:4.49343 valid-rmse:4.50457 [1300] train-rmse:4.10977 valid-rmse:4.12874 [1400] train-rmse:3.76474 valid-rmse:3.79266 [1500] train-rmse:3.45493 valid-rmse:3.49292 [1600] train-rmse:3.17654 valid-rmse:3.22512 [1700] train-rmse:2.92698 valid-rmse:2.98712 [1800] train-rmse:2.70352 valid-rmse:2.77675 [1900] train-rmse:2.50368 valid-rmse:2.59047 [1999] train-rmse:2.32698 valid-rmse:2.42818 XGB Training Completed...
Catboost¶
In [22]:
cb_model = CatBoostRegressor(iterations=500,
learning_rate=0.05,
depth=10,
eval_metric='RMSE',
random_seed = 42,
bagging_temperature = 0.2,
od_type='Iter',
metric_period = 50,
od_wait=20)
In [23]:
cb_model.fit(dev_X, dev_y,
eval_set=(val_X, val_y),
use_best_model=True,
verbose=50)
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 13.8879486 test: 13.8753251 best: 13.8753251 (0) total: 12.6s remaining: 1h 45m 6s 50: learn: 2.0469297 test: 2.0667457 best: 2.0667457 (50) total: 10m 51s remaining: 1h 35m 35s 100: learn: 1.6177240 test: 1.6331471 best: 1.6331471 (100) total: 21m 29s remaining: 1h 24m 54s 150: learn: 1.5794666 test: 1.6037148 best: 1.6037148 (150) total: 31m 55s remaining: 1h 13m 46s 200: learn: 1.5547877 test: 1.5898497 best: 1.5898497 (200) total: 42m 33s remaining: 1h 3m 19s 250: learn: 1.4868878 test: 1.5562798 best: 1.5562568 (249) total: 53m 9s remaining: 52m 43s 300: learn: 1.4428643 test: 1.5388335 best: 1.5388335 (300) total: 1h 3m 45s remaining: 42m 9s 350: learn: 1.3926213 test: 1.5243028 best: 1.5241373 (348) total: 1h 14m 26s remaining: 31m 35s 400: learn: 1.3574712 test: 1.5164978 best: 1.5164978 (400) total: 1h 25m 7s remaining: 21m 450: learn: 1.3343425 test: 1.5112868 best: 1.5112868 (450) total: 1h 35m 42s remaining: 10m 23s 499: learn: 1.3111233 test: 1.5077663 best: 1.5077360 (498) total: 1h 46m 9s remaining: 0us bestTest = 1.50773601 bestIteration = 498 Shrink model to first 499 iterations.
Out[23]:
<catboost.core.CatBoostRegressor at 0x7fd599df12b0>
In [24]:
pred_test_cat = np.expm1(cb_model.predict(X_test))
Combine Predictions¶
In [25]:
sub = pd.read_csv('../input/sample_submission.csv')
sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test
sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb
sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat
sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)
In [26]:
print(sub.head())
sub.to_csv('sub_lgb_xgb_cat.csv', index=False)
ID target 0 000137c73 1.295943e+06 1 00021489f 1.282743e+06 2 0004d7953 1.782904e+06 3 00056a333 3.562641e+06 4 00056d8eb 1.354023e+06