Load Required Libraries¶

In [1]:
### Import required libraries

import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

Load Train and Test Data¶

In [2]:
# Read train and test files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

Train Data¶

In [3]:
train_df.head()
Out[3]:
ID target 48df886f9 0deb4b6a8 34b15f335 a8cb14b00 2f0771a37 30347e683 d08d1fbe3 6ee66e115 20aa07010 dc5a8f1d8 11d86fa6a 77c9823f2 8d6c2a0b2 4681de4fd adf119b9a cff75dd09 96f83a237 b8a716ebf 6c7a4567c 4fcfd2b4d f3b9c0b95 71cebf11c d966ac62c 68b647452 c88d108c9 ff7b471cd d5308d8bc 0d866c3d7 bc3f77679 bd8f989f1 0eff5bf95 22ed6dba3 92b13ebba c330f1a67 233c7c17c 2cb4d123e eeac16933 87ffda550 ... 969caa87a 00302fe51 1189ee335 ca04a07ca f6f15ffa5 841704460 ea5ed6ff7 b1bb8eac3 8132d18b8 c24ea6548 cdfc2b069 2a879b4f7 6b119d8ce 98dea9e42 9f2471031 88458cb21 f40da20f4 7ad6b38bd c901e7df1 8f55955dc 85dcc913d 5ca0b9b0c eab8abf7a 8d8bffbae 2a1f6c7f9 9437d8b64 5831f4c76 2e84e09c5 d45fd5508 a165f5761 3ecc09859 9281abeea 8675bec0b 3a13ed79a f677d4d13 71b203550 137efaa80 fb36b89d9 7e293fbaf 9fc776466
0 000d6aaf2 38000000.0 0.0 0 0.0 0 0 0 0 0 0.0 0.0 0.0 0 0 0 0 0.0 0 0.0 0 0 0.0 0 0 0.0 0 0 0 0.0 0 0.0 0 0.0 0.0 0 0.0 0 0 1300000.0 ... 0 0 1100000.0 0 0 0 0.0 0.0 14800000 0.0 1200000.0 0.0 0.0 0 0 0 0 0.0 4000000 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 0 0 0
1 000fbd867 600000.0 0.0 0 0.0 0 0 0 0 0 2200000.0 0.0 0.0 0 0 0 0 0.0 0 0.0 0 0 0.0 0 0 0.0 0 0 0 0.0 0 0.0 0 0.0 0.0 0 0.0 0 0 0.0 ... 0 0 0.0 0 0 0 0.0 0.0 0 0.0 0.0 0.0 0.0 0 0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 0 0 0
2 0027d6b71 10000000.0 0.0 0 0.0 0 0 0 0 0 0.0 0.0 0.0 0 0 0 0 0.0 0 0.0 0 0 0.0 0 0 0.0 0 0 0 0.0 0 0.0 0 0.0 0.0 0 0.0 0 0 0.0 ... 0 0 0.0 0 0 0 0.0 0.0 0 0.0 0.0 0.0 0.0 0 0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 0 0 0
3 0028cbf45 2000000.0 0.0 0 0.0 0 0 0 0 0 0.0 0.0 0.0 0 0 0 0 0.0 0 0.0 0 0 0.0 0 0 0.0 0 0 0 0.0 0 0.0 0 0.0 0.0 0 0.0 0 0 0.0 ... 0 0 0.0 0 0 0 0.0 0.0 0 0.0 0.0 0.0 0.0 0 0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 0 0 0
4 002a68644 14400000.0 0.0 0 0.0 0 0 0 0 0 2000000.0 0.0 0.0 0 0 0 0 0.0 0 0.0 0 0 0.0 0 0 0.0 0 0 0 0.0 0 0.0 0 0.0 0.0 0 0.0 0 0 0.0 ... 0 0 0.0 0 0 0 0.0 0.0 0 0.0 0.0 0.0 0.0 0 0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 0 0 0

Test Data¶

In [4]:
test_df.head()
Out[4]:
ID 48df886f9 0deb4b6a8 34b15f335 a8cb14b00 2f0771a37 30347e683 d08d1fbe3 6ee66e115 20aa07010 dc5a8f1d8 11d86fa6a 77c9823f2 8d6c2a0b2 4681de4fd adf119b9a cff75dd09 96f83a237 b8a716ebf 6c7a4567c 4fcfd2b4d f3b9c0b95 71cebf11c d966ac62c 68b647452 c88d108c9 ff7b471cd d5308d8bc 0d866c3d7 bc3f77679 bd8f989f1 0eff5bf95 22ed6dba3 92b13ebba c330f1a67 233c7c17c 2cb4d123e eeac16933 87ffda550 822e49b95 ... 969caa87a 00302fe51 1189ee335 ca04a07ca f6f15ffa5 841704460 ea5ed6ff7 b1bb8eac3 8132d18b8 c24ea6548 cdfc2b069 2a879b4f7 6b119d8ce 98dea9e42 9f2471031 88458cb21 f40da20f4 7ad6b38bd c901e7df1 8f55955dc 85dcc913d 5ca0b9b0c eab8abf7a 8d8bffbae 2a1f6c7f9 9437d8b64 5831f4c76 2e84e09c5 d45fd5508 a165f5761 3ecc09859 9281abeea 8675bec0b 3a13ed79a f677d4d13 71b203550 137efaa80 fb36b89d9 7e293fbaf 9fc776466
0 000137c73 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.528249e+06 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 00021489f 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000e+00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0004d7953 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000e+00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5000000.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 00056a333 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000e+00 0.0 0.0 20884000.0 0.0 0.0 0.0 0.0 0.0 36252000.0 0.0 20000000.0 0.0 ... 0.0 0.0 25010000.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 00056d8eb 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000e+00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

Train and Test Data Info¶

In [5]:
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB

So there are a total of 4993 columns out of which 1845 are of type float64, 3147 are int64 and 1 is object (ID is the object column)

In [6]:
test_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB

So there are a total of 4992 columns in the test set out of which 4991 are of type float64 and 1 is object (ID is the object column)

Check for Missing Values¶

In [7]:
#### Check if there are any NULL values in Train Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
    train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)
Total Train Features with NaN Values = 0
In [8]:
#### Check if there are any NULL values in Test Data
print("Total Test Features with NaN Values = " + str(test_df.columns[test_df.isnull().sum() != 0].size))
if (test_df.columns[test_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(test_df.columns[test_df.isnull().sum() != 0])))
    test_df[test_df.columns[test_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)
Total Test Features with NaN Values = 0

Check and Remove Constant Features¶

In [9]:
# check and remove constant columns
colsToRemove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0: 
            colsToRemove.append(col)
        
# remove constant columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)
Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a742107', '08b9ec4ae', 'd95203ded', '58ad51def', '9f69ae59f', '863de8a31', 'be10df47c', 'f006d9618', 'a7e39d23d', '5ed0abe85', '6c578fe94', '7fa4fcee9', '5e0571f07', 'fd5659511', 'e06b9f40f', 'c506599c8', '99de8c2dc', 'b05f4b229', '5e0834175', 'eb1cc0d9c', 'b281a62b9', '00fcf67e4', 'e37b65992', '2308e2b29', 'c342e8709', '708471ebf', 'f614aac15', '15ecf7b68', '3bfe540f1', '7a0d98f3c', 'e642315a5', 'c16d456a7', '0c9b5bcfa', 'b778ab129', '2ace87cdd', '697a566f0', '97b1f84fc', '34eff114b', '5281333d7', 'c89f3ba7e', 'cd6d3c7e6', 'fc7c8f2e8', 'abbbf9f82', '24a233e8f', '8e26b560e', 'a28ac1049', '504502ce1', 'd9a8615f3', '4efd6d283', '34cc56e83', '93e98252a', '2b6cef19e', 'c7f70a49b', '0d29ab7eb', 'e4a0d39b7', 'a4d1a8409', 'bc694fc8f', '3a36fc3a2', '4ffba44d3', '9bfdec4bc', '66a866d2f', 'f941e9df7', 'e7af4dbf3', 'dc9a54a3e', '748168a04', 'bba8ce4bb', 'ff6f62aa4', 'b06fe66ba', 'ae87ebc42', 'f26589e57', '963bb53b1', 'a531a4bf0', '9fc79985d', '9350d55c1', 'de06e884c', 'fc10bdf18', 'e0907e883', 'c586d79a1', 'e15e1513d', 'a06067897', '643e42fcb', '217cd3838', '047ebc242', '9b6ce40cf', '3b2c972b3', '17a7bf25a', 'c9028d46b', '9e0473c91', '6b041d374', '783c50218', '19122191d', 'ce573744f', '1c4ea481e', 'fbd6e0a0b', '69831c049', 'b87e3036b', '54ba515ee', 'a09ba0b15', '90f77ec55', 'fb02ef0ea', '3b0cccd29', 'fe9ed417c', '589e8bd6f', '17b5a03fd', '80e16b49a', 'a3d5c2c2a', '1bd3a4e92', '611d81daa', '3d7780b1c', '113fd0206', '5e5894826', 'cb36204f9', 'bc4e3d600', 'c66e2deb0', 'c25851298', 'a7f6de992', '3f93a3272', 'c1b95c2ec', '6bda21fee', '4a64e56e7', '943743753', '20854f8bf', 'ac2e428a9', '5ee7de0be', '316423a21', '2e52b0c6a', '8bdf6bc7e', '8f523faf2', '4758340d5', '8411096ec', '9678b95b7', 'a185e35cc', 'fa980a778', 'c8d90f7d7', '080540c81', '32591c8b4', '5779da33c', 'bb425b41e', '01599af81', '1654ab770', 'd334a588e', 'b4353599c', '51b53eaec', '2cc0fbc52', '45ffef194', 'c15ac04ee', '5b055c8ea', 'd0466eb58', 'a80633823', 'a117a5409', '7ddac276f', '8c32df8b3', 'e5649663e', '6c16efbb8', '9118fd5ca', 'ca8d565f1', '16a5bb8d2', 'fd6347461', 'f5179fb9c', '97428b646', 'f684b0a96', 'e4b2caa9f', '2c2d9f267', '96eb14eaf', 'cb2cb460c', '86f843927', 'ecd16fc60', '801c6dc8e', 'f859a25b8', 'ae846f332', '2252c7403', 'fb9e07326', 'd196ca1fd', 'a8e562e8e', 'eb6bb7ce1', '5beff147e', '52b347cdc', '4600aadcf', '6fa0b9dab', '43d70cc4d', '408021ef8', 'e29d22b59']

Remove Duplicate Columns¶

In [10]:
%%time
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

colsToRemove = duplicate_columns(train_df)
print(colsToRemove)
['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
CPU times: user 8min 50s, sys: 232 ms, total: 8min 51s
Wall time: 8min 50s
In [11]:
# remove duplicate columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(colsToRemove)))
print(colsToRemove)
Removed `5` Duplicate Columns

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']

Drop Sparse Data¶

In [12]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test
In [13]:
%%time
train_df, test_df = drop_sparse(train_df, test_df)
CPU times: user 712 ms, sys: 0 ns, total: 712 ms
Wall time: 712 ms
In [14]:
gc.collect()
print("Train set size: {}".format(train_df.shape))
print("Test set size: {}".format(test_df.shape))
Train set size: (4459, 4732)
Test set size: (49342, 4731)

Build Train and Test Data for Modeling¶

In [15]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)
In [16]:
dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

LightGBM¶

In [17]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result
In [18]:
# Training LGB
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.50793	valid_1's rmse: 1.53931
[300]	training's rmse: 1.34402	valid_1's rmse: 1.46602
[450]	training's rmse: 1.23291	valid_1's rmse: 1.43402
[600]	training's rmse: 1.14927	valid_1's rmse: 1.41918
[750]	training's rmse: 1.08343	valid_1's rmse: 1.41347
[900]	training's rmse: 1.03011	valid_1's rmse: 1.41164
[1050]	training's rmse: 0.985188	valid_1's rmse: 1.4116
Early stopping, best iteration is:
[976]	training's rmse: 1.00631	valid_1's rmse: 1.41125
LightGBM Training Completed...
In [19]:
# feature importance
print("Features Importance...")
gain = model.feature_importance('gain')
featureimp = pd.DataFrame({'feature':model.feature_name(), 
                   'split':model.feature_importance('split'), 
                   'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:50])
Features Importance...
        feature  split      gain
4130  f190486d6    803  9.017404
2375  58e2e02e6    715  5.294145
3465  eeb9cd3aa    675  4.294044
4020  15ace8c9f    541  3.095043
2614  9fd594eec    379  2.850291
8     20aa07010    409  2.112768
834   6eef030c1    342  1.400379
3571  58232a6fb    380  1.400253
1457  b43a7cfd5    399  1.187425
3661  491b9ee45    297  1.048269
2687  fb0f5dbfe    410  0.997311
1482  024c577b9    258  0.931900
3867  2288333b4    195  0.860454
2079  58e056e12    347  0.857672
4185  f74e8f13d    352  0.849962
4343  1702b5bf0    291  0.836605
4508  c47340d97    313  0.815153
828   6786ea46d    184  0.798005
566   66ace2992    276  0.778013
3722  d6bb78916    313  0.770098
3791  ed8ff54b5    169  0.764647
3220  ced6a7e91    240  0.720281
4028  5c6487af1    180  0.712760
3886  50e4f96cf    152  0.686449
863   fc99f9426    238  0.667129
1378  6cf7866c1    160  0.621800
34    87ffda550    162  0.604544
853   bc70cbc26    139  0.599770
3811  adb64ff71    235  0.597324
213   186b87c05     68  0.579914
3983  45f6d00da    229  0.575398
2616  fb387ea33    122  0.546694
2134  241f0f867    185  0.530336
624   0c9462c08    217  0.505927
4453  190db8488    190  0.490855
3509  13bdd610a    177  0.480766
1067  17b81a716    143  0.444314
537   26fc93eb7    184  0.442125
3779  70feb1494    160  0.441275
3849  73687e512    172  0.436898
1007  1c71183bb    201  0.426290
4341  e176a204a    176  0.418576
2211  1931ccfdd    171  0.412802
1044  edc84139a    201  0.412291
1748  5f341a818    147  0.394860
3150  f1e0ada11    168  0.381402
4316  c5a231d81    132  0.369029
1548  26ab20ff9    137  0.368083
4290  9280f3d04    190  0.363643
1848  5a1589f1a    196  0.360180

XGB Modeling¶

In [20]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb
In [21]:
# Training XGB
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")
[0]	train-rmse:14.0877	valid-rmse:14.0768
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.7683	valid-rmse:12.756
[200]	train-rmse:11.5765	valid-rmse:11.5629
[300]	train-rmse:10.4999	valid-rmse:10.4854
[400]	train-rmse:9.52802	valid-rmse:9.51304
[500]	train-rmse:8.65065	valid-rmse:8.6361
[600]	train-rmse:7.85865	valid-rmse:7.84473
[700]	train-rmse:7.14388	valid-rmse:7.13201
[800]	train-rmse:6.49911	valid-rmse:6.48993
[900]	train-rmse:5.91691	valid-rmse:5.91106
[1000]	train-rmse:5.3925	valid-rmse:5.39137
[1100]	train-rmse:4.9196	valid-rmse:4.92423
[1200]	train-rmse:4.49343	valid-rmse:4.50457
[1300]	train-rmse:4.10977	valid-rmse:4.12874
[1400]	train-rmse:3.76474	valid-rmse:3.79266
[1500]	train-rmse:3.45493	valid-rmse:3.49292
[1600]	train-rmse:3.17654	valid-rmse:3.22512
[1700]	train-rmse:2.92698	valid-rmse:2.98712
[1800]	train-rmse:2.70352	valid-rmse:2.77675
[1900]	train-rmse:2.50368	valid-rmse:2.59047
[1999]	train-rmse:2.32698	valid-rmse:2.42818
XGB Training Completed...

Catboost¶

In [22]:
cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)
In [23]:
cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=50)
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:	learn: 13.8879486	test: 13.8753251	best: 13.8753251 (0)	total: 12.6s	remaining: 1h 45m 6s
50:	learn: 2.0469297	test: 2.0667457	best: 2.0667457 (50)	total: 10m 51s	remaining: 1h 35m 35s
100:	learn: 1.6177240	test: 1.6331471	best: 1.6331471 (100)	total: 21m 29s	remaining: 1h 24m 54s
150:	learn: 1.5794666	test: 1.6037148	best: 1.6037148 (150)	total: 31m 55s	remaining: 1h 13m 46s
200:	learn: 1.5547877	test: 1.5898497	best: 1.5898497 (200)	total: 42m 33s	remaining: 1h 3m 19s
250:	learn: 1.4868878	test: 1.5562798	best: 1.5562568 (249)	total: 53m 9s	remaining: 52m 43s
300:	learn: 1.4428643	test: 1.5388335	best: 1.5388335 (300)	total: 1h 3m 45s	remaining: 42m 9s
350:	learn: 1.3926213	test: 1.5243028	best: 1.5241373 (348)	total: 1h 14m 26s	remaining: 31m 35s
400:	learn: 1.3574712	test: 1.5164978	best: 1.5164978 (400)	total: 1h 25m 7s	remaining: 21m
450:	learn: 1.3343425	test: 1.5112868	best: 1.5112868 (450)	total: 1h 35m 42s	remaining: 10m 23s
499:	learn: 1.3111233	test: 1.5077663	best: 1.5077360 (498)	total: 1h 46m 9s	remaining: 0us

bestTest = 1.50773601
bestIteration = 498

Shrink model to first 499 iterations.
Out[23]:
<catboost.core.CatBoostRegressor at 0x7fd599df12b0>
In [24]:
pred_test_cat = np.expm1(cb_model.predict(X_test))

Combine Predictions¶

In [25]:
sub = pd.read_csv('../input/sample_submission.csv')

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb

sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat

sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)
In [26]:
print(sub.head())
sub.to_csv('sub_lgb_xgb_cat.csv', index=False)
          ID        target
0  000137c73  1.295943e+06
1  00021489f  1.282743e+06
2  0004d7953  1.782904e+06
3  00056a333  3.562641e+06
4  00056d8eb  1.354023e+06