### Import required libraries

import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

# Read train and test files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

train_df.head()

test_df.head()

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB

#### Check if there are any NULL values in Train Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
    train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Train Features with NaN Values = 0

#### Check if there are any NULL values in Test Data
print("Total Test Features with NaN Values = " + str(test_df.columns[test_df.isnull().sum() != 0].size))
if (test_df.columns[test_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(test_df.columns[test_df.isnull().sum() != 0])))
    test_df[test_df.columns[test_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Test Features with NaN Values = 0

# check and remove constant columns
colsToRemove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0: 
            colsToRemove.append(col)
        
# remove constant columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a742107', '08b9ec4ae', 'd95203ded', '58ad51def', '9f69ae59f', '863de8a31', 'be10df47c', 'f006d9618', 'a7e39d23d', '5ed0abe85', '6c578fe94', '7fa4fcee9', '5e0571f07', 'fd5659511', 'e06b9f40f', 'c506599c8', '99de8c2dc', 'b05f4b229', '5e0834175', 'eb1cc0d9c', 'b281a62b9', '00fcf67e4', 'e37b65992', '2308e2b29', 'c342e8709', '708471ebf', 'f614aac15', '15ecf7b68', '3bfe540f1', '7a0d98f3c', 'e642315a5', 'c16d456a7', '0c9b5bcfa', 'b778ab129', '2ace87cdd', '697a566f0', '97b1f84fc', '34eff114b', '5281333d7', 'c89f3ba7e', 'cd6d3c7e6', 'fc7c8f2e8', 'abbbf9f82', '24a233e8f', '8e26b560e', 'a28ac1049', '504502ce1', 'd9a8615f3', '4efd6d283', '34cc56e83', '93e98252a', '2b6cef19e', 'c7f70a49b', '0d29ab7eb', 'e4a0d39b7', 'a4d1a8409', 'bc694fc8f', '3a36fc3a2', '4ffba44d3', '9bfdec4bc', '66a866d2f', 'f941e9df7', 'e7af4dbf3', 'dc9a54a3e', '748168a04', 'bba8ce4bb', 'ff6f62aa4', 'b06fe66ba', 'ae87ebc42', 'f26589e57', '963bb53b1', 'a531a4bf0', '9fc79985d', '9350d55c1', 'de06e884c', 'fc10bdf18', 'e0907e883', 'c586d79a1', 'e15e1513d', 'a06067897', '643e42fcb', '217cd3838', '047ebc242', '9b6ce40cf', '3b2c972b3', '17a7bf25a', 'c9028d46b', '9e0473c91', '6b041d374', '783c50218', '19122191d', 'ce573744f', '1c4ea481e', 'fbd6e0a0b', '69831c049', 'b87e3036b', '54ba515ee', 'a09ba0b15', '90f77ec55', 'fb02ef0ea', '3b0cccd29', 'fe9ed417c', '589e8bd6f', '17b5a03fd', '80e16b49a', 'a3d5c2c2a', '1bd3a4e92', '611d81daa', '3d7780b1c', '113fd0206', '5e5894826', 'cb36204f9', 'bc4e3d600', 'c66e2deb0', 'c25851298', 'a7f6de992', '3f93a3272', 'c1b95c2ec', '6bda21fee', '4a64e56e7', '943743753', '20854f8bf', 'ac2e428a9', '5ee7de0be', '316423a21', '2e52b0c6a', '8bdf6bc7e', '8f523faf2', '4758340d5', '8411096ec', '9678b95b7', 'a185e35cc', 'fa980a778', 'c8d90f7d7', '080540c81', '32591c8b4', '5779da33c', 'bb425b41e', '01599af81', '1654ab770', 'd334a588e', 'b4353599c', '51b53eaec', '2cc0fbc52', '45ffef194', 'c15ac04ee', '5b055c8ea', 'd0466eb58', 'a80633823', 'a117a5409', '7ddac276f', '8c32df8b3', 'e5649663e', '6c16efbb8', '9118fd5ca', 'ca8d565f1', '16a5bb8d2', 'fd6347461', 'f5179fb9c', '97428b646', 'f684b0a96', 'e4b2caa9f', '2c2d9f267', '96eb14eaf', 'cb2cb460c', '86f843927', 'ecd16fc60', '801c6dc8e', 'f859a25b8', 'ae846f332', '2252c7403', 'fb9e07326', 'd196ca1fd', 'a8e562e8e', 'eb6bb7ce1', '5beff147e', '52b347cdc', '4600aadcf', '6fa0b9dab', '43d70cc4d', '408021ef8', 'e29d22b59']

%%time
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

colsToRemove = duplicate_columns(train_df)
print(colsToRemove)

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
CPU times: user 8min 50s, sys: 232 ms, total: 8min 51s
Wall time: 8min 50s

# remove duplicate columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `5` Duplicate Columns

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']

def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

%%time
train_df, test_df = drop_sparse(train_df, test_df)

CPU times: user 712 ms, sys: 0 ns, total: 712 ms
Wall time: 712 ms

gc.collect()
print("Train set size: {}".format(train_df.shape))
print("Test set size: {}".format(test_df.shape))

Train set size: (4459, 4732)
Test set size: (49342, 4731)

X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)

dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

# Training LGB
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")

Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.50793	valid_1's rmse: 1.53931
[300]	training's rmse: 1.34402	valid_1's rmse: 1.46602
[450]	training's rmse: 1.23291	valid_1's rmse: 1.43402
[600]	training's rmse: 1.14927	valid_1's rmse: 1.41918
[750]	training's rmse: 1.08343	valid_1's rmse: 1.41347
[900]	training's rmse: 1.03011	valid_1's rmse: 1.41164
[1050]	training's rmse: 0.985188	valid_1's rmse: 1.4116
Early stopping, best iteration is:
[976]	training's rmse: 1.00631	valid_1's rmse: 1.41125
LightGBM Training Completed...

# feature importance
print("Features Importance...")
gain = model.feature_importance('gain')
featureimp = pd.DataFrame({'feature':model.feature_name(), 
                   'split':model.feature_importance('split'), 
                   'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:50])

Features Importance...
        feature  split      gain
4130  f190486d6    803  9.017404
2375  58e2e02e6    715  5.294145
3465  eeb9cd3aa    675  4.294044
4020  15ace8c9f    541  3.095043
2614  9fd594eec    379  2.850291
8     20aa07010    409  2.112768
834   6eef030c1    342  1.400379
3571  58232a6fb    380  1.400253
1457  b43a7cfd5    399  1.187425
3661  491b9ee45    297  1.048269
2687  fb0f5dbfe    410  0.997311
1482  024c577b9    258  0.931900
3867  2288333b4    195  0.860454
2079  58e056e12    347  0.857672
4185  f74e8f13d    352  0.849962
4343  1702b5bf0    291  0.836605
4508  c47340d97    313  0.815153
828   6786ea46d    184  0.798005
566   66ace2992    276  0.778013
3722  d6bb78916    313  0.770098
3791  ed8ff54b5    169  0.764647
3220  ced6a7e91    240  0.720281
4028  5c6487af1    180  0.712760
3886  50e4f96cf    152  0.686449
863   fc99f9426    238  0.667129
1378  6cf7866c1    160  0.621800
34    87ffda550    162  0.604544
853   bc70cbc26    139  0.599770
3811  adb64ff71    235  0.597324
213   186b87c05     68  0.579914
3983  45f6d00da    229  0.575398
2616  fb387ea33    122  0.546694
2134  241f0f867    185  0.530336
624   0c9462c08    217  0.505927
4453  190db8488    190  0.490855
3509  13bdd610a    177  0.480766
1067  17b81a716    143  0.444314
537   26fc93eb7    184  0.442125
3779  70feb1494    160  0.441275
3849  73687e512    172  0.436898
1007  1c71183bb    201  0.426290
4341  e176a204a    176  0.418576
2211  1931ccfdd    171  0.412802
1044  edc84139a    201  0.412291
1748  5f341a818    147  0.394860
3150  f1e0ada11    168  0.381402
4316  c5a231d81    132  0.369029
1548  26ab20ff9    137  0.368083
4290  9280f3d04    190  0.363643
1848  5a1589f1a    196  0.360180

def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

# Training XGB
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")

[0]	train-rmse:14.0877	valid-rmse:14.0768
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.7683	valid-rmse:12.756
[200]	train-rmse:11.5765	valid-rmse:11.5629
[300]	train-rmse:10.4999	valid-rmse:10.4854
[400]	train-rmse:9.52802	valid-rmse:9.51304
[500]	train-rmse:8.65065	valid-rmse:8.6361
[600]	train-rmse:7.85865	valid-rmse:7.84473
[700]	train-rmse:7.14388	valid-rmse:7.13201
[800]	train-rmse:6.49911	valid-rmse:6.48993
[900]	train-rmse:5.91691	valid-rmse:5.91106
[1000]	train-rmse:5.3925	valid-rmse:5.39137
[1100]	train-rmse:4.9196	valid-rmse:4.92423
[1200]	train-rmse:4.49343	valid-rmse:4.50457
[1300]	train-rmse:4.10977	valid-rmse:4.12874
[1400]	train-rmse:3.76474	valid-rmse:3.79266
[1500]	train-rmse:3.45493	valid-rmse:3.49292
[1600]	train-rmse:3.17654	valid-rmse:3.22512
[1700]	train-rmse:2.92698	valid-rmse:2.98712
[1800]	train-rmse:2.70352	valid-rmse:2.77675
[1900]	train-rmse:2.50368	valid-rmse:2.59047
[1999]	train-rmse:2.32698	valid-rmse:2.42818
XGB Training Completed...

cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=50)

Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.

0:	learn: 13.8879486	test: 13.8753251	best: 13.8753251 (0)	total: 12.6s	remaining: 1h 45m 6s
50:	learn: 2.0469297	test: 2.0667457	best: 2.0667457 (50)	total: 10m 51s	remaining: 1h 35m 35s
100:	learn: 1.6177240	test: 1.6331471	best: 1.6331471 (100)	total: 21m 29s	remaining: 1h 24m 54s
150:	learn: 1.5794666	test: 1.6037148	best: 1.6037148 (150)	total: 31m 55s	remaining: 1h 13m 46s
200:	learn: 1.5547877	test: 1.5898497	best: 1.5898497 (200)	total: 42m 33s	remaining: 1h 3m 19s
250:	learn: 1.4868878	test: 1.5562798	best: 1.5562568 (249)	total: 53m 9s	remaining: 52m 43s
300:	learn: 1.4428643	test: 1.5388335	best: 1.5388335 (300)	total: 1h 3m 45s	remaining: 42m 9s
350:	learn: 1.3926213	test: 1.5243028	best: 1.5241373 (348)	total: 1h 14m 26s	remaining: 31m 35s
400:	learn: 1.3574712	test: 1.5164978	best: 1.5164978 (400)	total: 1h 25m 7s	remaining: 21m
450:	learn: 1.3343425	test: 1.5112868	best: 1.5112868 (450)	total: 1h 35m 42s	remaining: 10m 23s
499:	learn: 1.3111233	test: 1.5077663	best: 1.5077360 (498)	total: 1h 46m 9s	remaining: 0us

bestTest = 1.50773601
bestIteration = 498

Shrink model to first 499 iterations.

<catboost.core.CatBoostRegressor at 0x7fd599df12b0>

pred_test_cat = np.expm1(cb_model.predict(X_test))

sub = pd.read_csv('../input/sample_submission.csv')

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb

sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat

sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)

print(sub.head())
sub.to_csv('sub_lgb_xgb_cat.csv', index=False)

          ID        target
0  000137c73  1.295943e+06
1  00021489f  1.282743e+06
2  0004d7953  1.782904e+06
3  00056a333  3.562641e+06
4  00056d8eb  1.354023e+06

Load Required Libraries¶

Load Train and Test Data¶

Train Data¶

Test Data¶

Train and Test Data Info¶

Check for Missing Values¶

Check and Remove Constant Features¶

Remove Duplicate Columns¶

Drop Sparse Data¶

Build Train and Test Data for Modeling¶

LightGBM¶

XGB Modeling¶

Catboost¶

Combine Predictions¶

	ID	target	20aa07010	87ffda550	...	1189ee335	8132d18b8	cdfc2b069	c901e7df1
0	000d6aaf2	38000000.0	0.0	1300000.0	...	1100000.0	14800000	1200000.0	4000000
1	000fbd867	600000.0	2200000.0	0.0	...	0.0	0	0.0	0
2	0027d6b71	10000000.0	0.0	0.0	...	0.0	0	0.0	0
3	0028cbf45	2000000.0	0.0	0.0	...	0.0	0	0.0	0
4	002a68644	14400000.0	2000000.0	0.0	...	0.0	0	0.0	0

	ID	d5308d8bc	bd8f989f1	2cb4d123e	87ffda550	...	1189ee335	5831f4c76
0	000137c73	5.528249e+06	0.0	0.0	0.0	...	0.0	0.0
1	00021489f	0.000000e+00	0.0	0.0	0.0	...	0.0	0.0
2	0004d7953	0.000000e+00	0.0	0.0	0.0	...	0.0	5000000.0
3	00056a333	0.000000e+00	20884000.0	36252000.0	20000000.0	...	25010000.0	0.0
4	00056d8eb	0.000000e+00	0.0	0.0	0.0	...	0.0	0.0