# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
import pylab 
import scipy.stats as stats
import datetime
%matplotlib inline

pd.set_option('display.max_columns', None)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv
/kaggle/input/lending-club-loan-data/LCDataDictionary.xlsx
/kaggle/input/lending-club-loan-data/loan.csv
/kaggle/input/lending-club-loan-data/database.sqlite
/kaggle/input/mercedesbenz-greener-manufacturing/test.csv
/kaggle/input/mercedesbenz-greener-manufacturing/train.csv

# ignore warnings

import warnings
warnings.filterwarnings('ignore')

# load the dataset
titanic = pd.read_csv('/kaggle/input/titanic/train.csv')

# make a copy of titanic dataset
data1 = titanic.copy()

 # check the percentage of missing values per variable

data1.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

# check how many observations we would drop
print('total passengers with values in all variables: ', data1.dropna().shape[0])
print('total passengers in the Titanic: ', data1.shape[0])
print('percentage of data without missing values: ', data1.dropna().shape[0]/ np.float(data1.shape[0]))

total passengers with values in all variables:  183
total passengers in the Titanic:  891
percentage of data without missing values:  0.2053872053872054

# make a copy of titanic dataset
data2 = titanic.copy()

# check the percentage of NA values in dataset

data2.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

# plot the distribution of age to find out if they are Gaussian or skewed.

plt.figure(figsize=(12,8))
fig = data2.Age.hist(bins=10)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Age')

Text(0.5, 0, 'Age')

# separate dataset into training and testing set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data2, data2.Survived, test_size=0.3, 
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 12), (268, 12))

# calculate median of Age
median = X_train.Age.median()
median

29.0

# impute missing values in age in train and test set

for df in [X_train, X_test]:
    df['Age'].fillna(median, inplace=True)

X_train['Age'].isnull().sum()

0

X_test['Age'].isnull().sum()

0

# make a copy of titanic dataset

data3 = titanic.copy()

# check the percentage of NA values

data3.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

# separate dataset into training and testing set

X_train, X_test, y_train, y_test = train_test_split(data3, data3.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 12), (268, 12))

# write a function to create 3 variables from Age:

def impute_na(df, variable, median):
    
    df[variable+'_median'] = df[variable].fillna(median)
    df[variable+'_zero'] = df[variable].fillna(0)
    
    # random sampling
    df[variable+'_random'] = df[variable]
    
    # extract the random sample to fill the na
    random_sample = X_train[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)
    
    # pandas needs to have the same index in order to merge datasets
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(), variable+'_random'] = random_sample
    
    # fill with random-sample
    df[variable+'_random_sample'] = df[variable].fillna(random_sample)

impute_na(X_train, 'Age', median)

impute_na(X_test, 'Age', median)

# make a copy of titanic dataset

data4 = titanic.copy()

# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(data4, data4.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 12), (268, 12))

def impute_na(df, variable):
    df[variable+'_zero'] = df[variable].fillna(0)
    df[variable+'_hundred']= df[variable].fillna(100)

# replace NA with the median value in the training and test set
impute_na(X_train, 'Age')
impute_na(X_test, 'Age')

# make a copy of titanic dataset

data5 = titanic.copy()

# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(data5, data5.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 12), (268, 12))

plt.figure(figsize=(12,8))
X_train.Age.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f2d0b026b38>

# at far end of the distribution
X_train.Age.mean()+3*X_train.Age.std()

73.43632005918366

# we can see that there are a few outliers for Age
# according to its distribution, these outliers will be masked when we replace NA by values at the far end 

plt.figure(figsize=(12,8))
sns.boxplot('Age', data=data5)

<matplotlib.axes._subplots.AxesSubplot at 0x7f2d0af709e8>

def impute_na(df, variable, median, extreme):
    df[variable+'_far_end'] = df[variable].fillna(extreme)
    df[variable].fillna(median, inplace=True)

# let's replace the NA with the median value in the training and testing sets
impute_na(X_train, 'Age', X_train.Age.median(), X_train.Age.mean()+3*X_train.Age.std())
impute_na(X_test, 'Age', X_train.Age.median(), X_train.Age.mean()+3*X_train.Age.std())

# make a copy of titanic dataset

data6 = titanic.copy()

# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(data6, data6.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 12), (268, 12))

# create variable indicating missingness

X_train['Age_NA'] = np.where(X_train['Age'].isnull(), 1, 0)
X_test['Age_NA'] = np.where(X_test['Age'].isnull(), 1, 0)

X_train.head()

# we can see that mean and median are similar. So I will replace with the median

X_train.Age.mean(), X_train.Age.median()

(29.915338645418327, 29.0)

# let's replace the NA with the median value in the training set
X_train['Age'].fillna(X_train.Age.median(), inplace=True)
X_test['Age'].fillna(X_train.Age.median(), inplace=True)

X_train.head(10)

# make a copy of titanic dataset

data7 = titanic.copy()

data7['Sex'].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

# one hot encoding

pd.get_dummies(data7['Sex']).head()

# for better visualisation
pd.concat([data7['Sex'], pd.get_dummies(data7['Sex'])], axis=1).head()

# obtaining k-1 labels
pd.get_dummies(data7['Sex'], drop_first=True).head()

# Let's now look at an example with more than 2 labels

data7['Embarked'].head()

0    S
1    C
2    S
3    S
4    S
Name: Embarked, dtype: object

# check the number of different labels
data7.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

# get whole set of dummy variables

pd.get_dummies(data7['Embarked']).head()

# get k-1 dummy variables

pd.get_dummies(data7['Embarked'], drop_first=True).head()

#import dataset
df_train = pd.read_csv('/kaggle/input/mercedesbenz-greener-manufacturing/train.csv')
                       

df_test = pd.read_csv('/kaggle/input/mercedesbenz-greener-manufacturing/test.csv')

df_train.head()

# let's have a look at how many labels

for col in df_train.columns[3:9]:
    print(col, ': ', len(df_train[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels

X_train, X_test, y_train, y_test = train_test_split(df_train[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']], df_train.y,
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((2946, 6), (1263, 6))

# let's obtain the counts for each one of the labels in variable X2
# let's capture this in a dictionary that we can use to re-map the labels

X_train.X2.value_counts().to_dict()

{'as': 1155,
 'ae': 342,
 'ai': 289,
 'm': 284,
 'ak': 188,
 'r': 101,
 'n': 97,
 's': 63,
 'e': 61,
 'f': 59,
 'aq': 46,
 'ay': 40,
 'a': 34,
 't': 17,
 'k': 16,
 'i': 15,
 'ag': 15,
 'z': 14,
 'd': 12,
 'b': 12,
 'ac': 10,
 'ao': 10,
 'g': 10,
 'y': 8,
 'x': 8,
 'at': 5,
 'ap': 5,
 'h': 4,
 'q': 3,
 'ah': 3,
 'al': 3,
 'an': 3,
 'au': 3,
 'av': 2,
 'aw': 2,
 'aa': 1,
 'l': 1,
 'p': 1,
 'c': 1,
 'af': 1,
 'o': 1,
 'am': 1}

# lets look at X_train so we can compare then the variable re-coding

X_train.head()

# now let's replace each label in X2 by its count

# first we make a dictionary that maps each label to the counts
X_frequency_map = X_train.X2.value_counts().to_dict()

# and now we replace X2 labels both in train and test set with the same map
X_train.X2 = X_train.X2.map(X_frequency_map)
X_test.X2 = X_test.X2.map(X_frequency_map)

X_train.head()

# let's load again the titanic dataset

data = pd.read_csv('/kaggle/input/titanic/train.csv', usecols=['Cabin', 'Survived'])
data.head()

# let's fill NA values with an additional label

data.Cabin.fillna('Missing', inplace=True)
data.head()

# check number of different labels in Cabin

len(data.Cabin.unique())

148

# Now we extract the first letter of the cabin

data['Cabin'] = data['Cabin'].astype(str).str[0]
data.head()

# check the labels
data.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

# Let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(data[['Cabin', 'Survived']], data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 2), (268, 2))

# let's calculate the target frequency for each label

X_train.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.428571
B    0.774194
C    0.571429
D    0.692308
E    0.740741
F    0.666667
G    0.500000
M    0.303609
T    0.000000
Name: Survived, dtype: float64

# and now let's do the same but capturing the result in a dictionary

ordered_labels = X_train.groupby(['Cabin'])['Survived'].mean().to_dict()
ordered_labels

{'A': 0.42857142857142855,
 'B': 0.7741935483870968,
 'C': 0.5714285714285714,
 'D': 0.6923076923076923,
 'E': 0.7407407407407407,
 'F': 0.6666666666666666,
 'G': 0.5,
 'M': 0.3036093418259023,
 'T': 0.0}

# replace the labels with the 'risk' (target frequency)
# note that we calculated the frequencies based on the training set only

X_train['Cabin_ordered'] = X_train.Cabin.map(ordered_labels)
X_test['Cabin_ordered'] = X_test.Cabin.map(ordered_labels)

# view results

X_train.head()

# plot the original variable

fig = plt.figure(figsize=(8,6))
fig = X_train.groupby(['Cabin'])['Survived'].mean().plot()
fig.set_title('Normal relationship between variable and target')
fig.set_ylabel('Survived')

Text(0, 0.5, 'Survived')

# plot the transformed result: the monotonic variable

fig = plt.figure(figsize=(8,6))
fig = X_train.groupby(['Cabin_ordered'])['Survived'].mean().plot()
fig.set_title('Monotonic relationship between variable and target')
fig.set_ylabel('Survived')

Text(0, 0.5, 'Survived')

# preview X_train

X_train.head()

# now we calculate the probability of target=1 
X_train.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.428571
B    0.774194
C    0.571429
D    0.692308
E    0.740741
F    0.666667
G    0.500000
M    0.303609
T    0.000000
Name: Survived, dtype: float64

# let's make a dataframe with the above calculation

prob_df = X_train.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df

# and now the probability of target = 0 
# and we add it to the dataframe

prob_df = X_train.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df['Died'] = 1-prob_df.Survived
prob_df

# since the log of zero is not defined, let's set this number to something small and non-zero

prob_df.loc[prob_df.Survived == 0, 'Survived'] = 0.00001
prob_df

# now we calculate the WoE

prob_df['WoE'] = np.log(prob_df.Survived/prob_df.Died)
prob_df

# and we create a dictionary to re-map the variable

prob_df['WoE'].to_dict()

{'A': -0.2876820724517809,
 'B': 1.232143681292632,
 'C': 0.28768207245178085,
 'D': 0.8109302162163288,
 'E': 1.0498221244986774,
 'F': 0.6931471805599452,
 'G': 0.0,
 'M': -0.8301689781242366,
 'T': -11.512925464970229}

# and we make a dictionary to map the orignal variable to the WoE
# same as above but we capture the dictionary in a variable

ordered_labels = prob_df['WoE'].to_dict()

# replace the labels with the WoE

X_train['Cabin_ordered'] = X_train.Cabin.map(ordered_labels)
X_test['Cabin_ordered'] = X_test.Cabin.map(ordered_labels)

# check the results

X_train.head()

# plot the original variable

fig = plt.figure(figsize=(8,6))
fig = X_train.groupby(['Cabin'])['Survived'].mean().plot()
fig.set_title('Normal relationship between variable and target')
fig.set_ylabel('Survived')

Text(0, 0.5, 'Survived')

# plot the transformed result: the monotonic variable

fig = plt.figure(figsize=(8,6))
fig = X_train.groupby(['Cabin_ordered'])['Survived'].mean().plot()
fig.set_title('Monotonic relationship between variable and target')
fig.set_ylabel('Survived')

Text(0, 0.5, 'Survived')

# load the numerical variables of the Titanic Dataset

data = pd.read_csv('/kaggle/input/titanic/train.csv', usecols = ['Age', 'Fare', 'Survived'])
data.head()

# first I will fill the missing data of the variable age, with a random sample of the variable

def impute_na(data, variable):
    # function to fill na with a random sample
    df = data.copy()
    
    # random sampling
    df[variable+'_random'] = df[variable]
    
    # extract the random sample to fill the na
    random_sample = df[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)
    
    # pandas needs to have the same index in order to merge datasets
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(), variable+'_random'] = random_sample
    
    return df[variable+'_random']

# fill na
data['Age'] = impute_na(data, 'Age')

# plot the histograms to have a quick look at the distributions
# we can plot Q-Q plots to visualise if the variable is normally distributed

def diagnostic_plots(df, variable):
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=pylab)

    plt.show()
    
diagnostic_plots(data, 'Age')

### Logarithmic transformation
data['Age_log'] = np.log(data.Age)

diagnostic_plots(data, 'Age_log')

### Reciprocal transformation
data['Age_reciprocal'] = 1 / data.Age

diagnostic_plots(data, 'Age_reciprocal')

data['Age_sqr'] =data.Age**(1/2)

diagnostic_plots(data, 'Age_sqr')

data['Age_exp'] = data.Age**(1/1.2) 

diagnostic_plots(data, 'Age_exp')

data['Age_boxcox'], param = stats.boxcox(data.Age) 

print('Optimal λ: ', param)

diagnostic_plots(data, 'Age_boxcox')

Optimal λ:  0.7648522500282625

# define x
x = np.array([24,  7,  2, 25, 22, 29])
x

array([24,  7,  2, 25, 22, 29])

# equal width discretisation with cut 
pd.cut(x, bins = 3, labels = ["bad", "medium", "good"]).value_counts() #Bins size has equal interval of 9

bad       2
medium    0
good      4
dtype: int64

# define x
x = np.array([24,  7,  2, 25, 22, 29])
x

array([24,  7,  2, 25, 22, 29])

# equal frequency discretisation with qcut 
pd.qcut(x, q = 3, labels = ["bad", "medium", "good"]).value_counts() #Equal frequency of 2 in each bins

bad       2
medium    2
good      2
dtype: int64

# load the numerical variables of the Titanic Dataset
data = pd.read_csv('/kaggle/input/titanic/train.csv', usecols = ['Age', 'Survived'])
data.head()

def impute_na(data, variable):
    df = data.copy()
    
    # random sampling
    df[variable+'_random'] = df[variable]
    
    # extract the random sample to fill the na
    random_sample = data[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)
    
    # pandas needs to have the same index in order to merge datasets
    random_sample.index = df[df[variable].isnull()].index
    df.loc[df[variable].isnull(), variable+'_random'] = random_sample
    
    return df[variable+'_random']

# let's fill the missing data
data['Age'] = impute_na(data, 'Age')

data['Age'].isnull().sum()

0

# let's divide age into the buckets 

# bucket boundaries
buckets = [0,20,40,60,100]

# bucket labels
labels = ['0-20', '20-40', '40-60', '>60']

# discretisation
pd.cut(data.Age, bins = buckets, labels = labels, include_lowest=True).value_counts()

20-40    486
0-20     221
40-60    156
>60       28
Name: Age, dtype: int64

# create two new columns after discretisation

data['Age_buckets_labels'] = pd.cut(data.Age, bins=buckets, labels = labels, include_lowest=True)
data['Age_buckets'] = pd.cut(data.Age, bins=buckets, include_lowest=True)

data.head()

data.tail()

# number of passengers per age bucket

plt.figure(figsize=(12,8))
data.groupby('Age_buckets_labels')['Age'].count().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f2d06d67b70>

# load the numerical variables of the Titanic Dataset
data = pd.read_csv('/kaggle/input/titanic/train.csv', usecols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived'])
data.head()

# divide dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(data, data.Survived,
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 6), (268, 6))

# let's make boxplots to visualise outliers in the continuous variables 
# Age and Fare

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = data.boxplot(column='Age')
fig.set_title('')
fig.set_ylabel('Age')

plt.subplot(1, 2, 2)
fig = data.boxplot(column='Fare')
fig.set_title('')
fig.set_ylabel('Fare')

Text(0, 0.5, 'Fare')

# first we plot the distributions to find out if they are Gaussian or skewed.
# Depending on the distribution, we will use the normal assumption or the interquantile
# range to find outliers

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = data.Age.hist(bins=20)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Age')

plt.subplot(1, 2, 2)
fig = data.Fare.hist(bins=20)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Fare')

Text(0.5, 0, 'Fare')

# find outliers

# Age
Upper_boundary = data.Age.mean() + 3* data.Age.std()
Lower_boundary = data.Age.mean() - 3* data.Age.std()
print('Age outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_boundary, upperboundary=Upper_boundary))

# Fare
IQR = data.Fare.quantile(0.75) - data.Fare.quantile(0.25)
Lower_fence = data.Fare.quantile(0.25) - (IQR * 3)
Upper_fence = data.Fare.quantile(0.75) + (IQR * 3)
print('Fare outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Age outliers are values < -13.88037434994331 or > 73.27860964406095
Fare outliers are values < -61.358399999999996 or > 100.2688

# view the statistical summary of Age
data.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

# Assuming normality

Upper_boundary = X_train.Age.mean() + 3* X_train.Age.std()
Upper_boundary

73.43632005918366

# top-coding the Age variable

X_train.loc[X_train.Age>73, 'Age'] = 73
X_test.loc[X_test.Age>73, 'Age'] = 73

X_train.Age.max(), X_test.Age.max()

(73.0, 70.5)

# view statistical properties of Fare

X_train.Fare.describe()

count    623.000000
mean      32.458273
std       48.257658
min        0.000000
25%        7.925000
50%       15.000000
75%       31.387500
max      512.329200
Name: Fare, dtype: float64

# top coding: upper boundary for outliers according to interquantile proximity rule

IQR = data.Fare.quantile(0.75) - data.Fare.quantile(0.25)

Upper_fence = X_train.Fare.quantile(0.75) + (IQR * 3)

Upper_fence

100.6563

# top-coding: capping the variable Fare at 100
X_train.loc[X_train.Fare>100, 'Fare'] = 100
X_test.loc[X_test.Fare>100, 'Fare'] = 100
X_train.Fare.max(), X_test.Fare.max()

(100.0, 100.0)

# let's load the Lending Club dataset with selected columns and rows

use_cols = ['issue_d', 'last_pymnt_d']
data = pd.read_csv('/kaggle/input/lending-club-loan-data/loan.csv', usecols=use_cols, nrows=10000)
data.head()

# now let's parse the dates, currently coded as strings, into datetime format

data['issue_dt'] = pd.to_datetime(data.issue_d)
data['last_pymnt_dt'] = pd.to_datetime(data.last_pymnt_d)

data[['issue_d','issue_dt','last_pymnt_d', 'last_pymnt_dt']].head()

# Extracting Month from date

data['issue_dt_month'] = data['issue_dt'].dt.month

data[['issue_dt', 'issue_dt_month']].head()

data[['issue_dt', 'issue_dt_month']].tail()

# Extract quarter from date variable

data['issue_dt_quarter'] = data['issue_dt'].dt.quarter

data[['issue_dt', 'issue_dt_quarter']].head()

data[['issue_dt', 'issue_dt_quarter']].tail()

# We could also extract semester

data['issue_dt_semester'] = np.where(data.issue_dt_quarter.isin([1,2]),1,2)
data.head()

# day - numeric from 1-31

data['issue_dt_day'] = data['issue_dt'].dt.day

data[['issue_dt', 'issue_dt_day']].head()

# day of the week - from 0 to 6

data['issue_dt_dayofweek'] = data['issue_dt'].dt.dayofweek

data[['issue_dt', 'issue_dt_dayofweek']].head()

data[['issue_dt', 'issue_dt_dayofweek']].tail()

# day of the week - name

data['issue_dt_dayofweek'] = data['issue_dt'].dt.weekday_name

data[['issue_dt', 'issue_dt_dayofweek']].head()

data[['issue_dt', 'issue_dt_dayofweek']].tail()

# was the application done on the weekend?

data['issue_dt_is_weekend'] = np.where(data['issue_dt_dayofweek'].isin(['Sunday', 'Saturday']), 1,0)
data[['issue_dt', 'issue_dt_dayofweek','issue_dt_is_weekend']].head()

data[data.issue_dt_is_weekend==1][['issue_dt', 'issue_dt_dayofweek','issue_dt_is_weekend']].head()

# extract year 

data['issue_dt_year'] = data['issue_dt'].dt.year

data[['issue_dt', 'issue_dt_year']].head()

# extract the date difference between 2 dates

data['issue_dt'] - data['last_pymnt_dt']

0      -62 days
1      -62 days
2      -62 days
3      -62 days
4      -62 days
         ...   
9995   -62 days
9996   -62 days
9997   -62 days
9998   -62 days
9999   -62 days
Length: 10000, dtype: timedelta64[ns]

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Age_NA
857	858	1	1	Daly, Mr. Peter Denis	male	51.0	0	0	113055	26.5500	E17	S	0
52	53	1	1	Harper, Mrs. Henry Sleeper (Myna Haxtun)	female	49.0	1	0	PC 17572	76.7292	D33	C	0
386	387	0	3	Goodwin, Master. Sidney Leonard	male	1.0	5	2	CA 2144	46.9000	NaN	S	0
124	125	0	1	White, Mr. Percival Wayland	male	54.0	0	1	35281	77.2875	D26	S	0
578	579	0	3	Caram, Mrs. Joseph (Maria Elias)	female	NaN	1	0	2689	14.4583	NaN	C	1

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Age_NA
857	858	1	1	Daly, Mr. Peter Denis	male	51.0	0	0	113055	26.5500	E17	S	0
52	53	1	1	Harper, Mrs. Henry Sleeper (Myna Haxtun)	female	49.0	1	0	PC 17572	76.7292	D33	C	0
386	387	0	3	Goodwin, Master. Sidney Leonard	male	1.0	5	2	CA 2144	46.9000	NaN	S	0
124	125	0	1	White, Mr. Percival Wayland	male	54.0	0	1	35281	77.2875	D26	S	0
578	579	0	3	Caram, Mrs. Joseph (Maria Elias)	female	29.0	1	0	2689	14.4583	NaN	C	1
549	550	1	2	Davies, Master. John Morgan Jr	male	8.0	1	1	C.A. 33112	36.7500	NaN	S	0
118	119	0	1	Baxter, Mr. Quigg Edmond	male	24.0	0	1	PC 17558	247.5208	B58 B60	C	0
12	13	0	3	Saundercock, Mr. William Henry	male	20.0	0	0	A/5. 2151	8.0500	NaN	S	0
157	158	0	3	Corn, Mr. Harry	male	30.0	0	0	SOTON/OQ 392090	8.0500	NaN	S	0
127	128	1	3	Madsen, Mr. Fridtjof Arne	male	24.0	0	0	C 17369	7.1417	NaN	S	0

	ID	y	X0	X1	X2	X3	X4	X5	X6	X8	X13	X17	X18	X21	X27	X28	X29	X31	X35	X37	X43	X46	X51	X54	X58	X61	X68	X70	X71	X74	X75	X76	X80	X84	X85	X96	X98	X100	X101	X108	X111	X114	X116	X118	X119	X120	X127	X128	X131	X132	X136	X137	X142	X144	X148	X150	X156	X157	X158	X162	X163	X165	X166	X169	X170	X174	X178	X179	X184	X187	X189	X194	X204	X205	X206	X209	X218	X219	X220	X223	X224	X228	X229	X232	X234	X237	X238	X241	X244	X246	X250	X252	X256	X262	X263	X265	X266	X272	X273	X274	X275	X276	X279	X285	X286	X291	X304	X306	X311	X316	X324	X327	X328	X329	X334	X336	X337	X348	X350	X354	X358	X361	X363	X371	X372	X375	X377	X382
0	0	130.81	k	v	at	a	d	u	j	o	1	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	1	1	0	1	0	0	0	0	1	0	0	0	0	0	1	1	1	1	1	1	0	1	1	0	1	1	1	1	0	1	1	0	0	0	0	0	0	0	1	0	0	1	1	1	1	1	1	0	0	1	0	0	1	0	0	0	0	0	1	1	0	0	0	0	0	0	0	1	1	0	1	0	1	0	1	0	0	1	0	0	0	1	0	1	1	1	0	1	1	0	0	0	0	1	0	1	0	0	0	0	1	0
1	6	88.53	k	t	av	e	d	y	l	o	0	0	1	0	1	0	0	1	1	1	0	0	1	0	0	1	0	1	0	1	0	0	1	0	1	1	1	1	1	0	1	0	0	1	1	1	1	1	0	1	1	0	1	1	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0	1	1	1	0	1	0	1	0	0	0	0	0	0	1	0	0	0	1	0	0	0	1	0	0	0	1	1	0	0	1	0	1	0	0	1	0	0	1	0	1	1	0	0	0	1	0	1	1	1	0	0	0	1	1	0	0	1	0	0
2	7	76.26	az	w	n	c	d	x	j	x	0	1	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	0	1	0	1	0	0	0	0	1	0	1	0	1	0	1	0	1	1	1	0	1	1	1	1	0	0	1	0	0	0	1	0	0	0	1	0	1	1	1	1	1	1	1	1	1	0	1	0	0	0	1	1	1	1	1	1	0	0	0	0	1	1	1	0	1	1	0	1	0	1	0	0	0	1	0	1	0	1	0	0	1	1	1	1	1	1	0	0	0	0	1
3	9	80.62	az	t	n	f	d	x	l	e	0	0	0	0	1	1	1	1	1	1	1	1	0	1	0	1	0	1	1	1	0	1	1	1	0	1	1	0	1	1	1	1	0	0	0	1	0	1	0	1	0	0	1	1	1	1	0	1	0	1	0	0	1	0	0	1	0	1	0	0	0	1	0	1	0	1	1	0	1	1	0	0	1	1	0	0	0	1	1	1	1	1	1	0	0	0	0	1	1	0	0	1	1	0	1	1	1	0	0	0	0	0	1	0	0	0	0	1	1	0	1	1	1	0	1	0	0	0
4	13	78.02	az	v	n	f	d	h	d	n	0	0	0	0	1	1	1	1	1	1	1	1	1	1	0	1	0	1	0	1	0	1	1	0	0	1	1	0	1	1	1	1	0	0	0	1	0	1	0	1	0	0	0	1	1	1	0	1	1	1	0	0	1	0	0	0	0	1	0	0	0	1	0	1	0	1	1	0	1	1	0	0	1	1	0	0	0	0	0	1	1	1	1	0	0	0	0	1	1	0	0	1	1	0	1	0	1	0	0	0	0	0	1	0	1	0	0	1	1	0	1	1	1	1	0	0	0	0

	Survived	Cabin
0	0	NaN
1	1	C85
2	1	NaN
3	1	C123
4	0	NaN

	Survived	Age	Fare
0	0	22.0	7.2500
1	1	38.0	71.2833
2	1	26.0	7.9250
3	1	35.0	53.1000
4	0	35.0	8.0500

	X1	X2	X3	X4	X5	X6
3059	aa	289	c	d	q	g
3014	b	284	c	d	q	i
3368	o	59	f	d	s	l
2772	aa	1155	d	d	p	j
3383	v	61	c	d	s	g

	Cabin	Survived	Cabin_ordered
857	E	1	0.740741
52	D	1	0.692308
386	M	0	0.303609
124	D	0	0.692308
578	M	0	0.303609

	Survived
Cabin
A	0.428571
B	0.774194
C	0.571429
D	0.692308
E	0.740741
F	0.666667
G	0.500000
M	0.303609
T	0.000000

	Cabin	Survived	Cabin_ordered
857	E	1	1.049822
52	D	1	0.810930
386	M	0	-0.830169
124	D	0	0.810930
578	M	0	-0.830169

	Survived	Age	Age_buckets_labels	Age_buckets
0	0	22.0	20-40	(20.0, 40.0]
1	1	38.0	20-40	(20.0, 40.0]
2	1	26.0	20-40	(20.0, 40.0]
3	1	35.0	20-40	(20.0, 40.0]
4	0	35.0	20-40	(20.0, 40.0]

	Survived	Age	Age_buckets_labels	Age_buckets
886	0	27.0	20-40	(20.0, 40.0]
887	1	19.0	0-20	(-0.001, 20.0]
888	0	15.0	0-20	(-0.001, 20.0]
889	1	26.0	20-40	(20.0, 40.0]
890	0	32.0	20-40	(20.0, 40.0]

	issue_d	last_pymnt_d
0	Dec-2018	Feb-2019
1	Dec-2018	Feb-2019
2	Dec-2018	Feb-2019
3	Dec-2018	Feb-2019
4	Dec-2018	Feb-2019

	issue_d	issue_dt	last_pymnt_d	last_pymnt_dt
0	Dec-2018	2018-12-01	Feb-2019	2019-02-01
1	Dec-2018	2018-12-01	Feb-2019	2019-02-01
2	Dec-2018	2018-12-01	Feb-2019	2019-02-01
3	Dec-2018	2018-12-01	Feb-2019	2019-02-01
4	Dec-2018	2018-12-01	Feb-2019	2019-02-01

	issue_dt	issue_dt_month
9995	2018-12-01	12
9996	2018-12-01	12
9997	2018-12-01	12
9998	2018-12-01	12
9999	2018-12-01	12

	issue_dt	issue_dt_dayofweek
0	2018-12-01	Saturday
1	2018-12-01	Saturday
2	2018-12-01	Saturday
3	2018-12-01	Saturday
4	2018-12-01	Saturday

	issue_dt	issue_dt_year
0	2018-12-01	2018
1	2018-12-01	2018
2	2018-12-01	2018
3	2018-12-01	2018
4	2018-12-01	2018

A Reference Guide to Feature Engineering Methods¶

Table of Contents¶

1. Introduction to Feature Engineering ¶

2. Overview of Feature Engineering techniques ¶

3. Missing data imputation ¶

Missing Data Mechanisms¶

Missing Completely at Random, MCAR¶

Missing at Random, MAR¶

Missing Not at Random, MNAR¶

**3.1 Complete Case Analysis (CCA) ** ¶

CCA on Titanic dataset¶

3.2 Mean / Median / Mode Imputation ¶

Mean / Median / Mode Imputation on Titanic dataset¶

Important Note¶

Imputation of Age variable¶

Check for missing values in age variable¶

3.3 Random Sample imputation ¶

Assumptions¶

Random Sample imputation on Titanic dataset¶

Important Note¶

3.4 Replacement by Arbitrary Value ¶

Replacement by Arbitrary Value on Titanic dataset¶

3.5 End of Distribution Imputation ¶

End of Distribution Imputation on Titanic dataset¶

3.6 Missing Value Indicator ¶

Missing Value Indicator on Titanic dataset¶

Conclusion - When to use each imputation method¶

Exceptions¶

4. Categorical Encoding ¶

**4.1 One-Hot Encoding (OHE) ** ¶

Important Note regarding OHE¶

4.2 Ordinal encoding ¶

4.3 Count and Frequency Encoding ¶

4.4 Target / Mean Encoding ¶

Important¶

4.5 Weight of evidence ¶

5. Variable Transformation ¶

Fill missing data with random sample¶

Age¶

Original distribution¶

5.1 Logarithmic transformation ¶

5.2 Reciprocal transformation ¶

5.3 Square root transformation ¶

5.4 Exponential Transformation ¶

5.5 BoxCox transformation ¶

6. Discretization ¶

Discretisation helps handle outliers and highly skewed variables¶

Discretising data with pandas cut and qcut functions¶

6.1 Equal width discretisation with pandas cut function ¶

6.2 Equal frequency discretisation with pandas qcut function ¶

6.3 Domain knowledge discretisation ¶

7. Outlier Engineering ¶

Identifying outliers¶

Extreme Value Analysis¶

7.1 Outlier removal ¶

7.2 Treating outliers as missing values ¶

7.3 Discretisation ¶

7.4 Top /bottom / zero coding ¶

Top-coding important¶

Outliers in continuous variables¶

Age¶

Fare¶

8. Date and Time Engineering ¶

9. References ¶

3.1 Complete Case Analysis (CCA) ¶

Check for missing values in `age` variable¶

4.1 One-Hot Encoding (OHE) ¶