import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max.columns', None)
pd.set_option('display.max.rows', None)
plt.rcParams['figure.figsize'] = [20, 6]
sns.set_theme(style='whitegrid')
warnings.simplefilter(action='ignore')

credit_card = pd.read_csv('BankChurners.csv')
credit_card.head()


# Dropping the redundant columns
credit_card.drop(columns=credit_card.columns[[0, 21, 22]], inplace=True)
credit_card.head()
credit_card.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Open_To_Buy           10127 non-null  float64
 15  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 16  Total_Trans_Amt           10127 non-null  int64  
 17  Total_Trans_Ct            10127 non-null  int64  
 18  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 19  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(9), object(6)
memory usage: 1.5+ MB


print('Overall summary statistics of the numerical features:')
display(credit_card.describe())

print('\nSummary statistics of the numerical features by churning status:')
credit_card.groupby('Attrition_Flag').describe()

Overall summary statistics of the numerical features:

Summary statistics of the numerical features by churning status:


print('Overall summary statistics of the categorical features:')
display(credit_card.describe(exclude='number'))

print('\nSummary statistics of the categorical features by churning status:')
display(credit_card.groupby('Attrition_Flag').describe(exclude='number'))

Overall summary statistics of the categorical features:

Summary statistics of the categorical features by churning status:


# Recoding the Card_Category variable
credit_card['Card_Category'] = np.where(
    credit_card.Card_Category == 'Blue', 'Blue', 'Others')
assert len(credit_card.Card_Category.unique()) == 2


from scipy.stats import chi2_contingency
# Visualising the categorical variables
fig, axes = plt.subplots(2, 3)
cat_columns = [col for col in credit_card.select_dtypes(
    'object').columns if col != 'Attrition_Flag']

for col, ax in zip(cat_columns, axes.ravel()):
    col_chi2_p = chi2_contingency(pd.crosstab(
        credit_card.Attrition_Flag, credit_card[col]))[1]
    sns.countplot(y=col, hue='Attrition_Flag', data=credit_card, ax=ax)
    sns.despine()
    ax.set(xlabel='', ylabel='')
    ax.set_title(
        f"{col.replace('_', ' ')}, chi-squared test p value: {round(col_chi2_p, 6)}")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend('')
fig.suptitle(
    'Bar plots of the frequencies of each category in the categorical features')
fig.legend(handles, labels)
plt.delaxes(axes[1, 2])
plt.tight_layout()
plt.show()


from scipy.stats import kruskal
num_columns = list(set(credit_card.drop(columns='Attrition_Flag').columns).difference(
    cat_columns))

# Visualising numerical features
fig, axes = plt.subplots(3, 5)
for column, ax in zip(num_columns, axes.ravel()):
    kruskal_p = kruskal(credit_card.query("`Attrition_Flag` == 'Existing Customer'")[
                        column], credit_card.query("`Attrition_Flag` == 'Attrited Customer'")[column],
                        nan_policy='omit')[1]
    sns.violinplot(x='Attrition_Flag', y=column,
                   split=True, data=credit_card, ax=ax)
    ax.set(xlabel='', ylabel='',
           title=f"{column.replace('_', ' ')}, kruskal p value:{round(kruskal_p, 6)}")
    sns.despine()

fig.suptitle(
    'Distribution of numerical features by churning status of customers')
fig.delaxes(axes[2, 4])
plt.tight_layout()
plt.show()


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Recoding the target variable
credit_card['Has_Churned'] = np.where(
    credit_card.Attrition_Flag == 'Attrited Customer', 1, 0)
assert credit_card['Has_Churned'].dtype == 'int'

# Splitting the data set into train and test sets
X = credit_card.drop(columns=['Attrition_Flag', 'Has_Churned'])
y = credit_card['Has_Churned']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.33, random_state=1)

# Preprocessing steps of the data
df_preprocess = ColumnTransformer([('standardise', StandardScaler(), num_columns),
                                   ('ohe', OneHotEncoder(), cat_columns)], remainder='passthrough')


from sklearn.metrics import f1_score, log_loss
from sklearn.model_selection import cross_val_score
from catboost import cv, Pool

# Building model evaluation function


def evaluate_model(model, cat_columns=None, X_test=X_test, y_test=y_test):
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    if type(model) == type(CatBoostClassifier()):
        pool = Pool(data=X_train, label=y_train,
                    cat_features=cat_columns)
        catboost_cv = cv(pool, params=model.get_params(),
                         nfold=5, logging_level='Silent', early_stopping_rounds=int(model.tree_count_ * 0.1))
        cv_f1_score = np.max(catboost_cv['test-F1-mean'])
        cv_log_loss = np.min(catboost_cv['test-Logloss-mean'])
    else:
        cv_f1_score = np.mean(cross_val_score(
            model, X_train, y_train, cv=5, scoring='f1', verbose=1))
        cv_log_loss = - np.mean(cross_val_score(
            model, X_train, y_train, cv=5, scoring='neg_log_loss', verbose=1))
    test_f1_score = f1_score(y_test, y_pred)
    test_log_loss = log_loss(y_test, y_pred_prob)
    return {'5-fold CV F1 score': cv_f1_score,
            '5-fold CV log loss': cv_log_loss,
            'Test F1 score': test_f1_score,
            'Test log loss': test_log_loss}


from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Building the logistic regression model
logreg = Pipeline([('preprocess', df_preprocess),
                   ('logreg_clf', LogisticRegressionCV(random_state=1, class_weight='balanced'))])

logreg.fit(X_train, y_train)
print('Performance on metrics for the logistic regression model:')
display(evaluate_model(logreg))

# Building the random forest model
print('\nPerformance on metrics for the random forest model:')
rfc = Pipeline([('preprocess', df_preprocess),
                ('rf_clf', RandomForestClassifier(random_state=1, oob_score=True, class_weight='balanced'))])
rfc.fit(X_train, y_train)
display(evaluate_model(rfc))

# Building the catboost model
cat_column_indices = [idx for idx, col in enumerate(
    X.columns) if col in cat_columns]
class_weight = len(y[y == 0]) / len(y[y == 1])
catboost = CatBoostClassifier(cat_features=cat_column_indices,
                              verbose=0,
                              random_state=1,
                              eval_metric='F1',
                              objective='Logloss',
                              scale_pos_weight=class_weight)
catboost.fit(X_train,
             y_train,
             eval_set=(X_test, y_test),
             use_best_model=True,
             early_stopping_rounds=100)  # which is 10% of the default n_estimators of 1000 in CatBoost

print('\nPerformance on metrics for the catboost model:')
evaluate_model(catboost, cat_column_indices)

Performance on metrics for the logistic regression model:

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.1s finished

{'5-fold CV F1 score': 0.6432873253732959,
 '5-fold CV log loss': 0.3533421626561224,
 'Test F1 score': 0.6324549237170596,
 'Test log loss': 0.36175294960102994}

Performance on metrics for the random forest model:

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.7s finished

{'5-fold CV F1 score': 0.7959413443027146,
 '5-fold CV log loss': 0.15078832367837472,
 'Test F1 score': 0.805439330543933,
 'Test log loss': 0.14307530100487836}

Performance on metrics for the catboost model:

{'5-fold CV F1 score': 0.9550446589783407,
 '5-fold CV log loss': 0.1372564268050963,
 'Test F1 score': 0.9078014184397163,
 'Test log loss': 0.09082103310583552}


catboost_feature_select = catboost.select_features(X_train, y_train, eval_set=(X_test, y_test), features_for_select=list(
    X_train.columns), num_features_to_select=10, logging_level='Silent', train_final_model=False) # set plot=True to generate the plot below


rfe_plot = plt.imread('RFE.png')
plt.imshow(rfe_plot)
plt.axis('off')
plt.tight_layout()
plt.show()


# Extracting the five features to be removed
remove_features = {idx: feature for idx, feature in enumerate(
    X_train.columns) if idx in catboost_feature_select['eliminated_features'][:5]}

# Dropping the removed features
X_train.drop(columns=remove_features.values(), inplace=True)
X_test.drop(columns=remove_features.values(), inplace=True)
assert X_train.shape[1] == 14 and X_test.shape[1] == 14


# Specifying indices of cat features
cat_simple_indices = [idx for idx, feature in enumerate(
    X_train.columns) if X_train[feature].dtype == 'object']

# Fitting the parsimonious model
catboost_simple = CatBoostClassifier(cat_features=cat_simple_indices,
                                     verbose=0,
                                     random_state=1,
                                     eval_metric='F1',
                                     objective='Logloss',
                                     scale_pos_weight=class_weight)
catboost_simple.fit(X_train,
                    y_train,
                    eval_set=(X_test, y_test),
                    use_best_model=True,
                    early_stopping_rounds=100)

print("The performance on the metrics of the parsimonious catboost model is:")
display(evaluate_model(catboost_simple, cat_simple_indices))

print(
    f"\nThe eliminated features in the parsimonious model are: {list(remove_features.values())}.")

The performance on the metrics of the parsimonious catboost model is:

{'5-fold CV F1 score': 0.9468290199129157,
 '5-fold CV log loss': 0.1619717375362939,
 'Test F1 score': 0.9073426573426574,
 'Test log loss': 0.09101184018984058}

The eliminated features in the parsimonious model are: ['Dependent_count', 'Education_Level', 'Income_Category', 'Card_Category', 'Months_on_book'].


from hyperopt import hp, tpe, fmin, Trials

catboost_space = {
    'n_estimators': hp.quniform('n_estimators', 100, 2000, 50),
    'max_depth': hp.quniform('max_depth', 2, 10, 1),
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.3, 0.01),
    'colsample_bylevel': hp.quniform('colsample_bylevel', 0.5, 0.9, 0.01),
    'subsample': hp.quniform('subsample', 0.5, 0.9, 0.01),
    'reg_lambda': hp.quniform('reg_lambda', 1, 200, 1),
    'random_strength': hp.quniform('random_strength', 0, 10, 0.1)
}


def catboost_objective(params):
    catboost_params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'learning_rate': params['learning_rate'],
        'colsample_bylevel': params['colsample_bylevel'],
        'subsample': params['subsample'],
        'reg_lambda': params['reg_lambda'],
        'random_strength': params['random_strength']
    }
    pool = Pool(data=X_train,
                label=y_train,
                cat_features=cat_simple_indices)
    catboost = CatBoostClassifier(cat_features=cat_simple_indices,
                                  verbose=0,
                                  random_state=1,
                                  eval_metric='F1',
                                  objective='Logloss',
                                  scale_pos_weight=class_weight)
    catboost_cv = cv(pool,
                     params=catboost.set_params(
                         **catboost_params).get_params(),
                     nfold=5,
                     logging_level='Silent',
                     early_stopping_rounds=int(params['n_estimators']) * 0.1)  # 10% of the total number of trees in a model
    loss = np.min(catboost_cv['test-Logloss-mean'])
    return loss


trials = Trials()
catboost_best_params = fmin(
    catboost_objective, catboost_space, algo=tpe.suggest, max_evals=500, trials=trials, rstate=np.random.seed(1))
print(catboost_best_params)

100%|██████████| 500/500 [13:55:58<00:00, 100.32s/trial, best loss: 0.11053249502939395]
{'colsample_bylevel': 0.61, 'learning_rate': 0.15, 'max_depth': 4.0, 'n_estimators': 1950.0, 'random_strength': 1.4000000000000001, 'reg_lambda': 5.0, 'subsample': 0.72}


catboost_best_stopping_rounds = int(catboost_best_params['n_estimators'] * 0.1)

catboost_tuned = CatBoostClassifier(cat_features=cat_simple_indices,
                                    verbose=0,
                                    random_state=1,
                                    eval_metric='F1',
                                    objective='Logloss',
                                    scale_pos_weight=class_weight, **catboost_best_params)
catboost_tuned.fit(X_train,
                   y_train,
                   eval_set=(X_test, y_test),
                   use_best_model=True,
                   early_stopping_rounds=catboost_best_stopping_rounds)
print('The performance of the tuned CatBoost model is: ')
display(evaluate_model(catboost_tuned, cat_simple_indices))

# Saving the tuned model
catboost_pool = Pool(
    X_train, y_train, cat_features=cat_simple_indices)
catboost_tuned.save_model('credit_card_catboost_best', pool=catboost_pool)

The performance of the tuned CatBoost model is:

{'5-fold CV F1 score': 0.9555680735587438,
 '5-fold CV log loss': 0.13063632990296833,
 'Test F1 score': 0.9110132158590308,
 'Test log loss': 0.0840395210369782}


from sklearn.metrics import classification_report
y_pred = catboost_tuned.predict(X_test)
catboost_clf_report = pd.DataFrame(classification_report(
    y_test, y_pred, output_dict=True)).transpose()

display(catboost_clf_report)


import shap
catboost_explainer = shap.Explainer(catboost_tuned)
shap_values = catboost_explainer(X_train)
shap.plots.bar(shap_values)


shap.plots.beeswarm(shap_values)

	CLIENTNUM	Attrition_Flag	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1	Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0	768805383	Existing Customer	45	M	3	High School	Married	$60K - $80K	Blue	39	5	1	3	12691.0	777	11914.0	1.335	1144	42	1.625	0.061	0.000093	0.99991
1	818770008	Existing Customer	49	F	5	Graduate	Single	Less than $40K	Blue	44	6	1	2	8256.0	864	7392.0	1.541	1291	33	3.714	0.105	0.000057	0.99994
2	713982108	Existing Customer	51	M	3	Graduate	Married	$80K - $120K	Blue	36	4	1	0	3418.0	0	3418.0	2.594	1887	20	2.333	0.000	0.000021	0.99998
3	769911858	Existing Customer	40	F	4	High School	Unknown	Less than $40K	Blue	34	3	4	1	3313.0	2517	796.0	1.405	1171	20	2.333	0.760	0.000134	0.99987
4	709106358	Existing Customer	40	M	3	Uneducated	Married	$60K - $80K	Blue	21	5	1	0	4716.0	0	4716.0	2.175	816	28	2.500	0.000	0.000022	0.99998

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
count	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000
mean	46.325960	2.346203	35.928409	3.812580	2.341167	2.455317	8631.953698	1162.814061	7469.139637	0.759941	4404.086304	64.858695	0.712222	0.274894
std	8.016814	1.298908	7.986416	1.554408	1.010622	1.106225	9088.776650	814.987335	9090.685324	0.219207	3397.129254	23.472570	0.238086	0.275691
min	26.000000	0.000000	13.000000	1.000000	0.000000	0.000000	1438.300000	0.000000	3.000000	0.000000	510.000000	10.000000	0.000000	0.000000
25%	41.000000	1.000000	31.000000	3.000000	2.000000	2.000000	2555.000000	359.000000	1324.500000	0.631000	2155.500000	45.000000	0.582000	0.023000
50%	46.000000	2.000000	36.000000	4.000000	2.000000	2.000000	4549.000000	1276.000000	3474.000000	0.736000	3899.000000	67.000000	0.702000	0.176000
75%	52.000000	3.000000	40.000000	5.000000	3.000000	3.000000	11067.500000	1784.000000	9859.000000	0.859000	4741.000000	81.000000	0.818000	0.503000
max	73.000000	5.000000	56.000000	6.000000	6.000000	6.000000	34516.000000	2517.000000	34516.000000	3.397000	18484.000000	139.000000	3.714000	0.999000

	Customer_Age								Dependent_count								Months_on_book								Total_Relationship_Count								Months_Inactive_12_mon								Contacts_Count_12_mon								Credit_Limit								Total_Revolving_Bal								Avg_Open_To_Buy								Total_Amt_Chng_Q4_Q1								Total_Trans_Amt								Total_Trans_Ct								Total_Ct_Chng_Q4_Q1								Avg_Utilization_Ratio
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
Attrition_Flag
Attrited Customer	1627.0	46.659496	7.665652	26.0	41.0	47.0	52.0	68.0	1627.0	2.402581	1.275010	0.0	2.0	2.0	3.0	5.0	1627.0	36.178242	7.796548	13.0	32.0	36.0	40.0	56.0	1627.0	3.279656	1.577782	1.0	2.0	3.0	5.0	6.0	1627.0	2.693301	0.899623	0.0	2.0	3.0	3.0	6.0	1627.0	2.972342	1.090537	0.0	2.0	3.0	4.0	6.0	1627.0	8136.039459	9095.334105	1438.3	2114.0	4178.0	9933.50	34516.0	1627.0	672.822987	921.385582	0.0	0.0	0.0	1303.5	2517.0	1627.0	7463.216472	9109.208129	3.0	1587.0	3488.0	9257.50	34516.0	1627.0	0.694277	0.214924	0.000	0.5445	0.701	0.856	1.492	1627.0	3095.025814	2308.227629	510.0	1903.50	2329.0	2772.00	10583.0	1627.0	44.933620	14.568429	10.0	37.0	43.0	51.0	94.0	1627.0	0.554386	0.226854	0.000	0.400	0.531	0.692	2.500	1627.0	0.162475	0.264458	0.0	0.000	0.000	0.23100	0.999
Existing Customer	8500.0	46.262118	8.081157	26.0	41.0	46.0	52.0	73.0	8500.0	2.335412	1.303229	0.0	1.0	2.0	3.0	5.0	8500.0	35.880588	8.021810	13.0	31.0	36.0	40.0	56.0	8500.0	3.914588	1.528949	1.0	3.0	4.0	5.0	6.0	8500.0	2.273765	1.016741	0.0	1.0	2.0	3.0	6.0	8500.0	2.356353	1.081436	0.0	2.0	2.0	3.0	5.0	8500.0	8726.877518	9084.969807	1438.3	2602.0	4643.5	11252.75	34516.0	8500.0	1256.604118	757.745354	0.0	800.0	1364.0	1807.0	2517.0	8500.0	7470.273400	9087.671862	15.0	1184.5	3469.5	9978.25	34516.0	8500.0	0.772510	0.217783	0.256	0.6430	0.743	0.860	3.397	8500.0	4654.655882	3512.772635	816.0	2384.75	4100.0	4781.25	18484.0	8500.0	68.672588	22.919011	11.0	54.0	71.0	82.0	139.0	8500.0	0.742434	0.228054	0.028	0.617	0.721	0.833	3.714	8500.0	0.296412	0.272568	0.0	0.055	0.211	0.52925	0.994

	precision	recall	f1-score	support
0	0.992711	0.971123	0.981799	2805.000000
1	0.864548	0.962756	0.911013	537.000000
accuracy	0.969779	0.969779	0.969779	0.969779
macro avg	0.928630	0.966940	0.946406	3342.000000
weighted avg	0.972118	0.969779	0.970425	3342.000000

Predicting credit card customer churning and devising strategies for customer retention¶

Backgroud¶

Exploratory data analysis¶

Predicting customer churning with machine learning model¶

Building baseline models¶

Selecting features for CatBoost¶

Hyperparameter tuning¶

Looking deeper into the CatBoost model¶

	Attrition_Flag	Gender	Education_Level	Marital_Status	Income_Category	Card_Category
count	10127	10127	10127	10127	10127	10127
unique	2	2	7	4	6	4
top	Existing Customer	F	Graduate	Married	Less than $40K	Blue
freq	8500	5358	3128	4687	3561	9436