# Load packages
from warnings import simplefilter
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [20, 8]
sns.set_theme(context="notebook", style="whitegrid")

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


# Load data from the csv file
insurance = pd.read_csv('data.csv', index_col=0)
display(insurance.head())
insurance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   EmploymentType       1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 155.2+ KB


# Recoding Yes/No columns
recode_cols = insurance.iloc[:, [2, 6, 7]].columns.to_list()

for col in recode_cols:
    insurance[col] = np.where(insurance[col] == "Yes", 1, 0)

insurance.head()


# Summary statistic
print("Summary statistic on all features but EmploymentType:")
display(insurance.describe())

print("\nThe number of customers in each employment type:")
insurance.EmploymentType.value_counts()

Summary statistic on all features but EmploymentType:

The number of customers in each employment type:

Private Sector/Self Employed    1417
Government Sector                570
Name: EmploymentType, dtype: int64


# Overall distributions of numerical features
num_cols = ["Age", "AnnualIncome", "FamilyMembers"]
fig, axes = plt.subplots(1, 3)

for col, ax in zip(num_cols, axes.ravel()):
    sns.violinplot(y=col, data=insurance, ax=ax)
    ax.set(xlabel="", ylabel="", title=col)
    sns.despine()

axes[1].set(title="Annual Income (in million dollars)")
fig.suptitle("Overall distribution of numerical features")
plt.show()


# Testing if the medians of two groups are stat. different in numerical features
from scipy.stats import kruskal
insurance_yes = insurance.query("TravelInsurance == 1")
insurance_no = insurance.query("TravelInsurance == 0")
kruskal_results = {}

for col in num_cols:
    stats, pvalue = kruskal(
        insurance_yes[col], insurance_no[col], nan_policy="omit")
    kruskal_results.update({col: {"kruskal_stats": stats, "p_value": pvalue}})

kruskal_df = pd.DataFrame(kruskal_results).transpose()
kruskal_df["significance_95%"] = np.where(
    kruskal_df["p_value"] <= 0.05, True, False)
kruskal_df.sort_values("kruskal_stats", ascending=False)


# Visualising the difference on numerical features
fig, axes = plt.subplots(1, 3)

for col, ax in zip(num_cols, axes.ravel()):
    sns.violinplot(x="TravelInsurance", y=col, data=insurance, ax=ax)
    ax.set(xlabel="", ylabel="", title=col, xticks=[
           0, 1], xticklabels=["No", "Yes"])
    sns.despine()

axes[1].set(title="Annual Income (in million dollars)")
fig.suptitle(
    "Distribution of numerical features by purchase of travel insurance")
fig.text(0.5, 0.04, "Travel Insurance")
plt.show()


# Chi-squared test for categorical features
from scipy.stats import chi2_contingency
chi2_results = {}
cat_cols = [
    col for col in insurance.columns if col not in num_cols and col != "TravelInsurance"]

for col in cat_cols:
    cross_table = pd.crosstab(insurance.TravelInsurance, insurance[col])
    stats, pvalue = chi2_contingency(cross_table)[:2]
    chi2_results.update({col: {"chi2_stats": stats, "p_value": pvalue}})

chi2_df = pd.DataFrame(chi2_results).transpose()
chi2_df["significance_95%"] = np.where(chi2_df["p_value"] <= 0.05, True, False)
chi2_df.sort_values("chi2_stats", ascending=False)


# Visualising difference in cat features by TravelInsurance
significant_cat_cols = [feature for feature in chi2_df.query(
    "`significance_95%` == True").index]

fig, axes = plt.subplots(1, 3)

for col, ax in zip(significant_cat_cols, axes.ravel()):
    sns.countplot(x=col, hue="TravelInsurance", data=insurance, ax=ax)
    ax.set(xlabel="", ylabel="", title=col)
    sns.despine()
    ax.legend("")
    handles, labels = ax.get_legend_handles_labels()

axes[0].set(xticks=[0, 1], xticklabels=["Government", "Private Sector"])
axes[1].set(xticklabels=["No", "Yes"])
axes[2].set(xticklabels=["No", "Yes"])
fig.suptitle("Purchase status of travel insurance by categorical features")
fig.text(0.08, 0.5, "count", rotation=90)
fig.legend(handles, ["No", "Yes"], loc='upper right', title="Travel Insurance")
plt.show()


# Splitting the data into train, validation and test sets
from sklearn.model_selection import train_test_split
X = insurance.drop(columns="TravelInsurance")
y = insurance["TravelInsurance"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=.33,
                                                    stratify=y,
                                                    random_state=1)


# Building preprocessing steps
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder

transform_list = [([col], StandardScaler())
                  for col in num_cols] + [(["EmploymentType"], OneHotEncoder())]

df_preprocess = DataFrameMapper(
    transform_list, default=None, input_df=True, df_out=True)


# Building the pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Defining function to fit pipeline


def fit_pipeline(name, Classifier, X_train=X_train, y_train=y_train, *args, **kwargs):
    pipeline = Pipeline([("preprocess", df_preprocess),
                         (name, Classifier(*args,
                                           **kwargs,
                                           random_state=123))])
    fit_pipeline = pipeline.fit(X_train, y_train)
    return fit_pipeline


# Fitting the Logistic Regression model
logit_pipeline = fit_pipeline("logit", LogisticRegression)

# Fitting the SVM classifier
svm_pipeline = fit_pipeline("svm_clf", SVC, probability=True)

# Fitting xgboost classifier model
xgb_pipeline = fit_pipeline("xgb_clf", xgb.XGBClassifier, verbosity=0,
                            use_label_encoder=False)


# Function for evaluating models
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import cross_val_score, KFold

five_fold = KFold(n_splits=5, shuffle=True, random_state=123)

def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=five_fold, scoring='accuracy'))  
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)
    cv_accuracy = round(cv_score, 4)
    test_accuracy = round(accuracy_score(y_test, y_test_pred), 4)
    test_roc_auc = round(roc_auc_score(y_test, y_test_pred_proba[:, 1]), 4)
    test_log_loss = log_loss(y_test, y_test_pred_proba)
    return {"5-fold cross validation accuracy score": cv_accuracy, 
            "test accuracy score": test_accuracy, 
            "test ROC AUC score": test_roc_auc, 
            "log loss": test_log_loss}

# Model Evaluation
print("Metrics of the logit model:")
display(evaluate_model(logit_pipeline))

print("\nMetrics of the SVM model:")
display(evaluate_model(svm_pipeline))

print("\nMetrics of the XGBoost model:")
evaluate_model(xgb_pipeline)

Metrics of the logit model:

{'5-fold cross validation accuracy score': 0.7648,
 'test accuracy score': 0.7744,
 'test ROC AUC score': 0.7676,
 'log loss': 0.5176984926727463}

Metrics of the SVM model:

{'5-fold cross validation accuracy score': 0.8077,
 'test accuracy score': 0.8354,
 'test ROC AUC score': 0.8103,
 'log loss': 0.4452038976484994}

Metrics of the XGBoost model:

{'5-fold cross validation accuracy score': 0.7821,
 'test accuracy score': 0.7927,
 'test ROC AUC score': 0.8083,
 'log loss': 0.5148757110236613}


# Binning the income groups
bins = [3e5, 6e5, 9e5, 1.25e6, np.inf]
labels = ["low income", "middle low income",
          "middle high income", "high income"]

# Creating the new features in the train and test sets
X_train_new = X_train.assign(AnnualIncomePerMember=X_train["AnnualIncome"] / X_train["FamilyMembers"],
                             IncomeGroup=pd.cut(X_train["AnnualIncome"],
                                                bins=bins,
                                                labels=labels, 
                                                include_lowest=True))

X_test_new = X_test.assign(AnnualIncomePerMember=X_test["AnnualIncome"] / X_test["FamilyMembers"],
                           IncomeGroup=pd.cut(X_test["AnnualIncome"],
                                              bins=bins,
                                              labels=labels, 
                                              include_lowest=True))

# New feature preprocessing steps
df_new_transform_list = transform_list + [(["AnnualIncomePerMember"], StandardScaler()),
                                          (["IncomeGroup"], OneHotEncoder())]

df_new_preprocess = DataFrameMapper(
    df_new_transform_list, default=None, input_df=True, df_out=True)


# SVM
svm_new_feature_pipeline = Pipeline([("preprocess", df_new_preprocess),
                                     ("svm", SVC(probability=True, random_state=123))])
svm_new_feature_pipeline.fit(X_train_new, y_train)

print("SVM baseline:")
display(evaluate_model(svm_pipeline))

print("\nSVM performance with new features:")
display(evaluate_model(svm_new_feature_pipeline,
               X_train=X_train_new, X_test=X_test_new))

# XGBoost
xgb_new_feature_pipeline = Pipeline([("preprocess", df_new_preprocess),
                                     ("xgb_clf", xgb.XGBClassifier(verbosity=0,
                                                                   random_state=123,
                                                                   use_label_encoder=False))])

xgb_new_feature_pipeline.fit(X_train_new, y_train)

print("\nXGBoost baseline:")
display(evaluate_model(xgb_pipeline))

print("\nXGBoost performance with new features:")
evaluate_model(xgb_new_feature_pipeline,
               X_train=X_train_new, X_test=X_test_new)

SVM baseline:

{'5-fold cross validation accuracy score': 0.8077,
 'test accuracy score': 0.8354,
 'test ROC AUC score': 0.8103,
 'log loss': 0.4452038976484994}

SVM performance with new features:

{'5-fold cross validation accuracy score': 0.8114,
 'test accuracy score': 0.8323,
 'test ROC AUC score': 0.8145,
 'log loss': 0.43535750819133245}

XGBoost baseline:

{'5-fold cross validation accuracy score': 0.7821,
 'test accuracy score': 0.7927,
 'test ROC AUC score': 0.8083,
 'log loss': 0.5148757110236613}

XGBoost performance with new features:

{'5-fold cross validation accuracy score': 0.7844,
 'test accuracy score': 0.7927,
 'test ROC AUC score': 0.8182,
 'log loss': 0.5063971312975344}


# Defining function to plot permutation importance
from sklearn.inspection import permutation_importance


def plot_permutation_importance(fitted_model, X_test=X_test_new, y_test=y_test, ax=None):
    permutation = np.abs(permutation_importance(fitted_model,  # Because we only care about to extent to which permuting a feature would affect the model's accuracy
                                                X_test_new,
                                                y_test,
                                                scoring="accuracy",
                                                n_repeats=100,
                                                n_jobs=2,
                                                random_state=123)["importances_mean"])
    model_features = list(X_test_new.columns)
    model_feature_importance = pd.Series(
        permutation, index=model_features).sort_values(ascending=False)
    plot = sns.barplot(x=model_feature_importance,
                       y=model_feature_importance.index, color="tab:blue", ax=ax)
    return plot


# Plotting the permutation importance of the two models
fig, axes = plt.subplots(2, 1, sharex=True)
plot_permutation_importance(svm_new_feature_pipeline, ax=axes[0])
axes[0].set(title="SVM permutation importance")

plot_permutation_importance(xgb_new_feature_pipeline, ax=axes[1])
axes[1].set(title="XGBoost permutation importance")
sns.despine()
fig.text(0.3, 0.04, "Difference in accuracy between baseline and permuted models")
plt.show()


# SVM
X_train_svm_drop = X_train_new.drop(
    columns=["ChronicDiseases"])
X_test_svm_drop = X_test_new.drop(
    columns=["ChronicDiseases"])

svm_dropped_pipeline = Pipeline([("preprocess", df_new_preprocess),
                                 ("svm", SVC(probability=True, random_state=123))])

svm_dropped_pipeline.fit(X_train_svm_drop, y_train)

print("SVM performance with new features:")
display(evaluate_model(svm_new_feature_pipeline,
                       X_train=X_train_new, X_test=X_test_new))

print("\nSVM after dropping features:")
evaluate_model(svm_dropped_pipeline, X_train=X_train_svm_drop,
               X_test=X_test_svm_drop)

SVM performance with new features:

{'5-fold cross validation accuracy score': 0.8114,
 'test accuracy score': 0.8323,
 'test ROC AUC score': 0.8145,
 'log loss': 0.43535750819133245}

SVM after dropping features:

{'5-fold cross validation accuracy score': 0.8129,
 'test accuracy score': 0.8308,
 'test ROC AUC score': 0.817,
 'log loss': 0.435864466043114}


# XGBoost
X_train_xgb_drop = X_train_new.drop(columns=[
                                    "ChronicDiseases", "FrequentFlyer", "EverTravelledAbroad", "IncomeGroup"])
X_test_xgb_drop = X_test_new.drop(columns=[
                                  "ChronicDiseases", "FrequentFlyer", "EverTravelledAbroad", "IncomeGroup"])

xgb_transform_list = [
    feature for feature in df_new_transform_list if df_new_transform_list.index(feature) != 5]

xgb_drop_preprocess = DataFrameMapper(xgb_transform_list, default=None,
                                      input_df=True, df_out=True)

xgb_dropped_pipeline = Pipeline([("preprocess", xgb_drop_preprocess),
                                 ("xgb", xgb.XGBClassifier(verbosity=0,
                                                           random_state=123,
                                                           use_label_encoder=False,
                                                           n_jobs=-1))])

xgb_dropped_pipeline.fit(X_train_xgb_drop, y_train)

print("XGBoost performance with new features:")
display(evaluate_model(xgb_new_feature_pipeline,
                       X_train=X_train_new, X_test=X_test_new))

print("\nXGBoost performance after dropping features:")
evaluate_model(xgb_dropped_pipeline, X_train=X_train_xgb_drop,
               X_test=X_test_xgb_drop)

XGBoost performance with new features:

{'5-fold cross validation accuracy score': 0.7844,
 'test accuracy score': 0.7927,
 'test ROC AUC score': 0.8182,
 'log loss': 0.5063971312975344}

XGBoost performance after dropping features:

{'5-fold cross validation accuracy score': 0.7844,
 'test accuracy score': 0.8216,
 'test ROC AUC score': 0.8178,
 'log loss': 0.48032051994551894}


# Bayesian optimisation of the SVM model
from hyperopt import hp, tpe, fmin, Trials
svm_space = {
    "svm__C": hp.uniform("svm__C", 0.00001, 1000),
    "svm__gamma": hp.uniform("svm__gamma", 0.00001, 100)
}


def svm_objective(params):
    params = {
        "svm__C": params["svm__C"],
        "svm__gamma": params["svm__gamma"]
    }
    loss = - np.mean(cross_val_score(
        svm_dropped_pipeline.set_params(**params), X_train_svm_drop, y_train, cv=5, scoring="neg_log_loss"))
    return loss


svm_trials = Trials()
svm_best_params = fmin(svm_objective, svm_space, algo=tpe.suggest,
                       rstate=np.random.seed(123), max_evals=500, trials=svm_trials)

100%|██████████████████████████████████████████████| 500/500 [17:42<00:00,  2.13s/trial, best loss: 0.4738539663826099]


# Performance of the tuned SVM model
svm_best_pipeline = Pipeline([("preprocess", df_new_preprocess),
                             ("svm", SVC(probability=True,
                                         random_state=123))]).set_params(**svm_best_params)

svm_best_pipeline.fit(X_train_svm_drop, y_train)

print("The best hyperparameters of the SVM model are:")
display(svm_best_params)

print("\nThe performance of the SVM model with default hyperparameters on the test set is: ")
display(evaluate_model(svm_dropped_pipeline,
        X_train=X_train_svm_drop, X_test=X_test_svm_drop))

print("\nThe performance of the tuned SVM model on the test set is: ")
evaluate_model(svm_best_pipeline, X_train=X_train_svm_drop,
               X_test=X_test_svm_drop)

The best hyperparameters of the SVM model are:

{'svm__C': 1.5427598485101885, 'svm__gamma': 0.23641584701059065}

The performance of the SVM model with default hyperparameters on the test set is:

{'5-fold cross validation accuracy score': 0.7678,
 'test accuracy score': 0.8308,
 'test ROC AUC score': 0.817,
 'log loss': 0.435864466043114}

The performance of the tuned SVM model on the test set is:

{'5-fold cross validation accuracy score': 0.8152,
 'test accuracy score': 0.8354,
 'test ROC AUC score': 0.8125,
 'log loss': 0.4352559894428388}


# Tuning the xgboost classifier with bayesian optimisation
from hyperopt import hp, fmin, tpe, Trials

xgb_space = {
    "xgb__n_estimators": hp.quniform("xgb__n_estimators", 50, 500, 50),
    "xgb__max_depth": hp.quniform("xgb__max_depth", 2, 10, 1),
    "xgb__learning_rate": hp.quniform("xgb__learning_rate", 0.1, 0.3, 0.01),
    "xgb__reg_lambda": hp.quniform("xgb__reg_lambda", 0, 100, 1),
    "xgb__gamma": hp.quniform("xgb__gamma", 0, 10, 0.1),
    "xgb__colsample_bytree": hp.quniform("xgb__colsample_bytree", 0.5, 0.9, 0.01),
    "xgb__subsample": hp.quniform("xgb__subsample", 0.5, 0.9, 0.01),
    "xgb__min_child_weight": hp.quniform("xgb__min_child_weight", 1, 10, 1)
}


def xgb_objective(params):
    params = {"xgb__n_estimators": int(params["xgb__n_estimators"]),
              "xgb__max_depth": int(params["xgb__max_depth"]),
              "xgb__learning_rate": params["xgb__learning_rate"],
              "xgb__reg_lambda": params["xgb__reg_lambda"],
              "xgb__gamma": params["xgb__gamma"],
              "xgb__colsample_bytree": params["xgb__colsample_bytree"],
              "xgb__subsample": params["xgb__subsample"],
              "xgb__min_child_weight": int(params["xgb__min_child_weight"])}
    loss = - np.mean(cross_val_score(
        xgb_dropped_pipeline.set_params(**params), X_train_xgb_drop, y_train, cv=5, scoring="neg_log_loss"))
    return loss


xgb_trials = Trials()
best_xgb_params = fmin(xgb_objective, xgb_space, algo=tpe.suggest,
                       max_evals=500, rstate=np.random.seed(123), trials=xgb_trials)

100%|██████████████████████████████████████████████| 500/500 [16:23<00:00,  1.97s/trial, best loss: 0.4557397373304942]


# Evaluating the best xgboost model
best_xgb_params["xgb__max_depth"] = int(best_xgb_params["xgb__max_depth"])
best_xgb_params['xgb__n_estimators'] = int(
    best_xgb_params['xgb__n_estimators'])

xgb_best_pipeline = Pipeline([("preprocess", xgb_drop_preprocess),
                              ("xgb", xgb.XGBClassifier(random_state=123, use_label_encoder=False, verbosity=0))]).set_params(**best_xgb_params)
xgb_best_pipeline.fit(X_train_xgb_drop, y_train)

print("The optimal hyperparameter of the XGBoost model are: ")
display(best_xgb_params)

print("\nThe performance of the XGBoost model with default hyperparameters on the test data is: ")
display(evaluate_model(xgb_dropped_pipeline, X_train=X_train_xgb_drop,
               X_test=X_test_xgb_drop))

print("\nThe performance of the tuned XGBoost model on the test data is: ")
evaluate_model(xgb_best_pipeline, X_train=X_train_xgb_drop,
               X_test=X_test_xgb_drop)

The optimal hyperparameter of the XGBoost model are:

{'xgb__colsample_bytree': 0.87,
 'xgb__gamma': 6.300000000000001,
 'xgb__learning_rate': 0.21,
 'xgb__max_depth': 3,
 'xgb__min_child_weight': 1.0,
 'xgb__n_estimators': 350,
 'xgb__reg_lambda': 4.0,
 'xgb__subsample': 0.85}

The performance of the XGBoost model with default hyperparameters on the test data is:

{'5-fold cross validation accuracy score': 0.8234,
 'test accuracy score': 0.8216,
 'test ROC AUC score': 0.8178,
 'log loss': 0.48032051994551894}

The performance of the tuned XGBoost model on the test data is:

{'5-fold cross validation accuracy score': 0.8234,
 'test accuracy score': 0.8415,
 'test ROC AUC score': 0.8271,
 'log loss': 0.4265397166515269}


# Printing the classification report
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
y_pred = xgb_best_pipeline.predict(X_test_xgb_drop)
clf_report = pd.DataFrame(classification_report(
    y_test, y_pred, target_names=["No", "Yes"], output_dict=True)).T

print("""Classification report of the XGBoost model:""")
display(clf_report)

# Confusion matrix
xgb_cm = confusion_matrix(y_test, y_pred)
cm_disp = ConfusionMatrixDisplay(xgb_cm)
cm_disp.plot(cmap="Blues", colorbar=False)
plt.gca().set(title="Confusion matrix of the XGBoost model")
plt.gca().grid(False)
plt.show()

Classification report of the XGBoost model:


# Feature gains of the XGBoost model
xgb.plot_importance(xgb_best_pipeline.named_steps["xgb"], importance_type="gain",
                    height=0.5, title="Gain of features in XGBoost model", ylabel="", show_values=False)
plt.show()

	Age	GraduateOrNot	AnnualIncome	FamilyMembers	ChronicDiseases	FrequentFlyer	EverTravelledAbroad	TravelInsurance
count	1987.000000	1987.000000	1.987000e+03	1987.000000	1987.000000	1987.000000	1987.000000	1987.000000
mean	29.650226	0.851535	9.327630e+05	4.752894	0.277806	0.209864	0.191243	0.357323
std	2.913308	0.355650	3.768557e+05	1.609650	0.448030	0.407314	0.393379	0.479332
min	25.000000	0.000000	3.000000e+05	2.000000	0.000000	0.000000	0.000000	0.000000
25%	28.000000	1.000000	6.000000e+05	4.000000	0.000000	0.000000	0.000000	0.000000
50%	29.000000	1.000000	9.000000e+05	5.000000	0.000000	0.000000	0.000000	0.000000
75%	32.000000	1.000000	1.250000e+06	6.000000	1.000000	0.000000	0.000000	1.000000
max	35.000000	1.000000	1.800000e+06	9.000000	1.000000	1.000000	1.000000	1.000000

	precision	recall	f1-score	support
No	0.819277	0.966825	0.886957	422.000000
Yes	0.911392	0.615385	0.734694	234.000000
accuracy	0.841463	0.841463	0.841463	0.841463
macro avg	0.865335	0.791105	0.810825	656.000000
weighted avg	0.852135	0.841463	0.832643	656.000000

Analysis on travel insurance customers¶

Background¶

Exploring the data¶

How different are the customers who have and have not bought insurance?¶

Building a model to predict whether someone will buy travel insurance¶

Building the baseline model¶

Improving model performance through feature engineering and selection¶

Feature engineering¶

Feature Selection¶

Finding the best model through hyperparameter tuning¶

Looking deeper into the XGBoost model¶

Limitations¶

	Age	EmploymentType	GraduateOrNot	AnnualIncome	FamilyMembers	ChronicDiseases	FrequentFlyer	EverTravelledAbroad	TravelInsurance
0	31	Government Sector	Yes	400000	6	1	No	No	0
1	31	Private Sector/Self Employed	Yes	1250000	7	0	No	No	0
2	34	Private Sector/Self Employed	Yes	500000	4	1	No	No	1
3	28	Private Sector/Self Employed	Yes	700000	3	1	No	No	0
4	28	Private Sector/Self Employed	Yes	700000	8	1	Yes	No	0

	kruskal_stats	p_value	significance_95%
AnnualIncome	313.945661	3.017909e-70	True
FamilyMembers	11.172750	8.300750e-04	True
Age	4.671026	3.067559e-02	True

	chi2_stats	p_value	significance_95%
EverTravelledAbroad	370.559928	1.413451e-82	True
FrequentFlyer	105.857231	7.924360e-25	True
EmploymentType	42.753803	6.208107e-11	True
GraduateOrNot	0.605510	4.364834e-01	False
ChronicDiseases	0.575411	4.481165e-01	False