import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# Load the data
data_df = pd.read_csv('heart_failure_clinical_records_dataset.csv')


# Top 5 records
data_df.head()


# Check columns data types
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


# Missing values check
data_df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64


# Describe statistics on numerical data types
data_df.describe()


# Check distribution of target variable
plt.figure(figsize=(9,7))
sns.countplot(x=data_df['DEATH_EVENT'])
plt.title('Distribution of target variable', fontsize=15)
plt.show()


# Correlation HeatMap
plt.figure(figsize=(14,10))
corr = data_df.corr()
sns.heatmap(corr, annot=True, cmap='YlGnBu')
plt.show()


# Age distribution based on death event
plt.figure(figsize=(12,9))
sns.boxplot(data_df['DEATH_EVENT'], data_df['age'])
plt.title('Age distribution based on the target variable', fontsize=22)
plt.xlabel('Death Event',fontsize=18, color='Black')
plt.ylabel('AGE',fontsize=18, color='Black')
plt.xticks(fontsize=15, color='Black')
plt.yticks(fontsize=15, color='Black')
plt.show()


plt.figure(figsize=(12,9))
male_survival = data_df[(data_df['sex']==1) & (data_df['DEATH_EVENT']==1)]
female_survival = data_df[data_df['sex']==0 & (data_df['DEATH_EVENT']==1)]
male_not = data_df[(data_df['sex']==1) & (data_df['DEATH_EVENT']==0)]
female_not = data_df[(data_df['sex']==0) & (data_df['DEATH_EVENT']==0)]
labels = ['male_survived','female_survived','male_not','female_not']
values = [len(male_survival),len(female_survival),len(male_not),len(female_not)]
plt.pie(x=values, autopct="%.1f%%", explode=[0.05]*4, labels=labels, pctdistance=0.5)
plt.show()


plt.figure(figsize=(12,9))
sns.scatterplot(data_df['creatinine_phosphokinase'], data_df['platelets'])
plt.xlabel('creatinine_phosphokinase',fontsize=18, color='Black')
plt.ylabel('platelets',fontsize=18, color='Black')
plt.xticks(fontsize=12, color='Black')
plt.yticks(fontsize=12, color='Black')
plt.show()


X = data_df.drop('DEATH_EVENT',axis=1)
Y = data_df['DEATH_EVENT']
print(X.shape)
print(Y.shape)

(299, 12)
(299,)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,Y,train_size=0.8,random_state=45,stratify=Y)


print('Train X set shape: {}'.format(X_train.shape))
print('Train Y set shape: {}'.format(y_train.shape))
print('Test X set shape: {}'.format(X_test.shape))
print('Test Y set shape: {}'.format(y_test.shape))

Train X set shape: (239, 12)
Train Y set shape: (239,)
Test X set shape: (60, 12)
Test Y set shape: (60,)


print('Training distribution:\n{}'.format(y_train.value_counts()))
print('Test distribution:\n{}'.format(y_test.value_counts()))

Training distribution:
0    162
1     77
Name: DEATH_EVENT, dtype: int64
Test distribution:
0    41
1    19
Name: DEATH_EVENT, dtype: int64


from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
train_X_std = scale.fit_transform(X_train)
X_test_std = scale.fit_transform(X_test)


from sklearn.linear_model import LogisticRegression
model_lr_imp = LogisticRegression()
model_lr_imp.fit(train_X_std, y_train)
importances = pd.DataFrame(data={
    'features':X.columns,
    'Importance':model_lr_imp.coef_[0]
})
importances = importances.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(12,9))
plt.bar(x=importances['features'], height=importances['Importance'], color='#087E8B')
plt.title('Features importance based on coefficients', size=18)
plt.xticks(rotation='vertical')
plt.show()


X.head()


X_imp = X[['age','creatinine_phosphokinase','diabetes','ejection_fraction','platelets','serum_creatinine','serum_sodium','smoking','time']]
X_imp.head()


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve, roc_auc_score, classification_report
X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_imp ,Y,train_size=0.8,random_state=45,stratify=Y)
train_X_imp = scale.fit_transform(X_train_imp)
X_test_imp = scale.fit_transform(X_test_imp)


model_lr0 = LogisticRegression(random_state=45)
model_lr0.fit(train_X_imp, y_train_imp)
y_pred_lr0 = model_lr0.predict(X_test_imp)
print(classification_report(y_test_imp, y_pred_lr0))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79        41
           1       0.55      0.58      0.56        19

    accuracy                           0.72        60
   macro avg       0.68      0.68      0.68        60
weighted avg       0.72      0.72      0.72        60


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve, roc_auc_score, classification_report
model_lr = LogisticRegression(random_state=45)
model_lr.fit(train_X_std, y_train)
y_pred_lr = model_lr.predict(X_test_std)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        41
           1       0.58      0.58      0.58        19

    accuracy                           0.73        60
   macro avg       0.69      0.69      0.69        60
weighted avg       0.73      0.73      0.73        60


from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(random_state=45,max_depth=5,min_samples_leaf=25)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.83      0.73      0.78        41
           1       0.54      0.68      0.60        19

    accuracy                           0.72        60
   macro avg       0.69      0.71      0.69        60
weighted avg       0.74      0.72      0.72        60


from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(n_estimators=200,max_depth=3,min_samples_leaf=25)
model_rfc.fit(X_train, y_train)
y_pred_rfc = model_rfc.predict(X_test)
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.80      0.95      0.87        41
           1       0.82      0.47      0.60        19

    accuracy                           0.80        60
   macro avg       0.81      0.71      0.73        60
weighted avg       0.80      0.80      0.78        60


from sklearn.model_selection import GridSearchCV
modeldt= DecisionTreeClassifier(random_state=45)
params={'max_depth': [3,5,7,9,12],'min_samples_leaf': [10,25,30,60,90],'criterion': ["gini", "entropy"]}
cv_model = GridSearchCV(estimator=modeldt,param_grid=params,cv=5,n_jobs=8,verbose=1,scoring='accuracy')
cv_model.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=45), n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 9, 12],
                         'min_samples_leaf': [10, 25, 30, 60, 90]},
             scoring='accuracy', verbose=1)


cv_model.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=10,
                       random_state=45)


modeldt2 = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=10,
                       random_state=45)
modeldt2.fit(X_train, y_train)
y_pred_dt2 = modeldt2.predict(X_test)
print(classification_report(y_test, y_pred_dt2))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84        41
           1       0.67      0.53      0.59        19

    accuracy                           0.77        60
   macro avg       0.73      0.70      0.71        60
weighted avg       0.76      0.77      0.76        60


model_rfc2 = RandomForestClassifier(random_state=45)
params_2 ={'n_estimators':[50,100,150,170],'max_depth': [3,5,7,9,12],'min_samples_leaf': [10,25,30,60,90],'criterion': ["gini", "entropy"]}
cv_model_2 = GridSearchCV(estimator=model_rfc2,param_grid=params_2,cv=5,n_jobs=8,verbose=1,scoring='accuracy')
cv_model_2.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=45), n_jobs=8,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 9, 12],
                         'min_samples_leaf': [10, 25, 30, 60, 90],
                         'n_estimators': [50, 100, 150, 170]},
             scoring='accuracy', verbose=1)


cv_model_2.best_estimator_

RandomForestClassifier(max_depth=5, min_samples_leaf=10, n_estimators=150,
                       random_state=45)


model_rfc_3 = RandomForestClassifier(max_depth=5, min_samples_leaf=10, n_estimators=150,
                       random_state=45)
model_rfc_3.fit(X_train, y_train)
y_pred_rfc_3 = model_rfc_3.predict(X_test)
print(classification_report(y_test, y_pred_rfc_3))

              precision    recall  f1-score   support

           0       0.83      0.93      0.87        41
           1       0.79      0.58      0.67        19

    accuracy                           0.82        60
   macro avg       0.81      0.75      0.77        60
weighted avg       0.81      0.82      0.81        60


import pickle
model_file = open('RF_model.pkl','wb')
pickle.dump(model_rfc_3, model_file)


# Test
model_name = 'RF_model.pkl'
classifier = pickle.load(open(model_name, 'rb'))


test_pred = classifier.predict(X_test)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.87        41
           1       0.79      0.58      0.67        19

    accuracy                           0.82        60
   macro avg       0.81      0.75      0.77        60
weighted avg       0.81      0.82      0.81        60

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4	1
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6	1
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7	1
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7	1
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8	1

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
count	299.000000	299.000000	299.000000	299.000000	299.000000	299.000000	299.000000	299.00000	299.000000	299.000000	299.00000	299.000000	299.00000
mean	60.833893	0.431438	581.839465	0.418060	38.083612	0.351171	263358.029264	1.39388	136.625418	0.648829	0.32107	130.260870	0.32107
std	11.894809	0.496107	970.287881	0.494067	11.834841	0.478136	97804.236869	1.03451	4.412477	0.478136	0.46767	77.614208	0.46767
min	40.000000	0.000000	23.000000	0.000000	14.000000	0.000000	25100.000000	0.50000	113.000000	0.000000	0.00000	4.000000	0.00000
25%	51.000000	0.000000	116.500000	0.000000	30.000000	0.000000	212500.000000	0.90000	134.000000	0.000000	0.00000	73.000000	0.00000
50%	60.000000	0.000000	250.000000	0.000000	38.000000	0.000000	262000.000000	1.10000	137.000000	1.000000	0.00000	115.000000	0.00000
75%	70.000000	1.000000	582.000000	1.000000	45.000000	1.000000	303500.000000	1.40000	140.000000	1.000000	1.00000	203.000000	1.00000
max	95.000000	1.000000	7861.000000	1.000000	80.000000	1.000000	850000.000000	9.40000	148.000000	1.000000	1.00000	285.000000	1.00000

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8

	age	creatinine_phosphokinase	diabetes	ejection_fraction	platelets	serum_creatinine	serum_sodium	smoking	time
0	75.0	582	0	20	265000.00	1.9	130	0	4
1	55.0	7861	0	38	263358.03	1.1	136	0	6
2	65.0	146	0	20	162000.00	1.3	129	1	7
3	50.0	111	0	20	210000.00	1.9	137	0	7
4	65.0	160	1	20	327000.00	2.7	116	0	8

About the data:¶

Based on above heatmap¶

Based on domain knowledge¶

Analysis of Survival based on Gender¶

Split data into train and test set¶

Standardization¶

Feature Importance¶

Modeling after feature selection¶

Modeling consist all features¶

Hyper parameter Tuning¶

Best model selection (Random Forest Classifier after hyperparameter tuning)¶