# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')


df.shape

(7043, 21)


df.head(3)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB


# Check for missing values
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


# Handle empty string record in Total charges column
df = df[~(df['TotalCharges']== ' ')]


# Convert TotalCharges column data type from object into float data types
df.TotalCharges = df.TotalCharges.apply(lambda x: float(x))


plt.figure(figsize=(9,7))
plt.style.use('ggplot')
plt.plot(df['tenure'], df['TotalCharges'],'.', color='#2ca02c')
plt.xlabel('Tenure')
plt.ylabel('Total Charges')
plt.title('Relationship of tenure with total charges')
plt.show()


# Churn distribution
plt.figure(figsize=(9,7))
plt.style.use('seaborn-dark')
df.Churn.value_counts().plot.bar()
plt.xlabel('Churn')
plt.ylabel('Number of customers')
plt.show()


df.groupby(by=['gender','Churn']).count()


df.head()


plt.figure(figsize=(9,7))
corr = df.corr()
sns.heatmap(corr, annot=True, linewidths=1, cmap="YlGnBu")
plt.title("Correlation among numerical features")
plt.show()


plt.figure(figsize=(9,7))
plt.style.use('ggplot')
plt.plot(df['MonthlyCharges'], df['TotalCharges'],'.', color='#9467bd')
plt.xlabel('Monthly charges')
plt.ylabel('Total charges')
plt.title('Relationship of Monthly with total charges')
plt.show()


# Remove Monthly charges and tenure column
df = df.drop(['MonthlyCharges','tenure'], axis=1)


df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'TotalCharges', 'Churn'],
      dtype='object')


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 
 17  TotalCharges      7032 non-null   float64
 18  Churn             7032 non-null   object 
dtypes: float64(1), int64(1), object(17)
memory usage: 1.1+ MB


df.shape

(7032, 19)


df.head()


df.MultipleLines.unique()

array(['No phone service', 'No', 'Yes'], dtype=object)


df.head()


categorical_cols = ['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']


# MinMax Normalization
df.TotalCharges = (df.TotalCharges.values - df.TotalCharges.values.min())/(df.TotalCharges.values.max()-df.TotalCharges.values.min())


df.head()


df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


df.head()


df = df.drop(['customerID'], axis=1)


X = df.iloc[:,:-1]
Y = df.iloc[:,-1:]


Y.head()


from sklearn.model_selection import train_test_split


train_X, test_X, train_y, test_y = train_test_split(X,Y, test_size=0.25, stratify=Y)


import warnings
warnings.filterwarnings('ignore')


plt.figure(figsize=(9,7))
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve, roc_auc_score

model0 = LogisticRegression()
model0.fit(train_X, train_y)
ypred0 = model0.predict(test_X)

print(classification_report(test_y, ypred0))
auc_lr = roc_auc_score(test_y, ypred0)
plot_roc_curve(model0, test_X, test_y)
plt.show()

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1291
           1       0.65      0.53      0.58       467

    accuracy                           0.80      1758
   macro avg       0.74      0.71      0.72      1758
weighted avg       0.79      0.80      0.79      1758

<Figure size 648x504 with 0 Axes>


from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

model1 = DecisionTreeClassifier()
model1.fit(train_X, train_y)
ypred1 = model1.predict(test_X)

print(classification_report(test_y, ypred1))

auc_dt = roc_auc_score(test_y, ypred1)
plot_roc_curve(model1, test_X, test_y)
plt.show()

              precision    recall  f1-score   support

           0       0.83      0.80      0.81      1291
           1       0.49      0.54      0.51       467

    accuracy                           0.73      1758
   macro avg       0.66      0.67      0.66      1758
weighted avg       0.74      0.73      0.73      1758


from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier()
model2.fit(train_X, train_y)
ypred2 = model2.predict(test_X)

print(classification_report(test_y, ypred2))

auc_rf = roc_auc_score(test_y, ypred2)
plot_roc_curve(model2, test_X, test_y)
plt.show()

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1291
           1       0.62      0.48      0.54       467

    accuracy                           0.78      1758
   macro avg       0.72      0.69      0.70      1758
weighted avg       0.77      0.78      0.77      1758


from sklearn.ensemble import AdaBoostClassifier
model3 = AdaBoostClassifier()
model3.fit(train_X, train_y)
ypred3 = model3.predict(test_X)

print(classification_report(test_y, ypred3))

auc_ab = roc_auc_score(test_y, ypred3)
plot_roc_curve(model3, test_X, test_y)
plt.show()

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1291
           1       0.65      0.51      0.57       467

    accuracy                           0.80      1758
   macro avg       0.74      0.70      0.72      1758
weighted avg       0.79      0.80      0.79      1758


from sklearn.ensemble import GradientBoostingClassifier
model4 = GradientBoostingClassifier()
model4.fit(train_X, train_y)
ypred4 = model4.predict(test_X)

print(classification_report(test_y, ypred4))

auc_gb = roc_auc_score(test_y, ypred4)
plot_roc_curve(model4, test_X, test_y)
plt.show()

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1291
           1       0.66      0.49      0.56       467

    accuracy                           0.80      1758
   macro avg       0.74      0.70      0.72      1758
weighted avg       0.79      0.80      0.79      1758


import xgboost as xgb
model5 = xgb.XGBClassifier()
model5.fit(train_X, train_y)
ypred5 = model5.predict(test_X)

print(classification_report(test_y, ypred5))

auc_xgb = roc_auc_score(test_y, ypred5)
plot_roc_curve(model5, test_X, test_y)
plt.show()

[23:52:46] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1291
           1       0.61      0.51      0.55       467

    accuracy                           0.78      1758
   macro avg       0.72      0.69      0.70      1758
weighted avg       0.77      0.78      0.78      1758


plt.style.use('seaborn-dark-palette')
plt.figure(figsize=(9,7))
model_list = ['Logistic Reg', 'DecisionTree','RandomForest','AdaBoost','GradientBoost','Xgboost']
auc_socre = [auc_lr, auc_dt, auc_rf, auc_ab, auc_gb, auc_xgb]
plt.bar(model_list, auc_socre)
plt.xticks(rotation=90)
plt.ylabel('AUC Score')
plt.title('Churn Modeling Result', fontsize=20)
plt.show()


import matplotlib.pyplot as plt

print(plt.style.available)

['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']

		customerID	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges
gender	Churn
Female	No	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544	2544
Female	Yes	939	939	939	939	939	939	939	939	939	939	939	939	939	939	939	939	939	939	939
Male	No	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619	2619
Male	Yes	930	930	930	930	930	930	930	930	930	930	930	930	930	930	930	930	930	930	930

	customerID	TotalCharges	gender_Male	Partner_Yes	PhoneService_Yes	MultipleLines_No phone service	InternetService_Fiber optic	...	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	7590-VHVEG	0.001275	0	1	0	1	0	...	0	1	1	0	0
1	5575-GNVDE	0.215867	1	0	1	0	0	...	1	0	0	1	0
2	3668-QPYBK	0.010310	1	0	1	0	0	...	0	1	0	1	1
3	7795-CFOCW	0.210241	1	0	0	1	0	...	1	0	0	0	0
4	9237-HQITU	0.015330	0	0	1	0	1	...	0	1	1	0	1

Chrun Analysis¶

Data Content¶

Correlation among independent variables (numerical types)¶

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes