Hopefully you will find enough information about how to set images in your blog here. This is an example of a post which includes a feature image specified in the front matter of the post. The feature image spans the full-width of the page, and is shown with the title on permalink pages:

CUSTOMER CHURN ANALYSIS

Import Library

# Library for data analysis
import numpy as np
import pandas as pd

# Library for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Library for machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Setting parameter kernel
plt.rcParams['figure.figsize'] = [10,6]
sns.set_style('darkgrid')

Data Preparation

Import Raw Data

# Read file as dataframe
df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/data_retail.csv', sep = ';')
# Print first five rows
df.head()

	no	Row_Num	Customer_ID	Product	First_Transaction	Last_Transaction	Average_Transaction_Amount	Count_Transaction
0	1	1	29531	Jaket	1466304274396	1538718482608	1467681	22
1	2	2	29531	Sepatu	1406077331494	1545735761270	1269337	41
2	3	3	141526	Tas	1493349147000	1548322802000	310915	30
3	4	4	141526	Jaket	1493362372547	1547643603911	722632	27
4	5	5	37545	Sepatu	1429178498531	1542891221530	1775036	25

print('Info dataset:')
df.info()

Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   no                          100000 non-null  int64 
 1   Row_Num                     100000 non-null  int64 
 2   Customer_ID                 100000 non-null  int64 
 3   Product                     100000 non-null  object
 4   First_Transaction           100000 non-null  int64 
 5   Last_Transaction            100000 non-null  int64 
 6   Average_Transaction_Amount  100000 non-null  int64 
 7   Count_Transaction           100000 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 6.1+ MB

Data Wrangling

# Change format column First_Transaction and Last Transaction
df['First_Transaction'] = pd.to_datetime(df['First_Transaction']/1000, unit='s', origin='1970-01-01')
df['Last_Transaction'] = pd.to_datetime(df['Last_Transaction']/1000, unit='s', origin='1970-01-01')
# View new format
df[['First_Transaction','Last_Transaction']].head()

	First_Transaction	Last_Transaction
0	2016-06-19 02:44:34.395999908	2018-10-05 05:48:02.608000040
1	2014-07-23 01:02:11.493999958	2018-12-25 11:02:41.269999981
2	2017-04-28 03:12:27.000000000	2019-01-24 09:40:02.000000000
3	2017-04-28 06:52:52.546999931	2019-01-16 13:00:03.911000013
4	2015-04-16 10:01:38.530999899	2018-11-22 12:53:41.529999970

# Check Last Transaction update
print(max(df['Last_Transaction']))

2019-02-01 23:57:57.286000013

# Create new column is_Churn
df.loc[df['Last_Transaction'] <= '2018-08-01', 'is_Churn'] = True 
df.loc[df['Last_Transaction'] > '2018-08-01', 'is_Churn'] = False 

# Remove unnecessary columns
del df['no']
del df['Row_Num']

Clean Dataset

# View first five rows after cleaning data
df.head()

	Customer_ID	Product	First_Transaction	Last_Transaction	Average_Transaction_Amount	Count_Transaction	is_Churn
0	29531	Jaket	2016-06-19 02:44:34.395999908	2018-10-05 05:48:02.608000040	1467681	22	False
1	29531	Sepatu	2014-07-23 01:02:11.493999958	2018-12-25 11:02:41.269999981	1269337	41	False
2	141526	Tas	2017-04-28 03:12:27.000000000	2019-01-24 09:40:02.000000000	310915	30	False
3	141526	Jaket	2017-04-28 06:52:52.546999931	2019-01-16 13:00:03.911000013	722632	27	False
4	37545	Sepatu	2015-04-16 10:01:38.530999899	2018-11-22 12:53:41.529999970	1775036	25	False

# View info dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Customer_ID                 100000 non-null  int64         
 1   Product                     100000 non-null  object        
 2   First_Transaction           100000 non-null  datetime64[ns]
 3   Last_Transaction            100000 non-null  datetime64[ns]
 4   Average_Transaction_Amount  100000 non-null  int64         
 5   Count_Transaction           100000 non-null  int64         
 6   is_Churn                    100000 non-null  object        
dtypes: datetime64[ns](2), int64(3), object(2)
memory usage: 5.3+ MB

Data Visualization

Customer Acquisition By Year

# Create new column Year First Transaction
df['Year_First_Transaction'] = df['First_Transaction'].dt.year
# Create new column Last Transaction
df['Year_Last_Transaction'] = df['Last_Transaction'].dt.year
# Grouping Number of Customers by Year
cust_by_year = df.groupby(['Year_First_Transaction'])['Customer_ID'].count()
cust_by_year

Year_First_Transaction
   1007
   4954
  11235
  17656
  31828
  30327
   2993
Name: Customer_ID, dtype: int64

# Plotting number of customers by year
sns.barplot(x = cust_by_year.index, y = cust_by_year, palette='Set2')
plt.title('Graph of Customer Acquisition', fontsize=15, pad=15)
plt.xlabel('Year Transaction', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)

Text(0, 0.5, 'Number of Customers')

png

Transaction by Year

# Grouping total Count Transaction by Year
trans_by_year = df.groupby(['Year_First_Transaction'])['Count_Transaction'].sum()
trans_by_year

Year_First_Transaction
   23154
  165494
  297445
  278707
  299199
   99989
    5862
Name: Count_Transaction, dtype: int64

# Plotting total transactions by year
sns.barplot(x = trans_by_year.index, y = trans_by_year, palette='Set2')
plt.title('Graph of Transaction Customer', fontsize=15, pad=15)
plt.xlabel('Year Transaction', fontsize=12)
plt.ylabel('Total Transactions', fontsize=12)

Text(0, 0.5, 'Total Transactions')

png

Average Transaction Amount by Year

# Grouping Average Transaction Amount by Product and Year
avg_trans_amount = df.groupby(['Product', 'Year_First_Transaction'])['Average_Transaction_Amount'].mean().reset_index()
avg_trans_amount

	Product	Year_First_Transaction	Average_Transaction_Amount
0	Baju	2017	1.490890e+06
1	Baju	2018	1.570201e+06
2	Baju	2019	1.383645e+06
3	Jaket	2014	1.467937e+06
4	Jaket	2015	1.296265e+06
5	Jaket	2016	1.317344e+06
6	Jaket	2017	1.369034e+06
7	Jaket	2018	1.419074e+06
8	Jaket	2019	1.447536e+06
9	Sepatu	2013	1.396499e+06
10	Sepatu	2014	1.427063e+06
11	Sepatu	2015	1.428235e+06
12	Sepatu	2016	1.425938e+06
13	Sepatu	2017	1.407275e+06
14	Sepatu	2018	1.346824e+06
15	Sepatu	2019	1.338180e+06
16	Tas	2017	1.109583e+06
17	Tas	2018	1.337614e+06
18	Tas	2019	1.287529e+06

# Plotting by Product
sns.pointplot(data = avg_trans_amount, x='Year_First_Transaction', 
              y='Average_Transaction_Amount', hue='Product')
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1.01))
plt.title('Average Transaction Amount by Year', fontsize=15, pad=15)
plt.xlabel('Year Transaction', fontsize=12)
plt.ylabel('Average Transaction Amount', fontsize=12)

Text(0, 0.5, 'Average Transaction Amount')

png

Churn Customer Proportion

# Create pivot table
churn_prop = df.pivot_table(index='is_Churn', 
                        columns='Product',
                        values='Customer_ID', 
                        aggfunc='count', 
                        fill_value=0)
churn_prop

Product	Baju	Jaket	Sepatu	Tas
is_Churn
False	1268	11123	16064	4976
True	2144	23827	33090	7508

# Plotting churn proportion using pie plot
churn_prop.plot.pie(subplots=True, layout=(2,2), autopct='%1.0f%%', 
                    legend=False, title='Churn Proportion by Product')
plt.tight_layout()

png

Distribution Count Transaction

# Create function distribution count transaction
def distribution_count_trans(row):
    if row['Count_Transaction'] == 1:
        val = '1'
    elif (row['Count_Transaction'] > 1 and row['Count_Transaction'] <= 3):
        val ='2 - 3'
    elif (row['Count_Transaction'] > 3 and row['Count_Transaction'] <= 6):
        val ='4 - 6'
    elif (row['Count_Transaction'] > 6 and row['Count_Transaction'] <= 10):
        val ='7 - 10'
    else:
        val ='> 10'
    return val

# Apply function to new column
df['Count_Transaction_Group'] = df.apply(distribution_count_trans, axis=1)
# Grouping Number of Customers by Count Transaaction Group
count_trans_group = df.groupby(['Count_Transaction_Group'])['Customer_ID'].count()
count_trans_group

Count_Transaction_Group
1         49255
2 - 3     14272
4 - 6     12126
7 - 10     2890
> 10      21457
Name: Customer_ID, dtype: int64

# Plotting count transaction group 
sns.barplot(x = count_trans_group.index, y = count_trans_group, palette='Set2')
plt.title('Customer Distribution by Count Transaction Group', fontsize=15, pad=15)
plt.xlabel('Transaction Group', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)

Text(0, 0.5, 'Number of Customers')

png

Distribution Average Transaction

# Create function distribution average transaction
def distribution_avg_trans(row):
    if (row['Average_Transaction_Amount'] >= 100000 and row['Average_Transaction_Amount'] <= 200000):
        val ='100.000 - 250.000'
    elif (row['Average_Transaction_Amount'] > 250000 and row['Average_Transaction_Amount'] <= 500000):
        val ='>250.000 - 500.000'
    elif (row['Average_Transaction_Amount'] > 500000 and row['Average_Transaction_Amount'] <= 750000):
        val ='>500.000 - 750.000'
    elif (row['Average_Transaction_Amount'] > 750000 and row['Average_Transaction_Amount'] <= 1000000):
        val ='>750.000 - 1.000.000'
    elif (row['Average_Transaction_Amount'] > 1000000 and row['Average_Transaction_Amount'] <= 2500000):
        val ='>1.000.000 - 2.500.000'
    elif (row['Average_Transaction_Amount'] > 2500000 and row['Average_Transaction_Amount'] <= 5000000):
        val ='>2.500.000 - 5.000.000'
    elif (row['Average_Transaction_Amount'] > 5000000 and row['Average_Transaction_Amount'] <= 10000000):
        val ='>5.000.000 - 10.000.000'
    else:
        val ='>10.000.000'
    return val

# Apply function to new column
df['Average_Transaction_Amount_Group'] = df.apply(distribution_avg_trans, axis=1)
# Grouping Number of Customers by Average Transaction Amount Group
avg_trans_group = df.groupby(['Average_Transaction_Amount_Group'])['Customer_ID'].count()
avg_trans_group

Average_Transaction_Amount_Group
100.000 - 250.000           4912
>1.000.000 - 2.500.000     32819
>10.000.000                 3227
>2.500.000 - 5.000.000      9027
>250.000 - 500.000         18857
>5.000.000 - 10.000.000     3689
>500.000 - 750.000         15171
>750.000 - 1.000.000       12298
Name: Customer_ID, dtype: int64

# Plotting average transaction amount group
orders = ['100.000 - 250.000', '>250.000 - 500.000', '>500.000 - 750.000',
          '>750.000 - 1.000.000', '>1.000.000 - 2.500.000', '>2.500.000 - 5.000.000',
          '>5.000.000 - 10.000.000', '>10.000.000']
sns.barplot(x = avg_trans_group, y = avg_trans_group.index, order=orders, palette='Set2')
plt.title('Customer Distribution by Average Transaction Amount Group', fontsize=15, pad=15)
plt.xlabel('Number of Customers', fontsize=12)
plt.ylabel('Average Transaction Amount Group', fontsize=12)

Text(0, 0.5, 'Average Transaction Amount Group')

png

Data Modelling

Feature Engineering

# Check datatypes
df.dtypes

Customer_ID                                  int64
Product                                     object
First_Transaction                   datetime64[ns]
Last_Transaction                    datetime64[ns]
Average_Transaction_Amount                   int64
Count_Transaction                            int64
is_Churn                                    object
Year_First_Transaction                       int64
Year_Last_Transaction                        int64
Count_Transaction_Group                     object
Average_Transaction_Amount_Group            object
dtype: object

# Copy old dataframe
df_model = df.copy()
# Create new column from First Transaction
df_model['Month_First'] = df_model['First_Transaction'].dt.month
df_model['Day_First'] = df_model['First_Transaction'].dt.day
# Create new column from Last Transaction
df_model['Month_Last'] = df_model['Last_Transaction'].dt.month
df_model['Day_Last'] = df_model['Last_Transaction'].dt.day

# Encoding categorical data
df_model['Count_Transaction_Group'].replace({'1':0, '2 - 3':1, '4 - 6':2, '7 - 10':3,
                                             '> 10':4}, inplace=True)
amounts = {'100.000 - 250.000':0, '>250.000 - 500.000':1, '>500.000 - 750.000':2,
          '>750.000 - 1.000.000':3, '>1.000.000 - 2.500.000':4, '>2.500.000 - 5.000.000':5,
          '>5.000.000 - 10.000.000':6, '>10.000.000':7}
df_model['Average_Transaction_Amount_Group'].replace(amounts, inplace=True)
df_model['is_Churn'].replace({False:0, True:1}, inplace=True)

# Perform one hot encoding
df_encode = pd.get_dummies(df_model['Product'])
# Selected columns
remove_cols = ['Product', 'First_Transaction', 'Last_Transaction', 'Count_Transaction', 'Average_Transaction_Amount']
df_fix = df_model.drop(remove_cols, axis=1)
# Concatenate dataframe
df_model_fix = pd.concat([df_fix, df_encode], axis=1)

# Check new first five rows
df_model_fix.head()

	Customer_ID	Year_First_Transaction	Year_Last_Transaction	Count_Transaction_Group	Average_Transaction_Amount_Group	Month_First	Day_First	Month_Last	Day_Last	Jaket	Sepatu	Tas
0	29531	2016	2018	4	4	6	19	10	5	1	0	0
1	29531	2014	2018	4	4	7	23	12	25	0	1	0
2	141526	2017	2019	4	1	4	28	1	24	0	0	1
3	141526	2017	2019	4	2	4	28	1	16	1	0	0
4	37545	2015	2018	4	4	4	16	11	22	0	1	0

# Check new data type
df_model_fix.dtypes

Customer_ID                         int64
is_Churn                            int64
Year_First_Transaction              int64
Year_Last_Transaction               int64
Count_Transaction_Group             int64
Average_Transaction_Amount_Group    int64
Month_First                         int64
Day_First                           int64
Month_Last                          int64
Day_Last                            int64
Baju                                uint8
Jaket                               uint8
Sepatu                              uint8
Tas                                 uint8
dtype: object

# View Total Proportion Churn
df_model_fix['is_Churn'].value_counts(normalize=True)

1    0.66569
0    0.33431
Name: is_Churn, dtype: float64

Split Train and Test Data

# Choose feature columns
X = df_model_fix.drop(['is_Churn'], axis=1)
# Choose target columns
y = df_model_fix['is_Churn']

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Model Selection

# Create function choose model
def choose_model(model_name):
  # Train model
  model = model_name.fit(X_train, y_train)
  # Scoring Model
  model_score = model.score(X_test, y_test)
  print(f'Score Accuracy is {round(model_score*100, 2)}%')
  # Validating Model
  valid_score = np.mean(cross_val_score(model_name, X_train, y_train, cv=5))
  print(f'Score Validation is {round(valid_score*100, 2)}%')

# Choose model
choose_model(LogisticRegression())

Score Accuracy is 72.19%
Score Validation is 74.93%

# Choose model
choose_model(xgb.XGBClassifier())

Score Accuracy is 100.0%
Score Validation is 100.0%

Model Evaluation

# Function for plot confusion matrix and classification report
def check_matrix_and_reports(model_plot):
  # Churn Prediction 
  train = model_plot.fit(X_train, y_train)
  preds = train.predict(X_test)

  # Create confusion matrix
  class_names = ['Not Churn', 'Churn'] 
  cnf_matrix = confusion_matrix(y_test, preds)
  # Use subplotting
  fig, ax = plt.subplots(figsize=(8,4))
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  # Plotting using heatmap
  sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')
  plt.title('Confusion matrix', fontsize=15, pad=15)
  plt.xlabel('Actual')
  plt.ylabel('Predicted')
  
  # Create classification report
  reports = classification_report(y_test, preds, target_names=class_names)
  print('Classification Report\n', reports)

# Apply function with Logistic Regression
check_matrix_and_reports(LogisticRegression())

Classification Report
               precision    recall  f1-score   support

   Not Churn       0.70      0.31      0.43      8399
       Churn       0.73      0.93      0.82     16601

    accuracy                           0.72     25000
   macro avg       0.71      0.62      0.62     25000
weighted avg       0.72      0.72      0.69     25000

png

# Apply function with XGBoost Classifier
check_matrix_and_reports(xgb.XGBClassifier())

Classification Report
               precision    recall  f1-score   support

   Not Churn       1.00      1.00      1.00      8399
       Churn       1.00      1.00      1.00     16601

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00      1.00      1.00     25000

png

# Predict probability of churn
model1 = LogisticRegression().fit(X_train, y_train)
log_pred = model1.predict_proba(X_test)[:,1]
model2 = xgb.XGBClassifier().fit(X_train, y_train)
xgb_pred = model2.predict_proba(X_test)[:,1]

# ROC Chart components
fallout_log, sensitivity_log, thresholds_log = roc_curve(y_test, log_pred)
fallout_xgb, sensitivity_xgb, thresholds_xgb = roc_curve(y_test, xgb_pred)

# ROC Chart with both
plt.plot(fallout_log, sensitivity_log, color = 'blue', label='%s' % 'Logistic Regression')
plt.plot(fallout_xgb, sensitivity_xgb, color = 'green', label='%s' % 'XGBoost')
plt.plot([0, 1], [0, 1], linestyle='--', label='%s' % 'Random Prediction')
plt.title("ROC Chart on Probability of Churn")
plt.xlabel('Fall-out')
plt.ylabel('Sensitivity')
plt.legend()

<matplotlib.legend.Legend at 0x7f2d2a3b0f90>

png

# Print the logistic regression AUC with formatting
print("Logistic Regression AUC Score: %0.2f" % roc_auc_score(y_test, log_pred))

# Print the xgboost classifier AUC with formatting
print("XGBoost Classifier AUC Score: %0.2f" % roc_auc_score(y_test, xgb_pred))

Logistic Regression AUC Score: 0.58
XGBoost Classifier AUC Score: 1.00