Hopefully you will find enough information about how to set images in your blog here. This is an example of a post which includes a feature image specified in the front matter of the post. The feature image spans the full-width of the page, and is shown with the title on permalink pages:

CUSTOMER CHURN ANALYSIS

Import Library

# Library for data analysis
import numpy as np
import pandas as pd

# Library for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Library for machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Setting parameter kernel
plt.rcParams['figure.figsize'] = [10,6]
sns.set_style('darkgrid')

Data Preparation

Import Raw Data

# Read file as dataframe
df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/data_retail.csv', sep = ';')
# Print first five rows
df.head()
no Row_Num Customer_ID Product First_Transaction Last_Transaction Average_Transaction_Amount Count_Transaction
0 1 1 29531 Jaket 1466304274396 1538718482608 1467681 22
1 2 2 29531 Sepatu 1406077331494 1545735761270 1269337 41
2 3 3 141526 Tas 1493349147000 1548322802000 310915 30
3 4 4 141526 Jaket 1493362372547 1547643603911 722632 27
4 5 5 37545 Sepatu 1429178498531 1542891221530 1775036 25
print('Info dataset:')
df.info()
Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   no                          100000 non-null  int64 
 1   Row_Num                     100000 non-null  int64 
 2   Customer_ID                 100000 non-null  int64 
 3   Product                     100000 non-null  object
 4   First_Transaction           100000 non-null  int64 
 5   Last_Transaction            100000 non-null  int64 
 6   Average_Transaction_Amount  100000 non-null  int64 
 7   Count_Transaction           100000 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 6.1+ MB

Data Wrangling

# Change format column First_Transaction and Last Transaction
df['First_Transaction'] = pd.to_datetime(df['First_Transaction']/1000, unit='s', origin='1970-01-01')
df['Last_Transaction'] = pd.to_datetime(df['Last_Transaction']/1000, unit='s', origin='1970-01-01')
# View new format
df[['First_Transaction','Last_Transaction']].head()
First_Transaction Last_Transaction
0 2016-06-19 02:44:34.395999908 2018-10-05 05:48:02.608000040
1 2014-07-23 01:02:11.493999958 2018-12-25 11:02:41.269999981
2 2017-04-28 03:12:27.000000000 2019-01-24 09:40:02.000000000
3 2017-04-28 06:52:52.546999931 2019-01-16 13:00:03.911000013
4 2015-04-16 10:01:38.530999899 2018-11-22 12:53:41.529999970
# Check Last Transaction update
print(max(df['Last_Transaction']))
2019-02-01 23:57:57.286000013
# Create new column is_Churn
df.loc[df['Last_Transaction'] <= '2018-08-01', 'is_Churn'] = True 
df.loc[df['Last_Transaction'] > '2018-08-01', 'is_Churn'] = False 
# Remove unnecessary columns
del df['no']
del df['Row_Num']

Clean Dataset

# View first five rows after cleaning data
df.head()
Customer_ID Product First_Transaction Last_Transaction Average_Transaction_Amount Count_Transaction is_Churn
0 29531 Jaket 2016-06-19 02:44:34.395999908 2018-10-05 05:48:02.608000040 1467681 22 False
1 29531 Sepatu 2014-07-23 01:02:11.493999958 2018-12-25 11:02:41.269999981 1269337 41 False
2 141526 Tas 2017-04-28 03:12:27.000000000 2019-01-24 09:40:02.000000000 310915 30 False
3 141526 Jaket 2017-04-28 06:52:52.546999931 2019-01-16 13:00:03.911000013 722632 27 False
4 37545 Sepatu 2015-04-16 10:01:38.530999899 2018-11-22 12:53:41.529999970 1775036 25 False
# View info dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Customer_ID                 100000 non-null  int64         
 1   Product                     100000 non-null  object        
 2   First_Transaction           100000 non-null  datetime64[ns]
 3   Last_Transaction            100000 non-null  datetime64[ns]
 4   Average_Transaction_Amount  100000 non-null  int64         
 5   Count_Transaction           100000 non-null  int64         
 6   is_Churn                    100000 non-null  object        
dtypes: datetime64[ns](2), int64(3), object(2)
memory usage: 5.3+ MB

Data Visualization

Customer Acquisition By Year

# Create new column Year First Transaction
df['Year_First_Transaction'] = df['First_Transaction'].dt.year
# Create new column Last Transaction
df['Year_Last_Transaction'] = df['Last_Transaction'].dt.year
# Grouping Number of Customers by Year
cust_by_year = df.groupby(['Year_First_Transaction'])['Customer_ID'].count()
cust_by_year
Year_First_Transaction
2013     1007
2014     4954
2015    11235
2016    17656
2017    31828
2018    30327
2019     2993
Name: Customer_ID, dtype: int64
# Plotting number of customers by year
sns.barplot(x = cust_by_year.index, y = cust_by_year, palette='Set2')
plt.title('Graph of Customer Acquisition', fontsize=15, pad=15)
plt.xlabel('Year Transaction', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
Text(0, 0.5, 'Number of Customers')

png

Transaction by Year

# Grouping total Count Transaction by Year
trans_by_year = df.groupby(['Year_First_Transaction'])['Count_Transaction'].sum()
trans_by_year
Year_First_Transaction
2013     23154
2014    165494
2015    297445
2016    278707
2017    299199
2018     99989
2019      5862
Name: Count_Transaction, dtype: int64
# Plotting total transactions by year
sns.barplot(x = trans_by_year.index, y = trans_by_year, palette='Set2')
plt.title('Graph of Transaction Customer', fontsize=15, pad=15)
plt.xlabel('Year Transaction', fontsize=12)
plt.ylabel('Total Transactions', fontsize=12)
Text(0, 0.5, 'Total Transactions')

png

Average Transaction Amount by Year

# Grouping Average Transaction Amount by Product and Year
avg_trans_amount = df.groupby(['Product', 'Year_First_Transaction'])['Average_Transaction_Amount'].mean().reset_index()
avg_trans_amount
Product Year_First_Transaction Average_Transaction_Amount
0 Baju 2017 1.490890e+06
1 Baju 2018 1.570201e+06
2 Baju 2019 1.383645e+06
3 Jaket 2014 1.467937e+06
4 Jaket 2015 1.296265e+06
5 Jaket 2016 1.317344e+06
6 Jaket 2017 1.369034e+06
7 Jaket 2018 1.419074e+06
8 Jaket 2019 1.447536e+06
9 Sepatu 2013 1.396499e+06
10 Sepatu 2014 1.427063e+06
11 Sepatu 2015 1.428235e+06
12 Sepatu 2016 1.425938e+06
13 Sepatu 2017 1.407275e+06
14 Sepatu 2018 1.346824e+06
15 Sepatu 2019 1.338180e+06
16 Tas 2017 1.109583e+06
17 Tas 2018 1.337614e+06
18 Tas 2019 1.287529e+06
# Plotting by Product
sns.pointplot(data = avg_trans_amount, x='Year_First_Transaction', 
              y='Average_Transaction_Amount', hue='Product')
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1.01))
plt.title('Average Transaction Amount by Year', fontsize=15, pad=15)
plt.xlabel('Year Transaction', fontsize=12)
plt.ylabel('Average Transaction Amount', fontsize=12)
Text(0, 0.5, 'Average Transaction Amount')

png

Churn Customer Proportion

# Create pivot table
churn_prop = df.pivot_table(index='is_Churn', 
                        columns='Product',
                        values='Customer_ID', 
                        aggfunc='count', 
                        fill_value=0)
churn_prop
Product Baju Jaket Sepatu Tas
is_Churn
False 1268 11123 16064 4976
True 2144 23827 33090 7508
# Plotting churn proportion using pie plot
churn_prop.plot.pie(subplots=True, layout=(2,2), autopct='%1.0f%%', 
                    legend=False, title='Churn Proportion by Product')
plt.tight_layout()

png

Distribution Count Transaction

# Create function distribution count transaction
def distribution_count_trans(row):
    if row['Count_Transaction'] == 1:
        val = '1'
    elif (row['Count_Transaction'] > 1 and row['Count_Transaction'] <= 3):
        val ='2 - 3'
    elif (row['Count_Transaction'] > 3 and row['Count_Transaction'] <= 6):
        val ='4 - 6'
    elif (row['Count_Transaction'] > 6 and row['Count_Transaction'] <= 10):
        val ='7 - 10'
    else:
        val ='> 10'
    return val
# Apply function to new column
df['Count_Transaction_Group'] = df.apply(distribution_count_trans, axis=1)
# Grouping Number of Customers by Count Transaaction Group
count_trans_group = df.groupby(['Count_Transaction_Group'])['Customer_ID'].count()
count_trans_group
Count_Transaction_Group
1         49255
2 - 3     14272
4 - 6     12126
7 - 10     2890
> 10      21457
Name: Customer_ID, dtype: int64
# Plotting count transaction group 
sns.barplot(x = count_trans_group.index, y = count_trans_group, palette='Set2')
plt.title('Customer Distribution by Count Transaction Group', fontsize=15, pad=15)
plt.xlabel('Transaction Group', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
Text(0, 0.5, 'Number of Customers')

png

Distribution Average Transaction

# Create function distribution average transaction
def distribution_avg_trans(row):
    if (row['Average_Transaction_Amount'] >= 100000 and row['Average_Transaction_Amount'] <= 200000):
        val ='100.000 - 250.000'
    elif (row['Average_Transaction_Amount'] > 250000 and row['Average_Transaction_Amount'] <= 500000):
        val ='>250.000 - 500.000'
    elif (row['Average_Transaction_Amount'] > 500000 and row['Average_Transaction_Amount'] <= 750000):
        val ='>500.000 - 750.000'
    elif (row['Average_Transaction_Amount'] > 750000 and row['Average_Transaction_Amount'] <= 1000000):
        val ='>750.000 - 1.000.000'
    elif (row['Average_Transaction_Amount'] > 1000000 and row['Average_Transaction_Amount'] <= 2500000):
        val ='>1.000.000 - 2.500.000'
    elif (row['Average_Transaction_Amount'] > 2500000 and row['Average_Transaction_Amount'] <= 5000000):
        val ='>2.500.000 - 5.000.000'
    elif (row['Average_Transaction_Amount'] > 5000000 and row['Average_Transaction_Amount'] <= 10000000):
        val ='>5.000.000 - 10.000.000'
    else:
        val ='>10.000.000'
    return val
# Apply function to new column
df['Average_Transaction_Amount_Group'] = df.apply(distribution_avg_trans, axis=1)
# Grouping Number of Customers by Average Transaction Amount Group
avg_trans_group = df.groupby(['Average_Transaction_Amount_Group'])['Customer_ID'].count()
avg_trans_group
Average_Transaction_Amount_Group
100.000 - 250.000           4912
>1.000.000 - 2.500.000     32819
>10.000.000                 3227
>2.500.000 - 5.000.000      9027
>250.000 - 500.000         18857
>5.000.000 - 10.000.000     3689
>500.000 - 750.000         15171
>750.000 - 1.000.000       12298
Name: Customer_ID, dtype: int64
# Plotting average transaction amount group
orders = ['100.000 - 250.000', '>250.000 - 500.000', '>500.000 - 750.000',
          '>750.000 - 1.000.000', '>1.000.000 - 2.500.000', '>2.500.000 - 5.000.000',
          '>5.000.000 - 10.000.000', '>10.000.000']
sns.barplot(x = avg_trans_group, y = avg_trans_group.index, order=orders, palette='Set2')
plt.title('Customer Distribution by Average Transaction Amount Group', fontsize=15, pad=15)
plt.xlabel('Number of Customers', fontsize=12)
plt.ylabel('Average Transaction Amount Group', fontsize=12)
Text(0, 0.5, 'Average Transaction Amount Group')

png

Data Modelling

Feature Engineering

# Check datatypes
df.dtypes
Customer_ID                                  int64
Product                                     object
First_Transaction                   datetime64[ns]
Last_Transaction                    datetime64[ns]
Average_Transaction_Amount                   int64
Count_Transaction                            int64
is_Churn                                    object
Year_First_Transaction                       int64
Year_Last_Transaction                        int64
Count_Transaction_Group                     object
Average_Transaction_Amount_Group            object
dtype: object
# Copy old dataframe
df_model = df.copy()
# Create new column from First Transaction
df_model['Month_First'] = df_model['First_Transaction'].dt.month
df_model['Day_First'] = df_model['First_Transaction'].dt.day
# Create new column from Last Transaction
df_model['Month_Last'] = df_model['Last_Transaction'].dt.month
df_model['Day_Last'] = df_model['Last_Transaction'].dt.day
# Encoding categorical data
df_model['Count_Transaction_Group'].replace({'1':0, '2 - 3':1, '4 - 6':2, '7 - 10':3,
                                             '> 10':4}, inplace=True)
amounts = {'100.000 - 250.000':0, '>250.000 - 500.000':1, '>500.000 - 750.000':2,
          '>750.000 - 1.000.000':3, '>1.000.000 - 2.500.000':4, '>2.500.000 - 5.000.000':5,
          '>5.000.000 - 10.000.000':6, '>10.000.000':7}
df_model['Average_Transaction_Amount_Group'].replace(amounts, inplace=True)
df_model['is_Churn'].replace({False:0, True:1}, inplace=True)
# Perform one hot encoding
df_encode = pd.get_dummies(df_model['Product'])
# Selected columns
remove_cols = ['Product', 'First_Transaction', 'Last_Transaction', 'Count_Transaction', 'Average_Transaction_Amount']
df_fix = df_model.drop(remove_cols, axis=1)
# Concatenate dataframe
df_model_fix = pd.concat([df_fix, df_encode], axis=1)

# Check new first five rows
df_model_fix.head()
Customer_ID is_Churn Year_First_Transaction Year_Last_Transaction Count_Transaction_Group Average_Transaction_Amount_Group Month_First Day_First Month_Last Day_Last Baju Jaket Sepatu Tas
0 29531 0 2016 2018 4 4 6 19 10 5 0 1 0 0
1 29531 0 2014 2018 4 4 7 23 12 25 0 0 1 0
2 141526 0 2017 2019 4 1 4 28 1 24 0 0 0 1
3 141526 0 2017 2019 4 2 4 28 1 16 0 1 0 0
4 37545 0 2015 2018 4 4 4 16 11 22 0 0 1 0
# Check new data type
df_model_fix.dtypes
Customer_ID                         int64
is_Churn                            int64
Year_First_Transaction              int64
Year_Last_Transaction               int64
Count_Transaction_Group             int64
Average_Transaction_Amount_Group    int64
Month_First                         int64
Day_First                           int64
Month_Last                          int64
Day_Last                            int64
Baju                                uint8
Jaket                               uint8
Sepatu                              uint8
Tas                                 uint8
dtype: object
# View Total Proportion Churn
df_model_fix['is_Churn'].value_counts(normalize=True)
1    0.66569
0    0.33431
Name: is_Churn, dtype: float64

Split Train and Test Data

# Choose feature columns
X = df_model_fix.drop(['is_Churn'], axis=1)
# Choose target columns
y = df_model_fix['is_Churn']

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Model Selection

# Create function choose model
def choose_model(model_name):
  # Train model
  model = model_name.fit(X_train, y_train)
  # Scoring Model
  model_score = model.score(X_test, y_test)
  print(f'Score Accuracy is {round(model_score*100, 2)}%')
  # Validating Model
  valid_score = np.mean(cross_val_score(model_name, X_train, y_train, cv=5))
  print(f'Score Validation is {round(valid_score*100, 2)}%')
# Choose model
choose_model(LogisticRegression())
Score Accuracy is 72.19%
Score Validation is 74.93%
# Choose model
choose_model(xgb.XGBClassifier())
Score Accuracy is 100.0%
Score Validation is 100.0%

Model Evaluation

# Function for plot confusion matrix and classification report
def check_matrix_and_reports(model_plot):
  # Churn Prediction 
  train = model_plot.fit(X_train, y_train)
  preds = train.predict(X_test)

  # Create confusion matrix
  class_names = ['Not Churn', 'Churn'] 
  cnf_matrix = confusion_matrix(y_test, preds)
  # Use subplotting
  fig, ax = plt.subplots(figsize=(8,4))
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  # Plotting using heatmap
  sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')
  plt.title('Confusion matrix', fontsize=15, pad=15)
  plt.xlabel('Actual')
  plt.ylabel('Predicted')
  
  # Create classification report
  reports = classification_report(y_test, preds, target_names=class_names)
  print('Classification Report\n', reports)
# Apply function with Logistic Regression
check_matrix_and_reports(LogisticRegression())
Classification Report
               precision    recall  f1-score   support

   Not Churn       0.70      0.31      0.43      8399
       Churn       0.73      0.93      0.82     16601

    accuracy                           0.72     25000
   macro avg       0.71      0.62      0.62     25000
weighted avg       0.72      0.72      0.69     25000

png

# Apply function with XGBoost Classifier
check_matrix_and_reports(xgb.XGBClassifier())
Classification Report
               precision    recall  f1-score   support

   Not Churn       1.00      1.00      1.00      8399
       Churn       1.00      1.00      1.00     16601

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00      1.00      1.00     25000

png

# Predict probability of churn
model1 = LogisticRegression().fit(X_train, y_train)
log_pred = model1.predict_proba(X_test)[:,1]
model2 = xgb.XGBClassifier().fit(X_train, y_train)
xgb_pred = model2.predict_proba(X_test)[:,1]

# ROC Chart components
fallout_log, sensitivity_log, thresholds_log = roc_curve(y_test, log_pred)
fallout_xgb, sensitivity_xgb, thresholds_xgb = roc_curve(y_test, xgb_pred)

# ROC Chart with both
plt.plot(fallout_log, sensitivity_log, color = 'blue', label='%s' % 'Logistic Regression')
plt.plot(fallout_xgb, sensitivity_xgb, color = 'green', label='%s' % 'XGBoost')
plt.plot([0, 1], [0, 1], linestyle='--', label='%s' % 'Random Prediction')
plt.title("ROC Chart on Probability of Churn")
plt.xlabel('Fall-out')
plt.ylabel('Sensitivity')
plt.legend()
<matplotlib.legend.Legend at 0x7f2d2a3b0f90>

png

# Print the logistic regression AUC with formatting
print("Logistic Regression AUC Score: %0.2f" % roc_auc_score(y_test, log_pred))

# Print the xgboost classifier AUC with formatting
print("XGBoost Classifier AUC Score: %0.2f" % roc_auc_score(y_test, xgb_pred))
Logistic Regression AUC Score: 0.58
XGBoost Classifier AUC Score: 1.00