In [6]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For display settings
pd.set_option('display.max_columns', None)

# Step 2: Load the Dataset
file_path = '/content/Subscription_Service_Churn_Dataset.csv'  # Adjust path if different
df = pd.read_csv(file_path)

# Step 3: Basic Exploration
print("Dataset Shape:", df.shape)
print("\nDataset Columns:\n", df.columns)
print("\nMissing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)

# Step 4: Preview the Data
df.head()
Dataset Shape: (963, 21)

Dataset Columns:
 Index(['AccountAge', 'MonthlyCharges', 'TotalCharges', 'SubscriptionType',
       'PaymentMethod', 'PaperlessBilling', 'ContentType', 'MultiDeviceAccess',
       'DeviceRegistered', 'ViewingHoursPerWeek', 'AverageViewingDuration',
       'ContentDownloadsPerMonth', 'GenrePreference', 'UserRating',
       'SupportTicketsPerMonth', 'Gender', 'WatchlistSize', 'ParentalControl',
       'SubtitlesEnabled', 'CustomerID', 'Churn'],
      dtype='object')

Missing Values:
 AccountAge                    0
MonthlyCharges              204
TotalCharges                186
SubscriptionType             50
PaymentMethod               223
PaperlessBilling              0
ContentType                   0
MultiDeviceAccess             0
DeviceRegistered             76
ViewingHoursPerWeek           0
AverageViewingDuration        0
ContentDownloadsPerMonth      0
GenrePreference             110
UserRating                  261
SupportTicketsPerMonth        0
Gender                       40
WatchlistSize                 0
ParentalControl               0
SubtitlesEnabled            800
CustomerID                    0
Churn                         0
dtype: int64

Data Types:
 AccountAge                    int64
MonthlyCharges              float64
TotalCharges                float64
SubscriptionType             object
PaymentMethod                object
PaperlessBilling             object
ContentType                  object
MultiDeviceAccess            object
DeviceRegistered             object
ViewingHoursPerWeek         float64
AverageViewingDuration      float64
ContentDownloadsPerMonth      int64
GenrePreference              object
UserRating                  float64
SupportTicketsPerMonth        int64
Gender                       object
WatchlistSize                 int64
ParentalControl              object
SubtitlesEnabled             object
CustomerID                   object
Churn                         int64
dtype: object
Out[6]:
AccountAge MonthlyCharges TotalCharges SubscriptionType PaymentMethod PaperlessBilling ContentType MultiDeviceAccess DeviceRegistered ViewingHoursPerWeek AverageViewingDuration ContentDownloadsPerMonth GenrePreference UserRating SupportTicketsPerMonth Gender WatchlistSize ParentalControl SubtitlesEnabled CustomerID Churn
0 42 11.321950 475.521914 Basic Electronic check Yes Movies Yes Tablet 0.386852 24.593361 25 Comedy 3.489465 7 Female 15 No No LOHRYYC9E8 0
1 95 12.810915 1217.036887 Standard Electronic check Yes TV Shows No Mobile 37.123000 102.860795 21 Comedy 2.251860 3 Male 8 No NaN RIH7RFEZCS 0
2 6 NaN 91.583304 Standard Credit card Yes TV Shows No Tablet 30.716944 10.512415 29 Fantasy NaN 4 Male 14 No Yes JB5J8X2UY1 1
3 54 17.917819 967.562224 Basic NaN Yes Movies No Mobile 2.495373 45.246834 45 Drama 1.275004 3 Male 12 No NaN Z4GZJIB90P 0
4 27 NaN 339.057244 Basic Mailed check No TV Shows No Mobile 39.936910 94.310954 44 Comedy 3.236230 4 Female 18 No NaN B3BDJTW0L9 0
In [7]:
# Step 1: Drop Irrelevant Columns
# 'CustomerID' is just an identifier and doesn't help in prediction
df.drop('CustomerID', axis=1, inplace=True)

# Step 2: Handle Missing Values
# We'll use strategy based on data types and context

# Numeric columns – fill with median
df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df['UserRating'].fillna(df['UserRating'].median(), inplace=True)

# Categorical columns – fill with mode (most frequent)
categorical_cols = ['SubscriptionType', 'PaymentMethod', 'DeviceRegistered',
                    'GenrePreference', 'Gender', 'SubtitlesEnabled']
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Step 3: Convert Categorical Variables into Numerical
# We'll use one-hot encoding for nominal variables
df = pd.get_dummies(df, drop_first=True)

# Step 4: Correlation Matrix to Detect Redundant Features
# High correlation might indicate redundancy
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

# Step 5: Final Shape Check After Processing
print("Final Dataset Shape:", df.shape)
df.head()
<ipython-input-7-a34dde097ccc>:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['UserRating'].fillna(df['UserRating'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:17: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
No description has been provided for this image
Final Dataset Shape: (963, 29)
Out[7]:
AccountAge MonthlyCharges TotalCharges ViewingHoursPerWeek AverageViewingDuration ContentDownloadsPerMonth UserRating SupportTicketsPerMonth WatchlistSize Churn SubscriptionType_Premium SubscriptionType_Standard PaymentMethod_Credit card PaymentMethod_Electronic check PaymentMethod_Mailed check PaperlessBilling_Yes ContentType_Movies ContentType_TV Shows MultiDeviceAccess_Yes DeviceRegistered_Mobile DeviceRegistered_TV DeviceRegistered_Tablet GenrePreference_Comedy GenrePreference_Drama GenrePreference_Fantasy GenrePreference_Sci-Fi Gender_Male ParentalControl_Yes SubtitlesEnabled_Yes
0 42 11.321950 475.521914 0.386852 24.593361 25 3.489465 7 15 0 False False False True False True True False True False False True True False False False False False False
1 95 12.810915 1217.036887 37.123000 102.860795 21 2.251860 3 8 0 False True False True False True False True False True False False True False False False True False True
2 6 12.169888 91.583304 30.716944 10.512415 29 2.898328 4 14 1 False True True False False True False True False False False True False False True False True False True
3 54 17.917819 967.562224 2.495373 45.246834 45 1.275004 3 12 0 False False True False False True True False False True False False False True False False True False True
4 27 12.169888 339.057244 39.936910 94.310954 44 3.236230 4 18 0 False False False False True False False True False True False False True False False False False False True
In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

# Step 1: Check class distribution
print("Original Class Distribution:\n", df['Churn'].value_counts())

# Step 2: Split features and target
X = df.drop('Churn', axis=1).values
y = df['Churn'].values

# Step 3: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5: Apply SMOTE to training set only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("\nAfter SMOTE - Resampled Class Distribution:\n", Counter(y_train_res))
Original Class Distribution:
 Churn
0    794
1    169
Name: count, dtype: int64

After SMOTE - Resampled Class Distribution:
 Counter({np.int64(1): 641, np.int64(0): 641})
In [9]:
# Step 1: Activation Functions and Derivatives
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1 - a)

def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

# Step 2: Initialize Parameters
def initialize_parameters(input_size, hidden_units):
    np.random.seed(42)
    W1 = np.random.randn(hidden_units, input_size) * 0.01
    b1 = np.zeros((hidden_units, 1))
    W2 = np.random.randn(1, hidden_units) * 0.01
    b2 = np.zeros((1, 1))
    return W1, b1, W2, b2

# Step 3: Forward Propagation
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(W1, X.T) + b1
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)
    cache = (Z1, A1, Z2, A2)
    return A2, cache

# Step 4: Compute Cost
def compute_cost(A2, Y):
    m = Y.shape[0]
    logprobs = Y * np.log(A2.T + 1e-8) + (1 - Y) * np.log(1 - A2.T + 1e-8)
    cost = -np.sum(logprobs) / m
    return cost

# Step 5: Backward Propagation
def backward_propagation(X, Y, cache, W2):
    Z1, A1, Z2, A2 = cache
    m = X.shape[0]

    dZ2 = A2 - Y.reshape(1, m)
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1 / m) * np.dot(dZ1, X)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

# Step 6: Update Parameters
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2
In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Training Function
def train_shallow_nn(X_train, y_train, X_test, y_test, hidden_units=10, learning_rate=0.01, epochs=1000):
    input_size = X_train.shape[1]
    W1, b1, W2, b2 = initialize_parameters(input_size, hidden_units)
    costs = []

    for epoch in range(epochs):
        # Forward pass
        A2, cache = forward_propagation(X_train, W1, b1, W2, b2)
        cost = compute_cost(A2, y_train)
        costs.append(cost)

        # Backward pass
        dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, cache, W2)

        # Update weights
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

        if epoch % 100 == 0:
            print(f"Epoch {epoch} - Cost: {cost:.4f}")

    # Final prediction on test set
    A2_test, _ = forward_propagation(X_test, W1, b1, W2, b2)
    y_pred = (A2_test > 0.5).astype(int).flatten()

    return y_pred, A2_test.flatten(), costs, W1

# Run Training
y_pred, y_probs, cost_history, W1 = train_shallow_nn(X_train_res, y_train_res, X_test, y_test, hidden_units=10, learning_rate=0.01, epochs=1000)


# Evaluate Model
print("\nEvaluation on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_probs))

# Plot training loss
plt.plot(cost_history)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Cost")
plt.grid(True)
plt.show()
Epoch 0 - Cost: 888.6149
Epoch 100 - Cost: 888.6149
Epoch 200 - Cost: 888.6150
Epoch 300 - Cost: 888.6151
Epoch 400 - Cost: 888.6154
Epoch 500 - Cost: 888.6159
Epoch 600 - Cost: 888.6167
Epoch 700 - Cost: 888.6183
Epoch 800 - Cost: 888.6215
Epoch 900 - Cost: 888.6281

Evaluation on Test Set:
Accuracy: 0.6321243523316062
Precision: 0.3368421052631579
Recall: 0.8
F1 Score: 0.4740740740740741
AUC-ROC: 0.7565359477124183
No description has been provided for this image
In [17]:
# Interpretability via First Layer Weights
feature_names = df.drop('Churn', axis=1).columns

# Average of absolute weights from input to hidden layer
importance_scores = np.mean(np.abs(W1), axis=0)

# Create DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
}).sort_values(by='Importance', ascending=False)

# Plot Top 15 Important Features
plt.figure(figsize=(10,6))
sns.barplot(data=importance_df.head(15), x='Importance', y='Feature')
plt.title("Top 15 Important Features (Shallow NN)")
plt.xlabel("Average Absolute Weight")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [18]:
# Final optimized configuration (based on previous tuning results)
best_hidden_units = 20
best_learning_rate = 0.05

# Train the model with best config
y_pred_final, y_probs_final, cost_history_final, _ = train_shallow_nn(
    X_train_res, y_train_res,
    X_test, y_test,
    hidden_units=best_hidden_units,
    learning_rate=best_learning_rate,
    epochs=1000
)

# Final evaluation metrics
from sklearn.metrics import classification_report, roc_auc_score

print("\n--- Final Optimized Model Evaluation ---\n")
print(classification_report(y_test, y_pred_final, digits=4))
print("AUC-ROC Score:", roc_auc_score(y_test, y_probs_final))

# Plot final cost curve
plt.plot(cost_history_final)
plt.title("Final Model - Training Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Cost")
plt.grid(True)
plt.show()
Epoch 0 - Cost: 888.6150
Epoch 100 - Cost: 888.6161
Epoch 200 - Cost: 888.6531
Epoch 300 - Cost: 889.8260
Epoch 400 - Cost: 906.6113
Epoch 500 - Cost: 960.3606
Epoch 600 - Cost: 1028.1502
Epoch 700 - Cost: 1086.5927
Epoch 800 - Cost: 1130.2906
Epoch 900 - Cost: 1163.7461

--- Final Optimized Model Evaluation ---

              precision    recall  f1-score   support

           0     0.9256    0.7320    0.8175       153
           1     0.4306    0.7750    0.5536        40

    accuracy                         0.7409       193
   macro avg     0.6781    0.7535    0.6855       193
weighted avg     0.8230    0.7409    0.7628       193

AUC-ROC Score: 0.7988562091503268
No description has been provided for this image
In [30]:
%%shell
jupyter nbconvert --to html /content/cchurn.ipynb
[NbConvertApp] Converting notebook /content/cchurn.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 4 image(s).
[NbConvertApp] Writing 737259 bytes to /content/cchurn.html
Out[30]: