In [6]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# For display settings
pd.set_option('display.max_columns', None)
# Step 2: Load the Dataset
file_path = '/content/Subscription_Service_Churn_Dataset.csv' # Adjust path if different
df = pd.read_csv(file_path)
# Step 3: Basic Exploration
print("Dataset Shape:", df.shape)
print("\nDataset Columns:\n", df.columns)
print("\nMissing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)
# Step 4: Preview the Data
df.head()
Dataset Shape: (963, 21)
Dataset Columns:
Index(['AccountAge', 'MonthlyCharges', 'TotalCharges', 'SubscriptionType',
'PaymentMethod', 'PaperlessBilling', 'ContentType', 'MultiDeviceAccess',
'DeviceRegistered', 'ViewingHoursPerWeek', 'AverageViewingDuration',
'ContentDownloadsPerMonth', 'GenrePreference', 'UserRating',
'SupportTicketsPerMonth', 'Gender', 'WatchlistSize', 'ParentalControl',
'SubtitlesEnabled', 'CustomerID', 'Churn'],
dtype='object')
Missing Values:
AccountAge 0
MonthlyCharges 204
TotalCharges 186
SubscriptionType 50
PaymentMethod 223
PaperlessBilling 0
ContentType 0
MultiDeviceAccess 0
DeviceRegistered 76
ViewingHoursPerWeek 0
AverageViewingDuration 0
ContentDownloadsPerMonth 0
GenrePreference 110
UserRating 261
SupportTicketsPerMonth 0
Gender 40
WatchlistSize 0
ParentalControl 0
SubtitlesEnabled 800
CustomerID 0
Churn 0
dtype: int64
Data Types:
AccountAge int64
MonthlyCharges float64
TotalCharges float64
SubscriptionType object
PaymentMethod object
PaperlessBilling object
ContentType object
MultiDeviceAccess object
DeviceRegistered object
ViewingHoursPerWeek float64
AverageViewingDuration float64
ContentDownloadsPerMonth int64
GenrePreference object
UserRating float64
SupportTicketsPerMonth int64
Gender object
WatchlistSize int64
ParentalControl object
SubtitlesEnabled object
CustomerID object
Churn int64
dtype: object
Out[6]:
| AccountAge | MonthlyCharges | TotalCharges | SubscriptionType | PaymentMethod | PaperlessBilling | ContentType | MultiDeviceAccess | DeviceRegistered | ViewingHoursPerWeek | AverageViewingDuration | ContentDownloadsPerMonth | GenrePreference | UserRating | SupportTicketsPerMonth | Gender | WatchlistSize | ParentalControl | SubtitlesEnabled | CustomerID | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42 | 11.321950 | 475.521914 | Basic | Electronic check | Yes | Movies | Yes | Tablet | 0.386852 | 24.593361 | 25 | Comedy | 3.489465 | 7 | Female | 15 | No | No | LOHRYYC9E8 | 0 |
| 1 | 95 | 12.810915 | 1217.036887 | Standard | Electronic check | Yes | TV Shows | No | Mobile | 37.123000 | 102.860795 | 21 | Comedy | 2.251860 | 3 | Male | 8 | No | NaN | RIH7RFEZCS | 0 |
| 2 | 6 | NaN | 91.583304 | Standard | Credit card | Yes | TV Shows | No | Tablet | 30.716944 | 10.512415 | 29 | Fantasy | NaN | 4 | Male | 14 | No | Yes | JB5J8X2UY1 | 1 |
| 3 | 54 | 17.917819 | 967.562224 | Basic | NaN | Yes | Movies | No | Mobile | 2.495373 | 45.246834 | 45 | Drama | 1.275004 | 3 | Male | 12 | No | NaN | Z4GZJIB90P | 0 |
| 4 | 27 | NaN | 339.057244 | Basic | Mailed check | No | TV Shows | No | Mobile | 39.936910 | 94.310954 | 44 | Comedy | 3.236230 | 4 | Female | 18 | No | NaN | B3BDJTW0L9 | 0 |
In [7]:
# Step 1: Drop Irrelevant Columns
# 'CustomerID' is just an identifier and doesn't help in prediction
df.drop('CustomerID', axis=1, inplace=True)
# Step 2: Handle Missing Values
# We'll use strategy based on data types and context
# Numeric columns – fill with median
df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df['UserRating'].fillna(df['UserRating'].median(), inplace=True)
# Categorical columns – fill with mode (most frequent)
categorical_cols = ['SubscriptionType', 'PaymentMethod', 'DeviceRegistered',
'GenrePreference', 'Gender', 'SubtitlesEnabled']
for col in categorical_cols:
df[col].fillna(df[col].mode()[0], inplace=True)
# Step 3: Convert Categorical Variables into Numerical
# We'll use one-hot encoding for nominal variables
df = pd.get_dummies(df, drop_first=True)
# Step 4: Correlation Matrix to Detect Redundant Features
# High correlation might indicate redundancy
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()
# Step 5: Final Shape Check After Processing
print("Final Dataset Shape:", df.shape)
df.head()
<ipython-input-7-a34dde097ccc>:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['UserRating'].fillna(df['UserRating'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:17: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[col].fillna(df[col].mode()[0], inplace=True)
Final Dataset Shape: (963, 29)
Out[7]:
| AccountAge | MonthlyCharges | TotalCharges | ViewingHoursPerWeek | AverageViewingDuration | ContentDownloadsPerMonth | UserRating | SupportTicketsPerMonth | WatchlistSize | Churn | SubscriptionType_Premium | SubscriptionType_Standard | PaymentMethod_Credit card | PaymentMethod_Electronic check | PaymentMethod_Mailed check | PaperlessBilling_Yes | ContentType_Movies | ContentType_TV Shows | MultiDeviceAccess_Yes | DeviceRegistered_Mobile | DeviceRegistered_TV | DeviceRegistered_Tablet | GenrePreference_Comedy | GenrePreference_Drama | GenrePreference_Fantasy | GenrePreference_Sci-Fi | Gender_Male | ParentalControl_Yes | SubtitlesEnabled_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 42 | 11.321950 | 475.521914 | 0.386852 | 24.593361 | 25 | 3.489465 | 7 | 15 | 0 | False | False | False | True | False | True | True | False | True | False | False | True | True | False | False | False | False | False | False |
| 1 | 95 | 12.810915 | 1217.036887 | 37.123000 | 102.860795 | 21 | 2.251860 | 3 | 8 | 0 | False | True | False | True | False | True | False | True | False | True | False | False | True | False | False | False | True | False | True |
| 2 | 6 | 12.169888 | 91.583304 | 30.716944 | 10.512415 | 29 | 2.898328 | 4 | 14 | 1 | False | True | True | False | False | True | False | True | False | False | False | True | False | False | True | False | True | False | True |
| 3 | 54 | 17.917819 | 967.562224 | 2.495373 | 45.246834 | 45 | 1.275004 | 3 | 12 | 0 | False | False | True | False | False | True | True | False | False | True | False | False | False | True | False | False | True | False | True |
| 4 | 27 | 12.169888 | 339.057244 | 39.936910 | 94.310954 | 44 | 3.236230 | 4 | 18 | 0 | False | False | False | False | True | False | False | True | False | True | False | False | True | False | False | False | False | False | True |
In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
# Step 1: Check class distribution
print("Original Class Distribution:\n", df['Churn'].value_counts())
# Step 2: Split features and target
X = df.drop('Churn', axis=1).values
y = df['Churn'].values
# Step 3: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Step 5: Apply SMOTE to training set only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("\nAfter SMOTE - Resampled Class Distribution:\n", Counter(y_train_res))
Original Class Distribution:
Churn
0 794
1 169
Name: count, dtype: int64
After SMOTE - Resampled Class Distribution:
Counter({np.int64(1): 641, np.int64(0): 641})
In [9]:
# Step 1: Activation Functions and Derivatives
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_derivative(a):
return a * (1 - a)
def relu(z):
return np.maximum(0, z)
def relu_derivative(z):
return (z > 0).astype(float)
# Step 2: Initialize Parameters
def initialize_parameters(input_size, hidden_units):
np.random.seed(42)
W1 = np.random.randn(hidden_units, input_size) * 0.01
b1 = np.zeros((hidden_units, 1))
W2 = np.random.randn(1, hidden_units) * 0.01
b2 = np.zeros((1, 1))
return W1, b1, W2, b2
# Step 3: Forward Propagation
def forward_propagation(X, W1, b1, W2, b2):
Z1 = np.dot(W1, X.T) + b1
A1 = relu(Z1)
Z2 = np.dot(W2, A1) + b2
A2 = sigmoid(Z2)
cache = (Z1, A1, Z2, A2)
return A2, cache
# Step 4: Compute Cost
def compute_cost(A2, Y):
m = Y.shape[0]
logprobs = Y * np.log(A2.T + 1e-8) + (1 - Y) * np.log(1 - A2.T + 1e-8)
cost = -np.sum(logprobs) / m
return cost
# Step 5: Backward Propagation
def backward_propagation(X, Y, cache, W2):
Z1, A1, Z2, A2 = cache
m = X.shape[0]
dZ2 = A2 - Y.reshape(1, m)
dW2 = (1 / m) * np.dot(dZ2, A1.T)
db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
dZ1 = dA1 * relu_derivative(Z1)
dW1 = (1 / m) * np.dot(dZ1, X)
db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
return dW1, db1, dW2, db2
# Step 6: Update Parameters
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
W1 -= learning_rate * dW1
b1 -= learning_rate * db1
W2 -= learning_rate * dW2
b2 -= learning_rate * db2
return W1, b1, W2, b2
In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# Training Function
def train_shallow_nn(X_train, y_train, X_test, y_test, hidden_units=10, learning_rate=0.01, epochs=1000):
input_size = X_train.shape[1]
W1, b1, W2, b2 = initialize_parameters(input_size, hidden_units)
costs = []
for epoch in range(epochs):
# Forward pass
A2, cache = forward_propagation(X_train, W1, b1, W2, b2)
cost = compute_cost(A2, y_train)
costs.append(cost)
# Backward pass
dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, cache, W2)
# Update weights
W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
if epoch % 100 == 0:
print(f"Epoch {epoch} - Cost: {cost:.4f}")
# Final prediction on test set
A2_test, _ = forward_propagation(X_test, W1, b1, W2, b2)
y_pred = (A2_test > 0.5).astype(int).flatten()
return y_pred, A2_test.flatten(), costs, W1
# Run Training
y_pred, y_probs, cost_history, W1 = train_shallow_nn(X_train_res, y_train_res, X_test, y_test, hidden_units=10, learning_rate=0.01, epochs=1000)
# Evaluate Model
print("\nEvaluation on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_probs))
# Plot training loss
plt.plot(cost_history)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Cost")
plt.grid(True)
plt.show()
Epoch 0 - Cost: 888.6149 Epoch 100 - Cost: 888.6149 Epoch 200 - Cost: 888.6150 Epoch 300 - Cost: 888.6151 Epoch 400 - Cost: 888.6154 Epoch 500 - Cost: 888.6159 Epoch 600 - Cost: 888.6167 Epoch 700 - Cost: 888.6183 Epoch 800 - Cost: 888.6215 Epoch 900 - Cost: 888.6281 Evaluation on Test Set: Accuracy: 0.6321243523316062 Precision: 0.3368421052631579 Recall: 0.8 F1 Score: 0.4740740740740741 AUC-ROC: 0.7565359477124183
In [17]:
# Interpretability via First Layer Weights
feature_names = df.drop('Churn', axis=1).columns
# Average of absolute weights from input to hidden layer
importance_scores = np.mean(np.abs(W1), axis=0)
# Create DataFrame for visualization
importance_df = pd.DataFrame({
'Feature': feature_names,
'Importance': importance_scores
}).sort_values(by='Importance', ascending=False)
# Plot Top 15 Important Features
plt.figure(figsize=(10,6))
sns.barplot(data=importance_df.head(15), x='Importance', y='Feature')
plt.title("Top 15 Important Features (Shallow NN)")
plt.xlabel("Average Absolute Weight")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()
In [18]:
# Final optimized configuration (based on previous tuning results)
best_hidden_units = 20
best_learning_rate = 0.05
# Train the model with best config
y_pred_final, y_probs_final, cost_history_final, _ = train_shallow_nn(
X_train_res, y_train_res,
X_test, y_test,
hidden_units=best_hidden_units,
learning_rate=best_learning_rate,
epochs=1000
)
# Final evaluation metrics
from sklearn.metrics import classification_report, roc_auc_score
print("\n--- Final Optimized Model Evaluation ---\n")
print(classification_report(y_test, y_pred_final, digits=4))
print("AUC-ROC Score:", roc_auc_score(y_test, y_probs_final))
# Plot final cost curve
plt.plot(cost_history_final)
plt.title("Final Model - Training Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Cost")
plt.grid(True)
plt.show()
Epoch 0 - Cost: 888.6150
Epoch 100 - Cost: 888.6161
Epoch 200 - Cost: 888.6531
Epoch 300 - Cost: 889.8260
Epoch 400 - Cost: 906.6113
Epoch 500 - Cost: 960.3606
Epoch 600 - Cost: 1028.1502
Epoch 700 - Cost: 1086.5927
Epoch 800 - Cost: 1130.2906
Epoch 900 - Cost: 1163.7461
--- Final Optimized Model Evaluation ---
precision recall f1-score support
0 0.9256 0.7320 0.8175 153
1 0.4306 0.7750 0.5536 40
accuracy 0.7409 193
macro avg 0.6781 0.7535 0.6855 193
weighted avg 0.8230 0.7409 0.7628 193
AUC-ROC Score: 0.7988562091503268
In [30]:
%%shell
jupyter nbconvert --to html /content/cchurn.ipynb
[NbConvertApp] Converting notebook /content/cchurn.ipynb to html [NbConvertApp] WARNING | Alternative text is missing on 4 image(s). [NbConvertApp] Writing 737259 bytes to /content/cchurn.html
Out[30]: