# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For display settings
pd.set_option('display.max_columns', None)

# Step 2: Load the Dataset
file_path = '/content/Subscription_Service_Churn_Dataset.csv'  # Adjust path if different
df = pd.read_csv(file_path)

# Step 3: Basic Exploration
print("Dataset Shape:", df.shape)
print("\nDataset Columns:\n", df.columns)
print("\nMissing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)

# Step 4: Preview the Data
df.head()

Dataset Shape: (963, 21)

Dataset Columns:
 Index(['AccountAge', 'MonthlyCharges', 'TotalCharges', 'SubscriptionType',
       'PaymentMethod', 'PaperlessBilling', 'ContentType', 'MultiDeviceAccess',
       'DeviceRegistered', 'ViewingHoursPerWeek', 'AverageViewingDuration',
       'ContentDownloadsPerMonth', 'GenrePreference', 'UserRating',
       'SupportTicketsPerMonth', 'Gender', 'WatchlistSize', 'ParentalControl',
       'SubtitlesEnabled', 'CustomerID', 'Churn'],
      dtype='object')

Missing Values:
 AccountAge                    0
MonthlyCharges              204
TotalCharges                186
SubscriptionType             50
PaymentMethod               223
PaperlessBilling              0
ContentType                   0
MultiDeviceAccess             0
DeviceRegistered             76
ViewingHoursPerWeek           0
AverageViewingDuration        0
ContentDownloadsPerMonth      0
GenrePreference             110
UserRating                  261
SupportTicketsPerMonth        0
Gender                       40
WatchlistSize                 0
ParentalControl               0
SubtitlesEnabled            800
CustomerID                    0
Churn                         0
dtype: int64

Data Types:
 AccountAge                    int64
MonthlyCharges              float64
TotalCharges                float64
SubscriptionType             object
PaymentMethod                object
PaperlessBilling             object
ContentType                  object
MultiDeviceAccess            object
DeviceRegistered             object
ViewingHoursPerWeek         float64
AverageViewingDuration      float64
ContentDownloadsPerMonth      int64
GenrePreference              object
UserRating                  float64
SupportTicketsPerMonth        int64
Gender                       object
WatchlistSize                 int64
ParentalControl              object
SubtitlesEnabled             object
CustomerID                   object
Churn                         int64
dtype: object

# Step 1: Drop Irrelevant Columns
# 'CustomerID' is just an identifier and doesn't help in prediction
df.drop('CustomerID', axis=1, inplace=True)

# Step 2: Handle Missing Values
# We'll use strategy based on data types and context

# Numeric columns – fill with median
df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df['UserRating'].fillna(df['UserRating'].median(), inplace=True)

# Categorical columns – fill with mode (most frequent)
categorical_cols = ['SubscriptionType', 'PaymentMethod', 'DeviceRegistered',
                    'GenrePreference', 'Gender', 'SubtitlesEnabled']
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Step 3: Convert Categorical Variables into Numerical
# We'll use one-hot encoding for nominal variables
df = pd.get_dummies(df, drop_first=True)

# Step 4: Correlation Matrix to Detect Redundant Features
# High correlation might indicate redundancy
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

# Step 5: Final Shape Check After Processing
print("Final Dataset Shape:", df.shape)
df.head()

<ipython-input-7-a34dde097ccc>:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MonthlyCharges'].fillna(df['MonthlyCharges'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['UserRating'].fillna(df['UserRating'].median(), inplace=True)
<ipython-input-7-a34dde097ccc>:17: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)

Final Dataset Shape: (963, 29)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

# Step 1: Check class distribution
print("Original Class Distribution:\n", df['Churn'].value_counts())

# Step 2: Split features and target
X = df.drop('Churn', axis=1).values
y = df['Churn'].values

# Step 3: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5: Apply SMOTE to training set only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("\nAfter SMOTE - Resampled Class Distribution:\n", Counter(y_train_res))

Original Class Distribution:
 Churn
0    794
1    169
Name: count, dtype: int64

After SMOTE - Resampled Class Distribution:
 Counter({np.int64(1): 641, np.int64(0): 641})

# Step 1: Activation Functions and Derivatives
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1 - a)

def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

# Step 2: Initialize Parameters
def initialize_parameters(input_size, hidden_units):
    np.random.seed(42)
    W1 = np.random.randn(hidden_units, input_size) * 0.01
    b1 = np.zeros((hidden_units, 1))
    W2 = np.random.randn(1, hidden_units) * 0.01
    b2 = np.zeros((1, 1))
    return W1, b1, W2, b2

# Step 3: Forward Propagation
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(W1, X.T) + b1
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)
    cache = (Z1, A1, Z2, A2)
    return A2, cache

# Step 4: Compute Cost
def compute_cost(A2, Y):
    m = Y.shape[0]
    logprobs = Y * np.log(A2.T + 1e-8) + (1 - Y) * np.log(1 - A2.T + 1e-8)
    cost = -np.sum(logprobs) / m
    return cost

# Step 5: Backward Propagation
def backward_propagation(X, Y, cache, W2):
    Z1, A1, Z2, A2 = cache
    m = X.shape[0]

    dZ2 = A2 - Y.reshape(1, m)
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1 / m) * np.dot(dZ1, X)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

# Step 6: Update Parameters
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Training Function
def train_shallow_nn(X_train, y_train, X_test, y_test, hidden_units=10, learning_rate=0.01, epochs=1000):
    input_size = X_train.shape[1]
    W1, b1, W2, b2 = initialize_parameters(input_size, hidden_units)
    costs = []

    for epoch in range(epochs):
        # Forward pass
        A2, cache = forward_propagation(X_train, W1, b1, W2, b2)
        cost = compute_cost(A2, y_train)
        costs.append(cost)

        # Backward pass
        dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, cache, W2)

        # Update weights
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

        if epoch % 100 == 0:
            print(f"Epoch {epoch} - Cost: {cost:.4f}")

    # Final prediction on test set
    A2_test, _ = forward_propagation(X_test, W1, b1, W2, b2)
    y_pred = (A2_test > 0.5).astype(int).flatten()

    return y_pred, A2_test.flatten(), costs, W1

# Run Training
y_pred, y_probs, cost_history, W1 = train_shallow_nn(X_train_res, y_train_res, X_test, y_test, hidden_units=10, learning_rate=0.01, epochs=1000)


# Evaluate Model
print("\nEvaluation on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_probs))

# Plot training loss
plt.plot(cost_history)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Cost")
plt.grid(True)
plt.show()

Epoch 0 - Cost: 888.6149
Epoch 100 - Cost: 888.6149
Epoch 200 - Cost: 888.6150
Epoch 300 - Cost: 888.6151
Epoch 400 - Cost: 888.6154
Epoch 500 - Cost: 888.6159
Epoch 600 - Cost: 888.6167
Epoch 700 - Cost: 888.6183
Epoch 800 - Cost: 888.6215
Epoch 900 - Cost: 888.6281

Evaluation on Test Set:
Accuracy: 0.6321243523316062
Precision: 0.3368421052631579
Recall: 0.8
F1 Score: 0.4740740740740741
AUC-ROC: 0.7565359477124183

# Interpretability via First Layer Weights
feature_names = df.drop('Churn', axis=1).columns

# Average of absolute weights from input to hidden layer
importance_scores = np.mean(np.abs(W1), axis=0)

# Create DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
}).sort_values(by='Importance', ascending=False)

# Plot Top 15 Important Features
plt.figure(figsize=(10,6))
sns.barplot(data=importance_df.head(15), x='Importance', y='Feature')
plt.title("Top 15 Important Features (Shallow NN)")
plt.xlabel("Average Absolute Weight")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()

# Final optimized configuration (based on previous tuning results)
best_hidden_units = 20
best_learning_rate = 0.05

# Train the model with best config
y_pred_final, y_probs_final, cost_history_final, _ = train_shallow_nn(
    X_train_res, y_train_res,
    X_test, y_test,
    hidden_units=best_hidden_units,
    learning_rate=best_learning_rate,
    epochs=1000
)

# Final evaluation metrics
from sklearn.metrics import classification_report, roc_auc_score

print("\n--- Final Optimized Model Evaluation ---\n")
print(classification_report(y_test, y_pred_final, digits=4))
print("AUC-ROC Score:", roc_auc_score(y_test, y_probs_final))

# Plot final cost curve
plt.plot(cost_history_final)
plt.title("Final Model - Training Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Cost")
plt.grid(True)
plt.show()

Epoch 0 - Cost: 888.6150
Epoch 100 - Cost: 888.6161
Epoch 200 - Cost: 888.6531
Epoch 300 - Cost: 889.8260
Epoch 400 - Cost: 906.6113
Epoch 500 - Cost: 960.3606
Epoch 600 - Cost: 1028.1502
Epoch 700 - Cost: 1086.5927
Epoch 800 - Cost: 1130.2906
Epoch 900 - Cost: 1163.7461

--- Final Optimized Model Evaluation ---

              precision    recall  f1-score   support

           0     0.9256    0.7320    0.8175       153
           1     0.4306    0.7750    0.5536        40

    accuracy                         0.7409       193
   macro avg     0.6781    0.7535    0.6855       193
weighted avg     0.8230    0.7409    0.7628       193

AUC-ROC Score: 0.7988562091503268

%%shell
jupyter nbconvert --to html /content/cchurn.ipynb

[NbConvertApp] Converting notebook /content/cchurn.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 4 image(s).
[NbConvertApp] Writing 737259 bytes to /content/cchurn.html

	AccountAge	MonthlyCharges	TotalCharges	SubscriptionType	PaymentMethod	PaperlessBilling	ContentType	MultiDeviceAccess	DeviceRegistered	ViewingHoursPerWeek	AverageViewingDuration	ContentDownloadsPerMonth	GenrePreference	UserRating	SupportTicketsPerMonth	Gender	WatchlistSize	ParentalControl	SubtitlesEnabled	CustomerID	Churn
0	42	11.321950	475.521914	Basic	Electronic check	Yes	Movies	Yes	Tablet	0.386852	24.593361	25	Comedy	3.489465	7	Female	15	No	No	LOHRYYC9E8	0
1	95	12.810915	1217.036887	Standard	Electronic check	Yes	TV Shows	No	Mobile	37.123000	102.860795	21	Comedy	2.251860	3	Male	8	No	NaN	RIH7RFEZCS	0
2	6	NaN	91.583304	Standard	Credit card	Yes	TV Shows	No	Tablet	30.716944	10.512415	29	Fantasy	NaN	4	Male	14	No	Yes	JB5J8X2UY1	1
3	54	17.917819	967.562224	Basic	NaN	Yes	Movies	No	Mobile	2.495373	45.246834	45	Drama	1.275004	3	Male	12	No	NaN	Z4GZJIB90P	0
4	27	NaN	339.057244	Basic	Mailed check	No	TV Shows	No	Mobile	39.936910	94.310954	44	Comedy	3.236230	4	Female	18	No	NaN	B3BDJTW0L9	0

	AccountAge	MonthlyCharges	TotalCharges	ViewingHoursPerWeek	AverageViewingDuration	ContentDownloadsPerMonth	UserRating	SupportTicketsPerMonth	WatchlistSize	Churn	SubscriptionType_Premium	SubscriptionType_Standard	PaymentMethod_Credit card	PaymentMethod_Electronic check	PaymentMethod_Mailed check	PaperlessBilling_Yes	ContentType_Movies	ContentType_TV Shows	MultiDeviceAccess_Yes	DeviceRegistered_Mobile	DeviceRegistered_TV	DeviceRegistered_Tablet	GenrePreference_Comedy	GenrePreference_Drama	GenrePreference_Fantasy	GenrePreference_Sci-Fi	Gender_Male	ParentalControl_Yes	SubtitlesEnabled_Yes
0	42	11.321950	475.521914	0.386852	24.593361	25	3.489465	7	15	0	False	False	False	True	False	True	True	False	True	False	False	True	True	False	False	False	False	False	False
1	95	12.810915	1217.036887	37.123000	102.860795	21	2.251860	3	8	0	False	True	False	True	False	True	False	True	False	True	False	False	True	False	False	False	True	False	True
2	6	12.169888	91.583304	30.716944	10.512415	29	2.898328	4	14	1	False	True	True	False	False	True	False	True	False	False	False	True	False	False	True	False	True	False	True
3	54	17.917819	967.562224	2.495373	45.246834	45	1.275004	3	12	0	False	False	True	False	False	True	True	False	False	True	False	False	False	True	False	False	True	False	True
4	27	12.169888	339.057244	39.936910	94.310954	44	3.236230	4	18	0	False	False	False	False	True	False	False	True	False	True	False	False	True	False	False	False	False	False	True