Technical Blog
Building a Customer Churn Model: From Raw Data to Production
A complete walkthrough of developing and deploying a churn prediction model — data prep, training, tuning, evaluation, and production deployment.
Customer churn prediction is one of the most impactful ML use cases in business. A well-built model can save millions by identifying at-risk customers before they leave.
In this post, I'll walk through the end-to-end pipeline I typically use — from messy raw data to a model running in production.
1. Data Preparation, Cleaning & Processing
Before any model can learn, we need clean, well-structured data. This phase usually takes 60–70% of the total project time.
Common data issues
- Missing values: customer records with null fields (e.g., no last login date).
- Duplicates: same customer appearing multiple times.
- Inconsistent formats: dates as strings, mixed encodings, etc.
- Class imbalance: churned customers are often 5–10% of the dataset.
A minimal preprocessing pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
def load_and_clean(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
# Drop duplicates
df = df.drop_duplicates(subset=["customer_id"])
# Handle missing values
df["tenure"].fillna(df["tenure"].median(), inplace=True)
df["monthly_charges"].fillna(df["monthly_charges"].mean(), inplace=True)
# Convert categorical columns
label_cols = ["contract_type", "payment_method", "internet_service"]
for col in label_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str))
return df
def prepare_features(df: pd.DataFrame):
feature_cols = [
"tenure",
"monthly_charges",
"total_charges",
"contract_type",
"payment_method",
"internet_service",
"num_support_tickets",
]
X = df[feature_cols]
y = df["churned"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, scalerKey decisions:
- Use stratified split to preserve class balance.
- Fit the scaler on train only, then transform test (avoid data leakage).
- Keep the scaler object — we'll need it in production.
2. Training the Models
I usually start with a few baseline models before going deep on any one.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
def train_baselines(X_train, y_train, X_test, y_test):
models = {
"LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced"),
"RandomForest": RandomForestClassifier(n_estimators=100, class_weight="balanced", n_jobs=-1),
"GradientBoosting": GradientBoostingClassifier(n_estimators=100),
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
results[name] = {"model": model, "auc": auc}
print(f"{name}: AUC = {auc:.4f}")
return resultsWhy these three?
- Logistic Regression: interpretable baseline.
- Random Forest: handles non-linearity, robust to outliers.
- Gradient Boosting: often best performance, good starting point before XGBoost/LightGBM.
At this stage, I'm looking for a model that clearly outperforms the others. If Gradient Boosting wins (common), I'll invest time tuning it.
3. Hyperparameter Tuning
Once we pick a candidate, we tune it properly. I prefer Optuna for its flexibility and pruning capabilities.
import optuna
from sklearn.model_selection import cross_val_score
def objective(trial, X, y):
params = {
"n_estimators": trial.suggest_int("n_estimators", 100, 500),
"max_depth": trial.suggest_int("max_depth", 3, 10),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
}
model = GradientBoostingClassifier(**params, random_state=42)
# 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring="roc_auc", n_jobs=-1)
return scores.mean()
def tune_model(X_train, y_train, n_trials: int = 50):
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=n_trials)
print(f"Best AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")
return study.best_paramsTips:
- Use log scale for learning rate — small changes matter more at low values.
- Set a reasonable
n_trials(50–100) to balance exploration vs. time. - Consider early stopping with LightGBM/XGBoost for faster iteration.
4. Evaluation
AUC alone isn't enough. For churn, we care about precision at the top decile (are the customers we flag actually churning?) and recall (are we catching most churners?).
from sklearn.metrics import (
classification_report,
confusion_matrix,
precision_recall_curve,
roc_curve,
auc,
)
import matplotlib.pyplot as plt
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Retained", "Churned"]))
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:\n{cm}")
# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
# Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
return {
"roc_auc": roc_auc,
"precision": precision,
"recall": recall,
"thresholds": thresholds,
}
def plot_curves(metrics):
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# ROC
axes[0].plot([0, 1], [0, 1], "k--", label="Random")
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].set_title(f"ROC Curve (AUC = {metrics['roc_auc']:.3f})")
# Precision-Recall
axes[1].plot(metrics["recall"], metrics["precision"])
axes[1].set_xlabel("Recall")
axes[1].set_ylabel("Precision")
axes[1].set_title("Precision-Recall Curve")
plt.tight_layout()
plt.savefig("evaluation_curves.png", dpi=150)
plt.show()Business-oriented metrics:
- Lift at top 10%: if we contact only the top 10% riskiest customers, how many actual churners do we catch?
- Cost-benefit analysis: false positives (wasted retention offers) vs. false negatives (lost customers).
5. Production Deployment
A model sitting in a notebook is worthless. Here's a minimal production setup using FastAPI and Docker.
Model serialization
import joblib
def save_artifacts(model, scaler, path: str = "./artifacts"):
joblib.dump(model, f"{path}/churn_model.joblib")
joblib.dump(scaler, f"{path}/scaler.joblib")
print(f"Artifacts saved to {path}")FastAPI service
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import numpy as np
app = FastAPI(title="Churn Prediction API")
# Load artifacts at startup
model = joblib.load("./artifacts/churn_model.joblib")
scaler = joblib.load("./artifacts/scaler.joblib")
class CustomerFeatures(BaseModel):
tenure: float
monthly_charges: float
total_charges: float
contract_type: int
payment_method: int
internet_service: int
num_support_tickets: int
class PredictionResponse(BaseModel):
churn_probability: float
churn_prediction: bool
@app.post("/predict", response_model=PredictionResponse)
def predict(features: CustomerFeatures):
X = np.array([[
features.tenure,
features.monthly_charges,
features.total_charges,
features.contract_type,
features.payment_method,
features.internet_service,
features.num_support_tickets,
]])
X_scaled = scaler.transform(X)
proba = model.predict_proba(X_scaled)[0, 1]
return PredictionResponse(
churn_probability=round(proba, 4),
churn_prediction=proba >= 0.5,
)
@app.get("/health")
def health():
return {"status": "healthy"}Dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY artifacts/ ./artifacts/
COPY main.py .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]Deployment checklist
- Model and scaler versioned in artifact storage (S3, Azure Blob, MLflow).
- API behind a load balancer with health checks.
- Logging: log every prediction request for monitoring.
- Monitoring: track prediction distribution drift over time.
- Retraining pipeline: schedule monthly retraining with fresh data.
Wrapping up
Building a churn model isn't just about the algorithm. It's about:
- Clean data — garbage in, garbage out.
- Proper evaluation — business metrics, not just AUC.
- Production readiness — models need to be served, monitored, and updated.
The code above is intentionally minimal but production-oriented. Start here, then layer in complexity (feature stores, A/B testing, real-time scoring) as needed.