Loise Mihari | Freelancer Portfolio Item #421297

churn_analysis June 4, 2025 [1]: import import import import pandas as pd numpy as np matplotlib.pyplot as plt seaborn as sns Matplotlib is building the font cache; this may take a moment. 1 � Project Summary — Telco Customer Churn Analysis 1.1 � Goal: Predict and prevent customer churn using advanced analytics — not just as a technical task, but as a profit-saving strategy. 1.2 � Dataset Snapshot: • • • • 1.3 Rows: 7,032 Features: 21 Target: Churn (Yes/No) Source: Kaggle – Telco Customer Churn � Business Risk: • • • • Churn Rate: 26.6% Revenue at Risk: � $121,000/month Annualized churn loss = $1.45M+ Churn concentrated among month-to-month contracts, electronic check payments, and low tenure customers [7]: import pandas as pd # Load the dataset df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') # Take a look at the first few rows df.head() [7]: 0 customerID 7590-VHVEG gender Female SeniorCitizen Partner Dependents 0 Yes No 1 tenure PhoneService 1 No \ 1 2 3 4 5575-GNVDE 3668-QPYBK 7795-CFOCW 9237-HQITU 0 1 2 3 4 MultipleLines InternetService OnlineSecurity No phone service DSL No No DSL Yes No DSL Yes No phone service DSL Yes No Fiber optic No 0 1 2 3 4 0 1 2 3 4 Male Male Male Female 0 0 0 0 No No No No TechSupport StreamingTV StreamingMovies No No No No No No No No No Yes No No No No No No No No No … DeviceProtection … No … Yes … No … Yes … No Yes Yes No Yes \ Contract PaperlessBilling Month-to-month Yes No One year Month-to-month Yes One year No Month-to-month Yes PaymentMethod MonthlyCharges Electronic check 29.85 Mailed check 56.95 Mailed check 53.85 Bank transfer (automatic) 42.30 Electronic check 70.70 TotalCharges Churn 29.85 No 1889.5 No 108.15 Yes 1840.75 No 151.65 Yes [5 rows x 21 columns] [8]: # Check the shape of the dataset print("Rows and columns:", df.shape) # See column names and data types print("\nColumn Info:") print(df.dtypes) # Quick summary statistics (numerical columns only) print("\nSummary Stats:") print(df.describe()) # Check for missing values print("\nMissing Values:") print(df.isnull().sum()) # Check for empty strings (common in categorical columns) print("\nEmpty String Values:") print((df == ' ').sum()) 2 34 2 45 2 \ Rows and columns: (7043, 21) Column Info: customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn dtype: object object object int64 object object int64 object object object object object object object object object object object object float64 object object Summary Stats: SeniorCitizen count- mean- std- min-%-%-%- max- Missing Values: customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity tenure- MonthlyCharges- - OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn dtype: int64 - Empty String Values: customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64 [9]: # Convert 'TotalCharges' to numeric, forcing errors to NaN df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') # Check how many are now NaN print("NaNs in TotalCharges:", df['TotalCharges'].isna().sum()) # Drop those rows with missing TotalCharges df = df.dropna(subset=['TotalCharges']) # Confirm it's now clean 4 print("After cleanup:", df.shape) NaNs in TotalCharges: 11 After cleanup: (7032, 21) [10]: # Churn value counts print(df['Churn'].value_counts()) # Churn percentages print("\nChurn Percentage:\n", df['Churn'].value_counts(normalize=True) * 100) Churn No 5163 Yes 1869 Name: count, dtype: int64 Churn Percentage: Churn No- Yes- Name: proportion, dtype: float64 [11]: import seaborn as sns import matplotlib.pyplot as plt # Plot churn distribution sns.countplot(x='Churn', data=df) plt.title("Customer Churn Distribution") plt.xlabel("Churn") plt.ylabel("Count") plt.show() 5 [12]: plt.figure(figsize=(8, 5)) sns.countplot(x='Contract', hue='Churn', data=df) plt.title("Churn by Contract Type") plt.xlabel("Contract Type") plt.ylabel("Number of Customers") plt.show() 6 [13]: plt.figure(figsize=(8, 5)) sns.histplot(data=df, x='tenure', hue='Churn', multiple='stack', bins=30) plt.title("Churn by Customer Tenure") plt.xlabel("Months with Company") plt.ylabel("Number of Customers") plt.show() 7 [14]: plt.figure(figsize=(8, 5)) sns.histplot(data=df, x='MonthlyCharges', hue='Churn', multiple='stack',␣ ↪bins=30) plt.title("Churn by Monthly Charges") plt.xlabel("Monthly Charges") plt.ylabel("Number of Customers") plt.show() 8 [15]: plt.figure(figsize=(8, 5)) sns.countplot(data=df, x='InternetService', hue='Churn') plt.title("Churn by Internet Service Type") plt.xlabel("Internet Service") plt.ylabel("Number of Customers") plt.show() 9 [17]: df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') df = df.dropna(subset=['TotalCharges']) # Remove any leftover bad rows [18]: binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',␣ ↪'Churn'] for col in binary_cols: df[col] = df[col].map({'Yes': 1, 'No': 0}) [19]: df['gender'] = df['gender'].map({'Male': 1, 'Female': 0}) [20]: multi_cat_cols = [ 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod' ] df = pd.get_dummies(df, columns=multi_cat_cols) [21]: import seaborn as sns import matplotlib.pyplot as plt plt.figure(figsize=(6, 4)) 10 sns.countplot(data=df, x='gender', hue='Churn') plt.title('Churn by Gender') plt.show() [22]: plt.figure(figsize=(6, 4)) sns.countplot(data=df, x='SeniorCitizen', hue='Churn') plt.title('Churn by Senior Citizen Status (0 = No, 1 = Yes)') plt.show() 11 1.4 � Key Business Insights: Factor Impact on Churn Contract Type Tenure Payment Method Senior Citizens Month-to-month has highest churn risk Lower tenure = significantly higher churn Electronic checks = higher churn Slightly higher churn risk (but less impactful than contract type) [28]: print(df.head()) 0 1 2 3 4 customerID 7590-VHVEG 5575-GNVDE 3668-QPYBK 7795-CFOCW 9237-HQITU gender 0 1 1 1 0 0 1 2 3 PhoneService 0 1 1 0 SeniorCitizen 0 0 0 0 0 PaperlessBilling 1 0 1 0 Partner 1 0 0 0 0 Dependents 0 0 0 0 0 MonthlyCharges- tenure- TotalCharges- … … … … … \ \ 4 1 1 70.70 0 1 2 3 4 StreamingMovies_No True True True True True 0 1 2 3 4 StreamingMovies_Yes False False False False False 0 1 2 3 4 Contract_Two year False False False False False 0 1 2 3 4 PaymentMethod_Credit card (automatic) False False False False False 0 1 2 3 4 PaymentMethod_Mailed check False True True False False 151.65 StreamingMovies_No internet service False False False False False Contract_Month-to-month True False True False True … \ Contract_One year False True False True False PaymentMethod_Bank transfer (automatic) False False False True False \ PaymentMethod_Electronic check True False False False True [5 rows x 42 columns] [29]: # Drop customerID and Churn from features X = df.drop(['customerID', 'Churn'], axis=1) # Target variable y = df['Churn'].map({'No': 0, 'Yes': 1}) # Confirm shapes print("� X shape:", X.shape) 13 \ # Encode Churn to 0 and 1 \ print("� y shape:", y.shape) � X shape: (7032, 40) � y shape: (7032,) [31]: from sklearn.model_selection import train_test_split # Separate target and features X = df.drop("Churn", axis=1) y = df["Churn"] # Split into training and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print("� Training set shape:", X_train.shape) print("� Testing set shape:", X_test.shape) � Training set shape: (5625, 41) � Testing set shape: (1407, 41) [35]: # Drop customerID before splitting df_model = df.drop('customerID', axis=1) [36]: # Separate features and target X = df_model.drop('Churn', axis=1) y = df_model['Churn'] [37]: # Train-test split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣ ↪random_state=42) [38]: from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix,␣ ↪accuracy_score # Initialize and train the model model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) # Predict on the test set y_pred = model.predict(X_test) # Evaluate the model print("� Accuracy:", accuracy_score(y_test, y_pred)) 14 print("\n� Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) print("\n� Classification Report:\n", classification_report(y_test, y_pred)) � Accuracy:- � Confusion Matrix: [[918 115] [179 195]] � Classification Report: precision recall 0 1 0.84 0.63 0.89 0.52 0.86 0.57 - accuracy macro avg weighted avg 0.73 0.78 0.71 0.79 - - 1.5 f1-score support � Model Performance — Logistic Regression Used as baseline for interpretability Metrics: • • • • Accuracy: 79.1% Precision (Churn): 0.63 Recall (Churn): 0.52 F1 Score: 0.57 � Note: The model is a good baseline but tends to underpredict churn. Future improvements could include: - XGBoost or Random Forest for better performance - SMOTE to address class imbalance - More feature engineering [39]: import numpy as np import matplotlib.pyplot as plt # Get feature importance coefficients = model.coef_[0] features = X_train.columns # Create DataFrame for plotting importance_df = pd.DataFrame({ 'Feature': features, 'Coefficient': coefficients }) # Sort by absolute value of coefficients 15 importance_df['abs_coef'] = importance_df['Coefficient'].abs() importance_df = importance_df.sort_values('abs_coef', ascending=False).head(10) # Plot plt.figure(figsize=(10, 6)) plt.barh(importance_df['Feature'], importance_df['Coefficient'],␣ ↪color='skyblue') plt.xlabel('Coefficient Value') plt.title('Top 10 Most Important Features for Churn Prediction') plt.gca().invert_yaxis() plt.tight_layout() plt.show() [42]: import matplotlib.pyplot as plt import seaborn as sns # Make sure you're using the original DataFrame if 'df' in globals(): # Ensure visuals folder exists import os if not os.path.exists('visuals'): os.makedirs('visuals') # 1. Churn bar chart plt.figure(figsize=(6,4)) 16 sns.countplot(x='Churn', data=df) plt.title('Churn Count') plt.savefig('visuals/churn_count.png') plt.show() # 2. Churn pie chart churn_counts = df['Churn'].value_counts() plt.figure(figsize=(6,6)) plt.pie(churn_counts, labels=churn_counts.index, autopct='%1.1f%%',␣ ↪startangle=90, colors=['lightgreen','salmon']) plt.title('Churn Proportion') plt.savefig('visuals/churn_pie.png') plt.show() # 3. Correlation heatmap (numeric features only) plt.figure(figsize=(12,10)) numeric_df = df.select_dtypes(include=['int64', 'float64']) sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f') plt.title('Feature Correlation Heatmap') plt.savefig('visuals/heatmap.png') plt.show() else: print("� Original 'df' not found. Make sure your raw dataset is loaded as␣ ↪'df'.") 17 18 1.6 � Recommendations for Action: Strategy Justification Incentivize long-term contracts Drastically lower churn with yearly contracts Most churn occurs early in lifecycle Churners over-index on this method High revenue loss segment Trigger retention offers at month 1–3 Discourage electronic check usage Target low-tenure, high-charge users with proactive CS 1.6.1 � Final Outcome: • Created actionable customer segments most at risk of churn • Built a deployable, explainable churn prediction model using Logistic Regression 19 • Translated raw telecom data into clear business insights for executive stakeholders • Delivered visuals and metrics ready for integration into dashboards • Demonstrated end-to-end problem solving — from data wrangling to business recommendations [ ]: 20