Etinosa David Eribo | Freelancer Melanoma Analysis

Melanoma Analysis

MELANOMA ANALYSIS Load necessary Libraries In [1]: import import import import pandas as pd numpy as np matplotlib.pyplot as plt seaborn as sns In [4]: import os os.getcwd() Out[4]: 'C:\\Users\\eribo' In [5]: ml_data = pd.read_csv('C:\\Users\\eribo/documents/pythonjup/melanoma-2.csv') Data Preprocessing 1. Preview the data (.head, .tail) 2. Handle Missing Data 3. Check Data Type In [7]: ml_data.head() Unnamed: 0 time Out[7]: status sex age year thickness ulcer 0 1 10 3 1 76 1972 6.76 1 1 2 30 3 1 56 1968 0.65 0 2 3 35 2 1 41 1977 1.34 0 3 4 99 3 0 71 1968 2.90 0 4 5 185 1 1 52 1965 12.08 1 In [8]: ml_data.tail() Unnamed: 0 time status sex age 200 201 4492 2 1 29 1965 7.06 1 201 202 4668 2 0 40 1965 6.12 0 202 203 4688 2 0 42 1965 0.48 0 203 204 4926 2 0 50 1964 2.26 0 204 205 5565 2 0 41 1962 2.90 0 Out[8]: In [9]: ml_data.dtypes year thickness ulcer Out[9]: Unnamed: 0 int64 time int64 status int64 sex int64 age int64 year int64 thickness float64 ulcer int64 dtype: object In [10]: ml_data.shape Out[10]: (205, 8) In [13]: ml_data.isnull().sum() Out[13]: Unnamed: 0 time status sex age year thickness ulcer dtype: int64 - In [21]: ml_data.drop(["Unnamed: 0"], axis = 1, inplace = True) In [22]: ml_data.columns Out[22]: Index(['time', 'status', 'sex', 'age', 'year', 'thickness', 'ulcer'], dtype='object') Exploratory Data Analysis (EDA) 1. Summary statistics (Descriptive statistics) 2. Distribution 3. Correlation In [23]: ml_data.describe() time status sex age year thickness ulcer count - - - - - - - mean - - - - - - - std- - - - - - - Out[23]: min - - - - - - - 25% - - - - - - - 50% - - - - - - - 75% - - - - - - - max - - - - - - - In [25]: ml_data.corr() time status sex age year thickness ulcer time - - - - - - - status - - - - - - - sex - - - - - - - age - - - - - - - year - - - - - - - - - - - - - - ulcer - - - - - - - Out[25]: thickness POSITIVE CORRELATION • From 0 - 0.4 (Weak postive correlation) • From 0.4 - 0.7 (Moderate correlation) • From 0.7 - 1 (Strong Correlation) NEGATIVE CORRELATION • From -0 to -0.4 (Weak Negative correlation) • From -0.4 to -0.7 (Moderate correlation) • From -0.7 to -1 (Strong Correlation) DATA VISUALIZATION histogram bar scatter boxplot In [26]: plt.hist(ml_data["time"], color = "Red") plt.show() In [27]: plt.hist(ml_data["age"], color = "Orange") plt.show() In [28]: plt.hist(ml_data["thickness"], color = "indigo") plt.show() In [35]: ml_data1 = ml_data[["time", "age", "thickness"]] ml_data1.head() time Out[35]: age thickness 0 10 76 6.76 1 30 56 0.65 2 35 41 1.34 3 99 71 2.90 4 185 52 12.08 In [36]: fig, axes = plt.subplots(1, 3, figsize = (10,3)) counter = 0 loop = range(3) for i in loop: axes[i].hist(ml_data1.iloc[:,counter], color = "purple") title = ml_data1.iloc[:,counter].name.capitalize() + " distribution" axes[i].set_title(title) counter += 1 plt.show() In [40]: fig, axes = plt.subplots(1, 3, figsize = (10,3)) counter = 0 loop = range(3) for i in loop: axes[i].boxplot(ml_data1.iloc[:,counter]) title = ml_data1.iloc[:,counter].name.capitalize() + " distribution" axes[i].set_xticklabels([title]) counter += 1 plt.show() In [41]: plt.scatter(ml_data1["time"], ml_data1["age"]) plt.show() In [42]: plt.scatter(ml_data1["time"], ml_data1["thickness"]) plt.show() In [44]: plt.scatter(ml_data1["thickness"], ml_data1["age"]) plt.show() MODEL DEVELOPMENT In [45]: from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split TIME AND THICKNESS In [60]: X = ml_data["thickness"].values.reshape(-1, 1) Y = ml_data["time"].values lm1 = LinearRegression() X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 4 In [61]: lm1.fit(X_train, Y_train) Out[61]: LinearRegression() In [62]: # y = mx + c for predicting lm1.coef_ Out[62]: array([-]) In [63]: lm1.intercept_ Out[63]: - time = -65.66thickness + 2360.10 TIME AND AGE In [64]: X = ml_data["age"].values.reshape(-1, 1) Y = ml_data["time"].values lm2 = LinearRegression() X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 4 In [65]: lm2.fit(X_train, Y_train) Out[65]: LinearRegression() In [66]: # y = mx + c for predicting lm2.coef_ Out[66]: array([-]) In [67]: lm2.intercept_ Out[67]: - time = -20.87age + 3265.87 THICKNESS AND AGE In [68]: X = ml_data["age"].values.reshape(-1, 1) Y = ml_data["thickness"].values lm3= LinearRegression() X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 4 In [69]: lm3.fit(X_train, Y_train) Out[69]: LinearRegression() In [70]: # y = mx + c for predicting lm3.coef_ Out[70]: array([-]) In [71]: lm3.intercept_ Out[71]: - thickness = 0.04age + 1.19 In [72]: predictions = lm3.predict(X_test) In [73]: predictions Out[73]: array([- ,-,-,-,-,-,-,-,-,-,-,-,-, -,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-]) -,-,-,-,-,-,-,-,-,-,-,-, -,-,-,-,-,-,-,-,- ,-,-,-, In [75]: plt.scatter(ml_data1 ["age"], ml_data1 ["thickness"]) plt.plot(predictions, color = "red") plt.show() MODEL EVALUATION mean_square_error r_squared In [76]: r_squared = lm3.score(X_test, Y_test) r_squared Out[76]: - r_squared is 0.29%. Based on the performance of the model is extremely poor. In [77]: from sklearn.metrics import mean_squared_error mse = mean_squared_error(Y_test, predictions, squared = False) mse Out[77]: In [ ]: -