Melanoma Analysis
MELANOMA ANALYSIS
Load necessary Libraries
In [1]: import
import
import
import
pandas as pd
numpy as np
matplotlib.pyplot as plt
seaborn as sns
In [4]: import os
os.getcwd()
Out[4]:
'C:\\Users\\eribo'
In [5]: ml_data = pd.read_csv('C:\\Users\\eribo/documents/pythonjup/melanoma-2.csv')
Data Preprocessing
1. Preview the data (.head, .tail)
2. Handle Missing Data
3. Check Data Type
In [7]: ml_data.head()
Unnamed: 0 time
Out[7]:
status
sex
age
year thickness
ulcer
0
1
10
3
1
76
1972
6.76
1
1
2
30
3
1
56
1968
0.65
0
2
3
35
2
1
41
1977
1.34
0
3
4
99
3
0
71
1968
2.90
0
4
5
185
1
1
52
1965
12.08
1
In [8]: ml_data.tail()
Unnamed: 0
time
status
sex
age
200
201
4492
2
1
29
1965
7.06
1
201
202
4668
2
0
40
1965
6.12
0
202
203
4688
2
0
42
1965
0.48
0
203
204
4926
2
0
50
1964
2.26
0
204
205
5565
2
0
41
1962
2.90
0
Out[8]:
In [9]: ml_data.dtypes
year thickness
ulcer
Out[9]:
Unnamed: 0
int64
time
int64
status
int64
sex
int64
age
int64
year
int64
thickness
float64
ulcer
int64
dtype: object
In [10]: ml_data.shape
Out[10]:
(205, 8)
In [13]: ml_data.isnull().sum()
Out[13]:
Unnamed: 0
time
status
sex
age
year
thickness
ulcer
dtype: int64
-
In [21]: ml_data.drop(["Unnamed: 0"], axis = 1, inplace = True)
In [22]: ml_data.columns
Out[22]:
Index(['time', 'status', 'sex', 'age', 'year', 'thickness', 'ulcer'], dtype='object')
Exploratory Data Analysis (EDA)
1. Summary statistics (Descriptive statistics)
2. Distribution
3. Correlation
In [23]: ml_data.describe()
time
status
sex
age
year
thickness
ulcer
count
-
-
-
-
-
-
-
mean
-
-
-
-
-
-
-
std-
-
-
-
-
-
-
Out[23]:
min
-
-
-
-
-
-
-
25%
-
-
-
-
-
-
-
50%
-
-
-
-
-
-
-
75%
-
-
-
-
-
-
-
max
-
-
-
-
-
-
-
In [25]: ml_data.corr()
time
status
sex
age
year
thickness
ulcer
time
-
-
-
-
-
-
-
status
-
-
-
-
-
-
-
sex
-
-
-
-
-
-
-
age -
-
-
-
-
-
-
year -
-
-
-
-
-
-
-
-
-
-
-
-
-
ulcer -
-
-
-
-
-
-
Out[25]:
thickness
POSITIVE CORRELATION
• From 0 - 0.4 (Weak postive correlation)
• From 0.4 - 0.7 (Moderate correlation)
• From 0.7 - 1 (Strong Correlation)
NEGATIVE CORRELATION
• From -0 to -0.4 (Weak Negative correlation)
• From -0.4 to -0.7 (Moderate correlation)
• From -0.7 to -1 (Strong Correlation)
DATA VISUALIZATION
histogram
bar
scatter
boxplot
In [26]: plt.hist(ml_data["time"], color = "Red")
plt.show()
In [27]: plt.hist(ml_data["age"], color = "Orange")
plt.show()
In [28]: plt.hist(ml_data["thickness"], color = "indigo")
plt.show()
In [35]: ml_data1 = ml_data[["time", "age", "thickness"]]
ml_data1.head()
time
Out[35]:
age thickness
0
10
76
6.76
1
30
56
0.65
2
35
41
1.34
3
99
71
2.90
4
185
52
12.08
In [36]: fig, axes = plt.subplots(1, 3, figsize = (10,3))
counter = 0
loop = range(3)
for i in loop:
axes[i].hist(ml_data1.iloc[:,counter], color = "purple")
title = ml_data1.iloc[:,counter].name.capitalize() + " distribution"
axes[i].set_title(title)
counter += 1
plt.show()
In [40]: fig, axes = plt.subplots(1, 3, figsize = (10,3))
counter = 0
loop = range(3)
for i in loop:
axes[i].boxplot(ml_data1.iloc[:,counter])
title = ml_data1.iloc[:,counter].name.capitalize() + " distribution"
axes[i].set_xticklabels([title])
counter += 1
plt.show()
In [41]: plt.scatter(ml_data1["time"], ml_data1["age"])
plt.show()
In [42]: plt.scatter(ml_data1["time"], ml_data1["thickness"])
plt.show()
In [44]: plt.scatter(ml_data1["thickness"], ml_data1["age"])
plt.show()
MODEL DEVELOPMENT
In [45]: from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
TIME AND THICKNESS
In [60]: X = ml_data["thickness"].values.reshape(-1, 1)
Y = ml_data["time"].values
lm1 = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 4
In [61]: lm1.fit(X_train, Y_train)
Out[61]:
LinearRegression()
In [62]: # y = mx + c for predicting
lm1.coef_
Out[62]:
array([-])
In [63]: lm1.intercept_
Out[63]:
-
time = -65.66thickness + 2360.10
TIME AND AGE
In [64]: X = ml_data["age"].values.reshape(-1, 1)
Y = ml_data["time"].values
lm2 = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 4
In [65]: lm2.fit(X_train, Y_train)
Out[65]:
LinearRegression()
In [66]: # y = mx + c for predicting
lm2.coef_
Out[66]:
array([-])
In [67]: lm2.intercept_
Out[67]:
-
time = -20.87age + 3265.87
THICKNESS AND AGE
In [68]: X = ml_data["age"].values.reshape(-1, 1)
Y = ml_data["thickness"].values
lm3= LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 4
In [69]: lm3.fit(X_train, Y_train)
Out[69]:
LinearRegression()
In [70]: # y = mx + c for predicting
lm3.coef_
Out[70]:
array([-])
In [71]: lm3.intercept_
Out[71]:
-
thickness = 0.04age + 1.19
In [72]: predictions = lm3.predict(X_test)
In [73]: predictions
Out[73]:
array([- ,-,-,-,-,-,-,-,-,-,-,-,-,
-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-])
-,-,-,-,-,-,-,-,-,-,-,-,
-,-,-,-,-,-,-,-,- ,-,-,-,
In [75]: plt.scatter(ml_data1 ["age"], ml_data1 ["thickness"])
plt.plot(predictions, color = "red")
plt.show()
MODEL EVALUATION
mean_square_error r_squared
In [76]: r_squared = lm3.score(X_test, Y_test)
r_squared
Out[76]:
-
r_squared is 0.29%. Based on the performance of the model is extremely poor.
In [77]: from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test, predictions, squared = False)
mse
Out[77]:
In [ ]:
-