Data Analytics in R
Machine Learning Project with R
Lovette
12/27/2021
Data Description Loading Packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 -##
##
##
##
v
v
v
v
ggplot2
tibble
tidyr
readr
-
v
v
v
v
purrr
dplyr
stringr
forcats
-
## -- Conflicts ------------------------------------------ tidyverse_conflicts() -## x dplyr::filter() masks stats::filter()
## x dplyr::lag()
masks stats::lag()
library(factoextra)
## Warning: package ’factoextra’ was built under R version 4.1.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
## Warning: package ’cluster’ was built under R version 4.1.2
library(e1071)
## Warning: package ’e1071’ was built under R version 4.1.2
library(rpart)
## Warning: package ’rpart’ was built under R version 4.1.2
Importing csv file
1
car_sales
66687, 132477, 36970, 132928, 44311, 132806, 1~
149900, 149900, 75000, 75000, 39500, 39500, 44~
"PLN", "PLN", "PLN", "PLN", "PLN", "PLN", "PLN~
"Used", "Used", "Used", "Used", "Used", "Used"~
"Ford", "Ford", "Chevrolet", "Chevrolet", "Cit~
"Other", "Other", "Other", "Other", "Other", "~
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
"29/03/1905", "29/03/1905", "05/04/1905", "05/~
5, 5, 70000, 70000, 1, 1, 1, 1, 1, 1, 45110, 4~
16, 16, 50, 50, 25, 25, 10, 10, 25, 25, 1, 1, ~
2960, 2960, 2800, 2800, 1452, 1452, 667, 667, ~
"Gasoline", "Gasoline", "Gasoline", "Gasoline"~
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
"Rear wheels", "Rear wheels", NA, NA, NA, NA, ~
"Manual", "Manual", "Manual", "Manual", "Manua~
"convertible", "convertible", "convertible", "~
2, 2, 5, 5, 4, 4, 2, 2, 4, 4, 3, 3, 2, 2, 4, 4~
"black", "black", "beige", "beige", "blue", "b~
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
NA, NA, "Yes", "Yes", NA, NA, NA, NA, NA, NA, ~
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
"02/05/2021", "02/05/2021", "02/05/2021", "02/~
"Elblaska - 80-718 Gdansk, Pomorskie (Polska)"~
"[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"~
str(car_sales)
## spec_tbl_df [208,304 x 25] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
2
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
$
-
Index
: num [1:208304]- ...
Price
: num [1:208304]- ...
Currency
: chr [1:208304] "PLN" "PLN" "PLN" "PLN" ...
Condition
: chr [1:208304] "Used" "Used" "Used" "Used" ...
Vehicle_brand
: chr [1:208304] "Ford" "Ford" "Chevrolet" "Chevrolet" ...
Vehicle_model
: chr [1:208304] "Other" "Other" "Other" "Other" ...
Vehicle_version
: chr [1:208304] NA NA NA NA ...
Vehicle_generation
: chr [1:208304] NA NA NA NA ...
Production_year
: chr [1:208304] "29/03/1905" "29/03/1905" "05/04/1905" "05/04/1905" ...
Mileage_km
: num [1:208304]- ...
Power_HP
: num [1:208304]- ...
Displacement_cm3
: num [1:208304]- ...
Fuel_type
: chr [1:208304] "Gasoline" "Gasoline" "Gasoline" "Gasoline" ...
CO2_emissions
: num [1:208304] NA NA NA NA NA NA NA NA NA NA ...
Drive
: chr [1:208304] "Rear wheels" "Rear wheels" NA NA ...
Transmission
: chr [1:208304] "Manual" "Manual" "Manual" "Manual" ...
Type
: chr [1:208304] "convertible" "convertible" "convertible" "convertible" ..
Doors_number
: num [1:208304]- ...
Colour
: chr [1:208304] "black" "black" "beige" "beige" ...
Origin_country
: chr [1:208304] NA NA NA NA ...
First_owner
: chr [1:208304] NA NA "Yes" "Yes" ...
First_registration_date: chr [1:208304] NA NA NA NA ...
Offer_publication_date : chr [1:208304] "02/05/2021" "02/05/2021" "02/05/2021" "02/05/2021" ...
Offer_location
: chr [1:208304] "Elblaska - 80-718 Gdansk, Pomorskie (Polska)" "Elblaska Features
: chr [1:208304] "[]" "[]" "[]" "[]" ...
attr(*, "spec")=
.. cols(
..
Index = col_double(),
..
Price = col_double(),
..
Currency = col_character(),
..
Condition = col_character(),
..
Vehicle_brand = col_character(),
..
Vehicle_model = col_character(),
..
Vehicle_version = col_character(),
..
Vehicle_generation = col_character(),
..
Production_year = col_character(),
..
Mileage_km = col_double(),
..
Power_HP = col_double(),
..
Displacement_cm3 = col_double(),
..
Fuel_type = col_character(),
..
CO2_emissions = col_double(),
..
Drive = col_character(),
..
Transmission = col_character(),
..
Type = col_character(),
..
Doors_number = col_double(),
..
Colour = col_character(),
..
Origin_country = col_character(),
..
First_owner = col_character(),
..
First_registration_date = col_character(),
..
Offer_publication_date = col_character(),
..
Offer_location = col_character(),
..
Features = col_character()
.. )
3
Creating a dataframe from the dataset
car_sales %
select(-c(Currency, Index, Vehicle_version, Vehicle_generation, CO2_emissions,
First_registration_date,Features))
Filling in some missing values
car_sales % mutate(First_owner = case_when(
grepl(pattern = "New", x = Condition) ~ "Yes"))
car_sales % replace_na(list(First_owner = "No"))
car_sales$First_owner % na.omit()
viewing the dataset
glimpse(car_sales)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Rows: 112,192
Columns: 18
$ Price
$ Condition
$ Vehicle_brand
$ Vehicle_model
$ Production_year
$ Mileage_km
$ Power_HP
$ Displacement_cm3
$ Fuel_type
$ Drive
$ Transmission
$ Type
$ Doors_number
$ Colour
$ Origin_country
$ First_owner
$ Offer_publication_date
$ Offer_location
79900, 79900, 49900, 49900, 60000, 60000, 10900~
Used, Used, Used, Used, Used, Used, Used, Used,~
Opel, Opel, Fiat, Fiat, Mercedes-Benz, Mercedes~
Other, Other, Other, Other, Other, Other, F150,~
19/04/1905, 19/04/1905, 28/04/1905, 28/04/1905,~
76000, 76000, 68000, 68000, 3000, 3000, 12000, ~
26, 26, 17, 17, 70, 70, 122, 100, 275, 275, 100~
1288, 1288, 560, 560, 2500, 2500, 3800, 3916, 5~
Gasoline, Gasoline, Gasoline, Gasoline, Gasolin~
Rear wheels, Rear wheels, Front wheels, Front w~
Manual, Manual, Manual, Manual, Manual, Manual,~
sedan, sedan, small_cars, small_cars, SUV, SUV,~
2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2,~
green, green, blue, blue, beige, beige, golden,~
Germany, Germany, Switzerland, Switzerland, Gre~
No, No, No, No, No, No, No, No, No, No, No, No,~
03/05/2021, 03/05/2021, 04/05/2021, 04/05/2021,~
"Jagiellonska 65a - 03-303 Warszawa, Praga-Póln~
View(car_sales)
str(car_sales)
## ’data.frame’:
112192 obs. of 18 variables:
## $ Price
: int- ...
## $ Condition
: Factor w/ 2 levels "New","Used":- ...
## $ Vehicle_brand
: Factor w/ 108 levels "Abarth","Acura",..:- ..
## $ Vehicle_model
: Factor w/ 1203 levels "09-Mar","09-May",..:-
## $ Production_year
: Factor w/ 92 levels "01/05/1905","01/06/1905",..:-
## $ Mileage_km
: int- ...
## $ Power_HP
: int- ...
## $ Displacement_cm3
: int- ...
## $ Fuel_type
: Factor w/ 8 levels "Diesel","Electric",..:- ...
## $ Drive
: Factor w/ 5 levels "4x4 (attached automatically)",..:-
## $ Transmission
: Factor w/ 2 levels "Automatic","Manual":- ...
## $ Type
: Factor w/ 9 levels "city_cars","compact",..:- ...
## $ Doors_number
: int- ...
## $ Colour
: Factor w/ 14 levels "beige","black",..:- ...
## $ Origin_country
: Factor w/ 37 levels "Austria","Belarus",..:- .
## $ First_owner
: Factor w/ 2 levels "No","Yes":- ...
## $ Offer_publication_date: Factor w/ 41 levels "01/04/2021","01/05/2021",..:-
## $ Offer_location
: Factor w/ 13635 levels "------------- - 42-202 Czestochowa, Parkitka (Pols
## - attr(*, "na.action")= ’omit’ Named int [1:96112]- ...
##
..- attr(*, "names")= chr [1:96112] "1" "2" "3" "4" ...
5
Statistical summaries of dataset
summary(car_sales)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Price
Min.
:
585
1st Qu.: 19900
Median : 39900
Mean
: 65913
3rd Qu.: 79899
Max.
:-
Condition
New : 11362
Used:100830
Vehicle_brand
Vehicle_model
Volkswagen: 9943
Astra : 3276
Opel
: 9255
Octavia: 2658
Audi
: 9149
Passat : 2374
Ford
: 8729
A4
: 2366
BMW
: 8706
Seria 3: 2355
Toyota
: 6706
Golf
: 2291
(Other)
:59704
(Other):96872
Mileage_km
Power_HP
Displacement_cm3
Min.
:
1
Min.
:
1.0
Min.
: 400
1st Qu.: 50011
1st Qu.: 106.0
1st Qu.:1461
Median : 134000
Median : 136.0
Median :1798
Mean
: 130825
Mean
: 152.4
Mean
:1864
3rd Qu.:-rd Qu.: 175.0
3rd Qu.:1997
Max.
:-
Max.
:1115.0
Max.
:8400
Production_year
09/07/1905: 9742
12/07/1905: 7821
13/07/1905: 7727
08/07/1905: 7684
10/07/1905: 7001
01/07/1905: 5975
(Other)
:66242
Fuel_type
Drive
Transmission
Gasoline
:55681
4x4 (attached automatically): 9278
Automatic:40898
Diesel
:48657
4x4 (attached manually)
: 1432
Manual
:71294
Gasoline + LPG: 4145
4x4 (permanent)
: 9920
Hybrid
: 3668
Front wheels
:81951
Gasoline + CNG:
41
Rear wheels
: 9611
Electric
:
0
(Other)
:
0
Type
Doors_number
Colour
Origin_country
station_wagon:23105
Min.
: 1.000
black :27204
Poland
:53913
SUV
:22571
1st Qu.: 5.000
gray
:17457
Germany
:36599
compact
:18369
Median : 5.000
white :16762
France
: 4477
sedan
:18239
Mean
: 4.668
silver :16462
United States: 3891
city_cars
:11881
3rd Qu.: 5.000
blue
:11713
Belgium
: 3861
minivan
:11034
Max.
:55.000
other : 6863
Switzerland : 2061
(Other)
: 6993
(Other):15731
(Other)
: 7390
First_owner Offer_publication_date
No :-/04/2021:14771
Yes:-/05/2021:-/05/2021:-/05/2021:-/04/2021:-/05/2021:13451
(Other)
:26643
Offer_location
Stalowa 16 - 41-506 Chorzów, Slaskie (Polska)
: 1301
ul. Stalowa 16 - 41-506 Chorzów, Slaskie (Polska)
:
766
Okulickiego 3B - 05-500 Piaseczno, piaseczynski, Mazowieckie (Polska):
583
Radom, Mazowieckie
:
535
Kielce, Swietokrzyskie
:
528
Lublin, Lubelskie
:
528
(Other)
:107951
The Doors_number has an outlier 55 which will be replaced by 5
6
car_sales$Doors_number |t|)
(Intercept) - -112.2
<2e-16 ***
Power_HP-
<2e-16 ***
--Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
Residual standard error: 63000 on 112190 degrees of freedom
Multiple R-squared: 0.4449, Adjusted R-squared: 0.4449
F-statistic: 8.99e+04 on 1 and 112190 DF, p-value: < 2.2e-16
Linear Regression with Multiple Variables
mlg_model |t|)
3.940e+04 3.392e-
-7.441e-02 2.279e-03 -32.653 < 2e-16 ***
-1.275e+01 4.167e-01 -30.601 < 2e-16 ***
-4.349e+04 3.549e+04 -
-7.687e+04 3.393e+04 - *
2.709e+04 5.874e-
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Production_year02/06/1905
Production_year02/07/1905
Production_year03/06/1905
Production_year03/07/1905
Production_year04/06/1905
Production_year04/07/1905
Production_year05/05/1905
Production_year05/06/1905
Production_year05/07/1905
Production_year06/05/1905
Production_year06/06/1905
Production_year06/07/1905
Production_year07/06/1905
Production_year07/07/1905
Production_year08/06/1905
Production_year08/07/1905
Production_year09/05/1905
Production_year09/06/1905
Production_year09/07/1905
Production_year10/05/1905
Production_year10/06/1905
Production_year10/07/1905
Production_year11/05/1905
Production_year11/06/1905
Production_year11/07/1905
Production_year12/05/1905
Production_year12/06/1905
Production_year12/07/1905
Production_year13/05/1905
Production_year13/06/1905
Production_year13/07/1905
Production_year14/05/1905
Production_year14/06/1905
Production_year15/05/1905
Production_year15/06/1905
Production_year16/05/1905
Production_year16/06/1905
Production_year17/05/1905
Production_year17/06/1905
Production_year18/05/1905
Production_year18/06/1905
Production_year19/04/1905
Production_year19/05/1905
Production_year19/06/1905
Production_year20/05/1905
Production_year20/06/1905
Production_year21/05/1905
Production_year21/06/1905
Production_year22/05/1905
Production_year22/06/1905
Production_year23/05/1905
Production_year23/06/1905
Production_year24/05/1905
Production_year24/06/1905
-5.500e+04
-7.773e+04
-3.096e+04
-7.708e+04
-4.958e+04
-7.522e+04
6.729e+04
-5.362e+04
-7.330e+04
-8.959e+04
-5.148e+04
-6.916e+04
-5.312e+04
-6.478e+04
-6.813e+04
-5.691e+04
-3.964e+04
-6.367e+04
-4.699e+04
1.525e+05
-6.966e+04
-3.468e+04
7.358e+04
-5.459e+04
-1.924e+04
6.647e+05
-5.661e+04
7.166e+03
1.101e+05
-7.941e+04
2.237e+04
4.993e+04
-7.913e+04
-9.393e+04
-7.941e+04
-9.787e+04
-7.934e+04
9.776e+04
-8.049e+04
4.774e+04
-7.422e+04
4.301e+04
-2.554e+04
-8.507e+04
2.644e+04
-8.237e+04
-2.913e+04
-8.437e+04
5.765e+04
-8.514e+04
-4.592e+04
-8.632e+04
-4.914e+04
-8.470e+04
3.557e+04
3.393e+04
3.663e+04
3.393e+04
3.663e+04
3.393e+04
4.154e+04
3.626e+04
3.393e+04
4.796e+04
3.566e+04
3.393e+04
3.557e+04
3.393e+04
3.536e+04
3.392e+04
5.874e+04
3.475e+04
3.392e+04
4.013e+04
3.487e+04
3.392e+04
4.154e+04
3.459e+04
3.393e+04
4.796e+04
3.448e+04
3.392e+04
3.687e+04
3.428e+04
3.392e+04
4.154e+04
3.444e+04
4.378e+04
3.437e+04
4.378e+04
3.429e+04
3.626e+04
3.422e+04
3.626e+04
3.418e+04
4.796e+04
3.749e+04
3.409e+04
3.749e+04
3.403e+04
3.643e+04
3.399e+04
3.687e+04
3.398e+04
3.792e+04
3.396e+04
3.845e+04
3.395e+04
8
-1.546
-2.291
-0.845
-2.272
-1.354
-
-1.479
-2.161
-1.868
-1.444
-2.039
-1.493
-1.909
-1.927
-1.678
-0.675
-1.832
-
-1.998
-
-1.578
-
-
-
-2.297
-2.145
-2.311
-2.235
-
-
-
-0.681
-
-2.420
-0.800
-
-2.506
-1.211
-2.542
-1.278
-2.495
-
< 2e-
*
*
*
*
.
*
.
.
.
.
***
*
.
***
**
*
*
*
*
*
*
**
*
*
*
*
*
*
*
*
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Production_year25/05/1905 -4.952e+03 3.568e+04 -
Production_year25/06/1905 -8.638e+04 3.394e+04 -
Production_year26/05/1905 -3.466e+04 3.643e+04 -
Production_year26/06/1905 -8.553e+04 3.393e+04 -
Production_year27/05/1905 -1.135e+04 3.846e+04 -
Production_year27/06/1905 -8.532e+04 3.393e+04 -
Production_year28/04/-e+03 4.796e-
Production_year28/05/1905 -3.347e+04 3.750e+04 -
Production_year28/06/1905 -8.358e+04 3.393e+04 -
Production_year29/05/-e+04 4.013e-
Production_year29/06/1905 -8.358e+04 3.393e+04 -
Production_year30/05/1905 -4.500e+04 3.916e+04 -
Production_year30/06/1905 -8.078e+04 3.393e+04 -
Production_year31/05/1905 -2.994e+04 3.687e+04 -
Power_HP
7.529e+02 3.842e- < 2e-16
--Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
*
*
*
*
*
*
***
Residual standard error: 47960 on 112117 degrees of freedom
Multiple R-squared: 0.6784, Adjusted R-squared: 0.6782
F-statistic: 3197 on 74 and 112117 DF, p-value: < 2.2e-16
Classification Algorithms
Naive Bayes Classifier
Building the model
naive_model %
select(c(Price, Mileage_km, Power_HP, Displacement_cm3))
Building the model
K_model