penguins_visualization
mugo_muiruri_james-
library(ggplot2)
library(palmerpenguins)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
##
filter, lag
## The following objects are masked from 'package:base':
##
##
intersect, setdiff, setequal, union
data exploration
str(penguins)
## tibble [344 × 8] (S3:
## $ species
:
1 1 1 1 1 1 ...
## $ island
:
3 3 3 3 ...
## $ bill_length_mm
:
34.1 42 ...
## $ bill_depth_mm
:
20.2 ...
## $ flipper_length_mm:
...
## $ body_mass_g
:- ...
## $ sex
:
NA NA ...
## $ year
:- ...
tbl_df/tbl/data.frame)
Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1
Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3
num [1:344]- NA-
num [1:344]- NA-
int [1:344]- NA-
int [1:344]- NA-
Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2
int [1:344]-
names(penguins)
## [1] "species"
## [4] "bill_depth_mm"
## [7] "sex"
summary(penguins)
"island"
"bill_length_mm"
"flipper_length_mm" "body_mass_g"
"year"
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
species
Adelie
:152
Chinstrap: 68
Gentoo
:124
island
Biscoe
:168
Dream
:124
Torgersen: 52
flipper_length_mm
Min.
:172.0
1st Qu.:190.0
Median :197.0
Mean
:200.9
3rd Qu.:213.0
Max.
:231.0
NA's
:2
body_mass_g
Min.
:2700
1st Qu.:3550
Median :4050
Mean
:4202
3rd Qu.:4750
Max.
:6300
NA's
:2
bill_length_mm bill_depth_mm
Min.
:32.10
Min.
:13.10
1st Qu.:39.23
1st Qu.:15.60
Median :44.45
Median :17.30
Mean
:43.92
Mean
:17.15
3rd Qu.:48.50
3rd Qu.:18.70
Max.
:59.60
Max.
:21.50
NA's
:2
NA's
:2
sex
year
female:165
Min.
:2007
male :168
1st Qu.:2007
NA's : 11
Median :2008
Mean
:2008
3rd Qu.:2009
Max.
:2009
data cleaning there are NA values which need to be removed as below
summary(penguins<-na.omit(penguins))
##
##
##
##
##
##
##
##
##
##
##
##
##
##
species
Adelie
:146
Chinstrap: 68
Gentoo
:119
island
Biscoe
:163
Dream
:123
Torgersen: 47
flipper_length_mm
Min.
:172
1st Qu.:190
Median :197
Mean
:201
3rd Qu.:213
Max.
:231
body_mass_g
Min.
:2700
1st Qu.:3550
Median :4050
Mean
:4207
3rd Qu.:4775
Max.
:6300
bill_length_mm bill_depth_mm
Min.
:32.10
Min.
:13.10
1st Qu.:39.50
1st Qu.:15.60
Median :44.50
Median :17.30
Mean
:43.99
Mean
:17.16
3rd Qu.:48.60
3rd Qu.:18.70
Max.
:59.60
Max.
:21.50
sex
year
female:165
Min.
:2007
male :168
1st Qu.:2007
Median :2008
Mean
:2008
3rd Qu.:2009
Max.
:2009
The data now is clean from Na values complete cases
summary(penguins<-penguins[complete.cases(penguins),])
##
##
##
##
##
##
##
##
##
##
##
species
Adelie
:146
Chinstrap: 68
Gentoo
:119
island
Biscoe
:163
Dream
:123
Torgersen: 47
flipper_length_mm body_mass_g
Min.
:172
Min.
:2700
1st Qu.:190
1st Qu.:3550
Median :197
Median :4050
bill_length_mm bill_depth_mm
Min.
:32.10
Min.
:13.10
1st Qu.:39.50
1st Qu.:15.60
Median :44.50
Median :17.30
Mean
:43.99
Mean
:17.16
3rd Qu.:48.60
3rd Qu.:18.70
Max.
:59.60
Max.
:21.50
sex
year
female:165
Min.
:2007
male :168
1st Qu.:2007
Median :2008
##
##
##
Mean
:201
3rd Qu.:213
Max.
:231
Mean
:4207
3rd Qu.:4775
Max.
:6300
Mean
:2008
3rd Qu.:2009
Max.
:2009
remove duplicate cases
summary(penguins<-penguins%>%distinct())
##
##
##
##
##
##
##
##
##
##
##
##
##
##
species
Adelie
:146
Chinstrap: 68
Gentoo
:119
island
Biscoe
:163
Dream
:123
Torgersen: 47
flipper_length_mm
Min.
:172
1st Qu.:190
Median :197
Mean
:201
3rd Qu.:213
Max.
:231
body_mass_g
Min.
:2700
1st Qu.:3550
Median :4050
Mean
:4207
3rd Qu.:4775
Max.
:6300
bill_length_mm bill_depth_mm
Min.
:32.10
Min.
:13.10
1st Qu.:39.50
1st Qu.:15.60
Median :44.50
Median :17.30
Mean
:43.99
Mean
:17.16
3rd Qu.:48.60
3rd Qu.:18.70
Max.
:59.60
Max.
:21.50
sex
year
female:165
Min.
:2007
male :168
1st Qu.:2007
Median :2008
Mean
:2008
3rd Qu.:2009
Max.
:2009
there were no duplicate cases. remove missing values
scatter_plot%ggplot(aes(bill_length_mm,bill_depth_mm,color=species))+geom_point
()+geom_smooth(method = "lm",se=F)+scale_color_brewer(palette =
"Dark2")+labs(x="Bill Length(mm)",y="Bill
Depth(mm)",color="species")+theme_minimal()
scatter_plot
## `geom_smooth()` using formula = 'y ~ x'