-- Data Cleaning --
select *
from layoffs;
-- 1. Remove duplicates
-- 2. Standardize the data
-- 3. Null values or blank values
-- 4. Remove rows/column
Create table layoffs_staging
like layoffs;
select *
from layoffs_staging;
insert into layoffs_staging
select * from layoffs;
select *,
row_number() over(
partition by company,location, industry, total_laid_off, percentage_laid_off, 'date',
stage, country, funds_raised) as row_num
from layoffs_staging;
With duplicate_cte As
(
select *,
row_number() over(
partition by company, location, industry, total_laid_off, percentage_laid_off, 'date', stage,
country, funds_raised) as row_num
from layoffs_staging
)
select * from duplicate_cte
where row_num > 1;
Create table layoffs_staging2
like layoffs_staging;
select * from layoffs_staging2;
ALTER TABLE layoffs_staging2
ADD row_num int;
INSERT INTO layoffs_staging2
select *,
row_number() over(
partition by company, location, industry, total_laid_off, percentage_laid_off, 'date', stage,
country, funds_raised) as row_num
from layoffs_staging;
DELETE
FROM layoffs_staging2
where row_num > 1;
SET SQL_SAFE_UPDATES = 0;
ALTER TABLE layoffs_staging2
RENAME COLUMN company TO company;
select * from layoffs_staging2;
-- standardizing data: this means finding issue in your data and then fixing it --
SELECT company, TRIM(company)
from layoffs_staging2;
UPDATE layoffs_staging2
SET company = TRIM(company);
select *
from layoffs_staging2
WHERE industry like 'crypto';
UPDATE layoffs_staging2
SET industry = 'crypto'
where industry like 'crypto%';
select distinct industry
from layoffs_staging2;
select distinct location
from layoffs_staging2
order by 1;
select distinct country
from layoffs_staging2
order by 1;
select distinct country, trim(trailing ' . ' from country)
from layoffs_staging2
order by 1;
update layoffs_staging2
set country = trim(trailing ' . ' from country);
-- changing date data type from text to date data_type --
select date,
str_to_date(date, '%m/%d/%Y')
from layoffs_staging2;
UPDATE layoffs_staging2
SET date = str_to_date(date, '%m/%d/%Y');
select * from layoffs_staging2;
Alter table layoffs_staging2
modify date date;
select *
from layoffs_staging2
where industry is null
or industry = ' ';
select *
from layoffs_staging2
where company = 'airbnb';
update layoffs_staging2
set industry = null
where industry = ' ';
select t1.industry, t2.industry
from layoffs_staging2 t1
join layoffs_staging2 t2
on t1.company = t2.company
where (t1.industry is null or t1.industry = ' ')
and t2.industry is not null;
update layoffs_staging2 t1
join layoffs_staging2 t2
on t1.company = t2.company
set t1.industry = t2. industry
where t1.industry is null
and t2.industry is not null;
alter table layoffs_staging2
drop row_num;