# This Python 3 environment comes with many helpful analytics
libraries installed
# It is defined by the kaggle/python Docker image:
https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/"
directory
# For example, running this (by clicking run or pressing Shift+Enter)
will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/)
that gets preserved as output when you create a version using "Save &
Run All"
# You can also write temporary files to /kaggle/temp/, but they won't
be saved outside of the current session
/kaggle/input/exploratory-data-analysis-on-netflix-data/
netflix_titles_2021.csv
/kaggle/input/exploratory-data-analysis-on-netflix-data/netflix
img.png
df=pd.read_csv("/kaggle/input/exploratory-data-analysis-on-netflixdata/netflix_titles_2021.csv")
df.head()
0
1
2
3
4
show_id
s1
s2
s3
s4
s5
type
Movie
TV Show
TV Show
TV Show
TV Show
title
Dick Johnson Is Dead
Blood & Water
Ganglands
Jailbirds New Orleans
Kota Factory
director
Kirsten Johnson
NaN
Julien Leclercq
NaN
NaN
0
1
2
3
4
cast
NaN
Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...
Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...
NaN
Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...
0
date_added
September 25, 2021
release_year rating
2020 PG-13
duration
90 min
\
country
United States
South Africa
NaN
NaN
India
\
\
1
2
3
4
September
September
September
September
24,
24,
24,
24,
-
-
TV-MA
TV-MA
TV-MA
TV-MA
0
1
2
3
4
listed_in
Documentaries
International TV Shows, TV Dramas, TV Mysteries
Crime TV Shows, International TV Shows, TV Act...
Docuseries, Reality TV
International TV Shows, Romantic TV Shows, TV ...
0
1
2
3
4
description
As her father nears the end of his life, filmm...
After crossing paths at a party, a Cape Town t...
To protect his family from a powerful drug lor...
Feuds, flirtations and toilet talk go down amo...
In a city of coaching centers known to train I...
df.info()
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
#
Column
Non-Null Count Dtype
--- ------------------- ----0
show_id
8807 non-null
object
1
type
8807 non-null
object
2
title
8807 non-null
object
3
director
6173 non-null
object
4
cast
7982 non-null
object
5
country
7976 non-null
object
6
date_added
8797 non-null
object
7
release_year 8807 non-null
int64
8
rating
8803 non-null
object
9
duration
8804 non-null
object
10 listed_in
8807 non-null
object
11 description
8807 non-null
object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
df["type"]-
Movie
TV Show
TV Show
TV Show
TV Show
...
Movie
2 Seasons
1 Season
1 Season
2 Seasons
\
8803
TV Show
8804
Movie
8805
Movie
8806
Movie
Name: type, Length: 8807, dtype: object
#What types of shows or movies are uploaded on Netflix?
df["type"].unique()
array(['Movie', 'TV Show'], dtype=object)
#What is the correlation between features?:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["encoded_rating"]=le.fit_transform(df["rating"])
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
numerical_features = ['release_year', 'encoded_rating', 'duration']
correlation_matrix = df[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Netflix Features')
plt.show()
-------------------------------------------------------------------------ValueError
Traceback (most recent call
last)
/tmp/ipykernel_31/-.py in ()
7 numerical_features = ['release_year', 'encoded_rating',
'duration']
8
----> 9 correlation_matrix = df[numerical_features].corr()
10
11 sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py in
corr(self, method, min_periods, numeric_only)
11047
cols = data.columns
11048
idx = cols.copy()
> 11049
mat = data.to_numpy(dtype=float, na_value=np.nan,
copy=False-
if method == "pearson":
/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py in
to_numpy(self, dtype, copy, na_value)
1991
if dtype is not None:
1992
dtype = np.dtype(dtype)
-> 1993
result = self._mgr.as_array(dtype=dtype, copy=copy,
na_value=na_value)
1994
if result.dtype is not dtype:
1995
result = np.asarray(result, dtype=dtype)
/usr/local/lib/python3.11/dist-packages/pandas/core/internals/managers
.py in as_array(self, dtype, copy, na_value)
1692
arr.flags.writeable = False
1693
else:
-> 1694
arr = self._interleave(dtype=dtype,
na_value=na_value)
1695
# The underlying data was copied within
_interleave, so no need
1696
# to further copy if copy=True or setting na_value
/usr/local/lib/python3.11/dist-packages/pandas/core/internals/managers
.py in _interleave(self, dtype, na_value)
1751
else:
1752
arr = blk.get_values(dtype)
-> 1753
result[rl.indexer] = arr
1754
itemmask[rl.indexer] = 1
1755
ValueError: could not convert string to float: '90 min'
#Which shows are most watched on Netflix?
df["type"].iloc[0]
#OR :
len(df[df["type"]=="Movie"])
#What is the distribution of ratings?
df["rating"].unique()
array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
'TV-Y7-FV', 'UR'], dtype=object)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(13,8))
sns.histplot(df['rating'],bins=30)
plt.title=("Distribution Of ratings")
plt.xlabel=("rating")
plt.ylabel=("frequency")
plt.show()
#distribution of rating is positively skewed.
/usr/local/lib/python3.11/dist-packages/seaborn/_oldcore.py:1119:
FutureWarning: use_inf_as_na option is deprecated and will be removed
in a future version. Convert inf values to NaN before operating
instead.
with pd.option_context('mode.use_inf_as_na', True):
#Which has the highest rating: TV shows or movies?
avg_ratings= df.groupby('type')['encoded_rating'].mean()
hig_avg_ratings=avg_ratings.idxmax()
print(f"The highest rating is of:{hig_avg_ratings}")
The highest rating is of:TV Show
# What is the best month for releasing content?
avg_ratings=df.groupby("date_added")['encoded_rating'].mean()
avg_ratings.sort_values(ascending=False)
best_month= avg_ratings.idxmax()
print(f"The best_month for releasing content is :{best_month}")
The best_month for releasing content is :January 26, 2017
#Which genres are most watched on Netflix?
genre_no=df["listed_in"].value_counts()
h_genre=genre_no.idxmax()
print(f"most watched genre on Netflix is:{h_genre}")
most watched genre on Netflix is:Dramas, International Movies
#How many movies have been released over the years?
df["type"].value_counts()
type
Movie
6131
TV Show
2676
Name: count, dtype: int64
#How many movies were made per year?
movies_per_year=df.groupby('release_year')['show_id'].count()
movies_per_year
release_year-
..-
Name: show_id, Length: 74, dtype: int64
#What is the show ID and director for 'House of Cards'?
show_id=df[(df["title"]=="House of Cards")]["show_id"].iloc[0]
director=df[(df["title"]=="House of Cards")]["director"].iloc[0]
show_id,director
('s1060', nan)
##OR
house_of_cards_data=df[(df["title"]=="House of Cards")]
show_id = house_of_cards_data['show_id'].iloc[0] # Assuming show_id
is unique
director = house_of_cards_data['director'].iloc[0] # Assuming
director is available
print(f"Show ID for 'House of Cards': {show_id}")
print(f"Director for 'House of Cards': {director}")
Show ID for 'House of Cards': s1060
Director for 'House of Cards': nan
#List all movies released in 2000.
# Filter the DataFrame
movies_2000 = df[df['release_year'] == 2000]
movies_2000
# Print the titles
for title in movies_2000['title']:
print(title)
The Nutty Professor II: The Klumps
Space Cowboys
The Original Kings of Comedy
Charlie's Angels
Snow Day
Battlefield Earth
The Whole Nine Yards
Moesha
Rugrats in Paris: The Movie
Monty Python: Before the Flying Circus
Fiza
Phir Bhi Dil Hai Hindustani
Pokémon: Indigo League
Star Trek: Voyager
Joseph: King of Dreams
28 Days
American Psycho
An American Tail: The Treasures of Manhattan Island
Billy Elliot
Center Stage
Chal Mere Bhai
Crouching Tiger, Hidden Dragon
Dragonheart: A New Beginning
Final Destination
Hamara Dil Aapke Paas Hai
How the Grinch Stole Christmas
Kya Kehna
Little Nicky
Papa the Great
Power Rangers Lightspeed Rescue
Pukar
Scary Movie
Scream 3
The Art of War
The Flintstones in Viva Rock Vegas
What Lies Beneath
Where the Money Is
#Show only the titles of TV shows released in India.
Movies_In=df[(df["type"]=="TV Show") & (df["country"]=="India")]
for title in Movies_In["title"]:
print(title)
Kota Factory
Chhota Bheem
Dharmakshetra
Raja Rasoi Aur Anya Kahaniyan
Stories by Rabindranath Tagore
The Creative Indians
Navarasa
Alma Matters
Sab Jholmaal Hai
Lava Ka Dhaava
The Big Day
Bombay Begums
Zindagi in Short
Pitta Kathalu
Mighty Little Bheem: Kite Festival
Regiment Diaries
Paava Kadhaigal
Bhaag Beanie Bhaag
Fabulous Lives of Bollywood Wives
Mismatched
Bad Boy Billionaires: India
Masaba Masaba
Little Singham
Betaal
ChuChu TV Nursery Rhymes & Kids Songs (Hindi)
Hasmukh
Akbar Birbal
Ladies Up
She
Mighty Little Bheem: Festival of Colors
Taj Mahal 1989
Jamtara - Sabka Number Ayega
Little Things
Mighty Little Bheem: Diwali
College Romance
Engineering Girls
Girls Hostel
Inmates
Bard of Blood
Typewriter
Leila
Delhi Crime
Cricket Fever: Mumbai Indians
Selection Day
GHOUL
21 Sarfarosh: Saragarhi 1897
7 (Seven)
Agent Raghav
Anjaan: Rural Myths
Anjaan: Special Crimes Unit
Badalte Rishton Ki Dastaan
Bh Se Bhade
Bhaage Re Mann
Classic Legends
Darr Sabko Lagta Hai
Devlok with Devdutt Pattanaik
Fear Files... Har Mod Pe Darr
Gabru: Hip Hop Revolution
Gangs of Hassepur
Jhansi Ki Rani
Khan: No. 1 Crime Hunter
Khelti Hai Zindagi Aankh Micholi
Khotey Sikkey
Maharakshak Devi
Maharakshak: Aryan
Mahi Way
Midnight Misadventures With Mallika Dua
Powder
Pyaar Tune Kya Kiya
Ramayan
Razia Sultan
Rishta.com
Super Bheem
Th Eena Meena Deeka Chase Comedy Show
Thackeray
The Calling
The Golden Years with Javed Akhtar
The House That Made Me
Yeh Meri Family
#Identify the top 10 directors who have contributed the most TV shows
and movies to Netflix
top_10=df["director"].value_counts().sort_values(ascending=False)
top_10.head(10)
director
Rajiv Chilaka
Raúl Campos, Jan Suter
Marcus Raboy
Suhas Kadav
Jay Karas
Cathy Garcia-Molina
Youssef Chahine
Jay Chapman
Martin Scorsese
-
Steven Spielberg
11
Name: count, dtype: int64
#How many movies/TV shows has Tom Cruise been cast in?
tom_df=df[df["cast"].str.contains("Tom Cruise",na= False)]
len(tom_df)
print(f"Tom Cruise has been cast in {tom_cruise_count} movies/TV shows
on Netflix.")
Tom Cruise has been cast in 2 movies/TV shows on Netflix.
#How many movies have a "TV-14" rating in Canada?
df_movies=df[(df["rating"]=="TV14")&(df["type"]=="Movie")&(df["country"]=="Canada")]
len(df_movies)
13