Zainab Munir | Freelancer Portfolio Item #424816

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session /kaggle/input/exploratory-data-analysis-on-netflix-data/ netflix_titles_2021.csv /kaggle/input/exploratory-data-analysis-on-netflix-data/netflix img.png df=pd.read_csv("/kaggle/input/exploratory-data-analysis-on-netflixdata/netflix_titles_2021.csv") df.head() 0 1 2 3 4 show_id s1 s2 s3 s4 s5 type Movie TV Show TV Show TV Show TV Show title Dick Johnson Is Dead Blood & Water Ganglands Jailbirds New Orleans Kota Factory director Kirsten Johnson NaN Julien Leclercq NaN NaN 0 1 2 3 4 cast NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... 0 date_added September 25, 2021 release_year rating 2020 PG-13 duration 90 min \ country United States South Africa NaN NaN India \ \ 1 2 3 4 September September September September 24, 24, 24, 24, - - TV-MA TV-MA TV-MA TV-MA 0 1 2 3 4 listed_in Documentaries International TV Shows, TV Dramas, TV Mysteries Crime TV Shows, International TV Shows, TV Act... Docuseries, Reality TV International TV Shows, Romantic TV Shows, TV ... 0 1 2 3 4 description As her father nears the end of his life, filmm... After crossing paths at a party, a Cape Town t... To protect his family from a powerful drug lor... Feuds, flirtations and toilet talk go down amo... In a city of coaching centers known to train I... df.info() RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------------------- ----0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 7976 non-null object 6 date_added 8797 non-null object 7 release_year 8807 non-null int64 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: int64(1), object(11) memory usage: 825.8+ KB df["type"]- Movie TV Show TV Show TV Show TV Show ... Movie 2 Seasons 1 Season 1 Season 2 Seasons \ 8803 TV Show 8804 Movie 8805 Movie 8806 Movie Name: type, Length: 8807, dtype: object #What types of shows or movies are uploaded on Netflix? df["type"].unique() array(['Movie', 'TV Show'], dtype=object) #What is the correlation between features?: from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df["encoded_rating"]=le.fit_transform(df["rating"]) import pandas as pd import seaborn as sns import matplotlib.pyplot as plt numerical_features = ['release_year', 'encoded_rating', 'duration'] correlation_matrix = df[numerical_features].corr() sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') plt.title('Correlation Matrix of Netflix Features') plt.show() -------------------------------------------------------------------------ValueError Traceback (most recent call last) /tmp/ipykernel_31/-.py in () 7 numerical_features = ['release_year', 'encoded_rating', 'duration'] 8 ----> 9 correlation_matrix = df[numerical_features].corr() 10 11 sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') /usr/local/lib/python3.11/dist-packages/pandas/core/frame.py in corr(self, method, min_periods, numeric_only) 11047 cols = data.columns 11048 idx = cols.copy() > 11049 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False- if method == "pearson": /usr/local/lib/python3.11/dist-packages/pandas/core/frame.py in to_numpy(self, dtype, copy, na_value) 1991 if dtype is not None: 1992 dtype = np.dtype(dtype) -> 1993 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) 1994 if result.dtype is not dtype: 1995 result = np.asarray(result, dtype=dtype) /usr/local/lib/python3.11/dist-packages/pandas/core/internals/managers .py in as_array(self, dtype, copy, na_value) 1692 arr.flags.writeable = False 1693 else: -> 1694 arr = self._interleave(dtype=dtype, na_value=na_value) 1695 # The underlying data was copied within _interleave, so no need 1696 # to further copy if copy=True or setting na_value /usr/local/lib/python3.11/dist-packages/pandas/core/internals/managers .py in _interleave(self, dtype, na_value) 1751 else: 1752 arr = blk.get_values(dtype) -> 1753 result[rl.indexer] = arr 1754 itemmask[rl.indexer] = 1 1755 ValueError: could not convert string to float: '90 min' #Which shows are most watched on Netflix? df["type"].iloc[0] #OR : len(df[df["type"]=="Movie"]) #What is the distribution of ratings? df["rating"].unique() array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R', 'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan, 'TV-Y7-FV', 'UR'], dtype=object) import matplotlib.pyplot as plt import seaborn as sns plt.figure(figsize=(13,8)) sns.histplot(df['rating'],bins=30) plt.title=("Distribution Of ratings") plt.xlabel=("rating") plt.ylabel=("frequency") plt.show() #distribution of rating is positively skewed. /usr/local/lib/python3.11/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): #Which has the highest rating: TV shows or movies? avg_ratings= df.groupby('type')['encoded_rating'].mean() hig_avg_ratings=avg_ratings.idxmax() print(f"The highest rating is of:{hig_avg_ratings}") The highest rating is of:TV Show # What is the best month for releasing content? avg_ratings=df.groupby("date_added")['encoded_rating'].mean() avg_ratings.sort_values(ascending=False) best_month= avg_ratings.idxmax() print(f"The best_month for releasing content is :{best_month}") The best_month for releasing content is :January 26, 2017 #Which genres are most watched on Netflix? genre_no=df["listed_in"].value_counts() h_genre=genre_no.idxmax() print(f"most watched genre on Netflix is:{h_genre}") most watched genre on Netflix is:Dramas, International Movies #How many movies have been released over the years? df["type"].value_counts() type Movie 6131 TV Show 2676 Name: count, dtype: int64 #How many movies were made per year? movies_per_year=df.groupby('release_year')['show_id'].count() movies_per_year release_year- ..- Name: show_id, Length: 74, dtype: int64 #What is the show ID and director for 'House of Cards'? show_id=df[(df["title"]=="House of Cards")]["show_id"].iloc[0] director=df[(df["title"]=="House of Cards")]["director"].iloc[0] show_id,director ('s1060', nan) ##OR house_of_cards_data=df[(df["title"]=="House of Cards")] show_id = house_of_cards_data['show_id'].iloc[0] # Assuming show_id is unique director = house_of_cards_data['director'].iloc[0] # Assuming director is available print(f"Show ID for 'House of Cards': {show_id}") print(f"Director for 'House of Cards': {director}") Show ID for 'House of Cards': s1060 Director for 'House of Cards': nan #List all movies released in 2000. # Filter the DataFrame movies_2000 = df[df['release_year'] == 2000] movies_2000 # Print the titles for title in movies_2000['title']: print(title) The Nutty Professor II: The Klumps Space Cowboys The Original Kings of Comedy Charlie's Angels Snow Day Battlefield Earth The Whole Nine Yards Moesha Rugrats in Paris: The Movie Monty Python: Before the Flying Circus Fiza Phir Bhi Dil Hai Hindustani Pokémon: Indigo League Star Trek: Voyager Joseph: King of Dreams 28 Days American Psycho An American Tail: The Treasures of Manhattan Island Billy Elliot Center Stage Chal Mere Bhai Crouching Tiger, Hidden Dragon Dragonheart: A New Beginning Final Destination Hamara Dil Aapke Paas Hai How the Grinch Stole Christmas Kya Kehna Little Nicky Papa the Great Power Rangers Lightspeed Rescue Pukar Scary Movie Scream 3 The Art of War The Flintstones in Viva Rock Vegas What Lies Beneath Where the Money Is #Show only the titles of TV shows released in India. Movies_In=df[(df["type"]=="TV Show") & (df["country"]=="India")] for title in Movies_In["title"]: print(title) Kota Factory Chhota Bheem Dharmakshetra Raja Rasoi Aur Anya Kahaniyan Stories by Rabindranath Tagore The Creative Indians Navarasa Alma Matters Sab Jholmaal Hai Lava Ka Dhaava The Big Day Bombay Begums Zindagi in Short Pitta Kathalu Mighty Little Bheem: Kite Festival Regiment Diaries Paava Kadhaigal Bhaag Beanie Bhaag Fabulous Lives of Bollywood Wives Mismatched Bad Boy Billionaires: India Masaba Masaba Little Singham Betaal ChuChu TV Nursery Rhymes & Kids Songs (Hindi) Hasmukh Akbar Birbal Ladies Up She Mighty Little Bheem: Festival of Colors Taj Mahal 1989 Jamtara - Sabka Number Ayega Little Things Mighty Little Bheem: Diwali College Romance Engineering Girls Girls Hostel Inmates Bard of Blood Typewriter Leila Delhi Crime Cricket Fever: Mumbai Indians Selection Day GHOUL 21 Sarfarosh: Saragarhi 1897 7 (Seven) Agent Raghav Anjaan: Rural Myths Anjaan: Special Crimes Unit Badalte Rishton Ki Dastaan Bh Se Bhade Bhaage Re Mann Classic Legends Darr Sabko Lagta Hai Devlok with Devdutt Pattanaik Fear Files... Har Mod Pe Darr Gabru: Hip Hop Revolution Gangs of Hassepur Jhansi Ki Rani Khan: No. 1 Crime Hunter Khelti Hai Zindagi Aankh Micholi Khotey Sikkey Maharakshak Devi Maharakshak: Aryan Mahi Way Midnight Misadventures With Mallika Dua Powder Pyaar Tune Kya Kiya Ramayan Razia Sultan Rishta.com Super Bheem Th Eena Meena Deeka Chase Comedy Show Thackeray The Calling The Golden Years with Javed Akhtar The House That Made Me Yeh Meri Family #Identify the top 10 directors who have contributed the most TV shows and movies to Netflix top_10=df["director"].value_counts().sort_values(ascending=False) top_10.head(10) director Rajiv Chilaka Raúl Campos, Jan Suter Marcus Raboy Suhas Kadav Jay Karas Cathy Garcia-Molina Youssef Chahine Jay Chapman Martin Scorsese - Steven Spielberg 11 Name: count, dtype: int64 #How many movies/TV shows has Tom Cruise been cast in? tom_df=df[df["cast"].str.contains("Tom Cruise",na= False)] len(tom_df) print(f"Tom Cruise has been cast in {tom_cruise_count} movies/TV shows on Netflix.") Tom Cruise has been cast in 2 movies/TV shows on Netflix. #How many movies have a "TV-14" rating in Canada? df_movies=df[(df["rating"]=="TV14")&(df["type"]=="Movie")&(df["country"]=="Canada")] len(df_movies) 13