Dennis Munderu | Freelancer Exploratory Sales Analysis For Electronic Shop

Exploratory Sales analysis for electronic shop

In [17]: import pandas as pd import os In [18]: df = pd.read_csv("C:\\Users\\user\\Desktop\\Data\\Sales_analysis\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\Sales_April_2019.csv") In [19]: df Order ID Product Quantity Ordered Price Each Order Date Purchase Address 0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 1 NaN NaN NaN NaN NaN NaN 2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 3 176560 Google Phone 1 600 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 ... ... ... ... ... ... ... 18378 194090 Google Phone 1 600 04/08/19 17:11 177 Jackson St, Los Angeles, CA 90001 18379 194091 AA Batteries (4-pack) 1 3.84 04/15/19 16:02 311 Forest St, Austin, TX 73301 18380 194092 AAA Batteries (4-pack) 2 2.99 04/28/19 14:36 347 Sunset St, San Francisco, CA 94016 18381 194093 AA Batteries (4-pack) 1 3.84 04/14/19 15:09 835 Lake St, Portland, OR 97035 18382 194094 Lightning Charging Cable 1 14.95 04/18/19 11:08 354 North St, Boston, MA 02215 Out[19]: 18383 rows × 6 columns Concat all files to a single csv In [20]: files =[file for file in os.listdir("C:\\Users\\user\\Desktop\\Data\\Sales_analysis\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\")] all_data = pd.DataFrame() for file in files: df = pd.read_csv("C:\\Users\\user\\Desktop\\Data\\Sales_analysis\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\" + file) all_data = pd.concat([all_data, df]) all_data.to_csv("comp_data.csv", index = False) In [21]: comp_data = pd.read_csv("comp_data.csv") comp_data.sample(10) Order ID Product Quantity Ordered Price Each Order Date Purchase Address 80832 226316 AAA Batteries (4-pack) 1 2.99 07/28/19 19:32 850 Willow St, New York City, NY 10001 38205 303112 USB-C Charging Cable 1 11.95 12/18/19 11:13 640 Hill St, Boston, MA 02215 106332 163020 34in Ultrawide Monitor 1 379.99 03/15/19 09:27 851 Hill St, Los Angeles, CA 90001 172484 276238 LG Dryer 1 600.0 10/01/19 21:00 64 Pine St, Austin, TX 73301 178009 250877 27in FHD Monitor 1 149.99 09/23/19 17:01 854 Dogwood St, San Francisco, CA 94016 117712 173893 Flatscreen TV 1 300 03/29/19 22:53 139 Johnson St, New York City, NY 10001 39753 304590 AAA Batteries (4-pack) 2 2.99 12/07/19 15:59 819 West St, Los Angeles, CA 90001 162644 266871 Apple Airpods Headphones 1 150 10/02/19 06:17 426 Walnut St, Los Angeles, CA 90001 131372 204433 Apple Airpods Headphones 1 150 05/03/19 13:46 846 West St, San Francisco, CA 94016 6579 182843 Apple Airpods Headphones 1 150 04/27/19 06:18 354 6th St, New York City, NY 10001 Out[21]: In [22]: df = comp_data.copy() In [23]: df.shape Out[23]: (186850, 6) In [24]: df.info() RangeIndex: 186850 entries, 0 to 186849 Data columns (total 6 columns): # Column Non-Null Count --- ------------------0 Order ID 186305 non-null 1 Product 186305 non-null 2 Quantity Ordered 186305 non-null 3 Price Each 186305 non-null 4 Order Date 186305 non-null 5 Purchase Address 186305 non-null dtypes: object(6) memory usage: 8.6+ MB Dtype ----object object object object object object In [25]: df.isna().sum() Out[25]: Order ID Product Quantity Ordered Price Each Order Date Purchase Address dtype: int64 - In [26]: df[df.isna().any(axis = 1)] Order ID Product Quantity Ordered Price Each Order Date Purchase Address 1 NaN NaN NaN NaN NaN NaN 356 NaN NaN NaN NaN NaN NaN 735 NaN NaN NaN NaN NaN NaN 1433 NaN NaN NaN NaN NaN NaN 1553 NaN NaN NaN NaN NaN NaN ... ... ... ... ... ... ... 185176 NaN NaN NaN NaN NaN NaN 185438 NaN NaN NaN NaN NaN NaN 186042 NaN NaN NaN NaN NaN NaN 186548 NaN NaN NaN NaN NaN NaN 186826 NaN NaN NaN NaN NaN NaN Out[26]: 545 rows × 6 columns In [27]: df = df.dropna() In [28]: df.shape Out[28]: (186305, 6) In [29]: df.info() Index: 186305 entries, 0 to 186849 Data columns (total 6 columns): # Column Non-Null Count --- ------------------0 Order ID 186305 non-null 1 Product 186305 non-null 2 Quantity Ordered 186305 non-null 3 Price Each 186305 non-null 4 Order Date 186305 non-null 5 Purchase Address 186305 non-null dtypes: object(6) memory usage: 9.9+ MB Dtype ----object object object object object object In [ ]: Cleaning data for analysis In [30]: df['Order Date'] = pd.to_datetime(df['Order Date']) # gives error C:\Users\user\AppData\Local\Temp\ipykernel_7140\-.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To en sure parsing is consistent and as-expected, please specify a format. df['Order Date'] = pd.to_datetime(df['Order Date']) # gives error --------------------------------------------------------------------------DateParseError Traceback (most recent call last) Cell In[30], line 1 ----> 1 df['Order Date'] = pd.to_datetime(df['Order Date']) File ~\anaconda3\Lib\site-packages\pandas\core\tools\datetimes.py:1050, in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, ca che) 1048 result = arg.map(cache_array) 1049 else: -> 1050 values = convert_listlike(arg._values, format) 1051 result = arg._constructor(values, index=arg.index, name=arg.name) 1052 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): File ~\anaconda3\Lib\site-packages\pandas\core\tools\datetimes.py:455, in _convert_listlike_datetimes(arg, format, name, utc, unit, errors, dayfirst, yearfirst, exact) 452 if format is not None and format != "mixed": 453 return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) --> 455 result, tz_parsed = objects_to_datetime64ns( 456 arg, 457 dayfirst=dayfirst, 458 yearfirst=yearfirst, 459 utc=utc, 460 errors=errors, 461 allow_object=True, 462 ) 464 if tz_parsed is not None: 465 # We can take a shortcut since the datetime64 numpy array 466 # is in UTC 467 dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) File ~\anaconda3\Lib\site-packages\pandas\core\arrays\datetimes.py:2177, in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors, allow_object) 2174 # if str-dtype, convert 2175 data = np.array(data, copy=False, dtype=np.object_) -> 2177 result, tz_parsed = tslib.array_to_datetime( 2178 data, 2179 errors=errors, 2180 utc=utc, 2181 dayfirst=dayfirst, 2182 yearfirst=yearfirst, 2183 ) 2185 if tz_parsed is not None: 2186 # We can take a shortcut since the datetime64 numpy array 2187 # is in UTC 2188 # Return i8 values to denote unix timestamps 2189 return result.view("i8"), tz_parsed File ~\anaconda3\Lib\site-packages\pandas\_libs\tslib.pyx:402, in pandas._libs.tslib.array_to_datetime() File ~\anaconda3\Lib\site-packages\pandas\_libs\tslib.pyx:551, in pandas._libs.tslib.array_to_datetime() File ~\anaconda3\Lib\site-packages\pandas\_libs\tslib.pyx:516, in pandas._libs.tslib.array_to_datetime() File ~\anaconda3\Lib\site-packages\pandas\_libs\tslibs\conversion.pyx:557, in pandas._libs.tslibs.conversion.convert_str_to_tsobject() File ~\anaconda3\Lib\site-packages\pandas\_libs\tslibs\parsing.pyx:329, in pandas._libs.tslibs.parsing.parse_datetime_string() File ~\anaconda3\Lib\site-packages\pandas\_libs\tslibs\parsing.pyx:658, in pandas._libs.tslibs.parsing.dateutil_parse() DateParseError: Unknown datetime string format, unable to parse: Order Date, at position 517 In [31]: df.iloc[515:5120,:] Order ID Product Quantity Ordered Price Each Order Date Purchase Address 517 177053 Wired Headphones 1 11.99 04/24/19 20:45 5 Adams St, Boston, MA 02215 518 177054 Apple Airpods Headphones 1 150 04/09/19 19:18 800 Jackson St, Atlanta, GA 30301 519 Order ID Product Quantity Ordered Price Each Order Date Purchase Address 520 177055 Lightning Charging Cable 1 14.95 04/09/19 12:37 59 Forest St, Atlanta, GA 30301 521 177056 Flatscreen TV 1 300 04/20/19 23:52 180 9th St, San Francisco, CA 94016 ... ... ... ... ... ... ... 5129 181456 AA Batteries (4-pack) 2 3.84 04/02/19 21:56 64 Forest St, Portland, OR 97035 5130 181457 AAA Batteries (4-pack) 1 2.99 04/06/19 13:47 754 Willow St, Dallas, TX 75001 5131 181458 Wired Headphones 1 11.99 04/28/19 12:28 577 Hill St, New York City, NY 10001 5132 181459 27in FHD Monitor 1 149.99 04/05/19 09:39 147 Maple St, Boston, MA 02215 5133 181460 Wired Headphones 1 11.99 04/01/19 09:26 892 8th St, San Francisco, CA 94016 Out[31]: 4605 rows × 6 columns In [ ]: # column names inputed as data entries while concat In [32]: df.loc[df['Order ID'] == 'Order ID'].shape Out[32]: (355, 6) In [33]: df.loc[df['Quantity Ordered'] == 'Quantity Ordered'].shape Out[33]: (355, 6) error_index = df.loc[df['Order ID'] == 'Order ID'].index In [34]: In [35]: df.drop(error_index, inplace = True) In [36]: df.loc[df['Order ID'] == 'Order ID'].shape Out[36]: #all dropped (0, 6) In [ ]: #Trying to convert to datetime again In [37]: df['Order Date'] = pd.to_datetime(df['Order Date']) C:\Users\user\AppData\Local\Temp\ipykernel_7140\-.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To en sure parsing is consistent and as-expected, please specify a format. df['Order Date'] = pd.to_datetime(df['Order Date']) In [38]: df['Order Date'].sample(5) Out[38]: -:20:-:41:-:07:-:26:-:37:00 Name: Order Date, dtype: datetime64[ns] In [39]: df['Month'] = df['Order Date'].dt.month df['Month'] Out[39]: 0 2 3 4 5 4 4 4 4 4 .- Name: Month, Length: 185950, dtype: int32 In [40]: df['Month'] = pd.to_datetime(df['Month'], format='%m').dt.month_name().str.slice(stop=3) df['Month'] Out[40]: 0 2 3 4 5 Apr Apr Apr Apr Apr ... 186845 Sep 186846 Sep 186847 Sep 186848 Sep 186849 Sep Name: Month, Length: 185950, dtype: object In [41]: df.info() Index: 185950 entries, 0 to 186849 Data columns (total 7 columns): # Column Non-Null Count --- ------------------0 Order ID 185950 non-null 1 Product 185950 non-null 2 Quantity Ordered 185950 non-null 3 Price Each 185950 non-null 4 Order Date 185950 non-null 5 Purchase Address 185950 non-null 6 Month 185950 non-null dtypes: datetime64[ns](1), object(6) memory usage: 11.3+ MB Dtype ----object object object object datetime64[ns] object object In [ ]: #df1 = df.copy() In [42]: df['Quantity Ordered'] = pd.to_numeric(df['Quantity Ordered']) df['Price Each'] = pd.to_numeric(df['Price Each']) df.info() Index: 185950 entries, 0 to 186849 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ----------------------0 Order ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity Ordered 185950 non-null int64 3 Price Each 185950 non-null float64 4 Order Date 185950 non-null datetime64[ns] 5 Purchase Address 185950 non-null object 6 Month 185950 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(4) memory usage: 11.3+ MB In [43]: df['Purchase Address'].sample(20) Out[43]: - North St, Los Angeles,-th St, Boston,- Lincoln St, New York City,-th St, Boston,- Johnson St, Seattle,- Lakeview St, Atlanta,- Forest St, Los Angeles,- Lake St, Portland,- Jefferson St, Boston,-th St, New York City,- Elm St, New York City,- Lakeview St, San Francisco,-st St, San Francisco,- Dogwood St, San Francisco,- Chestnut St, Boston,-th St, Los Angeles,- Madison St, San Francisco,-th St, San Francisco,- Lake St, Austin,- Ridge St, New York City, Name: Purchase Address, dtype: object CA MA NY MA WA GA CA ME MA NY NY CA CA CA MA CA CA CA TX NY - In [44]: df[['St Address','City','Zip Code']] = df['Purchase Address'].str.split(',',expand = True) df.sample(5) Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month St Address City Zip Code 170035 273914 Bose SoundSport Headphones 1 99.99 -:18:00 760 Hill St, San Francisco, CA 94016 Oct 760 Hill St San Francisco CA 94016 97817 215789 Flatscreen TV 1 -:46:00 972 Main St, Seattle, WA 98101 Jun 972 Main St 129481 202635 USB-C Charging Cable 1 -:53:00 550 North St, San Francisco, CA 94016 May 550 North St San Francisco CA 94016 91706 209982 27in 4K Gaming Monitor 1 -:31:00 315 Forest St, New York City, NY 10001 Jun 315 Forest St New York City NY 10001 46828 311375 USB-C Charging Cable 1 395 Madison St, Los Angeles, CA 90001 Dec 395 Madison St Los Angeles CA 90001 Out[44]: 11.95 11.95 -:36:00 Seattle WA 98101 In [45]: df.info() Index: 185950 entries, 0 to 186849 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ----------------------0 Order ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity Ordered 185950 non-null int64 3 Price Each 185950 non-null float64 4 Order Date 185950 non-null datetime64[ns] 5 Purchase Address 185950 non-null object 6 Month 185950 non-null object 7 St Address 185950 non-null object 8 City 185950 non-null object 9 Zip Code 185950 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(7) memory usage: 15.6+ MB In [46]: df.drop('Purchase Address',axis = 1, inplace = True) In [ ]: # how much was earned on the best month In [ ]: #df1 = df.copy() In [47]: df.insert(4, 'Total Price', (df['Quantity Ordered'] * df['Price Each'])) In [ ]: In [48]: best_month = df[['Month','Total Price']].groupby('Month', as_index = False).sum().sort_values(by= 'Total Price', ascending= False) best_month Month Total Price 2 Dec - 10 Oct - 0 Apr - 9 Nov - 8 May - 7 Mar - 5 Jul - 6 Jun - 1 Aug - 3 Feb - 11 Sep - 4 Jan - Out[48]: In [ ]: # December was the best month with a total sales of- (we can visualise these) # An assumption is that festivities affect sales positively #This assumption is backed by january being the lowest month. i.e purchasing power of consumer is negatively affected after the festivities In [ ]: In [49]: import matplotlib.pyplot as plt In [50]: best_month.plot.bar(x= 'Month', y= 'Total Price', color= 'orange') plt.xlabel('Month') plt.ylabel('sales($)') plt.title('Total sales per moth') plt.show() In [ ]: In [ ]: # How sales per city compare In [51]: city_sales = df[['City','Total Price']].groupby('City', as_index= False).sum().sort_values(by= 'Total Price', ascending= False) In [ ]: # San Francisco was the best perfoming and Austin wa the last In [52]: city_sales.plot.pie(figsize= (13,6), x='City', y='Total Price', labels = city_sales['City'], autopct = '%1.2f%%') plt.title('Sales perfomance by city', color = 'red') plt.legend().set_visible(False) In [53]: df1 = df.copy() In [87]: # Top 5 best peforming product by occourrence on orders In [54]: df_copy = df[df['Order ID'].duplicated(keep= False)] df_copy['Grouped'] = df_copy.groupby('Order ID')['Product'].transform(lambda x: ','.join(x)) df_copy = df[['Order ID','Product']].drop_duplicates() df_copy C:\Users\user\AppData\Local\Temp\ipykernel_7140\-.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_copy['Grouped'] = df_copy.groupby('Order ID')['Product'].transform(lambda x: ','.join(x)) Order ID Product 0 176558 USB-C Charging Cable 2 176559 Bose SoundSport Headphones 3 176560 Google Phone 4 176560 Wired Headphones 5 176561 Wired Headphones ... ... ... 186845 259353 AAA Batteries (4-pack) 186846 259354 iPhone 186847 259355 iPhone 186848 259356 34in Ultrawide Monitor 186849 259357 USB-C Charging Cable Out[54]: 185639 rows × 2 columns In [58]: df_copy['Product'].unique() Out[58]: array(['USB-C Charging Cable', 'Bose SoundSport Headphones', 'Google Phone', 'Wired Headphones', 'Macbook Pro Laptop', 'Lightning Charging Cable', '27in 4K Gaming Monitor', 'AA Batteries (4-pack)', 'Apple Airpods Headphones', 'AAA Batteries (4-pack)', 'iPhone', 'Flatscreen TV', '27in FHD Monitor', '20in Monitor', 'LG Dryer', 'ThinkPad Laptop', 'Vareebadd Phone', 'LG Washing Machine', '34in Ultrawide Monitor'], dtype=object) In [65]: df_copy_1 = df_copy['Product'].value_counts().to_frame() In [67]: df_copy_1 = df_copy_1.reset_index() df_copy_1 Product count 0 USB-C Charging Cable 21855 1 Lightning Charging Cable 21604 2 AAA Batteries (4-pack) 20593 3 AA Batteries (4-pack) 20542 4 Wired Headphones 18847 5 Apple Airpods Headphones 15525 6 Bose SoundSport Headphones 13298 7 27in FHD Monitor 7498 8 iPhone 6840 9 27in 4K Gaming Monitor 6225 10 34in Ultrawide Monitor 6174 11 Google Phone 5522 12 Flatscreen TV 4794 13 Macbook Pro Laptop 4721 14 ThinkPad Laptop 4126 15 20in Monitor 4098 16 Vareebadd Phone 2065 17 LG Washing Machine 666 18 LG Dryer 646 Out[67]: In [69]: df_copy_1 = df_copy_1.head(5) df_copy_1 Product count 0 USB-C Charging Cable 21855 1 Lightning Charging Cable 21604 2 AAA Batteries (4-pack) 20593 3 AA Batteries (4-pack) 20542 4 Wired Headphones 18847 Out[69]: In [80]: df_copy_1.plot.pie(figsize= (13,6), x='Product', y='count', labels = df_copy_1['Product'], autopct = '%1.2f%%') plt.title('Product perfomance by order', color = 'darkgreen') plt.legend().set_visible(False) In [81]: # top 5 product perfomance by quantity ordered In [82]: df Order ID Product Quantity Ordered Price Each Total Price Order Date Month St Address City Zip Code 0 176558 USB-C Charging Cable 2 11.95 23.90 -:46:00 Apr 917 1st St Dallas TX 75001 2 176559 Bose SoundSport Headphones 1 99.99 99.99 -:30:00 Apr 682 Chestnut St Boston MA 02215 3 176560 Google Phone 1 600.00 -:38:00 Apr 669 Spruce St Los Angeles CA 90001 4 176560 Wired Headphones 1 11.99 11.99 -:38:00 Apr 669 Spruce St Los Angeles CA 90001 5 176561 Wired Headphones 1 11.99 11.99 -:27:00 Apr 333 8th St Los Angeles CA 90001 ... ... ... ... ... ... ... ... ... ... ... 186845 259353 AAA Batteries (4-pack) 3 2.99 8.97 -:56:00 Sep 840 Highland St Los Angeles CA 90001 186846 259354 iPhone 1 700.00 -:00:00 Sep 216 Dogwood St San Francisco CA 94016 186847 259355 iPhone 1 700.00 -:39:00 Sep 220 12th St San Francisco CA 94016 186848 259356 34in Ultrawide Monitor 1 379.99 -:30:00 Sep 511 Forest St San Francisco CA 94016 186849 259357 USB-C Charging Cable 1 11.95 Sep 250 Meadow St San Francisco CA 94016 Out[82]: 11.95 -:18:00 185950 rows × 10 columns In [90]: df_copy_2 = df.groupby(['Product'], as_index= False)['Quantity Ordered'].sum().sort_values(by= 'Quantity Ordered', ascending= False).head(5) In [ ]: In [91]: df_copy_2.plot.pie(figsize= (13,6), x='Product', y='Quantity Ordered', labels = df_copy_2['Product'], autopct = '%1.2f%%') plt.title('Product perfomance by quantity ordered', color = 'darkgreen') plt.legend().set_visible(False) In [92]: # we can infer that triple A batteries are the most commonly bought items by volume while usb charging cable are a popular item among customers hence appear on most orders. In [ ]: In [ ]: In [ ]: In [ ]: In [ ]: