Exploratory Sales analysis for electronic shop
In [17]: import pandas as pd
import os
In [18]: df = pd.read_csv("C:\\Users\\user\\Desktop\\Data\\Sales_analysis\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\Sales_April_2019.csv")
In [19]: df
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
0
176558
USB-C Charging Cable
2
11.95
04/19/19 08:46
917 1st St, Dallas, TX 75001
1
NaN
NaN
NaN
NaN
NaN
NaN
2
176559
Bose SoundSport Headphones
1
99.99
04/07/19 22:30
682 Chestnut St, Boston, MA 02215
3
176560
Google Phone
1
600
04/12/19 14:38
669 Spruce St, Los Angeles, CA 90001
4
176560
Wired Headphones
1
11.99
04/12/19 14:38
669 Spruce St, Los Angeles, CA 90001
...
...
...
...
...
...
...
18378
194090
Google Phone
1
600
04/08/19 17:11
177 Jackson St, Los Angeles, CA 90001
18379
194091
AA Batteries (4-pack)
1
3.84
04/15/19 16:02
311 Forest St, Austin, TX 73301
18380
194092
AAA Batteries (4-pack)
2
2.99
04/28/19 14:36
347 Sunset St, San Francisco, CA 94016
18381
194093
AA Batteries (4-pack)
1
3.84
04/14/19 15:09
835 Lake St, Portland, OR 97035
18382
194094
Lightning Charging Cable
1
14.95
04/18/19 11:08
354 North St, Boston, MA 02215
Out[19]:
18383 rows × 6 columns
Concat all files to a single csv
In [20]: files =[file for file in os.listdir("C:\\Users\\user\\Desktop\\Data\\Sales_analysis\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\")]
all_data = pd.DataFrame()
for file in files:
df = pd.read_csv("C:\\Users\\user\\Desktop\\Data\\Sales_analysis\\Pandas-Data-Science-Tasks-master\\SalesAnalysis\\Sales_Data\\" + file)
all_data = pd.concat([all_data, df])
all_data.to_csv("comp_data.csv", index = False)
In [21]: comp_data = pd.read_csv("comp_data.csv")
comp_data.sample(10)
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
80832
226316
AAA Batteries (4-pack)
1
2.99
07/28/19 19:32
850 Willow St, New York City, NY 10001
38205
303112
USB-C Charging Cable
1
11.95
12/18/19 11:13
640 Hill St, Boston, MA 02215
106332
163020
34in Ultrawide Monitor
1
379.99
03/15/19 09:27
851 Hill St, Los Angeles, CA 90001
172484
276238
LG Dryer
1
600.0
10/01/19 21:00
64 Pine St, Austin, TX 73301
178009
250877
27in FHD Monitor
1
149.99
09/23/19 17:01
854 Dogwood St, San Francisco, CA 94016
117712
173893
Flatscreen TV
1
300
03/29/19 22:53
139 Johnson St, New York City, NY 10001
39753
304590
AAA Batteries (4-pack)
2
2.99
12/07/19 15:59
819 West St, Los Angeles, CA 90001
162644
266871
Apple Airpods Headphones
1
150
10/02/19 06:17
426 Walnut St, Los Angeles, CA 90001
131372
204433
Apple Airpods Headphones
1
150
05/03/19 13:46
846 West St, San Francisco, CA 94016
6579
182843
Apple Airpods Headphones
1
150
04/27/19 06:18
354 6th St, New York City, NY 10001
Out[21]:
In [22]: df = comp_data.copy()
In [23]: df.shape
Out[23]:
(186850, 6)
In [24]: df.info()
RangeIndex: 186850 entries, 0 to 186849
Data columns (total 6 columns):
#
Column
Non-Null Count
--- ------------------0
Order ID
186305 non-null
1
Product
186305 non-null
2
Quantity Ordered 186305 non-null
3
Price Each
186305 non-null
4
Order Date
186305 non-null
5
Purchase Address 186305 non-null
dtypes: object(6)
memory usage: 8.6+ MB
Dtype
----object
object
object
object
object
object
In [25]: df.isna().sum()
Out[25]:
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
dtype: int64
-
In [26]: df[df.isna().any(axis = 1)]
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
1
NaN
NaN
NaN
NaN
NaN
NaN
356
NaN
NaN
NaN
NaN
NaN
NaN
735
NaN
NaN
NaN
NaN
NaN
NaN
1433
NaN
NaN
NaN
NaN
NaN
NaN
1553
NaN
NaN
NaN
NaN
NaN
NaN
...
...
...
...
...
...
...
185176
NaN
NaN
NaN
NaN
NaN
NaN
185438
NaN
NaN
NaN
NaN
NaN
NaN
186042
NaN
NaN
NaN
NaN
NaN
NaN
186548
NaN
NaN
NaN
NaN
NaN
NaN
186826
NaN
NaN
NaN
NaN
NaN
NaN
Out[26]:
545 rows × 6 columns
In [27]: df = df.dropna()
In [28]: df.shape
Out[28]:
(186305, 6)
In [29]: df.info()
Index: 186305 entries, 0 to 186849
Data columns (total 6 columns):
#
Column
Non-Null Count
--- ------------------0
Order ID
186305 non-null
1
Product
186305 non-null
2
Quantity Ordered 186305 non-null
3
Price Each
186305 non-null
4
Order Date
186305 non-null
5
Purchase Address 186305 non-null
dtypes: object(6)
memory usage: 9.9+ MB
Dtype
----object
object
object
object
object
object
In [ ]:
Cleaning data for analysis
In [30]: df['Order Date'] = pd.to_datetime(df['Order Date'])
# gives error
C:\Users\user\AppData\Local\Temp\ipykernel_7140\-.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To en
sure parsing is consistent and as-expected, please specify a format.
df['Order Date'] = pd.to_datetime(df['Order Date']) # gives error
--------------------------------------------------------------------------DateParseError
Traceback (most recent call last)
Cell In[30], line 1
----> 1 df['Order Date'] = pd.to_datetime(df['Order Date'])
File ~\anaconda3\Lib\site-packages\pandas\core\tools\datetimes.py:1050, in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, ca
che)
1048
result = arg.map(cache_array)
1049
else:
-> 1050
values = convert_listlike(arg._values, format)
1051
result = arg._constructor(values, index=arg.index, name=arg.name)
1052 elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):
File ~\anaconda3\Lib\site-packages\pandas\core\tools\datetimes.py:455, in _convert_listlike_datetimes(arg, format, name, utc, unit, errors, dayfirst, yearfirst, exact)
452 if format is not None and format != "mixed":
453
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
--> 455 result, tz_parsed = objects_to_datetime64ns(
456
arg,
457
dayfirst=dayfirst,
458
yearfirst=yearfirst,
459
utc=utc,
460
errors=errors,
461
allow_object=True,
462 )
464 if tz_parsed is not None:
465
# We can take a shortcut since the datetime64 numpy array
466
# is in UTC
467
dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
File ~\anaconda3\Lib\site-packages\pandas\core\arrays\datetimes.py:2177, in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors, allow_object)
2174 # if str-dtype, convert
2175 data = np.array(data, copy=False, dtype=np.object_)
-> 2177 result, tz_parsed = tslib.array_to_datetime(
2178
data,
2179
errors=errors,
2180
utc=utc,
2181
dayfirst=dayfirst,
2182
yearfirst=yearfirst,
2183 )
2185 if tz_parsed is not None:
2186
# We can take a shortcut since the datetime64 numpy array
2187
# is in UTC
2188
# Return i8 values to denote unix timestamps
2189
return result.view("i8"), tz_parsed
File ~\anaconda3\Lib\site-packages\pandas\_libs\tslib.pyx:402, in pandas._libs.tslib.array_to_datetime()
File ~\anaconda3\Lib\site-packages\pandas\_libs\tslib.pyx:551, in pandas._libs.tslib.array_to_datetime()
File ~\anaconda3\Lib\site-packages\pandas\_libs\tslib.pyx:516, in pandas._libs.tslib.array_to_datetime()
File ~\anaconda3\Lib\site-packages\pandas\_libs\tslibs\conversion.pyx:557, in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
File ~\anaconda3\Lib\site-packages\pandas\_libs\tslibs\parsing.pyx:329, in pandas._libs.tslibs.parsing.parse_datetime_string()
File ~\anaconda3\Lib\site-packages\pandas\_libs\tslibs\parsing.pyx:658, in pandas._libs.tslibs.parsing.dateutil_parse()
DateParseError: Unknown datetime string format, unable to parse: Order Date, at position 517
In [31]: df.iloc[515:5120,:]
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
517
177053
Wired Headphones
1
11.99
04/24/19 20:45
5 Adams St, Boston, MA 02215
518
177054
Apple Airpods Headphones
1
150
04/09/19 19:18
800 Jackson St, Atlanta, GA 30301
519
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
520
177055
Lightning Charging Cable
1
14.95
04/09/19 12:37
59 Forest St, Atlanta, GA 30301
521
177056
Flatscreen TV
1
300
04/20/19 23:52
180 9th St, San Francisco, CA 94016
...
...
...
...
...
...
...
5129
181456
AA Batteries (4-pack)
2
3.84
04/02/19 21:56
64 Forest St, Portland, OR 97035
5130
181457
AAA Batteries (4-pack)
1
2.99
04/06/19 13:47
754 Willow St, Dallas, TX 75001
5131
181458
Wired Headphones
1
11.99
04/28/19 12:28
577 Hill St, New York City, NY 10001
5132
181459
27in FHD Monitor
1
149.99
04/05/19 09:39
147 Maple St, Boston, MA 02215
5133
181460
Wired Headphones
1
11.99
04/01/19 09:26
892 8th St, San Francisco, CA 94016
Out[31]:
4605 rows × 6 columns
In [ ]: # column names inputed as data entries while concat
In [32]: df.loc[df['Order ID'] == 'Order ID'].shape
Out[32]:
(355, 6)
In [33]: df.loc[df['Quantity Ordered'] == 'Quantity Ordered'].shape
Out[33]:
(355, 6)
error_index = df.loc[df['Order ID'] == 'Order ID'].index
In [34]:
In [35]: df.drop(error_index, inplace = True)
In [36]: df.loc[df['Order ID'] == 'Order ID'].shape
Out[36]:
#all dropped
(0, 6)
In [ ]: #Trying to convert to datetime again
In [37]: df['Order Date'] = pd.to_datetime(df['Order Date'])
C:\Users\user\AppData\Local\Temp\ipykernel_7140\-.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To en
sure parsing is consistent and as-expected, please specify a format.
df['Order Date'] = pd.to_datetime(df['Order Date'])
In [38]: df['Order Date'].sample(5)
Out[38]:
-:20:-:41:-:07:-:26:-:37:00
Name: Order Date, dtype: datetime64[ns]
In [39]: df['Month'] = df['Order Date'].dt.month
df['Month']
Out[39]:
0
2
3
4
5
4
4
4
4
4
.-
Name: Month, Length: 185950, dtype: int32
In [40]: df['Month'] = pd.to_datetime(df['Month'], format='%m').dt.month_name().str.slice(stop=3)
df['Month']
Out[40]:
0
2
3
4
5
Apr
Apr
Apr
Apr
Apr
...
186845
Sep
186846
Sep
186847
Sep
186848
Sep
186849
Sep
Name: Month, Length: 185950, dtype: object
In [41]: df.info()
Index: 185950 entries, 0 to 186849
Data columns (total 7 columns):
#
Column
Non-Null Count
--- ------------------0
Order ID
185950 non-null
1
Product
185950 non-null
2
Quantity Ordered 185950 non-null
3
Price Each
185950 non-null
4
Order Date
185950 non-null
5
Purchase Address 185950 non-null
6
Month
185950 non-null
dtypes: datetime64[ns](1), object(6)
memory usage: 11.3+ MB
Dtype
----object
object
object
object
datetime64[ns]
object
object
In [ ]: #df1 = df.copy()
In [42]: df['Quantity Ordered'] = pd.to_numeric(df['Quantity Ordered'])
df['Price Each'] = pd.to_numeric(df['Price Each'])
df.info()
Index: 185950 entries, 0 to 186849
Data columns (total 7 columns):
#
Column
Non-Null Count
Dtype
--- ----------------------0
Order ID
185950 non-null object
1
Product
185950 non-null object
2
Quantity Ordered 185950 non-null int64
3
Price Each
185950 non-null float64
4
Order Date
185950 non-null datetime64[ns]
5
Purchase Address 185950 non-null object
6
Month
185950 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 11.3+ MB
In [43]: df['Purchase Address'].sample(20)
Out[43]:
- North St, Los Angeles,-th St, Boston,- Lincoln St, New York City,-th St, Boston,- Johnson St, Seattle,- Lakeview St, Atlanta,- Forest St, Los Angeles,- Lake St, Portland,- Jefferson St, Boston,-th St, New York City,- Elm St, New York City,- Lakeview St, San Francisco,-st St, San Francisco,- Dogwood St, San Francisco,- Chestnut St, Boston,-th St, Los Angeles,- Madison St, San Francisco,-th St, San Francisco,- Lake St, Austin,- Ridge St, New York City,
Name: Purchase Address, dtype: object
CA
MA
NY
MA
WA
GA
CA
ME
MA
NY
NY
CA
CA
CA
MA
CA
CA
CA
TX
NY
-
In [44]: df[['St Address','City','Zip Code']] = df['Purchase Address'].str.split(',',expand = True)
df.sample(5)
Order ID
Product
Quantity Ordered
Price Each
Order Date
Purchase Address
Month
St Address
City
Zip Code
170035
273914
Bose SoundSport Headphones
1
99.99
-:18:00
760 Hill St, San Francisco, CA 94016
Oct
760 Hill St
San Francisco
CA 94016
97817
215789
Flatscreen TV
1
-:46:00
972 Main St, Seattle, WA 98101
Jun
972 Main St
129481
202635
USB-C Charging Cable
1
-:53:00
550 North St, San Francisco, CA 94016
May
550 North St
San Francisco
CA 94016
91706
209982
27in 4K Gaming Monitor
1
-:31:00
315 Forest St, New York City, NY 10001
Jun
315 Forest St
New York City
NY 10001
46828
311375
USB-C Charging Cable
1
395 Madison St, Los Angeles, CA 90001
Dec
395 Madison St
Los Angeles
CA 90001
Out[44]:
11.95
11.95
-:36:00
Seattle WA 98101
In [45]: df.info()
Index: 185950 entries, 0 to 186849
Data columns (total 10 columns):
#
Column
Non-Null Count
Dtype
--- ----------------------0
Order ID
185950 non-null object
1
Product
185950 non-null object
2
Quantity Ordered 185950 non-null int64
3
Price Each
185950 non-null float64
4
Order Date
185950 non-null datetime64[ns]
5
Purchase Address 185950 non-null object
6
Month
185950 non-null object
7
St Address
185950 non-null object
8
City
185950 non-null object
9
Zip Code
185950 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 15.6+ MB
In [46]: df.drop('Purchase Address',axis = 1, inplace = True)
In [ ]: # how much was earned on the best month
In [ ]: #df1 = df.copy()
In [47]: df.insert(4, 'Total Price', (df['Quantity Ordered'] * df['Price Each']))
In [ ]:
In [48]: best_month = df[['Month','Total Price']].groupby('Month', as_index = False).sum().sort_values(by= 'Total Price', ascending= False)
best_month
Month
Total Price
2
Dec
-
10
Oct
-
0
Apr
-
9
Nov
-
8
May
-
7
Mar
-
5
Jul
-
6
Jun
-
1
Aug
-
3
Feb
-
11
Sep
-
4
Jan
-
Out[48]:
In [ ]: # December was the best month with a total sales of- (we can visualise these)
# An assumption is that festivities affect sales positively
#This assumption is backed by january being the lowest month. i.e purchasing power of consumer is negatively affected after the festivities
In [ ]:
In [49]: import matplotlib.pyplot as plt
In [50]: best_month.plot.bar(x= 'Month', y= 'Total Price', color= 'orange')
plt.xlabel('Month')
plt.ylabel('sales($)')
plt.title('Total sales per moth')
plt.show()
In [ ]:
In [ ]: # How sales per city compare
In [51]: city_sales = df[['City','Total Price']].groupby('City', as_index= False).sum().sort_values(by= 'Total Price', ascending= False)
In [ ]: # San Francisco was the best perfoming and Austin wa the last
In [52]: city_sales.plot.pie(figsize= (13,6), x='City', y='Total Price', labels = city_sales['City'], autopct = '%1.2f%%')
plt.title('Sales perfomance by city', color = 'red')
plt.legend().set_visible(False)
In [53]: df1 = df.copy()
In [87]: # Top 5 best peforming product by occourrence on orders
In [54]: df_copy = df[df['Order ID'].duplicated(keep= False)]
df_copy['Grouped'] = df_copy.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df_copy = df[['Order ID','Product']].drop_duplicates()
df_copy
C:\Users\user\AppData\Local\Temp\ipykernel_7140\-.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_copy['Grouped'] = df_copy.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
Order ID
Product
0
176558
USB-C Charging Cable
2
176559
Bose SoundSport Headphones
3
176560
Google Phone
4
176560
Wired Headphones
5
176561
Wired Headphones
...
...
...
186845
259353
AAA Batteries (4-pack)
186846
259354
iPhone
186847
259355
iPhone
186848
259356
34in Ultrawide Monitor
186849
259357
USB-C Charging Cable
Out[54]:
185639 rows × 2 columns
In [58]: df_copy['Product'].unique()
Out[58]:
array(['USB-C Charging Cable', 'Bose SoundSport Headphones',
'Google Phone', 'Wired Headphones', 'Macbook Pro Laptop',
'Lightning Charging Cable', '27in 4K Gaming Monitor',
'AA Batteries (4-pack)', 'Apple Airpods Headphones',
'AAA Batteries (4-pack)', 'iPhone', 'Flatscreen TV',
'27in FHD Monitor', '20in Monitor', 'LG Dryer', 'ThinkPad Laptop',
'Vareebadd Phone', 'LG Washing Machine', '34in Ultrawide Monitor'],
dtype=object)
In [65]: df_copy_1 = df_copy['Product'].value_counts().to_frame()
In [67]: df_copy_1 = df_copy_1.reset_index()
df_copy_1
Product
count
0
USB-C Charging Cable
21855
1
Lightning Charging Cable
21604
2
AAA Batteries (4-pack)
20593
3
AA Batteries (4-pack)
20542
4
Wired Headphones
18847
5
Apple Airpods Headphones
15525
6
Bose SoundSport Headphones
13298
7
27in FHD Monitor
7498
8
iPhone
6840
9
27in 4K Gaming Monitor
6225
10
34in Ultrawide Monitor
6174
11
Google Phone
5522
12
Flatscreen TV
4794
13
Macbook Pro Laptop
4721
14
ThinkPad Laptop
4126
15
20in Monitor
4098
16
Vareebadd Phone
2065
17
LG Washing Machine
666
18
LG Dryer
646
Out[67]:
In [69]: df_copy_1 = df_copy_1.head(5)
df_copy_1
Product
count
0
USB-C Charging Cable
21855
1
Lightning Charging Cable
21604
2
AAA Batteries (4-pack)
20593
3
AA Batteries (4-pack)
20542
4
Wired Headphones
18847
Out[69]:
In [80]: df_copy_1.plot.pie(figsize= (13,6), x='Product', y='count', labels = df_copy_1['Product'], autopct = '%1.2f%%')
plt.title('Product perfomance by order', color = 'darkgreen')
plt.legend().set_visible(False)
In [81]: # top 5 product perfomance by quantity ordered
In [82]: df
Order ID
Product
Quantity Ordered
Price Each
Total Price
Order Date
Month
St Address
City
Zip Code
0
176558
USB-C Charging Cable
2
11.95
23.90
-:46:00
Apr
917 1st St
Dallas
TX 75001
2
176559
Bose SoundSport Headphones
1
99.99
99.99
-:30:00
Apr
682 Chestnut St
Boston
MA 02215
3
176560
Google Phone
1
600.00
-:38:00
Apr
669 Spruce St
Los Angeles
CA 90001
4
176560
Wired Headphones
1
11.99
11.99
-:38:00
Apr
669 Spruce St
Los Angeles
CA 90001
5
176561
Wired Headphones
1
11.99
11.99
-:27:00
Apr
333 8th St
Los Angeles
CA 90001
...
...
...
...
...
...
...
...
...
...
...
186845
259353
AAA Batteries (4-pack)
3
2.99
8.97
-:56:00
Sep
840 Highland St
Los Angeles
CA 90001
186846
259354
iPhone
1
700.00
-:00:00
Sep
216 Dogwood St
San Francisco
CA 94016
186847
259355
iPhone
1
700.00
-:39:00
Sep
220 12th St
San Francisco
CA 94016
186848
259356
34in Ultrawide Monitor
1
379.99
-:30:00
Sep
511 Forest St
San Francisco
CA 94016
186849
259357
USB-C Charging Cable
1
11.95
Sep
250 Meadow St
San Francisco
CA 94016
Out[82]:
11.95
-:18:00
185950 rows × 10 columns
In [90]: df_copy_2 = df.groupby(['Product'], as_index= False)['Quantity Ordered'].sum().sort_values(by= 'Quantity Ordered', ascending= False).head(5)
In [ ]:
In [91]: df_copy_2.plot.pie(figsize= (13,6), x='Product', y='Quantity Ordered', labels = df_copy_2['Product'], autopct = '%1.2f%%')
plt.title('Product perfomance by quantity ordered', color = 'darkgreen')
plt.legend().set_visible(False)
In [92]: # we can infer that triple A batteries are the most commonly bought items by volume while usb charging cable are a popular item among customers hence appear on most orders.
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: