HR Data Analytics
In [2]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')
In [3]: data = pd.read_csv(r'D:\Data Analysis\Datasets\HR_comma_sep.csv')
In [4]: data.shape
Out[4]:
(14999, 10)
In [5]: data.isnull().sum()
Out[5]:
satisfaction_level
last_evaluation
number_project
average_montly_hours
time_spend_company
Work_accident
left
promotion_last_5years
Department
salary
dtype: int64
-
In [6]: data.info()
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
#
Column
Non-Null Count
--- ------------------0
satisfaction_level
14999 non-null
1
last_evaluation
14999 non-null
2
number_project
14999 non-null
3
average_montly_hours
14999 non-null
4
time_spend_company
14999 non-null
5
Work_accident
14999 non-null
6
left
14999 non-null
7
promotion_last_5years 14999 non-null
8
Department
14999 non-null
9
salary
14999 non-null
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
Dtype
----float64
float64
int64
int64
int64
int64
int64
int64
object
object
In [7]: data.describe()
satisfaction_level
last_evaluation
number_project
average_montly_hours
time_spend_company
Work_accident
left
promotion_last_5years
count
-
-
-
-
-
-
-
-
mean
-
-
-
-
-
-
-
-
std
-
-
-
-
-
-
-
-
min
-
-
-
-
-
-
-
-
25%
-
-
-
-
-
-
-
-
50%
-
-
-
-
-
-
-
-
75%
-
-
-
-
-
-
-
-
max
-
-
-
-
-
-
-
-
Out[7]:
In [8]: data.nunique().sort_values()
Out[8]:
Work_accident
left
promotion_last_5years
salary
number_project
time_spend_company
Department
last_evaluation
satisfaction_level
average_montly_hours
dtype: int64
-
visulize how different features contribute to employee turnover
In [9]: left_data = data[data['left'] == 1 ]
remain_data = data[data['left'] == 0]
In [10]: ## employee satisfaction level contribution to employee leaving
bins = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
avg_sat_left = left_data['satisfaction_level'].mean()
avg_sat_remain = remain_data['satisfaction_level'].mean()
fig1,ax1 = plt.subplots(sharex = 1)
fig1,ax2 = plt.subplots()
ax1.hist(x = left_data['satisfaction_level'], bins = bins, edgecolor = 'black')
ax2.hist(x = remain_data['satisfaction_level'],bins = bins, edgecolor = 'black')
ax1.axvline(avg_sat_left,color = 'red',linewidth = 2)
ax2.axvline(avg_sat_remain, color = 'red',linewidth = 2)
ax2.set_xlabel('Employee Satisfaction Level')
ax2.set_ylabel('No of Employees')
ax1.set_ylabel('No of Employees')
ax1.set_title('Employee who Left')
ax2.set_title('Employee who Remained')
plt.show()
This histogram demostrate some insights about the employee leaving case:1. The employees having leaving tendancy are more likely to have satisfaction level < 0.5
2. But a portion of employees with satisfaction level > 0.7,also have tendancy to leave job.
3. The employees who remained, are more likely to have satisfaction level > 0.5
4. the average satisfaction level with employees who remained is between 0.6 to 0.7
In [12]: ## how employee performance related to employee leaving
bins = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
avg_per_left = left_data['last_evaluation'].mean()
avg_per_remain = remain_data['last_evaluation'].mean()
fig1,ax1 = plt.subplots(sharex = True)
fig1,ax2 = plt.subplots()
ax1.hist(x = left_data['last_evaluation'], bins = bins, edgecolor = 'blue')
ax2.hist(x = remain_data['last_evaluation'], bins = bins, edgecolor = 'blue')
ax2.set_xlabel('Employee Performance')
ax2.set_ylabel('No of employees')
ax1.set_ylabel('No of employees')
ax1.axvline(avg_per_left,color = 'red',linewidth = 2,label='Age Median')
ax2.axvline(avg_per_remain,color = 'red',linewidth = 2,label='Age Median')
ax1.set_title('Employees who left')
ax2.set_title('Employees who remained')
plt.show()
This histogram suggests :1. average performance for both remained and left employees are around 0.7.
2. Approximately- Employees with performance level 0.8 to 1 also left the company, which indicate some other factors might be the reason.
3. Employees with performance level from 0.5 to 1 are more likely to remain.
Scatter plot of Performance and Satisfaction level on left employees
In [13]: plt.style.use('default')
plt.scatter(left_data['satisfaction_level'], left_data['last_evaluation'], s=3)
plt.xlabel('Satisfaction Level')
plt.ylabel('Performance Level')
plt.title('Satisfaction & Performance Scatterplot')
plt.show()
Scatter plot of Performance and Satisfaction level on remain employees
In [14]: plt.scatter(remain_data['satisfaction_level'],remain_data['last_evaluation'], s=5)
plt.title('Scatter plot of Performance and satisfaction Level')
plt.xlabel('Satisfaction Level')
plt.ylabel('Performance Level')
plt.show()
1. Left employees scatterplot showing 3 clusters
2. Remaining employees scatterplot showing a single cluster, where the employees satisfaction level are within 0.5 to 1
In [15]: ## creating clusters for employees who choose to leave by satisfaction & performance level
In [16]: from sklearn.cluster import KMeans
km = KMeans(n_clusters = 3)
X = left_data[['satisfaction_level','last_evaluation']]
y_pred = km.fit_predict(X)
In [17]: ## creating a new column that link all customers to relevant clusters
left_data['cluster'] = y_pred
df1 = left_data[left_data.cluster == 0]
df2 = left_data[left_data.cluster == 1]
df3 = left_data[left_data.cluster == 2]
label = ['cluster-1','cluster-2','cluster-3']
plt.scatter(df1.satisfaction_level, df1.last_evaluation, color = 'green', s = 5)
plt.scatter(df2.satisfaction_level, df2.last_evaluation, color = 'blue', s = 5)
plt.scatter(df3.satisfaction_level, df3.last_evaluation, color = 'black', s = 5)
plt.xlabel('Satisfaction_Level')
plt.ylabel('Performance_Level')
plt.legend(label)
plt.show()
From this scatter plot:1. the leftmost top cluster indicates the leaving employees with very good performance but low satisfaction level. Maybe they are undervalued by their performance.
2. the rightmost top cluster indicates, the leaving employees with very high satisfaction and performance level
3. the bottom middle cluster indicates, the leaving employees with low performance and satisfaction, which is less surprising and makes sense.
employees leaving with the factor of total projects
In [19]: data['left'] = data['left'].astype(str)
sn.countplot(x = 'number_project', data = data, hue = data.left)
plt.xlabel('No of Projects')
plt.ylabel('Count of employees')
plt.title('Employees by projects')
plt.show()
This plot shows:1.Employees with less than 2 projects are more likely to leave
2.Employees with more than 2 projects are less likely to leave
In [20]: data['average_montly_hours'].agg(['min','max','median'])
Out[20]:
min
96.0
max
310.0
median
200.0
Name: average_montly_hours, dtype: float64
In [21]: ## employees leaving analysis with average monthly hours
In [22]: bins = [80,100,120,140,160,180,200,220,240,260,280,300,320]
avg_hr_left = left_data['average_montly_hours'].mean()
avg_hr_remain = remain_data['average_montly_hours'].mean()
fig1,ax1 = plt.subplots(sharex = 1)
fig1,ax2 = plt.subplots()
ax1.hist(x = left_data['average_montly_hours'], bins = bins, edgecolor = 'black')
ax2.hist(x = remain_data['average_montly_hours'], bins = bins, edgecolor = 'black')
ax1.axvline(avg_hr_left,color = 'red',linewidth = 2)
ax2.axvline(avg_hr_remain,color = 'red',linewidth = 2)
ax1.set_ylabel('No of employees')
ax2.set_ylabel('No of employees')
ax2.set_xlabel('Average monthly hours')
ax1.set_title('Employees who left')
ax2.set_title('Employees who remained')
Out[22]:
Text(0.5, 1.0, 'Employees who remained')
1.Both group of employees have performed averagely around 200 hours per month
2.Employees with avg monthly hours of 150 and below, are more likely to leave.
3.Employees of 150 to 250 monthly hours, are more like to remain in the job
4.However,employees with monthly 250 and higher hrs, are also tends to leave job.
Very limited employees worked 300 hrs or above per month.
In [23]: data.nunique().sort_values()
Out[23]:
Work_accident
left
promotion_last_5years
salary
number_project
time_spend_company
Department
last_evaluation
satisfaction_level
average_montly_hours
dtype: int64
-
In [24]: data['time_spend_company'].unique()
Out[24]:
array([ 3,
6,
4,
5,
2,
8, 10,
7], dtype=int64)
employees Tenure
In [124… sn.countplot(x = 'time_spend_company', data = data, hue = 'left')
plt.xlabel('Employee Tenure(Years)')
plt.ylabel('No of Employees')
plt.title('Employee Turnover by Tenure')
Out[124…
Text(0.5, 1.0, 'Employee Turnover by Tenure')
In [30]: ## creating employee tenure distribution
from matplotlib.ticker import PercentFormatter
plt.hist(data['time_spend_company'], weights = np.ones(len(data))/len(data), edgecolor = 'black', cumulative = True)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xlabel('Employee Tenure(Year)')
plt.ylabel('Cumulative Percentage')
plt.title('Employee Tenure Distribution')
Out[30]:
Text(0.5, 1.0, 'Employee Tenure Distribution')
From the above 2 plots:1. Employee who are more likely to leave, have tenure of 3 years at most, followed by 4,5 and 6.
2. For the first two years, employee are less likely to leave job, maybe they are trying to move up the hierarchy
3. Employees who have tenure of 6+ years, are less likely to leave the job.
4. there is a decline line for both cases from 3 to 6 years timeline.(Investigation require)
5. Around 80% of employees have a tenure less than 5 years.
In [47]: ## analysis 3 to 6 tenure employees with promotion acheivement
mask = (data['time_spend_company'] > 2) & (data['time_spend_company'] < 7)
tenure_3to6 = data.loc[mask]
sn.countplot(x = tenure_3to6.promotion_last_5years, hue = tenure_3to6.left, data = tenure_3to6)
plt.xticks(ticks = tenure_3to6.promotion_last_5years.unique(), labels = ['No','Yes'])
plt.xlabel('Promotion Acheived')
plt.ylabel('No of Employees')
plt.title('3 to 6 years tenure employees turnover by promotion')
plt.show()
Very few employees get promotion within 3 to 6 years of tenure
Those who get a promotion within this time period, remained in the company
overall employees turnover by promotion acheivement
In [50]: data['promotion_last_5years'].value_counts()
Out[50]:
promotion_last_5years-
Name: count, dtype: int64
In [53]: sn.countplot(x = 'promotion_last_5years', hue = 'left', data = data)
plt.xticks(ticks = data.promotion_last_5years.unique(), labels = ['No','Yes'])
plt.xlabel('Promotion Acheived')
plt.ylabel('No of Employees')
plt.title('Overall Employees turnover by promotion')
plt.show()
Insights :1. Very limited employees got promotion
2. Out of 14999 employees only 319 employees earned promotion
3. Those who earn promotion, are more likely to remain in the job
In [54]: len(data)
Out[54]:
14999
Analyze Employee Turnover by Department
In [59]: sn.countplot(x = 'Department', hue = 'left', data = data)
plt.xticks(rotation = 60)
plt.xlabel('Departments')
plt.ylabel('No of Employees')
plt.title('Employee Turnover by Department')
plt.show()
Percentage of Leavers
In [97]: for dept in data['Department'].unique():
dept_leaving = data[(data.Department == dept) & (data.left == '1')]
dept_all = data[(data.Department == dept)]
print(f"{dept} percentage left : {round(((dept_leaving.shape[0])/(dept_all.shape[0])* 100),2)} %")
sales percentage left : 24.49 %
accounting percentage left : 26.6 %
hr percentage left : 29.09 %
technical percentage left : 25.62 %
support percentage left : 24.9 %
management percentage left : 14.44 %
IT percentage left : 22.25 %
product_mng percentage left : 21.95 %
marketing percentage left : 23.66 %
RandD percentage left : 15.37 %
In [102… dept_left = []
for dept in data.Department.unique():
dept_leaving = data[(data.Department == dept) & (data.left == '1')]
dept_all = data[(data.Department == dept)]
dept_left.append(round(((dept_leaving.shape[0]/dept_all.shape[0]) * 100),2))
dept_left_series = pd.Series(data = dept_left, index = data.Department.unique()).sort_values(ascending = False)
plt.bar(dept_left_series.index, dept_left_series)
plt.xticks(rotation = 60)
plt.xlabel('Department')
plt.ylabel('Percentage Left')
plt.title('Employee Turnover percentage by Department')
Out[102…
Text(0.5, 1.0, 'Employee Turnover percentage by Department')
1. From the employee turnover by department, employees mostly left the job was from sales department. However, the employee number is also the most in this department. That's why this is not a good measure.
2. Employee turnover percentage is a relevant and effective measure.
3. HR has the highest turnover rate at 29.09 %
4. Management has the lowest turnover rate at 14.4%
In [106… data.nunique().sort_values()
Out[106…
Work_accident
left
promotion_last_5years
salary
number_project
time_spend_company
Department
last_evaluation
satisfaction_level
average_montly_hours
dtype: int64
-
Employee satisfaction by Department
In [112… data[['satisfaction_level','Department']].groupby(['Department']).mean().reset_index().sort_values(by = 'satisfaction_level',ascending = False)
Department
satisfaction_level
4
management
-
1
RandD
-
6
product_mng
-
5
marketing
-
8
support
-
0
IT
-
7
sales
-
9
technical
-
3
hr
-
2
accounting
-
Out[112…
It is seen that, the higher percentage of turnover department are likely to have lower satisfaction level.
1. management and RandD deparments have the two highest satisfaction rate with the two lowest turnover %
2. HR and accounting has the two lowest satisfaction rate with the two highest turnover %
In [115… data['salary'].unique()
Out[115…
array(['low', 'medium', 'high'], dtype=object)
In [119… sn.countplot(x = 'salary', hue = 'left', data = data)
plt.xlabel('Salary')
plt.ylabel('No of employees')
plt.title('Employees turnover by salary')
plt.show()
1. It is seen that, the turnover decreases as the salary increases.
2. very few employees with high salary left the job
Turnover by work accident
In [121… data.Work_accident.value_counts()
Out[121…
Work_accident-
Name: count, dtype: int64
In [140… work_accident_left = data[(data.Work_accident == 1) & (data.left == '1')]
non_work_accident_left = data[(data.Work_accident == 0) & (data.left == '1')]
work_accident_left_pct = round(((work_accident_left.shape[0]/left_data.shape[0]) * 100),2)
non_work_accident_left_pct = round(((non_work_accident_left.shape[0]/left_data.shape[0]) * 100),2)
slices = [work_accident_left_pct,non_work_accident_left_pct]
labels = ['Work accident','Non-Work accident']
plt.pie(slices, labels = labels,textprops = {'fontsize':12}, autopct = '%1.1f%%')
plt.show()
Pie chart shows:
the effect of work accident is negligible on employees leaving
summarize the data for numerical analysis
In [141… data.dtypes
Out[141…
satisfaction_level
last_evaluation
number_project
average_montly_hours
time_spend_company
Work_accident
left
promotion_last_5years
Department
salary
dtype: object
float64
float64
int64
int64
int64
int64
object
int64
object
object
In [142… data.drop(['Department','salary'],axis = 1).groupby('left').mean()
satisfaction_level
last_evaluation
number_project
average_montly_hours
time_spend_company
Work_accident
promotion_last_5years
0
-
-
-
-
-
-
-
1
-
-
-
-
-
-
-
Out[142…
left
1. Lower satisfaction level more indicate to employees of leaving the job.
2. Employees with higher average monthly hours, are more likely to leave the job.
3. Employees who get promotion are less likely to leave the job
In [ ]: