Livestock Data Analysis
Import needed package
In [372]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [374]:
livestock_data = pd.read_csv('-.csv')
In [376]:
livestock_data.head()
Out[376]:
REF_DATE
GEO
DGUID
Livestock Survey
date
Farm type UOM UOM_ID
0
1931
Canada 2016A-
Total
cattle
On all
At July
cattle
1 operations
Head
148
thousand
1
1931
Canada 2016A-
Bulls, 1
year and
over
On all
At July
cattle
1 operations
Head
148
thousand
2
1931
Canada 2016A-
Dairy
cows
On all
At July
cattle
1 operations
Head
148
thousand
3
1931
Canada 2016A-
Beef
cows
On all
At July
cattle
1 operations
Head
148
thousand
4
1931
Canada 2016A-
Total
heifers
On all
At July
cattle
1 operations
Head
148
thousand
In [378]:
livestock_data.drop(['STATUS', 'SYMBOL', 'TERMINATED'], axis=1, inplace=True)
In [380]:
livestock_data.head()
Out[380]:
In [382]:
SCALAR_FACTO
REF_DATE
GEO
DGUID
Livestock Survey
date
Farm type UOM UOM_ID
SCALAR_FACTO
0
1931
Canada 2016A-
Total
cattle
On all
At July
cattle
1 operations
Head
148
thousand
1
1931
Canada 2016A-
Bulls, 1
year and
over
On all
At July
cattle
1 operations
Head
148
thousand
2
1931
Canada 2016A-
Dairy
cows
On all
At July
cattle
1 operations
Head
148
thousand
3
1931
Canada 2016A-
Beef
cows
On all
At July
cattle
1 operations
Head
148
thousand
4
1931
Canada 2016A-
Total
heifers
On all
At July
cattle
1 operations
Head
148
thousand
livestock_data.isnull().sum()
Out[382]:
REF_DATE
GEO
DGUID
Livestock
Survey date
Farm type
UOM
UOM_ID
SCALAR_FACTOR
SCALAR_ID
VECTOR
COORDINATE
VALUE
DECIMALS
dtype: int64
-
In [384]:
livestock_data['VALUE'] = livestock_data['VALUE'].fillna(0)
In [386]:
livestock_data.isnull().sum()
Out[386]:
REF_DATE
GEO
DGUID
Livestock
Survey date
Farm type
UOM
UOM_ID
SCALAR_FACTOR
SCALAR_ID
VECTOR
COORDINATE
VALUE
DECIMALS
dtype: int64
In [388]:
livestock_data.dtypes
Out[388]:
REF_DATE
GEO
DGUID
Livestock
Survey date
Farm type
UOM
UOM_ID
SCALAR_FACTOR
SCALAR_ID
VECTOR
COORDINATE
VALUE
DECIMALS
dtype: object
In [390]:
livestock_data.describe()
-
int64
object
object
object
object
object
object
int64
object
int64
object
object
float64
int64
# Replace NaN in column 'VALU
REF_DATE UOM_ID
Out[390]:
count-
In [392]:
SCALAR_ID
VALUE
DECIMALS
48738.0
48738.0
-
48738.0
mean
-
148.0
3.0
-
1.0
std
-
0.0
0.0
-
0.0
min
-
148.0
3.0
-
1.0
25%
-
148.0
3.0
-
1.0
50%
-
148.0
3.0
-
1.0
75%
-
148.0
3.0
-
1.0
max
-
148.0
-
1.0
livestock_data.describe(include='object')
Out[392]:
Livestock Survey
date
GEO
DGUID
count
48738
41896
48738
48738
48738
48738
48738
48738
unique
14
12
11
2
6
1
1
1408
top Canada 2016A-
Total
heifers
On all
At July
cattle
1 operations
Head
thousands
v61382
48738
48738
94
freq
4575
4575
4900
In [394]:
livestock_data.value_counts('Livestock')
Out[394]:
Livestock
Total heifers
Beef cows
Bulls, 1 year and over
Calves, under 1 year
Dairy cows
Heifers for dairy replacement
Steers, 1 year and over
Total beef heifers
Total cattle
Heifers for beef replacement
Heifers for slaughter
Name: count, dtype: int64
-
In [396]:
livestock_data.value_counts('REF_DATE')
Out[396]:
REF_DATE-
..-
Name: count, Length: 94, dtype: int64
24809
Farm type
21238
UOM SCALAR_FACTOR VECTOR
CO
In [398]:
plt.figure(figsize=(10,8))
sns.set(font_scale=1.3)
sns.boxplot(data=livestock_data, x='REF_DATE', y='Livestock')
plt.show()
In [400]:
plt.figure(figsize=(25,8))
sns.set(font_scale=1.3)
sns.lineplot(data=livestock_data, x='REF_DATE', y='Livestock')
plt.show()
In [402]:
plt.figure(figsize=(25,8))
sns.set(font_scale=1.2)
sns.barplot(data=livestock_data, x='GEO', y='Livestock')
plt.show()
In [404]:
livestock_data.select_dtypes('float').corr()
VALUE
Out[404]:
VALUE
In [406]:
1.0
livestock_data.head()
Out[406]:
REF_DATE
GEO
DGUID
Livestock Survey
date
Farm type UOM UOM_ID
SCALAR_FACTO
0
1931
Canada 2016A-
Total
cattle
On all
At July
cattle
1 operations
Head
148
thousand
1
1931
Canada 2016A-
Bulls, 1
year and
over
On all
At July
cattle
1 operations
Head
148
thousand
2
1931
Canada 2016A-
Dairy
cows
On all
At July
cattle
1 operations
Head
148
thousand
3
1931
Canada 2016A-
Beef
cows
On all
At July
cattle
1 operations
Head
148
thousand
4
1931
Canada 2016A-
Total
heifers
On all
At July
cattle
1 operations
Head
148
thousand
Feature Encoding
In [408]:
livestock_data["Livestock"].unique()
Out[408]:
array(['Total cattle', 'Bulls, 1 year and over', 'Dairy cows',
'Beef cows', 'Total heifers', 'Heifers for dairy replacement',
'Total beef heifers', 'Steers, 1 year and over',
'Calves, under 1 year', 'Heifers for beef replacement',
'Heifers for slaughter'], dtype=object)
In [410]:
livestock_data["Survey date"].unique()
Out[410]:
array(['At July 1', 'At January 1'], dtype=object)
In [412]:
livestock_data.head()
Out[412]:
In [414]:
REF_DATE
GEO
DGUID
Livestock Survey
date
Farm type UOM UOM_ID
SCALAR_FACTO
0
1931
Canada 2016A-
Total
cattle
On all
At July
cattle
1 operations
Head
148
thousand
1
1931
Canada 2016A-
Bulls, 1
year and
over
On all
At July
cattle
1 operations
Head
148
thousand
2
1931
Canada 2016A-
Dairy
cows
On all
At July
cattle
1 operations
Head
148
thousand
3
1931
Canada 2016A-
Beef
cows
On all
At July
cattle
1 operations
Head
148
thousand
4
1931
Canada 2016A-
Total
heifers
On all
At July
cattle
1 operations
Head
148
thousand
from sklearn import preprocessing
label_encode = ['Survey date']
mapping_trip_type = {
'At July 1' : 0,
'At January 1'
}
: 1,
livestock_data['Survey date'] = preprocessing.LabelEncoder() \
.fit_transform(livestock_data['Survey date'])
In [416]:
livestock_data.head()
Out[416]:
In [431]:
REF_DATE
GEO
DGUID
Livestock Survey
date
Farm type UOM UOM_ID
SCALAR_FACTO
0
1931
Canada 2016A-
Total
cattle
1
On all
cattle
operations
Head
148
thousand
1
1931
Canada 2016A-
Bulls, 1
year and
over
1
On all
cattle
operations
Head
148
thousand
2
1931
Canada 2016A-
Dairy
cows
1
On all
cattle
operations
Head
148
thousand
3
1931
Canada 2016A-
Beef
cows
1
On all
cattle
operations
Head
148
thousand
4
1931
Canada 2016A-
Total
heifers
1
On all
cattle
operations
Head
148
thousand
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
# Example DataFrame
data = {
'Livestock': [
'Total cattle', 'Dairy cows', 'Bulls, 1 year and over', 'Beef cows',
'Unknown type', 'Heifers for slaughter', 'Total heifers', 'Calves, under 1 year',
' Dairy cows ', ' Bulls, 1 year and over '
],
'SurveyData': [1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
}
df = pd.DataFrame(data)
# Clean the "Livestock" column (strip spaces)
df['Livestock'] = df['Livestock'].str.strip()
# Encode the "Livestock" column using LabelEncoder
encoder = LabelEncoder()
df['Livestock_encoded'] = encoder.fit_transform(df['Livestock'])
# Define features (X) and target (y)
X = df[['Livestock_encoded']] # Features
y = df['SurveyData'] # Target
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize models
logreg = LogisticRegression()
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
# Train the models
logreg.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred_logreg = logreg.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
# Evaluate models
acc_logreg = accuracy_score(y_test, y_pred_logreg)
acc_dt = accuracy_score(y_test, y_pred_dt)
acc_rf = accuracy_score(y_test, y_pred_rf)
# Print classification reports
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
# ROC AUC Score (good for binary classification)
roc_auc_logreg = roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])
roc_auc_dt = roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1])
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
print(f"ROC AUC for Logistic Regression: {roc_auc_logreg:.2f}")
print(f"ROC AUC for Decision Tree: {roc_auc_dt:.2f}")
print(f"ROC AUC for Random Forest: {roc_auc_rf:.2f}")
# Compare Models using Accuracy
accuracy_results = [acc_logreg, acc_dt, acc_rf]
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest']
# Plot the comparison
plt.figure(figsize=(8, 6))
sns.barplot(x=model_names, y=accuracy_results, palette='viridis')
plt.title('Model Comparison (Accuracy)')
plt.ylabel('Accuracy')
plt.show()
Logistic Regression Classification Report:
precision
recall f1-score
support
0
1
0.50
0.00
1.00
0.00
0.67
0.00
1
1
accuracy
macro avg
weighted avg
0.25
0.25
0.50
0.50
-
2
2
2
Decision Tree Classification Report:
precision
recall f1-score
support
0
1
0.50
0.00
1.00
0.00
0.67
0.00
1
1
accuracy
macro avg
weighted avg
0.25
0.25
0.50
0.50
-
2
2
2
Random Forest Classification Report:
precision
recall f1-score
support
0
1
0.50
0.00
1.00
0.00
0.67
0.00
1
1
accuracy
macro avg
weighted avg
0.25
0.25
0.50
0.50
-
2
2
2
ROC AUC for Logistic Regression: 0.50
ROC AUC for Decision Tree: 0.50
ROC AUC for Random Forest: 0.50
In [ ]: