Etinosa David Eribo | Freelancer Livestock Data Analysis

Livestock Data Analysis

Import needed package In [372]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import preprocessing import warnings warnings.filterwarnings('ignore') %matplotlib inline In [374]: livestock_data = pd.read_csv('-.csv') In [376]: livestock_data.head() Out[376]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID 0 1931 Canada 2016A- Total cattle On all At July cattle 1 operations Head 148 thousand 1 1931 Canada 2016A- Bulls, 1 year and over On all At July cattle 1 operations Head 148 thousand 2 1931 Canada 2016A- Dairy cows On all At July cattle 1 operations Head 148 thousand 3 1931 Canada 2016A- Beef cows On all At July cattle 1 operations Head 148 thousand 4 1931 Canada 2016A- Total heifers On all At July cattle 1 operations Head 148 thousand In [378]: livestock_data.drop(['STATUS', 'SYMBOL', 'TERMINATED'], axis=1, inplace=True) In [380]: livestock_data.head() Out[380]: In [382]: SCALAR_FACTO REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTO 0 1931 Canada 2016A- Total cattle On all At July cattle 1 operations Head 148 thousand 1 1931 Canada 2016A- Bulls, 1 year and over On all At July cattle 1 operations Head 148 thousand 2 1931 Canada 2016A- Dairy cows On all At July cattle 1 operations Head 148 thousand 3 1931 Canada 2016A- Beef cows On all At July cattle 1 operations Head 148 thousand 4 1931 Canada 2016A- Total heifers On all At July cattle 1 operations Head 148 thousand livestock_data.isnull().sum() Out[382]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE DECIMALS dtype: int64 - In [384]: livestock_data['VALUE'] = livestock_data['VALUE'].fillna(0) In [386]: livestock_data.isnull().sum() Out[386]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE DECIMALS dtype: int64 In [388]: livestock_data.dtypes Out[388]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE DECIMALS dtype: object In [390]: livestock_data.describe() - int64 object object object object object object int64 object int64 object object float64 int64 # Replace NaN in column 'VALU REF_DATE UOM_ID Out[390]: count- In [392]: SCALAR_ID VALUE DECIMALS 48738.0 48738.0 - 48738.0 mean - 148.0 3.0 - 1.0 std - 0.0 0.0 - 0.0 min - 148.0 3.0 - 1.0 25% - 148.0 3.0 - 1.0 50% - 148.0 3.0 - 1.0 75% - 148.0 3.0 - 1.0 max - 148.0 - 1.0 livestock_data.describe(include='object') Out[392]: Livestock Survey date GEO DGUID count 48738 41896 48738 48738 48738 48738 48738 48738 unique 14 12 11 2 6 1 1 1408 top Canada 2016A- Total heifers On all At July cattle 1 operations Head thousands v61382 48738 48738 94 freq 4575 4575 4900 In [394]: livestock_data.value_counts('Livestock') Out[394]: Livestock Total heifers Beef cows Bulls, 1 year and over Calves, under 1 year Dairy cows Heifers for dairy replacement Steers, 1 year and over Total beef heifers Total cattle Heifers for beef replacement Heifers for slaughter Name: count, dtype: int64 - In [396]: livestock_data.value_counts('REF_DATE') Out[396]: REF_DATE- ..- Name: count, Length: 94, dtype: int64 24809 Farm type 21238 UOM SCALAR_FACTOR VECTOR CO In [398]: plt.figure(figsize=(10,8)) sns.set(font_scale=1.3) sns.boxplot(data=livestock_data, x='REF_DATE', y='Livestock') plt.show() In [400]: plt.figure(figsize=(25,8)) sns.set(font_scale=1.3) sns.lineplot(data=livestock_data, x='REF_DATE', y='Livestock') plt.show() In [402]: plt.figure(figsize=(25,8)) sns.set(font_scale=1.2) sns.barplot(data=livestock_data, x='GEO', y='Livestock') plt.show() In [404]: livestock_data.select_dtypes('float').corr() VALUE Out[404]: VALUE In [406]: 1.0 livestock_data.head() Out[406]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTO 0 1931 Canada 2016A- Total cattle On all At July cattle 1 operations Head 148 thousand 1 1931 Canada 2016A- Bulls, 1 year and over On all At July cattle 1 operations Head 148 thousand 2 1931 Canada 2016A- Dairy cows On all At July cattle 1 operations Head 148 thousand 3 1931 Canada 2016A- Beef cows On all At July cattle 1 operations Head 148 thousand 4 1931 Canada 2016A- Total heifers On all At July cattle 1 operations Head 148 thousand Feature Encoding In [408]: livestock_data["Livestock"].unique() Out[408]: array(['Total cattle', 'Bulls, 1 year and over', 'Dairy cows', 'Beef cows', 'Total heifers', 'Heifers for dairy replacement', 'Total beef heifers', 'Steers, 1 year and over', 'Calves, under 1 year', 'Heifers for beef replacement', 'Heifers for slaughter'], dtype=object) In [410]: livestock_data["Survey date"].unique() Out[410]: array(['At July 1', 'At January 1'], dtype=object) In [412]: livestock_data.head() Out[412]: In [414]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTO 0 1931 Canada 2016A- Total cattle On all At July cattle 1 operations Head 148 thousand 1 1931 Canada 2016A- Bulls, 1 year and over On all At July cattle 1 operations Head 148 thousand 2 1931 Canada 2016A- Dairy cows On all At July cattle 1 operations Head 148 thousand 3 1931 Canada 2016A- Beef cows On all At July cattle 1 operations Head 148 thousand 4 1931 Canada 2016A- Total heifers On all At July cattle 1 operations Head 148 thousand from sklearn import preprocessing label_encode = ['Survey date'] mapping_trip_type = { 'At July 1' : 0, 'At January 1' } : 1, livestock_data['Survey date'] = preprocessing.LabelEncoder() \ .fit_transform(livestock_data['Survey date']) In [416]: livestock_data.head() Out[416]: In [431]: REF_DATE GEO DGUID Livestock Survey date Farm type UOM UOM_ID SCALAR_FACTO 0 1931 Canada 2016A- Total cattle 1 On all cattle operations Head 148 thousand 1 1931 Canada 2016A- Bulls, 1 year and over 1 On all cattle operations Head 148 thousand 2 1931 Canada 2016A- Dairy cows 1 On all cattle operations Head 148 thousand 3 1931 Canada 2016A- Beef cows 1 On all cattle operations Head 148 thousand 4 1931 Canada 2016A- Total heifers 1 On all cattle operations Head 148 thousand import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, roc_auc_score import matplotlib.pyplot as plt import seaborn as sns # Example DataFrame data = { 'Livestock': [ 'Total cattle', 'Dairy cows', 'Bulls, 1 year and over', 'Beef cows', 'Unknown type', 'Heifers for slaughter', 'Total heifers', 'Calves, under 1 year', ' Dairy cows ', ' Bulls, 1 year and over ' ], 'SurveyData': [1, 0, 1, 0, 0, 1, 1, 0, 1, 0] } df = pd.DataFrame(data) # Clean the "Livestock" column (strip spaces) df['Livestock'] = df['Livestock'].str.strip() # Encode the "Livestock" column using LabelEncoder encoder = LabelEncoder() df['Livestock_encoded'] = encoder.fit_transform(df['Livestock']) # Define features (X) and target (y) X = df[['Livestock_encoded']] # Features y = df['SurveyData'] # Target # Split the data into training and test sets (80% train, 20% test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Initialize models logreg = LogisticRegression() dt_model = DecisionTreeClassifier(random_state=42) rf_model = RandomForestClassifier(random_state=42) # Train the models logreg.fit(X_train, y_train) dt_model.fit(X_train, y_train) rf_model.fit(X_train, y_train) # Make predictions y_pred_logreg = logreg.predict(X_test) y_pred_dt = dt_model.predict(X_test) y_pred_rf = rf_model.predict(X_test) # Evaluate models acc_logreg = accuracy_score(y_test, y_pred_logreg) acc_dt = accuracy_score(y_test, y_pred_dt) acc_rf = accuracy_score(y_test, y_pred_rf) # Print classification reports print("Logistic Regression Classification Report:") print(classification_report(y_test, y_pred_logreg)) print("Decision Tree Classification Report:") print(classification_report(y_test, y_pred_dt)) print("Random Forest Classification Report:") print(classification_report(y_test, y_pred_rf)) # ROC AUC Score (good for binary classification) roc_auc_logreg = roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]) roc_auc_dt = roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1]) roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]) print(f"ROC AUC for Logistic Regression: {roc_auc_logreg:.2f}") print(f"ROC AUC for Decision Tree: {roc_auc_dt:.2f}") print(f"ROC AUC for Random Forest: {roc_auc_rf:.2f}") # Compare Models using Accuracy accuracy_results = [acc_logreg, acc_dt, acc_rf] model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest'] # Plot the comparison plt.figure(figsize=(8, 6)) sns.barplot(x=model_names, y=accuracy_results, palette='viridis') plt.title('Model Comparison (Accuracy)') plt.ylabel('Accuracy') plt.show() Logistic Regression Classification Report: precision recall f1-score support 0 1 0.50 0.00 1.00 0.00 0.67 0.00 1 1 accuracy macro avg weighted avg 0.25 0.25 0.50 0.50 - 2 2 2 Decision Tree Classification Report: precision recall f1-score support 0 1 0.50 0.00 1.00 0.00 0.67 0.00 1 1 accuracy macro avg weighted avg 0.25 0.25 0.50 0.50 - 2 2 2 Random Forest Classification Report: precision recall f1-score support 0 1 0.50 0.00 1.00 0.00 0.67 0.00 1 1 accuracy macro avg weighted avg 0.25 0.25 0.50 0.50 - 2 2 2 ROC AUC for Logistic Regression: 0.50 ROC AUC for Decision Tree: 0.50 ROC AUC for Random Forest: 0.50 In [ ]: