Zainab Munir | Freelancer Portfolio Item #424815

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session /kaggle/input/emotions/text.csv import tensorflow as tf print("TF GPUs:", tf.config.list_physical_devices('GPU')-:50:-: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:- cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered E0000 00:00:- cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered TF GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] import torch if torch.cuda.is_available(): print("CUDA is available ✅") print("Device Name:", torch.cuda.get_device_name(0)) else: print("CUDA is NOT available ❌") CUDA is available ✅ Device Name: Tesla P100-PCIE-16GB df=pd.read_csv("/kaggle/input/emotions/text.csv") df.info() RangeIndex: 416809 entries, 0 to 416808 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ----------------------0 Unnamed:- non-null int64 1 text 416809 non-null object 2 label 416809 non-null int64 dtypes: int64(2), object(1) memory usage: 9.5+ MB df.head(10) Unnamed: label- 0 text 0 i just feel really helpless and heavy hearted 1 ive enjoyed being able to slouch about relax a... 2 i gave up my internship with the dmrg and am f... 3 i dont know i feel so lost 4 i am a kindergarten teacher and i am thoroughl... 5 i was beginning to feel quite disheartened 6 i would think that whomever would be lucky eno... 7 i fear that they won t ever feel that deliciou... 8 im forever taking some time out to have a lie ... 9 i can still lose the weight without feeling de... df.isnull().sum() Unnamed: 0 text 0 0 label dtype: int64 0 import re # Data Cleaning:def clean_text(text): text= re.sub(r'<.*?>'," ",text) text= re.sub(r'@\w+'," ",text) text= re.sub(r'http\S+|www\.\S+'," ",text) text=re.sub(r'\[.*?\]'," ",text) text=re.sub('r[^a-zA-Z\s]'," ",text) text= re.sub(r'\d+'," ",text) text=text.lower() text= " ".join(text.split()) return text df["clean_text"]=df["text"].apply(clean_text) df.head(20) Unnamed: 0 label \- text i just feel really helpless and heavy hearted ive enjoyed being able to slouch about relax a... i gave up my internship with the dmrg and am f... i dont know i feel so lost i am a kindergarten teacher and i am thoroughl... i was beginning to feel quite disheartened i would think that whomever would be lucky eno... i fear that they won t ever feel that deliciou... im forever taking some time out to have a lie ... i can still lose the weight without feeling de... i try to be nice though so if you get a bitchy... im feeling a little like a damaged tree and th... i have officially graduated im not feeling as ... - 13 i feel like a jerk because the library student... 14 i feel my portfolio demonstrates how eager i a... 15 i may be more biased than the next because i h... 16 i didn t feel terrific 17 i miss all the others as well that feel that i... 18 i feel so stupid that i realise it so late 19 i saunter through the airport terminals feelin... clean_text i just feel really helpless and heavy hearted ive enjoyed being able to slouch about relax a... i gave up my internship with the dmrg and am f... i dont know i feel so lost i am a kindergarten teacher and i am thoroughl... i was beginning to feel quite disheartened i would think that whomever would be lucky eno... i fear that they won t ever feel that deliciou... im forever taking some time out to have a lie ... i can still lose the weight without feeling de... i try to be nice though so if you get a bitchy... im feeling a little like a damaged tree and th... i have officially graduated im not feeling as ... i feel like a jerk because the library student... i feel my portfolio demonstrates how eager i a... i may be more biased than the next because i h... i didn t feel terrific i miss all the others as well that feel that i... i feel so stupid that i realise it so late i saunter through the airport terminals feelin... #Tokenization and Lemmatization: from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer lemmatizer= WordNetLemmatizer() def tokenize_and_lemm(text): tokens= word_tokenize(text) lem_tokens= [lemmatizer.lemmatize(token,pos="v") for token in tokens] return " ".join(lem_tokens) df["clean_text"]= df["clean_text"].apply(tokenize_and_lemm) df.head(20) Unnamed: 0 label \- text i just feel really helpless and heavy hearted ive enjoyed being able to slouch about relax a... i gave up my internship with the dmrg and am f... i dont know i feel so lost i am a kindergarten teacher and i am thoroughl... i was beginning to feel quite disheartened i would think that whomever would be lucky eno... i fear that they won t ever feel that deliciou... im forever taking some time out to have a lie ... i can still lose the weight without feeling de... i try to be nice though so if you get a bitchy... im feeling a little like a damaged tree and th... i have officially graduated im not feeling as ... i feel like a jerk because the library student... i feel my portfolio demonstrates how eager i a... i may be more biased than the next because i h... i didn t feel terrific i miss all the others as well that feel that i... i feel so stupid that i realise it so late i saunter through the airport terminals feelin... clean_text i just feel really helpless and heavy hearted - ive enjoy be able to slouch about relax and un... i give up my internship with the dmrg and be f... i dont know i feel so lose i be a kindergarten teacher and i be thoroughl... i be begin to feel quite dishearten i would think that whomever would be lucky eno... i fear that they win t ever feel that deliciou... im forever take some time out to have a lie do... i can still lose the weight without feel deprive i try to be nice though so if you get a bitchy... im feel a little like a damage tree and that m... i have officially graduate im not feel as ecst... i feel like a jerk because the library student... i feel my portfolio demonstrate how eager i be... i may be more bias than the next because i hav... i didn t feel terrific i miss all the others as well that feel that i... i feel so stupid that i realise it so late i saunter through the airport terminals feel t... # Vectorization using Tf-idf Vectorizer: from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer= TfidfVectorizer(max_features=5000, stop_words="english") x= tfidf_vectorizer.fit_transform(df["clean_text"]) y= df["label"] x.shape (416809, 5000) # Importing libraries: from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report,accuracy_score X_train,X_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42) # Applying Machine Learning Models:## Using Logistic Regression Model: logreg= LogisticRegression(max_iter=1000, multi_class="ovr") logreg.fit(X_train,y_train) y_pred= logreg.predict(X_test) accuracy= accuracy_score(y_pred,y_test) classification_rep = classification_report(y_test, y_pred) accuracy, classification_rep -, ' precision recall f1-score support\n\n-\n-\n-\n-\n-\n-\n\n accuracy-\n macro avg- nweighted avg-\n') -\ accuracy- y_test.shape (83362,) X_test.shape (83362, 5000) ## USING SVM MODEL: from sklearn.svm import SVC from sklearn.metrics import classification_report,accuracy_score from sklearn.model_selection import train_test_split x_small=x[:5000] y_small=y[:5000] X_train,X_test,y_train,y_test= train_test_split(x_small,y_small,test_size=0.2,random_state=42) svm_model= SVC(kernel="linear") svm_model.fit(X_train,y_train) y_pred= svm_model.predict(X_test) acc= accuracy_score(y_test,y_pred) report= classification_report(y_test,y_pred) acc 0.797 X_test.shape (1000, 5000) y_test.shape (1000,) ## USING DECISION TREE MODEL: from sklearn.tree import DecisionTreeClassifier X_train,X_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42) clf=DecisionTreeClassifier(random_state=42) clf.fit(X_train, y_train) print('Decision Tree model has been trained.') Decision Tree model has been trained. from sklearn.metrics import accuracy_score y_pred=clf.predict(X_test) accu= accuracy_score(y_test,y_pred) accu- y_test.shape (83362,) ## USING RANDOM FOREST MODEL: from sklearn.ensemble import RandomForestClassifier rf_clf= RandomForestClassifier(n_estimators=100, random_state=42) rf_clf.fit(X_train,y_train) print("Random Forest model on Emotions dataset has been trained") Random Forest model on Emotions dataset has been trained rf_y_pred= rf_clf.predict(X_test) acc_rf_clf= accuracy_score(y_test,rf_y_pred) acc_rf_clf- df["label"].unique() array([4, 0, 2, 1, 5, 3]) ## USING KNN MODEL: from sklearn.neighbors import KNeighborsClassifier knn= KNeighborsClassifier(n_neighbors=6) knn.fit(X_train,y_train) y_pred= knn.predict(X_test) y_pred array([0, 1, 3, ..., 1, 1, 1]) from sklearn.metrics import accuracy_score accuracy_score(y_test,y_pred- df["text"][10] 'i try to be nice though so if you get a bitchy person on the phone or at the window feel free to have a little fit and throw your pen at her face' import tensorflow as tf from tensorflow import keras X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,rando m_state=42) model=keras.Sequential([ keras.layers.Dense(64,activation='relu',input_shape=(5000,), keras.layers.Dropout(0.3), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.3), keras.layers.Dense(6,activation='softmax') ]) model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, epochs=6, batch_size=64, validation_split=0.1) /usr/local/lib/python3.11/dist-packages/keras/src/layers/core/ dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs) I0000 00:00:- gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0 Epoch 1/6 WARNING: All log messages before absl::InitializeLog() is called are written to STDERR I0000 00:00:- service.cc:148] XLA service 0x7c894c00b590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: I0000 00:00:- service.cc:156] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0 I0000 00:00:- cuda_dnn.cc:529] Loaded cuDNN version 90300 1/4690 ━━━━━━━━━━━━━━━━━━━━ 3:54:15 3s/step - accuracy: 0.0625 loss: 1.7992 I0000 00:00:- device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process. 4690/4690 ━━━━━━━━━━━━━━━━━━━━ 255s 54ms/step - accuracy: 0.7764 loss: 0.6100 - val_accuracy: 0.8833 - val_loss: 0.2507 Epoch 2/6 4690/4690 ━━━━━━━━━━━━━━━━━━━━ 25s 5ms/step - accuracy:- - val_accuracy: 0.8844 - val_loss: 0.2431 Epoch 3/6 4690/4690 ━━━━━━━━━━━━━━━━━━━━ 15s 3ms/step - accuracy:- - val_accuracy: 0.8848 - val_loss: 0.2416 Epoch 4/6 4690/4690 ━━━━━━━━━━━━━━━━━━━━ 16s 3ms/step - accuracy:- - val_accuracy: 0.8816 - val_loss: 0.2476 Epoch 5/6 4690/4690 ━━━━━━━━━━━━━━━━━━━━ 15s 3ms/step - accuracy:- - val_accuracy: 0.8803 - val_loss: 0.2568 Epoch 6/6 4690/4690 ━━━━━━━━━━━━━━━━━━━━ 15s 3ms/step - accuracy:- - val_accuracy: 0.8812 - val_loss: 0.2607 loss: loss: loss: loss: loss: # Evaluate test_loss, test_acc= model.evaluate(X_test,y_test) 2606/2606 ━━━━━━━━━━━━━━━━━━━━ 27s 10ms/step - accuracy: 0.8812 loss: 0.2579 test_acc- model.summary() Model: "sequential" ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳ ━━━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇ ━━━━━━━━━━━━━━━━━┩ │ dense (Dense) │ (None, 64) │ 320,064 │ ├──────────────────────────────────────┼─────────────────────────────┼ ─────────────────┤ │ dropout (Dropout) │ (None, 64) │ 0 │ ├──────────────────────────────────────┼─────────────────────────────┼ ─────────────────┤ │ dense_1 (Dense) │ (None, 64) │ 4,160 │ ├──────────────────────────────────────┼─────────────────────────────┼ ─────────────────┤ │ dropout_1 (Dropout) │ (None, 64) │ 0 │ ├──────────────────────────────────────┼─────────────────────────────┼ ─────────────────┤ │ dense_2 (Dense) │ (None, 6) │ 390 │ └──────────────────────────────────────┴─────────────────────────────┴ ─────────────────┘ Total params: 973,844 (3.71 MB) Trainable params: 324,614 (1.24 MB) Non-trainable params: 0 (0.00 B) Optimizer params: 649,230 (2.48 MB) print(x.shape) (416809, 5000) import pandas as pd import matplotlib.pyplot as plt import seaborn as sns results = { "Model": ["LogisticReg.", "SVM", "Decision_T", "Rand_F.","KNN", "ANN"], "Accuracy": [0.8827, 0.797, 0.8072, 0.8399,0.6458,0.8808] } df_results = pd.DataFrame(results) plt.figure(figsize=(10,7)) sns.barplot(data=df_results, x="Model", y="Accuracy") plt.title("Model Accuracy Comparison") plt.ylim(0, 1) plt.show() # Overall Logistic Regression and ANN Models are performed well on Dataset as compared to other models.