@Article{cmes.2022.019535,
AUTHOR = {Bilal Chandio, Asadullah Shaikh, Maheen Bakhtyar, Mesfer Alrizq, Junaid Baber, Adel Sulaiman, Adel Rajab, Waheed Noor},
TITLE = {Sentiment Analysis of Roman Urdu on E-Commerce Reviews Using Machine Learning},
JOURNAL = {Computer Modeling in Engineering \& Sciences},
VOLUME = {131},
YEAR = {2022},
NUMBER = {3},
PAGES = {1263--1287},
URL = {http://www.techscience.com/CMES/v131n3/47397},
ISSN = {1526-1506},
ABSTRACT = {Sentiment analysis task has widely been studied for various languages such as English and French. However, Roman
Urdu sentiment analysis yet requires more attention from peer-researchers due to the lack of Off-the-Shelf Natural
Language Processing (NLP) solutions. The primary objective of this study is to investigate the diverse machine
learning methods for the sentiment analysis of Roman Urdu data which is very informal in nature and needs to be
lexically normalized. To mitigate this challenge, we propose a fine-tuned Support Vector Machine (SVM) powered
by Roman Urdu Stemmer. In our proposed scheme, the corpus data is initially cleaned to remove the anomalies from
the text. After initial pre-processing, each user review is being stemmed. The input text is transformed into a feature
vector using the bag-of-word model. Subsequently, the SVM is used to classify and detect user sentiment. Our
proposed scheme is based on a dictionary based Roman Urdu stemmer. The creation of the Roman Urdu stemmer
is aimed at standardizing the text so as to minimize the level of complexity. The efficacy of our proposed model is
also empirically evaluated with diverse experimental configurations, so as to fine-tune the hyper-parameters and
achieve superior performance. Moreover, a series of experiments are conducted on diverse machine learning and
deep learning models to compare the performance with our proposed model. We also introduced the largest dataset
on Roman Urdu, i.e., Roman Urdu e-commerce dataset (RUECD), which contains 26K+ user reviews annotated by
the group of experts. The RUECD is challenging and the largest dataset available of Roman Urdu. The experiments
show that the newly generated dataset is quite challenging and requires more attention from the peer researchers
for Roman Urdu sentiment analysis.},
DOI = {10.32604/cmes.2022.019535}
}