
@Article{cmc.2020.011686,
AUTHOR = {Rizwan Ali Naqvi, Muhammad Adnan Khan, Nauman Malik, Shazia Saqib, Tahir Alyas, Dildar Hussain},
TITLE = {Roman Urdu News Headline Classification Empowered with  Machine Learning},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {65},
YEAR = {2020},
NUMBER = {2},
PAGES = {1221--1236},
URL = {http://www.techscience.com/cmc/v65n2/39870},
ISSN = {1546-2226},
ABSTRACT = {Roman Urdu has been used for text messaging over the Internet for years 
especially in Indo-Pak Subcontinent. Persons from the subcontinent may speak the 
same Urdu language but they might be using different scripts for writing. The 
communication using the Roman characters, which are used in the script of Urdu 
language on social media, is now considered the most typical standard of 
communication in an Indian landmass that makes it an expensive information supply. 
English Text classification is a solved problem but there have been only a few efforts 
to examine the rich information supply of Roman Urdu in the past. This is due to the 
numerous complexities involved in the processing of Roman Urdu data. The 
complexities associated with Roman Urdu include the non-availability of the tagged 
corpus, lack of a set of rules, and lack of standardized spellings. A large amount of 
Roman Urdu news data is available on mainstream news websites and social media 
websites like Facebook, Twitter but meaningful information can only be extracted if 
data is in a structured format. We have developed a Roman Urdu news headline 
classifier, which will help to classify news into relevant categories on which further 
analysis and modeling can be done. The author of this research aims to develop the 
Roman Urdu news classifier, which will classify the news into five categories 
(health, business, technology, sports, international). First, we will develop the news 
dataset using scraping tools and then after preprocessing, we will compare the results 
of different machine learning algorithms like Logistic Regression (LR), Multinomial 
Naïve Bayes (MNB), Long short term memory (LSTM), and Convolutional Neural 
Network (CNN). After this, we will use a phonetic algorithm to control lexical 
variation and test news from different websites. The preliminary results suggest that 
a more accurate classification can be accomplished by monitoring noise inside data 
and by classifying the news. After applying above mentioned different machine 
learning algorithms, results have shown that Multinomial Naïve Bayes classifier is 
giving the best accuracy of 90.17% which is due to the noise lexical variation.},
DOI = {10.32604/cmc.2020.011686}
}



