
@Article{cmc.2020.011485,
AUTHOR = {Malik Javed Akhtar, Zahur Ahmad, Rashid Amin, Sultan H. Almotiri, Mohammed A. Al Ghamdi, Hamza Aldabbas},
TITLE = {An Efficient Mechanism for Product Data Extraction from E-Commerce Websites},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {65},
YEAR = {2020},
NUMBER = {3},
PAGES = {2639--2663},
URL = {http://www.techscience.com/cmc/v65n3/40192},
ISSN = {1546-2226},
ABSTRACT = {A large amount of data is present on the web which can be used for useful 
purposes like a product recommendation, price comparison and demand forecasting for a 
particular product. Websites are designed for human understanding and not for machines. 
Therefore, to make data machine-readable, it requires techniques to grab data from web
pages. Researchers have addressed the problem using two approaches, i.e., knowledge 
engineering and machine learning. State of the art knowledge engineering approaches use
the structure of documents, visual cues, clustering of attributes of data records and text 
processing techniques to identify data records on a web page. Machine learning 
approaches use annotated pages to learn rules. These rules are used to extract data from 
unseen web pages. The structure of web documents is continuously evolving. Therefore,
new techniques are needed to handle the emerging requirements of web data extraction. 
In this paper, we have presented a novel, simple and efficient technique to extract data 
from web pages using visual styles and structure of documents. The proposed technique 
detects Rich Data Region (RDR) using query and correlative words of the query. RDR is 
then divided into data records using style similarity. Noisy elements are removed using a 
Common Tag Sequence (CTS) and formatting entropy. The system is implemented using 
JAVA and runs on the dataset of real-world working websites. The effectiveness of 
results is evaluated using precision, recall, and F-measure and compared with five 
existing systems. A comparison of the proposed technique to existing systems has shown
encouraging results.},
DOI = {10.32604/cmc.2020.011485}
}



