
@Article{cmc.2020.011158,
AUTHOR = {Jinlin Wang, Xing Wang, Hongli Zhang, Binxing Fang, Yuchen Yang, Jianan Liu},
TITLE = {Information Classification and Extraction on Official Web Pages  of Organizations},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {64},
YEAR = {2020},
NUMBER = {3},
PAGES = {2057--2073},
URL = {http://www.techscience.com/cmc/v64n3/39476},
ISSN = {1546-2226},
ABSTRACT = {As a real-time and authoritative source, the official Web pages of organizations 
contain a large amount of information. The diversity of Web content and format makes it 
essential for pre-processing to get the unified attributed data, which has the value of
organizational analysis and mining. The existing research on dealing with multiple Web
scenarios and accuracy performance is insufficient. This paper aims to propose a method to 
transform organizational official Web pages into the data with attributes. After locating the 
active blocks in the Web pages, the structural and content features are proposed to classify 
information with the specific model. The extraction methods based on trigger lexicon and 
LSTM (Long Short-Term Memory) are proposed, which efficiently process the classified 
information and extract data that matches the attributes. Finally, an accurate and efficient 
method to classify and extract information from organizational official Web pages is formed. 
Experimental results show that our approach improves the performing indicators and 
exceeds the level of state of the art on real data set from organizational official Web pages.},
DOI = {10.32604/cmc.2020.011158}
}



