
@Article{cmc.2022.025711,
AUTHOR = {Jongyun Choi, Hyesoo Kong, Hwamook Yoon, Heungseon Oh, Yuchul Jung},
TITLE = {LAME: Layout-Aware Metadata Extraction Approach for Research Articles},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {72},
YEAR = {2022},
NUMBER = {2},
PAGES = {4019--4037},
URL = {http://www.techscience.com/cmc/v72n2/47204},
ISSN = {1546-2226},
ABSTRACT = {The volume of academic literature, such as academic conference papers and journals, has increased rapidly worldwide, and research on metadata extraction is ongoing. However, high-performing metadata extraction is still challenging due to diverse layout formats according to journal publishers. To accommodate the diversity of the layouts of academic journals, we propose a novel LAyout-aware Metadata Extraction (LAME) framework equipped with the three characteristics (e.g., design of automatic layout analysis, construction of a large meta-data training set, and implementation of metadata extractor). In the framework, we designed an automatic layout analysis using PDFMiner. Based on the layout analysis, a large volume of metadata-separated training data, including the title, abstract, author name, author affiliated organization, and keywords, were automatically extracted. Moreover, we constructed a pre-trained model, Layout-MetaBERT, to extract the metadata from academic journals with varying layout formats. The experimental results with our metadata extractor exhibited robust performance (Macro-F1, 93.27%) in metadata extraction for unseen journals with different layout formats.},
DOI = {10.32604/cmc.2022.025711}
}



