
@Article{cmc.2025.068221,
AUTHOR = {Segyeong Bang, Soeun Kim, Gaeun Ahn, Hyemin Hong, Junhyoung Oh},
TITLE = {A Study on Re-Identification of Natural Language Data Considering Korean Attributes},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {85},
YEAR = {2025},
NUMBER = {3},
PAGES = {4629--4643},
URL = {http://www.techscience.com/cmc/v85n3/64173},
ISSN = {1546-2226},
ABSTRACT = {This study analyzes the risks of re-identification in Korean text data and proposes a secure, ethical approach to data anonymization. Following the ‘Lee Luda’ AI chatbot incident, concerns over data privacy have increased. The Personal Information Protection Commission of Korea conducted inspections of AI services, uncovering 850 cases of personal information in user input datasets, highlighting the need for pseudonymization standards. While current anonymization techniques remove personal data like names, phone numbers, and addresses, linguistic features such as writing habits and language-specific traits can still identify individuals when combined with other data. To address this, we analyzed 50,000 Korean text samples from the X platform, focusing on language-specific features for authorship attribution. Unlike English, Korean features flexible syntax, honorifics, syllabic and grapheme patterns, and referential terms. These linguistic characteristics were used to enhance re-identification accuracy. Our experiments combined five machine learning models, six stopword processing methods, and four morphological analyzers. By using a tokenizer that captures word frequency and order, and employing the LSTM model, OKT morphological analyzer, and stopword removal, we achieved the maximum authorship attributions accuracy of 90.51%. This demonstrates the significant role of Korean linguistic features in re-identification. The findings emphasize the risk of re-identification through language data and call for a re-evaluation of anonymization methods, urging the consideration of linguistic traits in anonymization beyond simply removing personal information.},
DOI = {10.32604/cmc.2025.068221}
}



