@Article{cmc.2025.064872,
AUTHOR = {Ali Hamid Farea, Iman Askerzade, Omar H. Alhazmi, Savaş Takan},
TITLE = {FSFS: A Novel Statistical Approach for Fair and Trustworthy Impactful Feature Selection in Artificial Intelligence Models},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {84},
YEAR = {2025},
NUMBER = {1},
PAGES = {1457--1484},
URL = {http://www.techscience.com/cmc/v84n1/61772},
ISSN = {1546-2226},
ABSTRACT = {Feature selection (FS) is a pivotal pre-processing step in developing data-driven models, influencing reliability, performance and optimization. Although existing FS techniques can yield high-performance metrics for certain models, they do not invariably guarantee the extraction of the most critical or impactful features. Prior literature underscores the significance of equitable FS practices and has proposed diverse methodologies for the identification of appropriate features. However, the challenge of discerning the most relevant and influential features persists, particularly in the context of the exponential growth and heterogeneity of big data—a challenge that is increasingly salient in modern artificial intelligence (AI) applications. In response, this study introduces an innovative, automated statistical method termed Farea Similarity for Feature Selection (FSFS). The FSFS approach computes a similarity metric for each feature by benchmarking it against the record-wise mean, thereby finding feature dependencies and mitigating the influence of outliers that could potentially distort evaluation outcomes. Features are subsequently ranked according to their similarity scores, with the threshold established at the average similarity score. Notably, lower FSFS values indicate higher similarity and stronger data correlations, whereas higher values suggest lower similarity. The FSFS method is designed not only to yield reliable evaluation metrics but also to reduce data complexity without compromising model performance. Comparative analyses were performed against several established techniques, including Chi-squared (CS), Correlation Coefficient (CC), Genetic Algorithm (GA), Exhaustive Approach, Greedy Stepwise Approach, Gain Ratio, and Filtered Subset Eval, using a variety of datasets such as the Experimental Dataset, Breast Cancer Wisconsin (Original), KDD CUP 1999, NSL-KDD, UNSW-NB15, and Edge-IIoT. In the absence of the FSFS method, the highest classifier accuracies observed were 60.00%, 95.13%, 97.02%, 98.17%, 95.86%, and 94.62% for the respective datasets. When the FSFS technique was integrated with data normalization, encoding, balancing, and feature importance selection processes, accuracies improved to 100.00%, 97.81%, 98.63%, 98.94%, 94.27%, and 98.46%, respectively. The FSFS method, with a computational complexity of O(<i>f</i><sub><i>n</i></sub> log <i>n</i>), demonstrates robust scalability and is well-suited for datasets of large size, ensuring efficient processing even when the number of features is substantial. By automatically eliminating outliers and redundant data, FSFS reduces computational overhead, resulting in faster training and improved model performance. Overall, the FSFS framework not only optimizes performance but also enhances the interpretability and explainability of data-driven models, thereby facilitating more trustworthy decision-making in AI applications.},
DOI = {10.32604/cmc.2025.064872}
}