
@Article{cmc.2025.073177,
AUTHOR = {Chih-Chieh Chang, Khairul Izyan Bin Anuar, Yu-Hwa Liu},
TITLE = {Structure-Based Virtual Sample Generation Using Average-Linkage Clustering for Small Dataset Problems},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {87},
YEAR = {2026},
NUMBER = {1},
PAGES = {--},
URL = {http://www.techscience.com/cmc/v87n1/66051},
ISSN = {1546-2226},
ABSTRACT = {Small datasets are often challenging due to their limited sample size. This research introduces a novel solution to these problems: average linkage virtual sample generation (ALVSG). ALVSG leverages the underlying data structure to create virtual samples, which can be used to augment the original dataset. The ALVSG process consists of two steps. First, an average-linkage clustering technique is applied to the dataset to create a dendrogram. The dendrogram represents the hierarchical structure of the dataset, with each merging operation regarded as a linkage. Next, the linkages are combined into an average-based dataset, which serves as a new representation of the dataset. The second step in the ALVSG process involves generating virtual samples using the average-based dataset. The research project generates a set of 100 virtual samples by uniformly distributing them within the provided boundary. These virtual samples are then added to the original dataset, creating a more extensive dataset with improved generalization performance. The efficacy of the ALVSG approach is validated through resampling experiments and t-tests conducted on two small real-world datasets. The experiments are conducted on three forecasting models: the support vector machine for regression (SVR), the deep learning model (DL), and XGBoost. The results show that the ALVSG approach outperforms the baseline methods in terms of mean square error (MSE), root mean square error (RMSE), and mean absolute error (MAE).},
DOI = {10.32604/cmc.2025.073177}
}



