@Article{cmc.2026.079522,
AUTHOR = {Nikita Sakovich, Dmitry Aksenov, Ekaterina Pleshakova, Sergey Gataullin},
TITLE = {AI Model Compression Methods: A Distribution-Aware Residual Entropy Quantization},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26790},
ISSN = {1546-2226},
ABSTRACT = {We introduce the DARE-Q (Distribution-Aware Residual Entropy Quantization) method—a post-training quantization method for neural network weights designed to reduce bit-width with minimal degradation of model quality. Unlike traditional approaches that solely optimize the mean squared error of weight approximation, DARE-Q additionally considers the entropy of the quantization residual, allowing for control over the statistical properties of the resulting error. The method is based on channel-wise symmetric uniform quantization with scaling based on a combined loss function that includes L2 distortion and entropy regularization. The DARE-Q method is implemented as a compact DAREQuantLinear module which can be easily integrated into standard transformer pipelines without changing the inference logic or using specific kernels. The experimental analysis was conducted on the language models <monospace>facebook/opt-125m</monospace> and <monospace>facebook/opt-350m</monospace>, which contain approximately 125 and 350 million parameters. The quality of the models was assessed using the standard perplexity metric (PPL) computed on the <monospace>wikitext-2-raw-v1</monospace> dataset. DARE-Q is completely data-free and does not require model retraining or calibration data, which makes it the only viable option in privacy-sensitive or confidential environments where access to the original training data is restricted—precisely the setting where methods such as GPTQ and AWQ cannot be applied. The observed increase in PPL relative to data-dependent baselines reflects this fundamental trade-off rather than a shortcoming of the approach. By leveraging per-channel scale selection and a combined loss function, DARE-Q provides a flexible trade-off between approximation accuracy and quantization error structure, creating an attractive algorithmic basis for further improvement of model compression methods.},
DOI = {10.32604/cmc.2026.079522}
}