@Article{cmc.2026.080068,
AUTHOR = {Darren Chai Xin Lun, Lim Tong Ming},
TITLE = {H-LoRA: Rethinking Rank Selection for Controllable Knowledge Retention in Edge AI},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26644},
ISSN = {1546-2226},
ABSTRACT = {The deployment of specialized language models in resource-constrained edge environments (<mml:math id="mml-ieqn-1"><mml:mrow><mml:mo>≤</mml:mo></mml:mrow><mml:mn>1</mml:mn></mml:math>B parameters, <mml:math id="mml-ieqn-2"><mml:mrow><mml:mo>≤</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:math> GB memory, <mml:math id="mml-ieqn-3"><mml:mrow><mml:mo>≤</mml:mo></mml:mrow><mml:mn>100</mml:mn></mml:math> ms latency) faces a critical challenge: Supervised Fine-Tuning (SFT) achieves domain expertise but suffers from irreversible catastrophic forgetting, while traditional Low-Rank Adaptation (LoRA) with conservative ranks (<mml:math id="mml-ieqn-4"><mml:mi>r</mml:mi><mml:mtext> </mml:mtext><mml:mo>≤</mml:mo><mml:mtext> </mml:mtext><mml:mn>64</mml:mn></mml:math>) often underperforms due to insufficient adaptation capacity. This work introduces H-LoRA (High-Rank LoRA) for edge-deployable models and establishes a fundamental distinction between destructive forgetting and controllable knowledge retention. Through comprehensive experiments on compact models (<mml:math id="mml-ieqn-5"><mml:mn>0.12</mml:mn></mml:math>B Minimind and Qwen-<mml:math id="mml-ieqn-6"><mml:mn>0.5</mml:mn></mml:math>B) across three domains (Human Resources, Medical, Mathematics) using 29,647 samples, we demonstrate that while both SFT and H-LoRA exhibit general capability degradation, they differ fundamentally: SFT completely destroys the original knowledge structure (<mml:math id="mml-ieqn-7"><mml:mn>1</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> topic retention), while H-LoRA maintains knowledge integrity with <mml:math id="mml-ieqn-8"><mml:mn>90</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> topic retention—an <mml:math id="mml-ieqn-9"><mml:mn>89</mml:mn></mml:math> percentage point improvement—enabling post-deployment capability recovery. H-LoRA employs simplified scaling and strategic high-rank adaptation at approximately two-thirds of the model’s hidden dimension (<mml:math id="mml-ieqn-10"><mml:mi>r</mml:mi><mml:mtext> </mml:mtext><mml:mo>=</mml:mo><mml:mtext> </mml:mtext><mml:mn>512</mml:mn></mml:math> for <mml:math id="mml-ieqn-11"><mml:mi>d</mml:mi><mml:mtext> </mml:mtext><mml:mo>=</mml:mo><mml:mtext> </mml:mtext><mml:mn>768</mml:mn></mml:math>), achieving SFT-level domain performance (<mml:math id="mml-ieqn-12"><mml:mn>99.81</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> precision) with <mml:math id="mml-ieqn-13"><mml:mn>5</mml:mn><mml:mo>×</mml:mo></mml:math> greater parameter efficiency (<mml:math id="mml-ieqn-14"><mml:mn>20.35</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> trainable parameters) and robust cross-domain generalization (<mml:math id="mml-ieqn-15"><mml:mn>93.5</mml:mn><mml:mtext> </mml:mtext><mml:mo>±</mml:mo><mml:mtext> </mml:mtext><mml:mn>6.8</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> average precision). In addition, H-LoRA reduces over-the-air (OTA) update size from <mml:math id="mml-ieqn-16"><mml:mn>1.4</mml:mn></mml:math> GB to <mml:math id="mml-ieqn-17"><mml:mn>96</mml:mn></mml:math> MB (<mml:math id="mml-ieqn-18"><mml:mo>≈</mml:mo><mml:mspace width="negativethinmathspace"/><mml:mn>93</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math>), enabling practical and frequent deployment of specialized models in bandwidth-limited edge environments. Beyond demonstrating effectiveness, this work establishes the first comprehensive framework for characterizing specialization-retention trade-offs in parameter-efficient fine-tuning, providing practical guidance for method selection in real-world deployments.},
DOI = {10.32604/cmc.2026.080068}
}