
@Article{cmc.2026.079331,
AUTHOR = {Giulio Caporro, Paolo Russo},
TITLE = {DeepEchoNet: A Lightweight Architecture for Low Resolution Monocular Depth Estimation},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26641},
ISSN = {1546-2226},
ABSTRACT = {Monocular depth estimation (MDE) has become a practical alternative to active range sensing in many indoor scenarios, enabled by supervised deep learning models that predict dense depth maps from a single RGB image. However, most modern MDE systems assume mid-to-high resolution inputs and non-trivial compute budgets, limiting their direct applicability in embedded and bandwidth-constrained settings. This paper studies <i>low resolution</i> MDE, focusing on <mml:math id="mml-ieqn-1"><mml:mn>96</mml:mn><mml:mo>×</mml:mo><mml:mn>96</mml:mn></mml:math> inputs, where geometric cues are strongly degraded and naively downsizing high-resolution architectures often leads to unstable training and poor accuracy. We propose DeepEchoNet, a lightweight hybrid CNN-transformer model tailored to operate natively at <mml:math id="mml-ieqn-2"><mml:mn>96</mml:mn><mml:mo>×</mml:mo><mml:mn>96</mml:mn></mml:math> resolution. The design combines a MobileViT-inspired encoder with MobileNetV2-style inverted residual blocks and lightweight transformer blocks, and a guided decoder that selectively fuses multi-scale skip features through efficient recalibration modules and separable convolutions. We further adopt a training objective that is aware of low resolution, along with a joint RGB–depth augmentation pipeline that includes a strong-to-weak schedule, to improve robustness while preserving coarse geometric consistency.},
DOI = {10.32604/cmc.2026.079331}
}



