
@Article{jbd.2020.010431,
AUTHOR = {Chao Zhu, Yike Wang, Dongbing Pu,Miao Qi, Hui Sun, Lei Tan},
TITLE = {Multi-Modality Video Representation for Action Recognition},
JOURNAL = {Journal on Big Data},
VOLUME = {2},
YEAR = {2020},
NUMBER = {3},
PAGES = {95--104},
URL = {http://www.techscience.com/jbd/v2n3/40326},
ISSN = {2579-0056},
ABSTRACT = {Nowadays, action recognition is widely applied in many fields. 
However, action is hard to define by single modality information. The difference 
between image recognition and action recognition is that action recognition 
needs more modality information to depict one action, such as the appearance, 
the motion and the dynamic information. Due to the state of action evolves with 
the change of time, motion information must be considered when representing an 
action. Most of current methods define an action by spatial information and 
motion information. There are two key elements of current action recognition 
methods: spatial information achieved by sampling sparsely on video frames’ 
sequence and the motion content mostly represented by the optical flow which is 
calculated on consecutive video frames. However, the relevance between them in 
current methods is weak. Therefore, to strengthen the associativity, this paper 
presents a new architecture consisted of three streams to obtain multi-modality 
information. The advantages of our network are: (a) We propose a new sampling 
approach to sample evenly on the video sequence for acquiring the appearance 
information; (b) We utilize ResNet101 for gaining high-level and distinguished 
features; (c) We advance a three-stream architecture to capture temporal, spatial 
and dynamic information. Experimental results on UCF101 dataset illustrate that 
our method outperforms other previous methods.},
DOI = {10.32604/jbd.2020.010431}
}



