Accelerating deep learning inference in constrained embedded devices using hardware loops and a dot product unit

Accelerating deep learning inference in constrained embedded devices using hardware loops and a dot product unit. Vreca, J., Sturm, K., J., Gungl, E., Merchant, F., Bientinesi, P., Leupers, R., & Brezocnik, Z. IEEE Access, 8:165913-165926, Institute of Electrical and Electronics Engineers Inc., 2020.

Paper doi abstract bibtex

Deep learning algorithms have seen success in a wide variety of applications, such as machine translation, image and speech recognition, and self-driving cars. However, these algorithms have only recently gained a foothold in the embedded systems domain. Most embedded systems are based on cheap microcontrollers with limited memory capacity, and, thus, are typically seen as not capable of running deep learning algorithms. Nevertheless, we consider that advancements in compression of neural networks and neural network architecture, coupled with an optimized instruction set architecture, could make microcontroller-grade processors suitable for specific low-intensity deep learning applications. We propose a simple instruction set extension with two main components-hardware loops and dot product instructions. To evaluate the effectiveness of the extension, we developed optimized assembly functions for the fully connected and convolutional neural network layers. When using the extensions and the optimized assembly functions, we achieve an average clock cycle count decrease of 73% for a small scale convolutional neural network. On a per layer base, our optimizations decrease the clock cycle count for fully connected layers and convolutional layers by 72% and 78%, respectively. The average energy consumption per inference decreases by 73%. We have shown that adding just hardware loops and dot product instructions has a significant positive effect on processor efficiency in computing neural network functions.

@article{
 title = {Accelerating deep learning inference in constrained embedded devices using hardware loops and a dot product unit},
 type = {article},
 year = {2020},
 keywords = {Deep learning,Embedded systems,Instruction set optimization,RISC-V},
 pages = {165913-165926},
 volume = {8},
 publisher = {Institute of Electrical and Electronics Engineers Inc.},
 id = {8de8aa1c-3363-3851-8bb0-331f5a245e84},
 created = {2022-09-29T07:48:17.119Z},
 file_attached = {true},
 profile_id = {c3c41a69-4b45-352f-9232-4d3281e18730},
 group_id = {5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1},
 last_modified = {2023-01-12T10:14:12.142Z},
 read = {false},
 starred = {true},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 folder_uuids = {6aec054e-977e-4d9f-8f01-b49e735da52b,5b1307f8-2af3-4763-ba1d-94a2bebe2114},
 private_publication = {false},
 abstract = {Deep learning algorithms have seen success in a wide variety of applications, such as machine translation, image and speech recognition, and self-driving cars. However, these algorithms have only recently gained a foothold in the embedded systems domain. Most embedded systems are based on cheap microcontrollers with limited memory capacity, and, thus, are typically seen as not capable of running deep learning algorithms. Nevertheless, we consider that advancements in compression of neural networks and neural network architecture, coupled with an optimized instruction set architecture, could make microcontroller-grade processors suitable for specific low-intensity deep learning applications. We propose a simple instruction set extension with two main components-hardware loops and dot product instructions. To evaluate the effectiveness of the extension, we developed optimized assembly functions for the fully connected and convolutional neural network layers. When using the extensions and the optimized assembly functions, we achieve an average clock cycle count decrease of 73% for a small scale convolutional neural network. On a per layer base, our optimizations decrease the clock cycle count for fully connected layers and convolutional layers by 72% and 78%, respectively. The average energy consumption per inference decreases by 73%. We have shown that adding just hardware loops and dot product instructions has a significant positive effect on processor efficiency in computing neural network functions.},
 bibtype = {article},
 author = {Vreca, Jure and Sturm, Karl J.X. and Gungl, Ernest and Merchant, Farhad and Bientinesi, Paolo and Leupers, Rainer and Brezocnik, Zmago},
 doi = {10.1109/ACCESS.2020.3022824},
 journal = {IEEE Access}
}

Downloads: 0

{"_id":"EpW8tA9suYTTfZ73r","bibbaseid":"vreca-sturm-gungl-merchant-bientinesi-leupers-brezocnik-acceleratingdeeplearninginferenceinconstrainedembeddeddevicesusinghardwareloopsandadotproductunit-2020","author_short":["Vreca, J.","Sturm, K., J.","Gungl, E.","Merchant, F.","Bientinesi, P.","Leupers, R.","Brezocnik, Z."],"bibdata":{"title":"Accelerating deep learning inference in constrained embedded devices using hardware loops and a dot product unit","type":"article","year":"2020","keywords":"Deep learning,Embedded systems,Instruction set optimization,RISC-V","pages":"165913-165926","volume":"8","publisher":"Institute of Electrical and Electronics Engineers Inc.","id":"8de8aa1c-3363-3851-8bb0-331f5a245e84","created":"2022-09-29T07:48:17.119Z","file_attached":"true","profile_id":"c3c41a69-4b45-352f-9232-4d3281e18730","group_id":"5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1","last_modified":"2023-01-12T10:14:12.142Z","read":false,"starred":"true","authored":false,"confirmed":false,"hidden":false,"folder_uuids":"6aec054e-977e-4d9f-8f01-b49e735da52b,5b1307f8-2af3-4763-ba1d-94a2bebe2114","private_publication":false,"abstract":"Deep learning algorithms have seen success in a wide variety of applications, such as machine translation, image and speech recognition, and self-driving cars. However, these algorithms have only recently gained a foothold in the embedded systems domain. Most embedded systems are based on cheap microcontrollers with limited memory capacity, and, thus, are typically seen as not capable of running deep learning algorithms. Nevertheless, we consider that advancements in compression of neural networks and neural network architecture, coupled with an optimized instruction set architecture, could make microcontroller-grade processors suitable for specific low-intensity deep learning applications. We propose a simple instruction set extension with two main components-hardware loops and dot product instructions. To evaluate the effectiveness of the extension, we developed optimized assembly functions for the fully connected and convolutional neural network layers. When using the extensions and the optimized assembly functions, we achieve an average clock cycle count decrease of 73% for a small scale convolutional neural network. On a per layer base, our optimizations decrease the clock cycle count for fully connected layers and convolutional layers by 72% and 78%, respectively. The average energy consumption per inference decreases by 73%. We have shown that adding just hardware loops and dot product instructions has a significant positive effect on processor efficiency in computing neural network functions.","bibtype":"article","author":"Vreca, Jure and Sturm, Karl J.X. and Gungl, Ernest and Merchant, Farhad and Bientinesi, Paolo and Leupers, Rainer and Brezocnik, Zmago","doi":"10.1109/ACCESS.2020.3022824","journal":"IEEE Access","bibtex":"@article{\n title = {Accelerating deep learning inference in constrained embedded devices using hardware loops and a dot product unit},\n type = {article},\n year = {2020},\n keywords = {Deep learning,Embedded systems,Instruction set optimization,RISC-V},\n pages = {165913-165926},\n volume = {8},\n publisher = {Institute of Electrical and Electronics Engineers Inc.},\n id = {8de8aa1c-3363-3851-8bb0-331f5a245e84},\n created = {2022-09-29T07:48:17.119Z},\n file_attached = {true},\n profile_id = {c3c41a69-4b45-352f-9232-4d3281e18730},\n group_id = {5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1},\n last_modified = {2023-01-12T10:14:12.142Z},\n read = {false},\n starred = {true},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n folder_uuids = {6aec054e-977e-4d9f-8f01-b49e735da52b,5b1307f8-2af3-4763-ba1d-94a2bebe2114},\n private_publication = {false},\n abstract = {Deep learning algorithms have seen success in a wide variety of applications, such as machine translation, image and speech recognition, and self-driving cars. However, these algorithms have only recently gained a foothold in the embedded systems domain. Most embedded systems are based on cheap microcontrollers with limited memory capacity, and, thus, are typically seen as not capable of running deep learning algorithms. Nevertheless, we consider that advancements in compression of neural networks and neural network architecture, coupled with an optimized instruction set architecture, could make microcontroller-grade processors suitable for specific low-intensity deep learning applications. We propose a simple instruction set extension with two main components-hardware loops and dot product instructions. To evaluate the effectiveness of the extension, we developed optimized assembly functions for the fully connected and convolutional neural network layers. When using the extensions and the optimized assembly functions, we achieve an average clock cycle count decrease of 73% for a small scale convolutional neural network. On a per layer base, our optimizations decrease the clock cycle count for fully connected layers and convolutional layers by 72% and 78%, respectively. The average energy consumption per inference decreases by 73%. We have shown that adding just hardware loops and dot product instructions has a significant positive effect on processor efficiency in computing neural network functions.},\n bibtype = {article},\n author = {Vreca, Jure and Sturm, Karl J.X. and Gungl, Ernest and Merchant, Farhad and Bientinesi, Paolo and Leupers, Rainer and Brezocnik, Zmago},\n doi = {10.1109/ACCESS.2020.3022824},\n journal = {IEEE Access}\n}","author_short":["Vreca, J.","Sturm, K., J.","Gungl, E.","Merchant, F.","Bientinesi, P.","Leupers, R.","Brezocnik, Z."],"urls":{"Paper":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/file/74804ca8-c87e-8af1-6f84-aa054149ca96/Accelerating_Deep_Learning_Inference_in_Constrained_Embedded_Devices_Using_Hardware_Loops_and_a_Dot_Product_Unit.pdf.pdf"},"biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","bibbaseid":"vreca-sturm-gungl-merchant-bientinesi-leupers-brezocnik-acceleratingdeeplearninginferenceinconstrainedembeddeddevicesusinghardwareloopsandadotproductunit-2020","role":"author","keyword":["Deep learning","Embedded systems","Instruction set optimization","RISC-V"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"article","biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","dataSources":["2252seNhipfTmjEBQ"],"keywords":["deep learning","embedded systems","instruction set optimization","risc-v"],"search_terms":["accelerating","deep","learning","inference","constrained","embedded","devices","using","hardware","loops","dot","product","unit","vreca","sturm","gungl","merchant","bientinesi","leupers","brezocnik"],"title":"Accelerating deep learning inference in constrained embedded devices using hardware loops and a dot product unit","year":2020}