var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=https%3A%2F%2Fhomes.luddy.indiana.edu%2Ffc7%2Ffan-publication.bib&fullnames=1&jsonp=1&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=https%3A%2F%2Fhomes.luddy.indiana.edu%2Ffc7%2Ffan-publication.bib&fullnames=1&jsonp=1\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=https%3A%2F%2Fhomes.luddy.indiana.edu%2Ffc7%2Ffan-publication.bib&fullnames=1&jsonp=1\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n 2024\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n QuantumLeak: Stealing Quantum Neural Networks from Cloud-based NISQ Machines.\n \n \n \n\n\n \n Zhenxiao Fu; Min Yang; Cheng Chu; Yilun Xu; Gang Huang; and Fan Chen.\n\n\n \n\n\n\n In International Joint Conference on Neural Networks (IJCNN), 2024. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{2024IJCNN:QuantumLeak,\r\n  author={ Zhenxiao Fu and\r\n           Min Yang and      \r\n           Cheng Chu and\r\n           Yilun Xu and\r\n           Gang Huang and\r\n          {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n  booktitle={International Joint Conference on Neural Networks (IJCNN)}, \r\n  title={{QuantumLeak: Stealing Quantum Neural Networks from Cloud-based NISQ Machines}}, \r\n  year={2024},\r\n  volume={},\r\n  number={},\r\n  pages={},\r\n  doi={}}\r\n\r\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Late Breaking Results: Extracting QNNs from NISQ Computers via Ensemble Learning.\n \n \n \n \n\n\n \n Zhenxiao Fu; and Fan Chen.\n\n\n \n\n\n\n In Proceedings of the 61th Annual Design Automation Conference (DAC), 2024. \n \n\n\n\n
\n\n\n\n \n \n \"LatePaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{2024DACLBR:Stealing,\r\n  author={ Zhenxiao Fu and\r\n          {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n  title = {{Late Breaking Results: Extracting QNNs from NISQ Computers via Ensemble Learning}},\r\n  year = {2024},\r\n  url = {},\r\n  doi = {},\r\n  booktitle = {Proceedings of the 61th Annual Design Automation Conference (DAC)},\r\n  pages = {},\r\n  numpages = {2},\r\n}\r\n\r\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n TITAN: A Fast and Distributed Large-Scale Trapped-Ion NISQ Computer.\n \n \n \n\n\n \n Cheng Chu; Zhenxiao Fu; Yilun Xu; Gang Huang; Hausi Müller; Fan Chen; and Lei Jiang.\n\n\n \n\n\n\n In Proceedings of the 61th Annual Design Automation Conference (DAC), 2024. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{2024DAC:TITAN, \r\n  author    = { Cheng Chu and\r\n                Zhenxiao Fu and\r\n                Yilun Xu and\r\n                Gang Huang and\r\n                Hausi M\\"uller and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and\r\n               Lei Jiang},\r\n  title     = {{TITAN: A Fast and Distributed Large-Scale Trapped-Ion NISQ Computer}},\r\n  booktitle = {Proceedings of the 61th Annual Design Automation Conference (DAC)},\r\n  pages     = {},\r\n  publisher = {},\r\n  year      = {2024},\r\n  doi       = {},\r\n  abstract  = {},\r\n}\r\n\r\n\r\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language Models.\n \n \n \n\n\n \n Ahmad Faiz; Sotaro Kaneda; Ruhan Wang; Rita Osi; Prateek Sharma; Fan Chen; and Lei Jiang.\n\n\n \n\n\n\n In International Conference on Learning Representations (ICLR), 2024. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{2024ICLR:LLMCarbon, \r\n  author    = { Ahmad Faiz and \r\n\t\tSotaro Kaneda and \r\n\t\tRuhan Wang and\r\n\t\tRita Osi and\r\n\t\tPrateek Sharma and\r\n\t\t{<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and\r\n\t\tLei Jiang},\r\n  title     = {{LLMCarbon:} Modeling the end-to-end Carbon Footprint of Large Language Models},\r\n  booktitle = {International Conference on Learning Representations (ICLR)},\r\n  pages     = {},\r\n  publisher = {},\r\n  year      = {2024},\r\n  doi       = {},\r\n  abstract  = {},\r\n}\r\n\r\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n NISQ Quantum Computing: A Security-Centric Tutorial and Survey.\n \n \n \n\n\n \n Fan Chen; Lei Jiang; Hausi Muller; Philip Richerme; Cheng Chu; Zhenxiao Fu; and Min Yang.\n\n\n \n\n\n\n In IEEE Circuits and Systems Magazine (CASM), volume 24, pages 14-32, 2024. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{2024CASM:QSurvey, \r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and\r\n\t  \tLei Jiang and\r\n                Hausi Muller and\r\n                Philip Richerme and\r\n                Cheng Chu and\r\n                Zhenxiao Fu and \r\n               Min Yang},\r\n  title     = {{NISQ Quantum Computing: A Security-Centric Tutorial and Survey}},\r\n  booktitle = {IEEE Circuits and Systems Magazine (CASM)},\r\n  year={2024},\r\n  volume={24},\r\n  number={1},\r\n  pages={14-32},\r\n  keywords={Quantum computing;Computer security;Stakeholders;Software algorithms;Quantum mechanics;Investment;Taxonomy;Quantum computing;quantum security;NISQ},\r\n  doi={10.1109/MCAS.2024.3349665},\r\n  abstract  = {Quantum computing (QC) demonstrates substantial theoretical promise in addressing classically intractable problems. Recent investments and advancements across QC system stacks, including hardware, software, and algorithms, underscore a pivotal shift from theoretical exploration to the practical realization of applications. Despite this progress, the prevailing emphasis has predominantly centered on performance enhancement, often overlooking security considerations. In response to this gap, our article presents a comprehensive tutorial and survey aimed at identifying and categorizing vulnerabilities inherent in quantum computing systems. Beginning with an overview encompassing essential principles, ecosystem components, and unique attributes in the quantum computing system stack, we also provide a summary of development resources to facilitate efficient initiation in this domain. Building on this foundational knowledge, we introduce a taxonomy of QC security organized by victim layer and security attack objectives. Utilizing this taxonomy as a guiding framework, the article delivers an extensive survey of the latest advancements in QC security, with the overarching goal of equipping the reader with a comprehensive understanding of quantum computing system principles and an informed awareness of diverse and dynamic QC security threats. The intention is to benefit both industry stakeholders and research communities, ultimately aiming to proactively identify and mitigate security concerns within QC systems, thereby establishing a robust foundation for secure quantum computing environments.},\r\n}\r\n\r\n
\n
\n\n\n
\n Quantum computing (QC) demonstrates substantial theoretical promise in addressing classically intractable problems. Recent investments and advancements across QC system stacks, including hardware, software, and algorithms, underscore a pivotal shift from theoretical exploration to the practical realization of applications. Despite this progress, the prevailing emphasis has predominantly centered on performance enhancement, often overlooking security considerations. In response to this gap, our article presents a comprehensive tutorial and survey aimed at identifying and categorizing vulnerabilities inherent in quantum computing systems. Beginning with an overview encompassing essential principles, ecosystem components, and unique attributes in the quantum computing system stack, we also provide a summary of development resources to facilitate efficient initiation in this domain. Building on this foundational knowledge, we introduce a taxonomy of QC security organized by victim layer and security attack objectives. Utilizing this taxonomy as a guiding framework, the article delivers an extensive survey of the latest advancements in QC security, with the overarching goal of equipping the reader with a comprehensive understanding of quantum computing system principles and an informed awareness of diverse and dynamic QC security threats. The intention is to benefit both industry stakeholders and research communities, ultimately aiming to proactively identify and mitigate security concerns within QC systems, thereby establishing a robust foundation for secure quantum computing environments.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n JustQ: Automated Deployment of Fair and Accurate Quantum Neural Networks.\n \n \n \n\n\n \n Ruhan Wang; Fahiz Baba-Yara; and Fan Chen.\n\n\n \n\n\n\n In Asia and South Pacific Design Automation Conference (ASP-DAC), 2024. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{2024ASPDAC:JustQ, \r\n  author    = {Ruhan Wang and \r\n\t\tFahiz Baba-Yara and \r\n\t\t{<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n  title     = {{JustQ:} Automated Deployment of Fair and Accurate Quantum Neural Networks},\r\n  booktitle = {Asia and South Pacific Design Automation Conference (ASP-DAC)},\r\n  pages     = {},\r\n  publisher = {},\r\n  year      = {2024},\r\n  doi       = {},\r\n  abstract  = {},\r\n}\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n% 2023\r\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2023\n \n \n (7)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n A hybrid quantum-classical neural network for learning transferable visual representation.\n \n \n \n \n\n\n \n Ruhan Wang; Philip Richerme; and Fan Chen.\n\n\n \n\n\n\n In Quantum Science and Technology (QST), volume 8, pages 045021, sep 2023. IOP Publishing\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{2023QST:QCLIP,\r\ndoi = {10.1088/2058-9565/acf1c7},\r\nurl = {https://dx.doi.org/10.1088/2058-9565/acf1c7},\r\nyear = {2023},\r\nmonth = {sep},\r\nbooktitle = {Quantum Science and Technology (QST)},\r\npublisher = {IOP Publishing},\r\nvolume = {8},\r\nnumber = {4},\r\npages = {045021},\r\nauthor = {Ruhan Wang and Philip Richerme and \r\n\t{<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\ntitle = {A hybrid quantum-classical neural network for learning transferable visual representation},\r\nabstract = {State-of-the-art quantum machine learning (QML) algorithms fail to offer practical advantages over their notoriously powerful classical counterparts, due to the limited learning capabilities of QML algorithms, the constrained computational resources available on today’s noisy intermediate-scale quantum (NISQ) devices, and the empirically designed circuit ansatz for QML models. In this work, we address these challenges by proposing a hybrid quantum–classical neural network (CaNN), which we call QCLIP, for Quantum Contrastive Language-Image Pre-Training. Rather than training a supervised QML model to predict human annotations, QCLIP focuses on more practical transferable visual representation learning, where the developed model can be generalized to work on unseen downstream datasets. QCLIP is implemented by using CaNNs to generate low-dimensional data feature embeddings followed by quantum neural networks to adapt and generalize the learned representation in the quantum Hilbert space. Experimental results show that the hybrid QCLIP model can be efficiently trained for representation learning. We evaluate the representation transfer capability of QCLIP against the classical Contrastive Language-Image Pre-Training model on various datasets. Simulation results and real-device results on NISQ IBM_Auckland quantum computer both show that the proposed QCLIP model outperforms the classical CLIP model in all test cases. As the field of QML on NISQ devices is continually evolving, we anticipate that this work will serve as a valuable foundation for future research and advancements in this promising area.}\r\n}\r\n\r\n
\n
\n\n\n
\n State-of-the-art quantum machine learning (QML) algorithms fail to offer practical advantages over their notoriously powerful classical counterparts, due to the limited learning capabilities of QML algorithms, the constrained computational resources available on today’s noisy intermediate-scale quantum (NISQ) devices, and the empirically designed circuit ansatz for QML models. In this work, we address these challenges by proposing a hybrid quantum–classical neural network (CaNN), which we call QCLIP, for Quantum Contrastive Language-Image Pre-Training. Rather than training a supervised QML model to predict human annotations, QCLIP focuses on more practical transferable visual representation learning, where the developed model can be generalized to work on unseen downstream datasets. QCLIP is implemented by using CaNNs to generate low-dimensional data feature embeddings followed by quantum neural networks to adapt and generalize the learned representation in the quantum Hilbert space. Experimental results show that the hybrid QCLIP model can be efficiently trained for representation learning. We evaluate the representation transfer capability of QCLIP against the classical Contrastive Language-Image Pre-Training model on various datasets. Simulation results and real-device results on NISQ IBM_Auckland quantum computer both show that the proposed QCLIP model outperforms the classical CLIP model in all test cases. As the field of QML on NISQ devices is continually evolving, we anticipate that this work will serve as a valuable foundation for future research and advancements in this promising area.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n QDoor: Exploiting Approximate Synthesis for Backdoor Attacks in Quantum Neural Network Circuits.\n \n \n \n\n\n \n Cheng Chu; Fan Chen; Philip Richerme; and Lei Jiang.\n\n\n \n\n\n\n In IEEE International Conference on Quantum Computing and Engineering (QCE), volume 01, pages 1098-1106, 2023. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{QCE2023:QDoor,\r\n  author={Cheng Chu and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and Philip Richerme and Lei Jiang},\r\n  booktitle={IEEE International Conference on Quantum Computing and Engineering (QCE)}, \r\n  title={{QDoor: Exploiting Approximate Synthesis for Backdoor Attacks in Quantum Neural Network Circuits}}, \r\n  year={2023},\r\n  volume={01},\r\n  number={},\r\n  pages={1098-1106},\r\n  doi={10.1109/QCE57702.2023.00124},\r\n  abstract = {Quantum neural networks (QNNs) succeed in object recognition, natural language processing, and financial analysis. To maximize the accuracy of a QNN on a Noisy Intermediate Scale Quantum (NISQ) computer, approximate synthesis modifies the QNN circuit by reducing error-prone 2-qubit quantum gates. The success of QNNs motivates adversaries to attack QNNs via backdoors. However, naively transplanting backdoors designed for classical neural networks to QNNs yields only low attack success rate, due to the noises and approximate synthesis on NISQ computers. Prior quantum circuit-based backdoors cannot selectively attack some inputs or work with all types of encoding layers of a QNN circuit. Moreover, it is easy to detect both transplanted and circuit-based backdoors in a QNN. In this paper, we propose a novel and stealthy backdoor attack, QDoor, to achieve high attack success rate in approximately-synthesized QNN circuits by weaponizing unitary differences between uncompiled QNNs and their synthesized counterparts. QDoor trains a QNN behaving normally for all inputs with and without a trigger. However, after approximate synthesis, the QNN circuit always predicts any inputs with a trigger to a predefined class while still acts normally for benign inputs. Compared to prior backdoor attacks, QDoor improves the attack success rate by 13x and the clean data accuracy by 65% on average. Furthermore, prior backdoor detection techniques cannot find QDoor attacks in uncompiled QNN circuits.}\r\n}\r\n\r\n
\n
\n\n\n
\n Quantum neural networks (QNNs) succeed in object recognition, natural language processing, and financial analysis. To maximize the accuracy of a QNN on a Noisy Intermediate Scale Quantum (NISQ) computer, approximate synthesis modifies the QNN circuit by reducing error-prone 2-qubit quantum gates. The success of QNNs motivates adversaries to attack QNNs via backdoors. However, naively transplanting backdoors designed for classical neural networks to QNNs yields only low attack success rate, due to the noises and approximate synthesis on NISQ computers. Prior quantum circuit-based backdoors cannot selectively attack some inputs or work with all types of encoding layers of a QNN circuit. Moreover, it is easy to detect both transplanted and circuit-based backdoors in a QNN. In this paper, we propose a novel and stealthy backdoor attack, QDoor, to achieve high attack success rate in approximately-synthesized QNN circuits by weaponizing unitary differences between uncompiled QNNs and their synthesized counterparts. QDoor trains a QNN behaving normally for all inputs with and without a trigger. However, after approximate synthesis, the QNN circuit always predicts any inputs with a trigger to a predefined class while still acts normally for benign inputs. Compared to prior backdoor attacks, QDoor improves the attack success rate by 13x and the clean data accuracy by 65% on average. Furthermore, prior backdoor detection techniques cannot find QDoor attacks in uncompiled QNN circuits.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n CryptoQFL: Quantum Federated Learning on Encrypted Data.\n \n \n \n\n\n \n Cheng Chu; Lei Jiang; and Fan Chen.\n\n\n \n\n\n\n In IEEE International Conference on Quantum Computing and Engineering (QCE), volume 01, pages 1231-1237, 2023. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{QCE2023:CryptoQFL,\r\n  author={Cheng Chu and Lei Jiang and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n  booktitle={IEEE International Conference on Quantum Computing and Engineering (QCE)}, \r\n  title={{CryptoQFL: Quantum Federated Learning on Encrypted Data}}, \r\n  year={2023},\r\n  volume={01},\r\n  number={},\r\n  pages={1231-1237},\r\n  doi={10.1109/QCE57702.2023.00139},\r\n  abstract = {Recent advancements in Quantum Neural Networks (QNNs) have demonstrated theoretical and experimental performance superior to their classical counterparts in a wide range of applications. However, existing centralized QNNs cannot solve many real-world problems because collecting large amounts of training data to a common public site is time-consuming and, more importantly, violates data privacy. Federated Learning (FL) is an emerging distributed machine learning framework that allows collaborative model training on decentralized data residing on multiple devices without breaching data privacy. Some initial attempts at Quantum Federated Learning (QFL) either only focus on improving the QFL performance or rely on a trusted quantum server that fails to preserve data privacy. In this work, we propose CryptoQFL, a QFL framework that allows distributed QNN training on encrypted data. CryptoQFL is (1) secure, because it allows each edge to train a QNN with local private data, and encrypt its updates using quantum homomorphic encryption before sending them to the central quantum server; (2) communication-efficient, as CryptoQFL quantize local gradient updates to ternary values, and only communicate non-zero values to the server for aggregation; and (3) computation-efficient, as CryptoQFL presents an efficient quantum aggregation circuit with significantly reduced latency compared to state-of-the-art approaches.}\r\n}\r\n\r\n
\n
\n\n\n
\n Recent advancements in Quantum Neural Networks (QNNs) have demonstrated theoretical and experimental performance superior to their classical counterparts in a wide range of applications. However, existing centralized QNNs cannot solve many real-world problems because collecting large amounts of training data to a common public site is time-consuming and, more importantly, violates data privacy. Federated Learning (FL) is an emerging distributed machine learning framework that allows collaborative model training on decentralized data residing on multiple devices without breaching data privacy. Some initial attempts at Quantum Federated Learning (QFL) either only focus on improving the QFL performance or rely on a trusted quantum server that fails to preserve data privacy. In this work, we propose CryptoQFL, a QFL framework that allows distributed QNN training on encrypted data. CryptoQFL is (1) secure, because it allows each edge to train a QNN with local private data, and encrypt its updates using quantum homomorphic encryption before sending them to the central quantum server; (2) communication-efficient, as CryptoQFL quantize local gradient updates to ternary values, and only communicate non-zero values to the server for aggregation; and (3) computation-efficient, as CryptoQFL presents an efficient quantum aggregation circuit with significantly reduced latency compared to state-of-the-art approaches.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n ReFloat: Low-Cost Floating-Point Processing in ReRAM for Accelerating Iterative Linear Solvers.\n \n \n \n \n\n\n \n Linghao Song; Fan Chen; Hai Li; and Yiran Chen.\n\n\n \n\n\n\n In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC), pages 75:1–75:15, 2023. ACM\n \n\n\n\n
\n\n\n\n \n \n \"ReFloat:Paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{SC2023,\r\n author = {Linghao {Song} and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and Hai {Li} and Yiran Chen},\r\n title = {ReFloat: Low-Cost Floating-Point Processing in ReRAM for Accelerating Iterative Linear Solvers},\r\n booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},\r\n pages        = {75:1--75:15},\r\n publisher    = {{ACM}},\r\n year         = {2023},\r\n url          = {https://doi.org/10.1145/3581784.3607077},\r\n doi          = {10.1145/3581784.3607077},\r\n abstract = {Resistive random access memory (ReRAM) is a promising technology that can perform low-cost and in-situ matrix-vector multiplication (MVM) in analog domain. Scientific computing requires high-precision floating-point (FP) processing. However, performing floating-point computation in ReRAM is challenging because of high hardware cost and execution time due to the large FP value range. In this work we present ReFloat, a data format and an accelerator architecture, for low-cost and high-performance floating-point processing in ReRAM for iterative linear solvers. ReFloat matches the ReRAM crossbar hardware and represents a block of FP values with reduced bits and an optimized exponent base for a high range of dynamic representation. Thus, ReFloat achieves less ReRAM crossbar consumption and fewer processing cycles and overcomes the noncovergence issue in a prior work. The evaluation on the SuiteSparse matrices shows ReFloat achieves 5.02\\texttimes{} to 84.28\\texttimes{} improvement in terms of solver time compared to a state-of-the-art ReRAM based accelerator.},\r\n}\r\n\r\n
\n
\n\n\n
\n Resistive random access memory (ReRAM) is a promising technology that can perform low-cost and in-situ matrix-vector multiplication (MVM) in analog domain. Scientific computing requires high-precision floating-point (FP) processing. However, performing floating-point computation in ReRAM is challenging because of high hardware cost and execution time due to the large FP value range. In this work we present ReFloat, a data format and an accelerator architecture, for low-cost and high-performance floating-point processing in ReRAM for iterative linear solvers. ReFloat matches the ReRAM crossbar hardware and represents a block of FP values with reduced bits and an optimized exponent base for a high range of dynamic representation. Thus, ReFloat achieves less ReRAM crossbar consumption and fewer processing cycles and overcomes the noncovergence issue in a prior work. The evaluation on the SuiteSparse matrices shows ReFloat achieves 5.02× to 84.28× improvement in terms of solver time compared to a state-of-the-art ReRAM based accelerator.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n QTrojan: A Circuit Backdoor Against Quantum Neural Networks.\n \n \n \n\n\n \n Cheng Chu; Lei Jiang; Martin Swany; and Fan Chen.\n\n\n \n\n\n\n In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 1-5, 2023. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{2023ICASSP:QTrojan,\r\n author = {Cheng Chu and Lei Jiang and Martin Swany and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n title = {{QTrojan: A Circuit Backdoor Against Quantum Neural Networks}},\r\n booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, \r\n abstract = {We propose a circuit-level backdoor attack, QTrojan, against Quantum Neural Networks (QNNs) in this paper. QTrojan is implemented by a few quantum gates inserted into the variational quantum circuit of the victim QNN. QTrojan is much stealthier than a prior Data-Poisoning-based Backdoor Attack (DPBA) since it does not embed any trigger in the inputs of the victim QNN or require access to original training datasets. Compared to a DPBA, QTrojan improves the clean data accuracy by 21\\% and the attack success rate by 19.9\\%.},\r\n year = {2023},\r\n volume={},\r\n number={},\r\n pages={1-5},\r\n doi={10.1109/ICASSP49357.2023.10096293}\r\n}\r\n
\n
\n\n\n
\n We propose a circuit-level backdoor attack, QTrojan, against Quantum Neural Networks (QNNs) in this paper. QTrojan is implemented by a few quantum gates inserted into the variational quantum circuit of the victim QNN. QTrojan is much stealthier than a prior Data-Poisoning-based Backdoor Attack (DPBA) since it does not embed any trigger in the inputs of the victim QNN or require access to original training datasets. Compared to a DPBA, QTrojan improves the clean data accuracy by 21% and the attack success rate by 19.9%.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n IQGAN: Robust Quantum Generative Adversarial Network for Image Synthesis On NISQ Devices.\n \n \n \n\n\n \n Cheng Chu; Grant Skipper; Martin Swany; and Fan Chen.\n\n\n \n\n\n\n In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 1-5, 2023. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{2023ICASSP:IQGAN,\r\n author = {Cheng Chu and Grant Skipper and Martin Swany and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n title = {{IQGAN: Robust Quantum Generative Adversarial Network for Image Synthesis On NISQ Devices}},\r\n booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, \r\n year = {2023},\r\n abstract = {In this work, we propose IQGAN, a quantum Generative Adversarial Network (GAN) framework for multiqubit image synthesis that can be efficiently implemented on Noisy Intermediate Scale Quantum (NISQ) devices. We investigate the reasons for the inferior generative performance of current quantum GANs in our preliminary study and conclude that an adjustable input encoder is the key to ensuring high-quality data synthesis. We then propose the IQGAN architecture featuring a trainable multiqubit quantum encoder that effectively embeds classical data into quantum states. Furthermore, we propose a compact quantum generator that significantly reduces the design cost and circuit depth on NISQ devices. Experimental results on both IBM quantum processors and quantum simulators demonstrated that IQGAN outperforms state-of-the-art quantum GANs in qualitative and quantitative evaluation of the generated samples, model convergence, and quantum computing cost.},\r\n volume={},\r\n number={},\r\n pages={1-5},\r\n doi={10.1109/ICASSP49357.2023.10096772}\r\n}\r\n\r\n
\n
\n\n\n
\n In this work, we propose IQGAN, a quantum Generative Adversarial Network (GAN) framework for multiqubit image synthesis that can be efficiently implemented on Noisy Intermediate Scale Quantum (NISQ) devices. We investigate the reasons for the inferior generative performance of current quantum GANs in our preliminary study and conclude that an adjustable input encoder is the key to ensuring high-quality data synthesis. We then propose the IQGAN architecture featuring a trainable multiqubit quantum encoder that effectively embeds classical data into quantum states. Furthermore, we propose a compact quantum generator that significantly reduces the design cost and circuit depth on NISQ devices. Experimental results on both IBM quantum processors and quantum simulators demonstrated that IQGAN outperforms state-of-the-art quantum GANs in qualitative and quantitative evaluation of the generated samples, model convergence, and quantum computing cost.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n PriML: An Electro-Optical Accelerator for Private Machine Learning on Encrypted Data.\n \n \n \n\n\n \n Mengxin Zheng; Fan Chen; Lei Jiang; and Qian Lou.\n\n\n \n\n\n\n In International Symposium on Quality Electronic Design (ISQED), 2023. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ISQED2023:PriML,\r\n  author    = {Mengxin Zheng and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and Lei Jiang and Qian Lou},\r\n  title     = {PriML: An Electro-Optical Accelerator for Private Machine Learning on Encrypted Data},\r\n  booktitle = {International Symposium on Quality Electronic Design (ISQED)},\r\n  year      = {2023},\r\n}\r\n\r\n% 2022\r\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2022\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n CryptoLight: An Electro-Optical Accelerator for Fully Homomorphic Encryption.\n \n \n \n\n\n \n Mengxin Zheng; Qian Lou; Fan Chen; Lei Jiang; and Yongxin Zhu.\n\n\n \n\n\n\n In IEEE/ACM International Symposium on Nanoscale Architectures (NANOARCH), pages arXiv:2211.13780, November 2022. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{2022arXiv221113780Z,\r\n       author = {{Zheng}, Mengxin and {Lou}, Qian and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and {Jiang}, Lei and {Zhu}, Yongxin},\r\n        title = "{CryptoLight: An Electro-Optical Accelerator for Fully Homomorphic Encryption}",\r\n  booktitle={IEEE/ACM International Symposium on Nanoscale Architectures (NANOARCH)}, \r\n  keywords = {Computer Science - Cryptography and Security, Computer Science - Hardware Architecture},\r\n         year = 2022,\r\n        month = nov,\r\n          eid = {arXiv:2211.13780},\r\n        pages = {arXiv:2211.13780},\r\narchivePrefix = {arXiv},\r\n       eprint = {2211.13780},\r\n primaryClass = {cs.CR},\r\n       adsurl = {https://ui.adsabs.harvard.edu/abs/2022arXiv221113780Z},\r\n      adsnote = {Provided by the SAO/NASA Astrophysics Data System}\r\n}\r\n\r\n\r\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n IQGAN: Robust Quantum Generative Adversarial Network for Image Synthesis On NISQ Devices.\n \n \n \n\n\n \n Cheng Chu; Grant Skipper; Martin Swany; and Fan Chen.\n\n\n \n\n\n\n arXiv e-prints,arXiv:2210.16857. October 2022.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@ARTICLE{2022arXiv221016857C,\r\n       author = {{Chu}, Cheng and {Skipper}, Grant and {Swany}, Martin and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n        title = "{IQGAN: Robust Quantum Generative Adversarial Network for Image Synthesis On NISQ Devices}",\r\n      journal = {arXiv e-prints},\r\n     keywords = {Quantum Physics},\r\n         year = 2022,\r\n        month = oct,\r\n          eid = {arXiv:2210.16857},\r\n        pages = {arXiv:2210.16857},\r\narchivePrefix = {arXiv},\r\n       eprint = {2210.16857},\r\n primaryClass = {quant-ph},\r\n       adsurl = {https://ui.adsabs.harvard.edu/abs/2022arXiv221016857C},\r\n      adsnote = {Provided by the SAO/NASA Astrophysics Data System}\r\n}\r\n\r\n \r\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n QMLP: An Error-Tolerant Nonlinear Quantum MLP Architecture using Parameterized Two-Qubit Gates.\n \n \n \n \n\n\n \n Cheng Chu; Nai-Hui Chia; Lei Jiang; and Fan Chen.\n\n\n \n\n\n\n In ACM/IEEE International Symposium on Low Power Electronics and Design (ISLPED), 2022. \n \n\n\n\n
\n\n\n\n \n \n \"QMLP:Paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{2022ISLPED:QMLP,\r\n author = {Cheng Chu and Nai-Hui Chia and Lei Jiang and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n title = {{QMLP: An Error-Tolerant Nonlinear Quantum MLP Architecture using Parameterized Two-Qubit Gates}},\r\n booktitle = {ACM/IEEE International Symposium on Low Power Electronics and Design (ISLPED)},\r\n year = {2022},\r\n % pages={},\r\n % doi={},\r\n % abstract = {},\r\nisbn = {9781450393546},\r\nurl = {https://doi.org/10.1145/3531437.3539719},\r\ndoi = {10.1145/3531437.3539719},\r\nabstract = {Despite potential quantum supremacy, state-of-the-art quantum neural networks (QNNs) suffer from low inference accuracy. First, the current Noisy Intermediate-Scale Quantum (NISQ) devices with high error rates of 10- 3 to 10- 2 significantly degrade the accuracy of a QNN. Second, although recently proposed Re-Uploading Units (RUUs) introduce some non-linearity into the QNN circuits, the theory behind it is not fully understood. Furthermore, previous RUUs that repeatedly upload original data can only provide marginal accuracy improvements. Third, current QNN circuit ansatz uses fixed two-qubit gates to enforce maximum entanglement capability, making task-specific entanglement tuning impossible, resulting in poor overall performance. In this paper, we propose a Quantum Multilayer Perceptron (QMLP) architecture featured by error-tolerant input embedding, rich nonlinearity, and enhanced variational circuit ansatz with parameterized two-qubit entangling gates. Compared to prior arts, QMLP increases the inference accuracy on the 10-class MNIST dataset by 10% with 2 \\texttimes{} fewer quantum gates and 3 \\texttimes{} reduced parameters. Our source code is available and can be found in https://github.com/chuchengc/QMLP/.},\r\narticleno = {4},\r\nnumpages = {6}\r\n}\r\n\r\n\r\n\r\n\r\n\r\n
\n
\n\n\n
\n Despite potential quantum supremacy, state-of-the-art quantum neural networks (QNNs) suffer from low inference accuracy. First, the current Noisy Intermediate-Scale Quantum (NISQ) devices with high error rates of 10- 3 to 10- 2 significantly degrade the accuracy of a QNN. Second, although recently proposed Re-Uploading Units (RUUs) introduce some non-linearity into the QNN circuits, the theory behind it is not fully understood. Furthermore, previous RUUs that repeatedly upload original data can only provide marginal accuracy improvements. Third, current QNN circuit ansatz uses fixed two-qubit gates to enforce maximum entanglement capability, making task-specific entanglement tuning impossible, resulting in poor overall performance. In this paper, we propose a Quantum Multilayer Perceptron (QMLP) architecture featured by error-tolerant input embedding, rich nonlinearity, and enhanced variational circuit ansatz with parameterized two-qubit entangling gates. Compared to prior arts, QMLP increases the inference accuracy on the 10-class MNIST dataset by 10% with 2 × fewer quantum gates and 3 × reduced parameters. Our source code is available and can be found in https://github.com/chuchengc/QMLP/.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n CANOPY: A CNFET-based Process Variation Aware Systolic DNN Accelerator.\n \n \n \n \n\n\n \n Cheng Chu; Dawen Xu; Ying Wang; and Fan Chen.\n\n\n \n\n\n\n In ACM/IEEE International Symposium on Low Power Electronics and Design (ISLPED), 2022. \n \n\n\n\n
\n\n\n\n \n \n \"CANOPY:Paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{2022ISLPED:CNT,\r\n author = {Cheng Chu and Dawen Xu and Ying Wang and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\n title = {{CANOPY: A CNFET-based Process Variation Aware Systolic DNN Accelerator}},\r\n booktitle = {ACM/IEEE International Symposium on Low Power Electronics and Design (ISLPED)},\r\n year = {2022},\r\nisbn = {9781450393546},\r\nurl = {https://doi.org/10.1145/3531437.3539703},\r\ndoi = {10.1145/3531437.3539703},\r\nabstract = {Although systolic accelerators have become the dominant method for executing Deep Neural Networks (DNNs), their performance efficiency (quantified as Energy-Delay Product or EDP) is limited by the capabilities of silicon Field-Effect Transistors (FETs). FETs constructed from Carbon Nanotubes (CNTs) have demonstrated &gt; 10 \\texttimes{} EDP benefits, however, the processing variations inherent in carbon nanotube FETs (CNFETs) fabrication compromise the EDP benefits, resulting &gt; 40% performance degradation. In this work, we study the impact of CNT process variations and present&nbsp;Canopy, a process variation aware systolic DNN accelerator by leveraging the spatial correlation in CNT variations. Canopy co-optimizes the architecture and dataflow to allow computing engines in a systolic array run at their best performance with non-uniform latency, minimizing the performance degradation incurred by CNT variations. Furthermore, we devise&nbsp;Canopy with dynamic reconfigurability such that the microarchitectural capability and its associated flexibility achieves an extra degree of adaptability with regard to the DNN topology and processing hyper-parameters (e.g., batch size). Experimental results show that&nbsp;Canopy improves the performance by 5.85 \\texttimes{} (4.66 \\texttimes{}) and reduces the energy by 34% (90%) when inferencing a single (a batch of) input compared to the baseline design under an iso-area comparison across seven DNN workloads.},\r\narticleno = {24},\r\nnumpages = {6}\r\n}\r\n\r\n\r\n
\n
\n\n\n
\n Although systolic accelerators have become the dominant method for executing Deep Neural Networks (DNNs), their performance efficiency (quantified as Energy-Delay Product or EDP) is limited by the capabilities of silicon Field-Effect Transistors (FETs). FETs constructed from Carbon Nanotubes (CNTs) have demonstrated > 10 × EDP benefits, however, the processing variations inherent in carbon nanotube FETs (CNFETs) fabrication compromise the EDP benefits, resulting > 40% performance degradation. In this work, we study the impact of CNT process variations and present Canopy, a process variation aware systolic DNN accelerator by leveraging the spatial correlation in CNT variations. Canopy co-optimizes the architecture and dataflow to allow computing engines in a systolic array run at their best performance with non-uniform latency, minimizing the performance degradation incurred by CNT variations. Furthermore, we devise Canopy with dynamic reconfigurability such that the microarchitectural capability and its associated flexibility achieves an extra degree of adaptability with regard to the DNN topology and processing hyper-parameters (e.g., batch size). Experimental results show that Canopy improves the performance by 5.85 × (4.66 ×) and reduces the energy by 34% (90%) when inferencing a single (a batch of) input compared to the baseline design under an iso-area comparison across seven DNN workloads.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n MOCCA: A Process Variation Tolerant Systolic DNN Accelerator using CNFETs in Monolithic 3D.\n \n \n \n \n\n\n \n Samuel J. Engers; Cheng Chu; Dawen Xu; Ying Wang; and Fan Chen.\n\n\n \n\n\n\n In Proceedings of the 2022 on Great Lakes Symposium on VLSI (GLSVLSI), pages 379-382, 2022. \n \n\n\n\n
\n\n\n\n \n \n \"MOCCA:Paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{glsvlsi2022,\r\nauthor = {Samuel J. Engers and Cheng Chu and Dawen Xu and\r\n               Ying Wang and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>}},\r\ntitle = {{MOCCA: A Process Variation Tolerant Systolic DNN Accelerator using CNFETs in Monolithic 3D}},\r\nyear = {2022},\r\nbooktitle = {Proceedings of the 2022 on Great Lakes Symposium on VLSI (GLSVLSI)},\r\nabstract = {Hardware accelerators based on systolic arrays have become the dominant method for efficient processing of deep neural networks (DNNs). Although such designs provide significant performance\r\nimprovement compared to its contemporary CPUs or GPUs, their power efficiency and area efficiency are greatly limited by the large computing array and on-chip memory. In this work, we demonstrate\r\nthat we can further improve the efficiency of systolic accelerators using emerging carbon nanotube field-effect transistors (CNFETs) by stacking the computing logic and on-chip memory on multiple layers and utilizing monolithic 3D (M3D) vias for low-latency communication. We comprehensively explore the design space and present MOCCA, the first process variation tolerable CNFET-based systolic DNN accelerator. We validate MOCCA against previous 2D accelerators on state-of-the-arts DNN models. On average, MOCCA achieves the same throughput with 6.12x and 2.12x improvement respectively on performance and power efficiency in a 2x reduced chip footprint.},\r\nisbn = {9781450393225},\r\nurl = {https://doi.org/10.1145/3526241.3530380},\r\ndoi = {10.1145/3526241.3530380},\r\npages = {379-382},\r\nnumpages = {4},\r\n}\r\n\r\n
\n
\n\n\n
\n Hardware accelerators based on systolic arrays have become the dominant method for efficient processing of deep neural networks (DNNs). Although such designs provide significant performance improvement compared to its contemporary CPUs or GPUs, their power efficiency and area efficiency are greatly limited by the large computing array and on-chip memory. In this work, we demonstrate that we can further improve the efficiency of systolic accelerators using emerging carbon nanotube field-effect transistors (CNFETs) by stacking the computing logic and on-chip memory on multiple layers and utilizing monolithic 3D (M3D) vias for low-latency communication. We comprehensively explore the design space and present MOCCA, the first process variation tolerable CNFET-based systolic DNN accelerator. We validate MOCCA against previous 2D accelerators on state-of-the-arts DNN models. On average, MOCCA achieves the same throughput with 6.12x and 2.12x improvement respectively on performance and power efficiency in a 2x reduced chip footprint.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Sky-sorter: A Processing-in-Memory Architecture for Large-Scale Sorting.\n \n \n \n\n\n \n Farzaneh Zokaee; Fan Chen; Guangyu Sun; and Lei Jiang.\n\n\n \n\n\n\n IEEE Transactions on Computers (TC),1-1. 2022.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{TC2022,\r\n  author={Zokaee, Farzaneh and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and Sun, Guangyu and Jiang, Lei},\r\n  title={{Sky-sorter: A Processing-in-Memory Architecture for Large-Scale Sorting}}, \r\n  journal={IEEE Transactions on Computers (TC)},\r\n  year={2022},\r\n  pages={1-1},\r\n  doi={10.1109/TC.2022.3169434},\r\n  abstract={Sorting is one of the most important algorithms in computer science. Conventional CPUs, GPUs, FPGAs, and ASICs running sorting are fundamentally bottlenecked by the off-chip memory bandwidth, because of their von-Neumann architecture. Processing-near-memory (PNM) designs integrate a CPU, a GPU, or an ASIC upon an HBM for sorting, but their sorting throughput is still limited by the HBM bandwidth and capacity. In this paper, we propose a skyrmion racetrack memory (SRM)-based PIM accelerator, Sky-Sorter, to enhance the sorting performance of large-scale datasets. Sky-Sorter implements samplesort which involves four steps, sampling, splitting marker sorting, partitioning, and bucket sorting. A SRM-based random number generator is used in Sky-Sorter to randomly sample records from the dataset. Sky-Sorter divides the large dataset into many buckets based on sampled markers by our proposed SRM-based partitioner. Its partitioning throughput matches off-chip memory bandwidth. We further designed a SRM-based sorting unit (SU) to sort all records of a bucket without introducing extra CMOS logic. Our SU uses the fast in-cell insertion characteristics of SRMs to implement and perform insertsort within a bucket. Sky-Sorter employs SUs to sort all buckets simultaneously by fully utilizing large internal array bandwidth. Compared to state-of-the-art accelerators, Sky-Sorter improves the throughput per Watt by 4x.},\r\n}\r\n\r\n\r\n% 2021\r\n
\n
\n\n\n
\n Sorting is one of the most important algorithms in computer science. Conventional CPUs, GPUs, FPGAs, and ASICs running sorting are fundamentally bottlenecked by the off-chip memory bandwidth, because of their von-Neumann architecture. Processing-near-memory (PNM) designs integrate a CPU, a GPU, or an ASIC upon an HBM for sorting, but their sorting throughput is still limited by the HBM bandwidth and capacity. In this paper, we propose a skyrmion racetrack memory (SRM)-based PIM accelerator, Sky-Sorter, to enhance the sorting performance of large-scale datasets. Sky-Sorter implements samplesort which involves four steps, sampling, splitting marker sorting, partitioning, and bucket sorting. A SRM-based random number generator is used in Sky-Sorter to randomly sample records from the dataset. Sky-Sorter divides the large dataset into many buckets based on sampled markers by our proposed SRM-based partitioner. Its partitioning throughput matches off-chip memory bandwidth. We further designed a SRM-based sorting unit (SU) to sort all records of a bucket without introducing extra CMOS logic. Our SU uses the fast in-cell insertion characteristics of SRMs to implement and perform insertsort within a bucket. Sky-Sorter employs SUs to sort all buckets simultaneously by fully utilizing large internal array bandwidth. Compared to state-of-the-art accelerators, Sky-Sorter improves the throughput per Watt by 4x.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2021\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n FeFET-based Process-in-Memory Architecture for Low-Power DNN Training.\n \n \n \n\n\n \n Farzaneh Zokaee; Bing Li; and Fan Chen.\n\n\n \n\n\n\n In IEEE/ACM International Symposium on Nanoscale Architectures (NANOARCH), pages 1-6, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{NANOARCH2021,\r\n  author={Zokaee, Farzaneh and Li, Bing and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>}},\r\n  booktitle={IEEE/ACM International Symposium on Nanoscale Architectures (NANOARCH)}, \r\n  title={{FeFET-based Process-in-Memory Architecture for Low-Power DNN Training}}, \r\n  year={2021},\r\n  pages={1-6},\r\n  doi={10.1109/NANOARCH53687.2021.9642234},\r\n  abstract={Although deep neural networks (DNNs) have become the cornerstone of Artificial Intelligence, the current training of DNNs still requires dozens of CPU hours. Prior works created various customized hardware accelerators for DNNs, however, most of these accelerators are designed to accelerate DNN inference and lack basic support for complex compute phases and sophisticated data dependency involved in DNN training. The major challenges for supporting DNN training come from various layers of the system stack: (1) the current de-facto training methods, error backpropagation (BP), requires all the weights and intermediate data to be stored in memory, and then sequentially consumed in backward paths. Therefore, weight updates are non-local and rely on upstream layers, which makes training parallelization extremely challenging and also incurs significant memory and computing overheads; (2) the power consumption of such CMOS accelerators can reach 200~250 Watt. Though emerging memory technology based designs demonstrated a great potential in low-power DNN acceleration, their power efficiency is bottlenecked by CMOS analog-to-digital converters (ADCs).In this work, we review the current advance in accelerator designs for DNNs and point out their limitations. Then we set out to address these challenges by combining innovations in training algorithm, circuits, and accelerator architecture. Our research still follows the Process-in-Memory (PIM) strategy. Specifically, we leverage the recently proposed Direct Feedback Alignment (DFA) training algorithm to overcome the limitation of long-range data dependency required by BP. We then propose to execute the DNN training in parallel in a particularly designed pipeline. We implement the proposed architecture using Ferroelectric Field-Effect Transistors (FeFET) due to their high performance and low-power operations. To further improve the power efficiency, we propose a random number generator (RNG) and an ultra-low power FeFET-based ADC. Preliminary results suggest the feasibility and promise of our approaches for low-power and highly parallel DNN training in a broad range of applications.},\r\n}\r\n\r\n\r\n
\n
\n\n\n
\n Although deep neural networks (DNNs) have become the cornerstone of Artificial Intelligence, the current training of DNNs still requires dozens of CPU hours. Prior works created various customized hardware accelerators for DNNs, however, most of these accelerators are designed to accelerate DNN inference and lack basic support for complex compute phases and sophisticated data dependency involved in DNN training. The major challenges for supporting DNN training come from various layers of the system stack: (1) the current de-facto training methods, error backpropagation (BP), requires all the weights and intermediate data to be stored in memory, and then sequentially consumed in backward paths. Therefore, weight updates are non-local and rely on upstream layers, which makes training parallelization extremely challenging and also incurs significant memory and computing overheads; (2) the power consumption of such CMOS accelerators can reach 200 250 Watt. Though emerging memory technology based designs demonstrated a great potential in low-power DNN acceleration, their power efficiency is bottlenecked by CMOS analog-to-digital converters (ADCs).In this work, we review the current advance in accelerator designs for DNNs and point out their limitations. Then we set out to address these challenges by combining innovations in training algorithm, circuits, and accelerator architecture. Our research still follows the Process-in-Memory (PIM) strategy. Specifically, we leverage the recently proposed Direct Feedback Alignment (DFA) training algorithm to overcome the limitation of long-range data dependency required by BP. We then propose to execute the DNN training in parallel in a particularly designed pipeline. We implement the proposed architecture using Ferroelectric Field-Effect Transistors (FeFET) due to their high performance and low-power operations. To further improve the power efficiency, we propose a random number generator (RNG) and an ultra-low power FeFET-based ADC. Preliminary results suggest the feasibility and promise of our approaches for low-power and highly parallel DNN training in a broad range of applications.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n REREC: In-ReRAM Acceleration with Access-Aware Mapping for Personalized Recommendation.\n \n \n \n\n\n \n Yitu Wang; Zhenhua Zhu; Fan Chen; Mingyuan Ma; Guohao Dai; Yu Wang; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In International Conference on Computer-Aided Design (ICCAD), pages 1-9, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ICCAD2021,\r\n author = { Wang, Yitu and Zhu, Zhenhua and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and Ma, Mingyuan and Dai, Guohao and Wang, Yu and Hai Helen {Li} and Chen, Yiran},\r\n title = {{REREC: In-ReRAM Acceleration with Access-Aware Mapping for Personalized Recommendation}},\r\n booktitle = {International Conference on Computer-Aided Design (ICCAD)},\r\n year = {2021},\r\n pages={1-9},\r\n doi={10.1109/ICCAD51958.2021.9643573},\r\n abstract={Personalized recommendation systems are widely used in many Internet services. The sparse embedding lookup in recommendation models dominates the computational cost of inference due to its intensive irregular memory accesses. Applying resistive random access memory (ReRAM) based process-in-memory (PIM) architecture to accelerate recommendation processing can avoid data movements caused by off-chip memory accesses. However, naive adoption of ReRAM-based DNN accelerators leads to low computation parallelism and severe under-utilization of computing resources, which is caused by the fine-grained inner-product in feature interaction. In this paper, we propose Rerec, an architecture-algorithm co-designed accelerator, which specializes in fine-grained ReRAM-based inner-product engines with access-aware mapping algorithm for recommendation inference. At the architecture level, we reduce the size and increase the amount of crossbars. The crossbars are fully-connected by Analog-to-Digital Converters (ADCs) in one inner-product engine, which can adapt to the fine-grained and irregular computational patterns and improve the processing parallelism. We further explore trade-offs of (i) crossbar size vs. hardware utilization, and (ii) ADC implementation vs. area/energy efficiency to optimize the design. At the algorithm level, we propose a novel access-aware mapping (AAM) algorithm to optimize resource allocations. Our AAM algorithm tackles the problems of (i) the workload imbalance and (ii) the long recommendation inference latency induced by the great variance of access frequency of embedding vectors. Experimental results show that Rerecachieves 7.69x speedup compared with a ReRAM-based baseline design. Compared to CPU and the state-of-the-art recommendation accelerator, Rerecdemonstrates 29.26x and 3.48x performance improvement, respectively.}\r\n} \r\n\r\n\r\n\r\n
\n
\n\n\n
\n Personalized recommendation systems are widely used in many Internet services. The sparse embedding lookup in recommendation models dominates the computational cost of inference due to its intensive irregular memory accesses. Applying resistive random access memory (ReRAM) based process-in-memory (PIM) architecture to accelerate recommendation processing can avoid data movements caused by off-chip memory accesses. However, naive adoption of ReRAM-based DNN accelerators leads to low computation parallelism and severe under-utilization of computing resources, which is caused by the fine-grained inner-product in feature interaction. In this paper, we propose Rerec, an architecture-algorithm co-designed accelerator, which specializes in fine-grained ReRAM-based inner-product engines with access-aware mapping algorithm for recommendation inference. At the architecture level, we reduce the size and increase the amount of crossbars. The crossbars are fully-connected by Analog-to-Digital Converters (ADCs) in one inner-product engine, which can adapt to the fine-grained and irregular computational patterns and improve the processing parallelism. We further explore trade-offs of (i) crossbar size vs. hardware utilization, and (ii) ADC implementation vs. area/energy efficiency to optimize the design. At the algorithm level, we propose a novel access-aware mapping (AAM) algorithm to optimize resource allocations. Our AAM algorithm tackles the problems of (i) the workload imbalance and (ii) the long recommendation inference latency induced by the great variance of access frequency of embedding vectors. Experimental results show that Rerecachieves 7.69x speedup compared with a ReRAM-based baseline design. Compared to CPU and the state-of-the-art recommendation accelerator, Rerecdemonstrates 29.26x and 3.48x performance improvement, respectively.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n PUFFIN: An Efficient DNN Training Accelerator for Direct Feedback Alignment in FeFET.\n \n \n \n\n\n \n Fan Chen.\n\n\n \n\n\n\n In IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED), pages 1-6, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{islped2021,\r\n  author={{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>}}, \r\n  booktitle={IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)}, \r\n  title={{PUFFIN: An Efficient DNN Training Accelerator for Direct Feedback Alignment in FeFET}}, \r\n  year={2021},\r\n  pages={1-6},\r\n  doi={10.1109/ISLPED52811.2021.9502499},\r\n  abstract = {The currently widely used backpropagation (BP) training algorithm requires that all trainable weights in a deep neural network (DNN) be stored in memory and used sequentially in the backward path, which makes training parallelization extremely challenging and also incurs significant memory and computing overheads. Although emerging ReRAM-based computing has demonstrated great potential for DNN acceleration, state-of-the-art designs suffer from >60\\% energy overhead for analog-to-digital converters (ADCs). In this work, we propose PUFFIN, an efficient DNN training accelerator for Direct Feedback Alignment (DFA). PUFFIN leverages DFA to overcome the limitation of long-range data dependency required by BP and executes an L -layer DNN training in parallel in an (L+2)-stage pipeline. We implement PUFFIN using Ferroelectric Field-Effect Transistors (FeFET) due to their high performance and low-power operation. To further improve the power efficiency, we propose a random number generator (RNG) based on the statistical switching in FeFET device and an ultra-low power FeFET-based ADC. Compared to previous ReRAM-based training accelerators, PUFFIN achieves 1.3x speedup and 2.5x improvement on power efficiency.},\r\n}\r\n\r\n\r\n
\n
\n\n\n
\n The currently widely used backpropagation (BP) training algorithm requires that all trainable weights in a deep neural network (DNN) be stored in memory and used sequentially in the backward path, which makes training parallelization extremely challenging and also incurs significant memory and computing overheads. Although emerging ReRAM-based computing has demonstrated great potential for DNN acceleration, state-of-the-art designs suffer from >60% energy overhead for analog-to-digital converters (ADCs). In this work, we propose PUFFIN, an efficient DNN training accelerator for Direct Feedback Alignment (DFA). PUFFIN leverages DFA to overcome the limitation of long-range data dependency required by BP and executes an L -layer DNN training in parallel in an (L+2)-stage pipeline. We implement PUFFIN using Ferroelectric Field-Effect Transistors (FeFET) due to their high performance and low-power operation. To further improve the power efficiency, we propose a random number generator (RNG) based on the statistical switching in FeFET device and an ultra-low power FeFET-based ADC. Compared to previous ReRAM-based training accelerators, PUFFIN achieves 1.3x speedup and 2.5x improvement on power efficiency.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n RECOIN: A Low-Power Processing-in-ReRAM Architecture for Deformable Convolution.\n \n \n \n\n\n \n Cheng Chu; Fan Chen; Dawen Xu; and Ying Wang.\n\n\n \n\n\n\n In Proceedings of the 2021 on Great Lakes Symposium on VLSI (GLSVLSI), pages 235-240, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{glsvlsi2021,\r\nauthor = {Cheng Chu and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_bilank">Fan Chen</a></span>} and\r\n               Dawen Xu and\r\n               Ying Wang},\r\ntitle = {{RECOIN: A Low-Power Processing-in-ReRAM Architecture for Deformable Convolution}},\r\nyear = {2021},\r\ndoi = {10.1145/3453688.3461480},\r\nabstract = {The recent proposed Deformable Convolutional Networks (DCNs) greatly enhance the performance\r\nof conventional Convolutional Neural Networks (CNNs) on vision recognition tasks by\r\nallowing flexible input sampling during inference runtime. DCNs introduce an additional\r\nconvolutional layer for adaptive sampling offset generation, followed by a bilinear\r\ninterpolation (BLI) algorithm to integerize the generated non-integer offset values.\r\nFinally, a regular convolution is performed on the loaded input pixels. Compared with\r\nconventional CNNs, DCN demonstrated significantly increased computational complexity\r\nand irregular input-dependentmemory access patterns, making it a great challenge for\r\ndeploying DCNs onto edge devices for real-time computer vision tasks. In this work,\r\nwe propose RECOIN, a processing-in-memory (PIM) architecture, which supports DCN inference\r\non resistive memory (ReRAM)crossbars, thus making the first DCN inference accelerator\r\npossible. We present a novel BLI processing engine that leverage both row-and column-oriented\r\ncomputation for in-situ BLI calculation. Amapping scheme and an address converter\r\nare particular designed to accommodate the intensive computation and irregular data\r\naccess. We implement the DCN inference in a 4-stage pipeline and evaluate the effectiveness\r\nof RECOIN on six DCN models. Experimental results show RECOIN achieves respectively\r\n225x and 17.4x improvement in energy efficiency compared to general-purpose CPU and\r\nGPU. Compared to two state-of-the-art ASIC accelerators, RECOIN achieve 26.8x and\r\n20.4x speedup respectively.},\r\nbooktitle = {Proceedings of the 2021 on Great Lakes Symposium on VLSI (GLSVLSI)},\r\npages = {235-240},\r\n}\r\n\r\n\r\n
\n
\n\n\n
\n The recent proposed Deformable Convolutional Networks (DCNs) greatly enhance the performance of conventional Convolutional Neural Networks (CNNs) on vision recognition tasks by allowing flexible input sampling during inference runtime. DCNs introduce an additional convolutional layer for adaptive sampling offset generation, followed by a bilinear interpolation (BLI) algorithm to integerize the generated non-integer offset values. Finally, a regular convolution is performed on the loaded input pixels. Compared with conventional CNNs, DCN demonstrated significantly increased computational complexity and irregular input-dependentmemory access patterns, making it a great challenge for deploying DCNs onto edge devices for real-time computer vision tasks. In this work, we propose RECOIN, a processing-in-memory (PIM) architecture, which supports DCN inference on resistive memory (ReRAM)crossbars, thus making the first DCN inference accelerator possible. We present a novel BLI processing engine that leverage both row-and column-oriented computation for in-situ BLI calculation. Amapping scheme and an address converter are particular designed to accommodate the intensive computation and irregular data access. We implement the DCN inference in a 4-stage pipeline and evaluate the effectiveness of RECOIN on six DCN models. Experimental results show RECOIN achieves respectively 225x and 17.4x improvement in energy efficiency compared to general-purpose CPU and GPU. Compared to two state-of-the-art ASIC accelerators, RECOIN achieve 26.8x and 20.4x speedup respectively.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Marvel: A Vertical Resistive Accelerator for Low-Power Deep Learning Inference in Monolithic 3D.\n \n \n \n\n\n \n Fan Chen; Linghao Song; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In Design, Automation Test in Europe Conference Exhibition (DATE), pages 1240-1245, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{DATE2021marvel,\r\n  author={{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and \r\n\tLinghao {Song} and Hai Helen {Li} and Yiran {Chen}},\r\n  booktitle={Design, Automation Test in Europe Conference Exhibition (DATE)}, \r\n  title={{Marvel: A Vertical Resistive Accelerator for Low-Power Deep Learning Inference in Monolithic 3D}}, \r\n  year={2021},\r\n  volume={},\r\n  number={},\r\n  pages={1240-1245},\r\n  doi={10.23919/DATE51398.2021.9474208},\r\n  abstract  ={Resistive memory (ReRAM) based Deep Neural Network (DNN) accelerators have achieved state-of-the-art DNN inference throughput. However, the power efficiency of such resistive accelerators is greatly limited by their peripheral circuitry including analog-to-digital converters (ADCs), digital-to-analog converters (DACs), SRAM registers, and eDRAM buffers. These power-hungry components consume 87\\% of the total system power, despite of the high power efficiency of ReRAM computing cores. In this paper, we propose Marvel, a monolithic 3D stacked resistive DNN accelerator, which consists of carbon nanotube field-effect transistors (CNFETs) based low-power ADC/DACs, CNFET logic, CNFET SRAM, and high-density global buffers implemented by cross-point Spin Transfer Torque Magnetic RAM (STT-MRAM). To compensate for the loss of inference throughput that is incurred by the slow CNFET ADCs, we propose to integrate more ADC layers into Marvel. Unlike the CMOS-based ADCs that can only be implemented in the bottom layer of the 3D structure, multiple CNFET layers can be implemented using a monolithic 3D stacking technique. Compared to prior ReRAM-based DNN accelerators, on average, Marvel achieves the same inference throughput with 4.5x improvement on performance per Watt. We also demonstrated that increasing the number of integration layers enables Marvelto further achieve 2x inference throughput with 7.6x improved power efficiency.},\r\n}\r\n\r\n\r\n
\n
\n\n\n
\n Resistive memory (ReRAM) based Deep Neural Network (DNN) accelerators have achieved state-of-the-art DNN inference throughput. However, the power efficiency of such resistive accelerators is greatly limited by their peripheral circuitry including analog-to-digital converters (ADCs), digital-to-analog converters (DACs), SRAM registers, and eDRAM buffers. These power-hungry components consume 87% of the total system power, despite of the high power efficiency of ReRAM computing cores. In this paper, we propose Marvel, a monolithic 3D stacked resistive DNN accelerator, which consists of carbon nanotube field-effect transistors (CNFETs) based low-power ADC/DACs, CNFET logic, CNFET SRAM, and high-density global buffers implemented by cross-point Spin Transfer Torque Magnetic RAM (STT-MRAM). To compensate for the loss of inference throughput that is incurred by the slow CNFET ADCs, we propose to integrate more ADC layers into Marvel. Unlike the CMOS-based ADCs that can only be implemented in the bottom layer of the 3D structure, multiple CNFET layers can be implemented using a monolithic 3D stacking technique. Compared to prior ReRAM-based DNN accelerators, on average, Marvel achieves the same inference throughput with 4.5x improvement on performance per Watt. We also demonstrated that increasing the number of integration layers enables Marvelto further achieve 2x inference throughput with 7.6x improved power efficiency.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n RAISE: A Resistive Accelerator for Subject-Independent EEG Signal Classification.\n \n \n \n\n\n \n Fan Chen; Linghao Song; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In Design, Automation Test in Europe Conference Exhibition (DATE), pages 340-343, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{DATE2021raise,\r\n  author={{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n\tLinghao {Song} and Hai Helen {Li} and Yiran {Chen}},\r\n  booktitle={Design, Automation Test in Europe Conference Exhibition (DATE)}, \r\n  title={{RAISE: A Resistive Accelerator for Subject-Independent EEG Signal Classification}}, \r\n  year={2021},\r\n  volume={},\r\n  number={},\r\n  pages={340-343},\r\n  doi={10.23919/DATE51398.2021.9473993},\r\n  abstract  ={State-of-the-art deep neural networks (DNNs) for electroencephalography (EEG) signals classification focus on subject-related tasks, in which the test data and the training data needs to be collected from the same subject. In addition, due to limited computing resources and strict power budgets at edges, it is very challenging to deploy the inference of such DNN models on biological devices. In this work, we present an algorithm/hardware co-designed low-power accelerator for subject-independent EEG signal classification. We propose a compact neural network that is capable to identify the common and stable structure among subjects. Based on it, we realize a robust subject-independent EEG signal classification model that can be extended to multiple BCI tasks with minimal overhead. Based on this model, we present RAISE, a low-power processing-in-memory inference accelerator by leveraging the emerging resistive memory. We compare the proposed model and hardware accelerator to prior arts across various BCI paradigms. We show that our model achieves the best subject-independent classification accuracy, while RAISE achieves 2.8x power reduction and 2.5x improvement in performance per watt compared to the state-of-the-art resistive inference accelerator.},\r\n}\r\n\r\n% 2020\r\n
\n
\n\n\n
\n State-of-the-art deep neural networks (DNNs) for electroencephalography (EEG) signals classification focus on subject-related tasks, in which the test data and the training data needs to be collected from the same subject. In addition, due to limited computing resources and strict power budgets at edges, it is very challenging to deploy the inference of such DNN models on biological devices. In this work, we present an algorithm/hardware co-designed low-power accelerator for subject-independent EEG signal classification. We propose a compact neural network that is capable to identify the common and stable structure among subjects. Based on it, we realize a robust subject-independent EEG signal classification model that can be extended to multiple BCI tasks with minimal overhead. Based on this model, we present RAISE, a low-power processing-in-memory inference accelerator by leveraging the emerging resistive memory. We compare the proposed model and hardware accelerator to prior arts across various BCI paradigms. We show that our model achieves the best subject-independent classification accuracy, while RAISE achieves 2.8x power reduction and 2.5x improvement in performance per watt compared to the state-of-the-art resistive inference accelerator.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2020\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Low-Cost Floating-Point Processing in ReRAM for Scientific Computing.\n \n \n \n\n\n \n Linghao Song; Fan Chen; Xuehai Qian; Hai Li; and Yiran Chen.\n\n\n \n\n\n\n CoRR, abs/2011.03190. 2020.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{refloat2020arxive,\r\n  author    = {Linghao Song and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Xuehai Qian and\r\n               Hai Li and\r\n               Yiran Chen},\r\n  title     = {Low-Cost Floating-Point Processing in ReRAM for Scientific Computing},\r\n  journal   = {CoRR},\r\n  volume    = {abs/2011.03190},\r\n  year      = {2020},\r\n  archivePrefix = {arXiv},\r\n  eprint    = {2011.03190},\r\n  abstract  = {We propose ReFloat, a principled approach for low-cost floating-point processing in ReRAM. The exponent offsets based on a base are stored by a flexible and fine-grained floating-point number representation. The key motivation is that, while the number of exponent bits must be reduced due to the exponential relation to the computation latency and hardware cost, the convergence still requires sufficient accuracy for exponents. Our design reconciles the conflicting goals by storing the exponent offsets from a common base among matrix values in a block, which is the granularity of computation in ReRAM. Due to the value locality, the differences among the exponents in a block are small, thus the offsets require much less number of bits to represent exponents. In essence, ReFloat enables the principled local fine-tuning of floating-point representation. Based on the idea, we define a flexible ReFloat format that specifies matrix block size, and the number of bits for exponent and fraction. To determine the base for each block, we propose an optimization method that minimizes the difference between the exponents of the original matrix block and the converted block. We develop the conversion scheme from default double-precision floating-point format to ReFloat format, the computation procedure, and the low-cost floating-point processing architecture in ReRAM.}\r\n\r\n}\r\n\r\n
\n
\n\n\n
\n We propose ReFloat, a principled approach for low-cost floating-point processing in ReRAM. The exponent offsets based on a base are stored by a flexible and fine-grained floating-point number representation. The key motivation is that, while the number of exponent bits must be reduced due to the exponential relation to the computation latency and hardware cost, the convergence still requires sufficient accuracy for exponents. Our design reconciles the conflicting goals by storing the exponent offsets from a common base among matrix values in a block, which is the granularity of computation in ReRAM. Due to the value locality, the differences among the exponents in a block are small, thus the offsets require much less number of bits to represent exponents. In essence, ReFloat enables the principled local fine-tuning of floating-point representation. Based on the idea, we define a flexible ReFloat format that specifies matrix block size, and the number of bits for exponent and fraction. To determine the base for each block, we propose an optimization method that minimizes the difference between the exponents of the original matrix block and the converted block. We develop the conversion scheme from default double-precision floating-point format to ReFloat format, the computation procedure, and the low-cost floating-point processing architecture in ReRAM.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n AccPar: Tensor Partitioning for Heterogeneous Deep Learning Accelerators.\n \n \n \n\n\n \n Linghao Song; Fan Chen; Youwei Zhuo; Xuehai Qian; Hai Li; and Yiran Chen.\n\n\n \n\n\n\n In IEEE International Symposium on High Performance Computer Architecture (HPCA), pages 342–355, 2020. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{hpca2020,\r\n  author    = {Linghao Song and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Youwei Zhuo and\r\n               Xuehai Qian and\r\n               Hai Li and\r\n               Yiran Chen},\r\n  title     = {AccPar: Tensor Partitioning for Heterogeneous Deep Learning Accelerators},\r\n  booktitle = {{IEEE} International Symposium on High Performance Computer Architecture (HPCA)},\r\n  pages     = {342--355},\r\n  publisher = {{IEEE}},\r\n  year      = {2020},\r\n  doi       = {10.1109/HPCA47549.2020.00036},\r\n  abstract  ={Deep neural network (DNN) accelerators as an example of domain-specific architecture have demonstrated great success in DNN inference. However, the architecture acceleration for equally important DNN training has not yet been fully studied. With data forward, error backward and gradient calculation, DNN training is a more complicated process with higher computation and communication intensity. Because the recent research demonstrates a diminishing specialization return, namely, “accelerator wall” , we believe that a promising approach is to explore coarse-grained parallelism among multiple performance-bounded accelerators to support DNN training. Distributing computations on multiple heterogeneous accelerators to achieve high throughput and balanced execution, however, remaining challenging. We present ACCPAR, a principled and systematic method of determining the tensor partition among heterogeneous accelerator arrays. Compared to prior empirical or unsystematic methods, ACCPAR considers the complete tensor partition space and can reveal previously unknown new parallelism configurations. ACCPAR optimizes the performance based on a cost model that takes into account both computation and communication costs of a heterogeneous execution environment. Hence, our method can avoid the drawbacks of existing approaches that use communication as a proxy of the performance. The enhanced flexibility of tensor partitioning in ACCPAR allows the flexible ratio of computations to be distributed among accelerators with different performances. The proposed search algorithm is also applicable to the emerging multi-path patterns in modern DNNs such as ResNet. We simulate ACCPAR on a heterogeneous accelerator array composed of both TPU-v2 and TPU-v3 accelerators for the training of large-scale DNN models such as Alexnet, Vgg series and Resnet series. The average performance improvements of the state-of-the-art “one weird trick” (OWT) and HYPAR, and ACCPAR, normalized to the baseline data parallelism scheme where each accelerator replicates the model and processes different input data in parallel, are 2.98x, 3.78x, and 6.30x, respectively.},\r\n}\r\n\r\n
\n
\n\n\n
\n Deep neural network (DNN) accelerators as an example of domain-specific architecture have demonstrated great success in DNN inference. However, the architecture acceleration for equally important DNN training has not yet been fully studied. With data forward, error backward and gradient calculation, DNN training is a more complicated process with higher computation and communication intensity. Because the recent research demonstrates a diminishing specialization return, namely, “accelerator wall” , we believe that a promising approach is to explore coarse-grained parallelism among multiple performance-bounded accelerators to support DNN training. Distributing computations on multiple heterogeneous accelerators to achieve high throughput and balanced execution, however, remaining challenging. We present ACCPAR, a principled and systematic method of determining the tensor partition among heterogeneous accelerator arrays. Compared to prior empirical or unsystematic methods, ACCPAR considers the complete tensor partition space and can reveal previously unknown new parallelism configurations. ACCPAR optimizes the performance based on a cost model that takes into account both computation and communication costs of a heterogeneous execution environment. Hence, our method can avoid the drawbacks of existing approaches that use communication as a proxy of the performance. The enhanced flexibility of tensor partitioning in ACCPAR allows the flexible ratio of computations to be distributed among accelerators with different performances. The proposed search algorithm is also applicable to the emerging multi-path patterns in modern DNNs such as ResNet. We simulate ACCPAR on a heterogeneous accelerator array composed of both TPU-v2 and TPU-v3 accelerators for the training of large-scale DNN models such as Alexnet, Vgg series and Resnet series. The average performance improvements of the state-of-the-art “one weird trick” (OWT) and HYPAR, and ACCPAR, normalized to the baseline data parallelism scheme where each accelerator replicates the model and processes different input data in parallel, are 2.98x, 3.78x, and 6.30x, respectively.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n ReBoc: Accelerating Block-Circulant Neural Networks in ReRAM.\n \n \n \n\n\n \n Yitu Wang; Fan Chen; Linghao Song; C.-J. Richard Shi; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In Design, Automation Test in Europe Conference Exhibition (DATE), pages 1472–1477, 2020. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{DATE20cir, \r\n  author    = {Yitu Wang and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Linghao Song and\r\n               C.{-}J. Richard Shi and\r\n               Hai Helen Li and\r\n               Yiran Chen},\r\n  title     = {ReBoc: Accelerating Block-Circulant Neural Networks in ReRAM},\r\n  booktitle = {Design, Automation Test in Europe Conference Exhibition (DATE)}, \r\n  pages     = {1472--1477},\r\n  publisher = {{IEEE}},\r\n  year      = {2020},\r\n  doi       = {10.23919/DATE48585.2020.9116422},\r\n  abstract  = {Deep neural networks (DNNs) emerge as a key component in various applications. However, the ever-growing DNN size hinders efficient processing on hardware. To tackle this problem, on the algorithmic side, compressed DNN models are explored, of which block-circulant DNN models are memory efficient and hardware-friendly; on the hardware side, resistive random-access memory (ReRAM) based accelerators are promising for in-situ processing of DNNs. In this work, we design an accelerator named ReBoc for accelerating block-circulant DNNs in ReRAM to reap the benefits of light-weight models and efficient in-situ processing simultaneously. We propose a novel mapping scheme which utilizes Horizontal Weight Slicing and Intra-Crossbar Weight Duplication to map block-circulant DNN models onto ReRAM crossbars with significant improved crossbar utilization. Moreover, two specific techniques, namely Input Slice Reusing and Input Tile Sharing are introduced to take advantage of the circulant calculation feature in block- circulant DNNs to reduce data access and buffer size. In REBOC, a DNN model is executed within an intra-layer processing pipeline and achieves respectively 96x and 8.86x power efficiency improvement compared to the state-of-the-art FPGA and ASIC accelerators for block-circulant neural networks. Compared to ReRAM-based DNN accelerators, REBOC achieves averagely 4.1x speedup and 2.6x energy reduction.},\r\n}\r\n\r\n
\n
\n\n\n
\n Deep neural networks (DNNs) emerge as a key component in various applications. However, the ever-growing DNN size hinders efficient processing on hardware. To tackle this problem, on the algorithmic side, compressed DNN models are explored, of which block-circulant DNN models are memory efficient and hardware-friendly; on the hardware side, resistive random-access memory (ReRAM) based accelerators are promising for in-situ processing of DNNs. In this work, we design an accelerator named ReBoc for accelerating block-circulant DNNs in ReRAM to reap the benefits of light-weight models and efficient in-situ processing simultaneously. We propose a novel mapping scheme which utilizes Horizontal Weight Slicing and Intra-Crossbar Weight Duplication to map block-circulant DNN models onto ReRAM crossbars with significant improved crossbar utilization. Moreover, two specific techniques, namely Input Slice Reusing and Input Tile Sharing are introduced to take advantage of the circulant calculation feature in block- circulant DNNs to reduce data access and buffer size. In REBOC, a DNN model is executed within an intra-layer processing pipeline and achieves respectively 96x and 8.86x power efficiency improvement compared to the state-of-the-art FPGA and ASIC accelerators for block-circulant neural networks. Compared to ReRAM-based DNN accelerators, REBOC achieves averagely 4.1x speedup and 2.6x energy reduction.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A Survey of Accelerator Architectures for Deep Neural Networks.\n \n \n \n\n\n \n Yiran Chen; Yuan Xie; Linghao Song; Fan Chen; and Tianqi Tang.\n\n\n \n\n\n\n Engineering, 6(3): 264 - 274. 2020.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Engineering2020,\r\ntitle = "A Survey of Accelerator Architectures for Deep Neural Networks",\r\njournal = "Engineering",\r\nvolume = "6",\r\nnumber = "3",\r\npages = "264 - 274",\r\nyear = "2020",\r\nissn = "2095-8099",\r\ndoi = "https://doi.org/10.1016/j.eng.2020.01.007",\r\nauthor = {Yiran Chen and \r\n\tYuan Xie and \r\n\tLinghao Song and \r\n\t{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n\tTianqi Tang},\r\nkeywords = "Deep neural network, Domain-specific architecture, Accelerator",\r\nabstract = "Recently, due to the availability of big data and the rapid growth of computing power, artificial intelligence (AI) has regained tremendous attention and investment. Machine learning (ML) approaches have been successfully applied to solve many problems in academia and in industry. Although the explosion of big data applications is driving the development of ML, it also imposes severe challenges of data processing speed and scalability on conventional computer systems. Computing platforms that are dedicatedly designed for AI applications have been considered, ranging from a complement to von Neumann platforms to a “must-have” and stand-alone technical solution. These platforms, which belong to a larger category named “domain-specific computing,” focus on specific customization for AI. In this article, we focus on summarizing the recent advances in accelerator designs for deep neural networks (DNNs)—that is, DNN accelerators. We discuss various architectures that support DNN executions in terms of computing units, dataflow optimization, targeted network topologies, architectures on emerging technologies, and accelerators for emerging applications. We also provide our visions on the future trend of AI chip designs."\r\n}\r\n\r\n
\n
\n\n\n
\n Recently, due to the availability of big data and the rapid growth of computing power, artificial intelligence (AI) has regained tremendous attention and investment. Machine learning (ML) approaches have been successfully applied to solve many problems in academia and in industry. Although the explosion of big data applications is driving the development of ML, it also imposes severe challenges of data processing speed and scalability on conventional computer systems. Computing platforms that are dedicatedly designed for AI applications have been considered, ranging from a complement to von Neumann platforms to a “must-have” and stand-alone technical solution. These platforms, which belong to a larger category named “domain-specific computing,” focus on specific customization for AI. In this article, we focus on summarizing the recent advances in accelerator designs for deep neural networks (DNNs)—that is, DNN accelerators. We discuss various architectures that support DNN executions in terms of computing units, dataflow optimization, targeted network topologies, architectures on emerging technologies, and accelerators for emerging applications. We also provide our visions on the future trend of AI chip designs.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Parallelism in Deep Learning Accelerators.\n \n \n \n\n\n \n Linghao Song; Fan Chen; Yiran Chen; and Hai Helen Li.\n\n\n \n\n\n\n In 25th Asia and South Pacific Design Automation Conference (ASP-DAC), pages 645–650, 2020. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{parallelism2020aspdac,\r\n  author    = {Linghao Song and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Yiran Chen and\r\n               Hai Helen Li},\r\n  title     = {Parallelism in Deep Learning Accelerators},\r\n  booktitle = {25th Asia and South Pacific Design Automation Conference (ASP-DAC)},\r\n  pages     = {645--650},\r\n  publisher = {{IEEE}},\r\n  year      = {2020},\r\n  doi       = {10.1109/ASP-DAC47756.2020.9045206},\r\n  abstract  = {Deep learning is the core of artificial intelligence and it achieves state-of-the-art in a wide range of applications. The intensity of computation and data in deep learning processing poses significant challenges to the conventional computing platforms. Thus, specialized accelerator architectures are proposed for the acceleration of deep learning. In this paper, we classify the design space of current deep learning accelerators into three levels, (1) processing engine, (2) memory and (3) accelerator, and present a constructive view from a perspective of parallelism in the three levels.}\r\n}\r\n\r\n
\n
\n\n\n
\n Deep learning is the core of artificial intelligence and it achieves state-of-the-art in a wide range of applications. The intensity of computation and data in deep learning processing poses significant challenges to the conventional computing platforms. Thus, specialized accelerator architectures are proposed for the acceleration of deep learning. In this paper, we classify the design space of current deep learning accelerators into three levels, (1) processing engine, (2) memory and (3) accelerator, and present a constructive view from a perspective of parallelism in the three levels.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n PARC: A Processing-in-CAM Architecture for Genomic Long Read Pairwise Alignment using ReRAM.\n \n \n \n\n\n \n Fan Chen; Linghao Song; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In 25th Asia and South Pacific Design Automation Conference (ASP-DAC), pages 175–180, 2020. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{ASPDAC2020, \r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Linghao Song and\r\n               Hai Helen Li and\r\n               Yiran Chen},\r\n  title     = {{PARC:} {A} Processing-in-CAM Architecture for Genomic Long Read Pairwise\r\n               Alignment using ReRAM},\r\n  booktitle = {25th Asia and South Pacific Design Automation Conference (ASP-DAC)},\r\n  pages     = {175--180},\r\n  publisher = {{IEEE}},\r\n  year      = {2020},\r\n  doi       = {10.1109/ASP-DAC47756.2020.9045555},\r\n  abstract  = {Technological advances in long read sequences have greatly facilitated the development of genomics. However, managing and analyzing the raw genomic data that outpaces Moore's Law requires extremely high computational efficiency. On the one hand, existing software solutions can take hundreds of CPU hours to complete human genome alignment. On the other hand, the recently proposed hardware platforms achieve low processing throughput with significant overhead. In this paper, we propose PARC, an Processing-in-Memory architecture for long read pairwise alignment leveraging emerging resistive CAM (content-addressable memory) to accelerate the bottleneck chaining step in DNA alignment. Chaining takes 2-tuple anchors as inputs and identifies a set of correlated anchors as potential alignment candidates. Unlike traditional main memory which organizes relational data structure in a linear address space, PARC stores tuples in two neighboring crossbar arrays with shared row decoder such that column-wise in-memory computational operations and row-wise memory accesses can be performed in-situ in a symmetric crossbar structure. Compared to both software tools and state-of-the-art accelerators, PARC shows significant improvement in alignment throughput and energy efficiency, thanks to the in-site computation capability and optimized data mapping.},\r\n}\r\n\r\n% 2019\r\n
\n
\n\n\n
\n Technological advances in long read sequences have greatly facilitated the development of genomics. However, managing and analyzing the raw genomic data that outpaces Moore's Law requires extremely high computational efficiency. On the one hand, existing software solutions can take hundreds of CPU hours to complete human genome alignment. On the other hand, the recently proposed hardware platforms achieve low processing throughput with significant overhead. In this paper, we propose PARC, an Processing-in-Memory architecture for long read pairwise alignment leveraging emerging resistive CAM (content-addressable memory) to accelerate the bottleneck chaining step in DNA alignment. Chaining takes 2-tuple anchors as inputs and identifies a set of correlated anchors as potential alignment candidates. Unlike traditional main memory which organizes relational data structure in a linear address space, PARC stores tuples in two neighboring crossbar arrays with shared row decoder such that column-wise in-memory computational operations and row-wise memory accesses can be performed in-situ in a symmetric crossbar structure. Compared to both software tools and state-of-the-art accelerators, PARC shows significant improvement in alignment throughput and energy efficiency, thanks to the in-site computation capability and optimized data mapping.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2019\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Exploring Bit-Slice Sparsity in Deep Neural Networks for Efficient ReRAM-Based Deployment.\n \n \n \n\n\n \n Jingyang Zhang; Huanrui Yang; Fan Chen; Yitu Wang; and Hai Li.\n\n\n \n\n\n\n In volume abs/1909.08496, 2019. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{EMC2020, \r\n  author    = {Jingyang Zhang and\r\n               Huanrui Yang and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Yitu Wang and\r\n               Hai Li},\r\n  title     = {Exploring Bit-Slice Sparsity in Deep Neural Networks for Efficient\r\n               ReRAM-Based Deployment},\r\n  journal   = {CoRR},\r\n  volume    = {abs/1909.08496},\r\n  year      = {2019},\r\n  archivePrefix = {arXiv},\r\n  eprint    = {1909.08496},\r\n  abstract  = {Emerging resistive random-access memory (ReRAM) has recently been intensively investigated to accelerate the processing of deep neural networks (DNNs). Due to the in-situ computation capability, analog ReRAM crossbars yield significant throughput improvement and energy reduction compared to traditional digital methods. However, the power hungry analog-to-digital converters (ADCs) prevent the practical deployment of ReRAM-based DNN accelerators on end devices with limited chip area and power budget. We observe that due to the limited bit-density of ReRAM cells, DNN weights are bit sliced and correspondingly stored on multiple ReRAM bitlines. The accumulated current on bitlines resulted by weights directly dictates the overhead of ADCs. As such, bitwise weight sparsity rather than the sparsity of the full weight, is desirable for efficient ReRAM deployment. In this work, we propose bit-slice L1, the first algorithm to induce bit-slice sparsity during the training of dynamic fixed-point DNNs. Experiment results show that our approach achieves 2x sparsity improvement compared to previous algorithms. The resulting sparsity allows the ADC resolution to be reduced to 1-bit of the most significant bit-slice and down to 3-bit for the others bits, which significantly speeds up processing and reduces power and area overhead.},\r\n}\r\n\r\n
\n
\n\n\n
\n Emerging resistive random-access memory (ReRAM) has recently been intensively investigated to accelerate the processing of deep neural networks (DNNs). Due to the in-situ computation capability, analog ReRAM crossbars yield significant throughput improvement and energy reduction compared to traditional digital methods. However, the power hungry analog-to-digital converters (ADCs) prevent the practical deployment of ReRAM-based DNN accelerators on end devices with limited chip area and power budget. We observe that due to the limited bit-density of ReRAM cells, DNN weights are bit sliced and correspondingly stored on multiple ReRAM bitlines. The accumulated current on bitlines resulted by weights directly dictates the overhead of ADCs. As such, bitwise weight sparsity rather than the sparsity of the full weight, is desirable for efficient ReRAM deployment. In this work, we propose bit-slice L1, the first algorithm to induce bit-slice sparsity during the training of dynamic fixed-point DNNs. Experiment results show that our approach achieves 2x sparsity improvement compared to previous algorithms. The resulting sparsity allows the ADC resolution to be reduced to 1-bit of the most significant bit-slice and down to 3-bit for the others bits, which significantly speeds up processing and reduces power and area overhead.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n How to Obtain and Run Light and Efficient Deep Learning Networks.\n \n \n \n\n\n \n Fan Chen; Wei Wen; Linghao Song; Jingchi Zhang; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In Proceedings of the International Conference on Computer-Aided Design (ICCAD), pages 1–5, 2019. ACM\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ICCAD2019,\r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Wei Wen and\r\n               Linghao Song and\r\n               Jingchi Zhang and\r\n               Hai Helen Li and\r\n               Yiran Chen},\r\n  title     = {How to Obtain and Run Light and Efficient Deep Learning Networks},\r\n  booktitle = {Proceedings of the International Conference on Computer-Aided Design (ICCAD)},\r\n  pages     = {1--5},\r\n  publisher = {{ACM}},\r\n  year      = {2019},\r\n  doi       = {10.1109/ICCAD45719.2019.8942106},\r\n  abstract  = {As the model size of deep neural networks (DNNs) grows for better performance, the increase in computational cost associated with training and testing makes it extremely difficulty to deploy DNNs on end/edge devices with limited resources while also satisfying the response time requirement. To address this challenge, model compression which compresses model size and thus reduces computation cost is widely adopted in deep learning society. However, the practical impacts of hardware design are often ignored in these algorithm-level solutions, such as the increase of the random accesses to memory hierarchy and the constraints of memory capacity. On the other side, limited understanding about the computational needs at algorithm level may lead to unrealistic assumptions during the hardware designs. In this work, we will discuss this mismatch and provide how our approach addresses it through an interactive design practice across both software and hardware levels.},\r\n} \r\n\r\n
\n
\n\n\n
\n As the model size of deep neural networks (DNNs) grows for better performance, the increase in computational cost associated with training and testing makes it extremely difficulty to deploy DNNs on end/edge devices with limited resources while also satisfying the response time requirement. To address this challenge, model compression which compresses model size and thus reduces computation cost is widely adopted in deep learning society. However, the practical impacts of hardware design are often ignored in these algorithm-level solutions, such as the increase of the random accesses to memory hierarchy and the constraints of memory capacity. On the other side, limited understanding about the computational needs at algorithm level may lead to unrealistic assumptions during the hardware designs. In this work, we will discuss this mismatch and provide how our approach addresses it through an interactive design practice across both software and hardware levels.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Taming extreme heterogeneity via machine learning based design of autonomous manycore systems.\n \n \n \n\n\n \n Paul Bogdan; Fan Chen; Aryan Deshwal; Janardhan Rao Doppa; Biresh Kumar Joardar; Hai Helen Li; Shahin Nazarian; Linghao Song; and Yao Xiao.\n\n\n \n\n\n\n In Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis Companion (CODES+ISSS), pages 21:1–21:10, 2019. ACM\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{CODES2019, \r\n  author    = {Paul Bogdan and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Aryan Deshwal and\r\n               Janardhan Rao Doppa and\r\n               Biresh Kumar Joardar and\r\n               Hai Helen Li and\r\n               Shahin Nazarian and\r\n               Linghao Song and\r\n               Yao Xiao},\r\n  title     = {Taming extreme heterogeneity via machine learning based design of\r\n               autonomous manycore systems},\r\n  booktitle = {Proceedings of the International Conference on Hardware/Software Codesign\r\n               and System Synthesis Companion (CODES+ISSS)},\r\n  pages     = {21:1--21:10},\r\n  publisher = {{ACM}},\r\n  year      = {2019},\r\n  doi       = {10.1145/3349567.3357376}, \r\n  abstract  = {To avoid rewriting software code for new computer architectures and to take advantage of the extreme heterogeneous processing, communication and storage technologies, there is an urgent need for determining the right amount and type of specialization while making a heterogeneous system as programmable and flexible as possible. To enable both programmability and flexibility in the heterogeneous computing era, we propose a novel complex network inspired model of computation and efficient optimization algorithms for determining the optimal degree of parallelization from old software code. This mathematical framework allows us to determine the required number and type of processing elements, the amount and type of deep memory hierarchy, and the degree of reconfiguration for the communication infrastructure, thus opening new avenues to performance and energy efficiency. Our framework enables heterogeneous manycore systems to autonomously adapt from traditional switching techniques to network coding strategies in order to sustain on-chip communication in the order of terabytes. While this new programming model enables the design of self-programmable autonomous heterogeneous manycore systems, a number of open challenges will be discussed.},\r\n}\r\n\r\n
\n
\n\n\n
\n To avoid rewriting software code for new computer architectures and to take advantage of the extreme heterogeneous processing, communication and storage technologies, there is an urgent need for determining the right amount and type of specialization while making a heterogeneous system as programmable and flexible as possible. To enable both programmability and flexibility in the heterogeneous computing era, we propose a novel complex network inspired model of computation and efficient optimization algorithms for determining the optimal degree of parallelization from old software code. This mathematical framework allows us to determine the required number and type of processing elements, the amount and type of deep memory hierarchy, and the degree of reconfiguration for the communication infrastructure, thus opening new avenues to performance and energy efficiency. Our framework enables heterogeneous manycore systems to autonomously adapt from traditional switching techniques to network coding strategies in order to sustain on-chip communication in the order of terabytes. While this new programming model enables the design of self-programmable autonomous heterogeneous manycore systems, a number of open challenges will be discussed.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n ZARA: A Novel Zero-free Dataflow Accelerator for Generative Adversarial Networks in 3D ReRAM.\n \n \n \n\n\n \n Fan Chen; Linghao Song; Hai Helen Li; and Yiran Chen.\n\n\n \n\n\n\n In Annual Design Automation Conference (DAC), pages 133, 2019. ACM\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{DAC2019,\r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Linghao Song and\r\n               Hai Helen Li and\r\n               Yiran Chen},\r\n  title     = {{ZARA:} {A} Novel Zero-free Dataflow Accelerator for Generative Adversarial\r\n               Networks in 3D ReRAM},\r\n  booktitle = {Annual Design Automation Conference (DAC)},\r\n  pages     = {133},\r\n  publisher = {{ACM}},\r\n  year      = {2019},\r\n  doi       = {10.1145/3316781.3317936},\r\n  abstract  = {Generative Adversarial Networks (GANs) recently demonstrated a great opportunity toward unsupervised learning with the intention to mitigate the massive human efforts on data labeling in supervised learning algorithms. GAN combines a generative model and a discriminative model to oppose each other in an adversarial situation to refine their abilities. Existing nonvolatile memory based machine learning accelerators, however, could not support the computational needs required by GAN training. Specifically, the generator utilizes a new operator, called transposed convolution, which introduces significant resource underutilization when executed on conventional neural network accelerators as it inserts massive zeros in its input before a convolution operation. In this work, we propose a novel computational deformation technique that synergistically optimizes the forward and backward functions in transposed convolution to eliminate the large resource underutilization. In addition, we present dedicated control units -a dataflow mapper and an operation scheduler, to support the proposed execution model with high parallelism and low energy consumption. ZARA is implemented with commodity ReRAM chips, and experimental results show that our design can improve GAN's training performance by averagely 1.6x ~ 23x over CMOS-based GAN accelerators. Compared to state-of-the-art ReRAM-based accelerator designs, ZARA also provides 1.15x ~ 2.1x performance improvement.},\r\n} \r\n\r\n
\n
\n\n\n
\n Generative Adversarial Networks (GANs) recently demonstrated a great opportunity toward unsupervised learning with the intention to mitigate the massive human efforts on data labeling in supervised learning algorithms. GAN combines a generative model and a discriminative model to oppose each other in an adversarial situation to refine their abilities. Existing nonvolatile memory based machine learning accelerators, however, could not support the computational needs required by GAN training. Specifically, the generator utilizes a new operator, called transposed convolution, which introduces significant resource underutilization when executed on conventional neural network accelerators as it inserts massive zeros in its input before a convolution operation. In this work, we propose a novel computational deformation technique that synergistically optimizes the forward and backward functions in transposed convolution to eliminate the large resource underutilization. In addition, we present dedicated control units -a dataflow mapper and an operation scheduler, to support the proposed execution model with high parallelism and low energy consumption. ZARA is implemented with commodity ReRAM chips, and experimental results show that our design can improve GAN's training performance by averagely 1.6x   23x over CMOS-based GAN accelerators. Compared to state-of-the-art ReRAM-based accelerator designs, ZARA also provides 1.15x   2.1x performance improvement.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Efficient Process-in-Memory Architecture Design for Unsupervised GAN-based Deep Learning using ReRAM.\n \n \n \n\n\n \n Fan Chen; Linghao Song; and Hai (Helen) Li.\n\n\n \n\n\n\n In Proceedings of the 2019 on Great Lakes Symposium on VLSI (GLSVLSI), pages 423–428, 2019. ACM\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{glsvlsi2019,\r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Linghao Song and\r\n               Hai (Helen) Li},\r\n  title     = {Efficient Process-in-Memory Architecture Design for Unsupervised GAN-based\r\n               Deep Learning using ReRAM},\r\n  booktitle = {Proceedings of the 2019 on Great Lakes Symposium on VLSI (GLSVLSI)},\r\n  pages     = {423--428},\r\n  publisher = {{ACM}},\r\n  year      = {2019},\r\n  doi       = {10.1145/3299874.3319482},\r\n  abstract  = {The ending of Moore's Law makes domain-specific architecture as the future of computing. The most representative is the emergence of various deep learning accelerators. Among the proposed solutions, resistive random access memory (ReRAM) based process-in-memory (PIM) architecture is anticipated as a promising candidate because ReRAM has the capability of both data storage and in-situ computation. However, we found that existing solutions are unable to efficiently support the computational needs required by the training of unsupervised generative adversarial networks (GANs), due to the lack of the following two features: 1) Computation efficiency: GAN utilizes a new operator, called transposed convolution. It inserts massive zeros in its input before a convolution operation, resulting in significant resource under-utilization; 2) Data traffic: The data intensive training process of GANs often incurs structural heavy data traffic as well as frequent massive data swaps. Our research follows the PIM strategy by leveraging the energy-efficiency of ReRAM arrays for vector-matrix multiplication to enhance the performance and energy efficiency. Specifically, we propose a novel computation deformation technique that can skip zero-insertions in transposed convolution for computation efficiency improvement. Moreover, we explore an efficient pipelined training procedure to reduce on-chip memory access. The implementation of related circuits and architecture is also discussed. At the end, we present our perspective on the future trend and opportunities of deep learning accelerators.},\r\n} \r\n\r\n
\n
\n\n\n
\n The ending of Moore's Law makes domain-specific architecture as the future of computing. The most representative is the emergence of various deep learning accelerators. Among the proposed solutions, resistive random access memory (ReRAM) based process-in-memory (PIM) architecture is anticipated as a promising candidate because ReRAM has the capability of both data storage and in-situ computation. However, we found that existing solutions are unable to efficiently support the computational needs required by the training of unsupervised generative adversarial networks (GANs), due to the lack of the following two features: 1) Computation efficiency: GAN utilizes a new operator, called transposed convolution. It inserts massive zeros in its input before a convolution operation, resulting in significant resource under-utilization; 2) Data traffic: The data intensive training process of GANs often incurs structural heavy data traffic as well as frequent massive data swaps. Our research follows the PIM strategy by leveraging the energy-efficiency of ReRAM arrays for vector-matrix multiplication to enhance the performance and energy efficiency. Specifically, we propose a novel computation deformation technique that can skip zero-insertions in transposed convolution for computation efficiency improvement. Moreover, we explore an efficient pipelined training procedure to reduce on-chip memory access. The implementation of related circuits and architecture is also discussed. At the end, we present our perspective on the future trend and opportunities of deep learning accelerators.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Deep Learning for Vertex Reconstruction of Neutrino-nucleus Interaction Events with Combined Energy and Time Data.\n \n \n \n\n\n \n Linghao Song; Fan Chen; Steven R. Young; Catherine D. Schuman; Gabriel N. Perdue; and Thomas E. Potok.\n\n\n \n\n\n\n In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 3882–3886, 2019. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{ICASSP2019, \r\n  author    = {Linghao Song and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Steven R. Young and\r\n               Catherine D. Schuman and\r\n               Gabriel N. Perdue and\r\n               Thomas E. Potok},\r\n  title     = {Deep Learning for Vertex Reconstruction of Neutrino-nucleus Interaction\r\n               Events with Combined Energy and Time Data},\r\n  booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing (ICASSP)},\r\n  pages     = {3882--3886},\r\n  publisher = {{IEEE}},\r\n  year      = {2019},\r\n  doi       = {10.1109/ICASSP.2019.8683736},\r\n  abstract  = {We present a deep learning approach for vertex reconstruction of neutrino-nucleus interaction events, a problem in the domain of high energy physics. In this approach, we combine both energy and timing data that are collected in the MIN-ERvA detector to perform classification and regression tasks. We show that the resulting network achieves higher accuracy than previous results while requiring a smaller model size and less training time. In particular, the proposed model outperforms the state-of-the-art by 4.00\\% on classification accuracy. For the regression task, our model achieves 0.9919 on the coefficient of determination, higher than the previous work (0.96).},\r\n}\r\n\r\n% 2018\r\n
\n
\n\n\n
\n We present a deep learning approach for vertex reconstruction of neutrino-nucleus interaction events, a problem in the domain of high energy physics. In this approach, we combine both energy and timing data that are collected in the MIN-ERvA detector to perform classification and regression tasks. We show that the resulting network achieves higher accuracy than previous results while requiring a smaller model size and less training time. In particular, the proposed model outperforms the state-of-the-art by 4.00% on classification accuracy. For the regression task, our model achieves 0.9919 on the coefficient of determination, higher than the previous work (0.96).\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2018\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n EMAT: an efficient multi-task architecture for transfer learning using ReRAM.\n \n \n \n\n\n \n Fan Chen; and Hai Li.\n\n\n \n\n\n\n In Proceedings of the International Conference on Computer-Aided Design (ICCAD), pages 33, 2018. ACM\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ICCAD18,\r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Hai Li},\r\n  title     = {{EMAT:} an efficient multi-task architecture for transfer learning\r\n               using ReRAM},\r\n  booktitle = {Proceedings of the International Conference on Computer-Aided Design (ICCAD)},\r\n  pages     = {33},\r\n  publisher = {{ACM}},\r\n  year      = {2018},\r\n  doi       = {10.1145/3240765.3240805},\r\n  abstract  = {Transfer learning has demonstrated a great success recently towards general supervised learning to mitigate expensive training efforts. However, existing neural network accelerators have been proven inefficient in executing transfer learning by failing to accommodate the layer-wise heterogeneity in computation and memory requirements. In this work, we propose EMAT-an efficient multi-task architecture for transfer learning built on resistive memory (ReRAM) technology. EMAT utilizes the energy-efficiency of ReRAM arrays for matrix-vector multiplication and realizes a hierarchical reconfigurable design with heterogeneous computation components to incorporate the data patterns in transfer learning. Compared to the GPU platform, EMAT can perform averagely 120x performance speedup and 87x energy saving. EMAT also obtains 2.5x speedup compared to the-state-of-the-art CMOS accelerator.},\r\n} \r\n\r\n
\n
\n\n\n
\n Transfer learning has demonstrated a great success recently towards general supervised learning to mitigate expensive training efforts. However, existing neural network accelerators have been proven inefficient in executing transfer learning by failing to accommodate the layer-wise heterogeneity in computation and memory requirements. In this work, we propose EMAT-an efficient multi-task architecture for transfer learning built on resistive memory (ReRAM) technology. EMAT utilizes the energy-efficiency of ReRAM arrays for matrix-vector multiplication and realizes a hierarchical reconfigurable design with heterogeneous computation components to incorporate the data patterns in transfer learning. Compared to the GPU platform, EMAT can perform averagely 120x performance speedup and 87x energy saving. EMAT also obtains 2.5x speedup compared to the-state-of-the-art CMOS accelerator.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Design and Data Management for Magnetic Racetrack Memory.\n \n \n \n\n\n \n Bing Li; Fan Chen; Wang Kang; Weisheng Zhao; Yiran Chen; and Hai Li.\n\n\n \n\n\n\n In IEEE International Symposium on Circuits and Systems (ISCAS), pages 1–4, 2018. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{ISCAS18, \r\n  author    = {Bing Li and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Wang Kang and\r\n               Weisheng Zhao and\r\n               Yiran Chen and\r\n               Hai Li},\r\n  title     = {Design and Data Management for Magnetic Racetrack Memory},\r\n  booktitle = {{IEEE} International Symposium on Circuits and Systems (ISCAS)},\r\n  pages     = {1--4},\r\n  publisher = {{IEEE}},\r\n  year      = {2018},\r\n  doi       = {10.1109/ISCAS.2018.8351681},\r\n  abstract  = {Benefiting from its ultra-high storage density, high energy efficiency, and non-volatility, racetrack memory demonstrates great potential in replacing conventional SRAM as large on-chip memory. Integrating the tape-like racetrack memory, however, faces unique design challenges from cell structure to architecture design. This paper reviews some cross-layer design methodologies for racetrack memory as on-chip cache hierarchy. Research studies show that with proper architectural design and data management, racetrack memory can achieve significant area reduction, system performance enhancement, and energy saving compared to state-of-the-art memory technologies.},\r\n}\r\n\r\n
\n
\n\n\n
\n Benefiting from its ultra-high storage density, high energy efficiency, and non-volatility, racetrack memory demonstrates great potential in replacing conventional SRAM as large on-chip memory. Integrating the tape-like racetrack memory, however, faces unique design challenges from cell structure to architecture design. This paper reviews some cross-layer design methodologies for racetrack memory as on-chip cache hierarchy. Research studies show that with proper architectural design and data management, racetrack memory can achieve significant area reduction, system performance enhancement, and energy saving compared to state-of-the-art memory technologies.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Exploring the opportunity of implementing neuromorphic computing systems with spintronic devices.\n \n \n \n\n\n \n Bonan Yan; Fan Chen; Yaojun Zhang; Chang Song; Hai Li; and Yiran Chen.\n\n\n \n\n\n\n In Design, Automation Test in Europe Conference Exhibition (DATE), pages 109–112, 2018. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{DATE18dw, \r\n  author    = {Bonan Yan and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Yaojun Zhang and\r\n               Chang Song and\r\n               Hai Li and\r\n               Yiran Chen},\r\n  title     = {Exploring the opportunity of implementing neuromorphic computing systems\r\n               with spintronic devices},\r\n  booktitle = {Design, Automation Test in Europe Conference Exhibition (DATE)}, \r\n  pages     = {109--112},\r\n  publisher = {{IEEE}},\r\n  year      = {2018},\r\n  doi       = {10.23919/DATE.2018.8341988},\r\n  abstract  = {Many cognitive algorithms such as neural networks cannot be efficiently executed by von Neumann architectures, the performance of which is constrained by the memory wall between microprocessor and memory hierarchy. Hence, researchers started to investigate new computing paradigms such as neuromorphic computing that can adapt their structure to the topology of the algorithms and accelerate their executions. New computing units have been also invented to support this effort by leveraging emerging nano-devices. In this work, we will discuss the opportunity of implementing neuromorphic computing systems with spintronic devices. We will also provide insights on how spintronic devices fit into different part of neuromorphic computing systems. Approaches to optimize the circuits are also discussed.},\r\n}\r\n\r\n
\n
\n\n\n
\n Many cognitive algorithms such as neural networks cannot be efficiently executed by von Neumann architectures, the performance of which is constrained by the memory wall between microprocessor and memory hierarchy. Hence, researchers started to investigate new computing paradigms such as neuromorphic computing that can adapt their structure to the topology of the algorithms and accelerate their executions. New computing units have been also invented to support this effort by leveraging emerging nano-devices. In this work, we will discuss the opportunity of implementing neuromorphic computing systems with spintronic devices. We will also provide insights on how spintronic devices fit into different part of neuromorphic computing systems. Approaches to optimize the circuits are also discussed.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n ReRAM-based accelerator for deep learning.\n \n \n \n\n\n \n Bing Li; Linghao Song; Fan Chen; Xuehai Qian; Yiran Chen; and Hai Helen Li.\n\n\n \n\n\n\n In Design, Automation Test in Europe Conference Exhibition (DATE), pages 815–820, 2018. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{DATE18gan, \r\n  author    = {Bing Li and\r\n               Linghao Song and\r\n               {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Xuehai Qian and\r\n               Yiran Chen and\r\n               Hai Helen Li},\r\n  title     = {ReRAM-based accelerator for deep learning},\r\n  booktitle = {Design, Automation Test in Europe Conference Exhibition (DATE)}, \r\n  pages     = {815--820},\r\n  publisher = {{IEEE}},\r\n  year      = {2018},\r\n  doi       = {10.23919/DATE.2018.8342118},\r\n  abstract  = {Big data computing applications such as deep learning and graph analytic usually incur a large amount of data movements. Deploying such applications on conventional von Neumann architecture that separates the processing units and memory components likely leads to performance bottleneck due to the limited memory bandwidth. A common approach is to develop architecture and memory co-design methodologies to overcome the challenge. Our research follows the same strategy by leveraging resistive memory (ReRAM) to further enhance the performance and energy efficiency. Specifically, we employ the general principles behind processing-in-memory to design efficient ReRAM based accelerators that support both testing and training operations. Related circuit and architecture optimization will be discussed too.},\r\n} \r\n\r\n
\n
\n\n\n
\n Big data computing applications such as deep learning and graph analytic usually incur a large amount of data movements. Deploying such applications on conventional von Neumann architecture that separates the processing units and memory components likely leads to performance bottleneck due to the limited memory bandwidth. A common approach is to develop architecture and memory co-design methodologies to overcome the challenge. Our research follows the same strategy by leveraging resistive memory (ReRAM) to further enhance the performance and energy efficiency. Specifically, we employ the general principles behind processing-in-memory to design efficient ReRAM based accelerators that support both testing and training operations. Related circuit and architecture optimization will be discussed too.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Process variation aware data management for magnetic skyrmions racetrack memory.\n \n \n \n\n\n \n Fan Chen; Zheng Li; Wang Kang; Weisheng Zhao; Hai Li; and Yiran Chen.\n\n\n \n\n\n\n In 23rd Asia and South Pacific Design Automation Conference (ASP-DAC), pages 221–226, 2018. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ASPDAC18dw,\r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Zheng Li and\r\n               Wang Kang and\r\n               Weisheng Zhao and\r\n               Hai Li and\r\n               Yiran Chen},\r\n  title     = {Process variation aware data management for magnetic skyrmions racetrack\r\n               memory},\r\n  booktitle = {23rd Asia and South Pacific Design Automation Conference (ASP-DAC)},\r\n  pages     = {221--226},\r\n  publisher = {{IEEE}},\r\n  year      = {2018},\r\n  doi       = {10.1109/ASPDAC.2018.8297309},\r\n  abstract  = {Skyrmions racetrack memory (SKM) has been identified as a promising candidate for future on-chip cache. Similar to many other nanoscale technologies, process variations also adversely impact the reliability and performance of SKM cache. In this work, we propose the first holistic solution for employing SKM as last-level caches. We first present a novel SKM cache architecture and a physical-to-logic mapping scheme based on our comprehensive analysis on working mechanism of SKM. We then model the impact of process variations on SKM cache performance. By leveraging the developed model, we propose a process variation aware data management technique to minimize the performance degradation of SKM cache incurred by process variations. Experimental results show that the proposed SKM cache can achieve a geometric mean of 1.28x IPC improvement, 2x density increase, and 23\\% energy reduction compared to Domain Wall racetrack memory (DWM) under the same area constraint across 15 workloads. In addition, our dynamic data management technique can further improve the system IPC by 25\\% w.r.t. the worst-case design.},\r\n} \r\n\r\n
\n
\n\n\n
\n Skyrmions racetrack memory (SKM) has been identified as a promising candidate for future on-chip cache. Similar to many other nanoscale technologies, process variations also adversely impact the reliability and performance of SKM cache. In this work, we propose the first holistic solution for employing SKM as last-level caches. We first present a novel SKM cache architecture and a physical-to-logic mapping scheme based on our comprehensive analysis on working mechanism of SKM. We then model the impact of process variations on SKM cache performance. By leveraging the developed model, we propose a process variation aware data management technique to minimize the performance degradation of SKM cache incurred by process variations. Experimental results show that the proposed SKM cache can achieve a geometric mean of 1.28x IPC improvement, 2x density increase, and 23% energy reduction compared to Domain Wall racetrack memory (DWM) under the same area constraint across 15 workloads. In addition, our dynamic data management technique can further improve the system IPC by 25% w.r.t. the worst-case design.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n ReGAN: A pipelined ReRAM-based accelerator for generative adversarial networks.\n \n \n \n\n\n \n Fan Chen; Linghao Song; and Yiran Chen.\n\n\n \n\n\n\n In 23rd Asia and South Pacific Design Automation Conference (ASP-DAC), pages 178–183, 2018. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{ASPDAC18gan, \r\n  author    = {{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and\r\n               Linghao Song and\r\n               Yiran Chen},\r\n  title     = {ReGAN: {A} pipelined ReRAM-based accelerator for generative adversarial\r\n               networks},\r\n  booktitle = {23rd Asia and South Pacific Design Automation Conference (ASP-DAC)},\r\n  pages     = {178--183},\r\n  publisher = {{IEEE}},\r\n  year      = {2018},\r\n  doi       = {10.1109/ASPDAC.2018.8297302},\r\n  abstract = {Generative Adversarial Networks (GANs) have recently drawn tremendous attention in many artificial intelligence (AI) applications including computer vision, speech recognition, and natural language processing. While GANs deliver state-of-the-art performance on these AI tasks, it comes at the cost of high computational complexity. Although recent progress demonstrated the promise of using ReRMA-based Process-In-Memory for acceleration of convolutional neural networks (CNNs) with low energy cost, the unique training process required by GANs makes them difficult to run on existing neural network acceleration platforms: two competing networks are simultaneously cotrained in GANs, and hence, significantly increasing the need of memory and computation resources. In this work, we propose ReGAN - a novel ReRAM-based Process-In-Memory accelerator that can efficiently reduce off-chip memory accesses. Moreover, ReGAN greatly increases system throughput by pipelining the layer-wise computation. Two techniques, namely, Spatial Parallelism and Computation Sharing are particularly proposed to further enhance training efficiency of GANs. Our experimental results show that ReGAN can achieve 240x performance speedup compared to GPU platform averagely, with an average energy saving of 94x.}\r\n}\r\n\r\n\r\n%\r\n
\n
\n\n\n
\n Generative Adversarial Networks (GANs) have recently drawn tremendous attention in many artificial intelligence (AI) applications including computer vision, speech recognition, and natural language processing. While GANs deliver state-of-the-art performance on these AI tasks, it comes at the cost of high computational complexity. Although recent progress demonstrated the promise of using ReRMA-based Process-In-Memory for acceleration of convolutional neural networks (CNNs) with low energy cost, the unique training process required by GANs makes them difficult to run on existing neural network acceleration platforms: two competing networks are simultaneously cotrained in GANs, and hence, significantly increasing the need of memory and computation resources. In this work, we propose ReGAN - a novel ReRAM-based Process-In-Memory accelerator that can efficiently reduce off-chip memory accesses. Moreover, ReGAN greatly increases system throughput by pipelining the layer-wise computation. Two techniques, namely, Spatial Parallelism and Computation Sharing are particularly proposed to further enhance training efficiency of GANs. Our experimental results show that ReGAN can achieve 240x performance speedup compared to GPU platform averagely, with an average energy saving of 94x.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2013\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n A reconfigurable multi-mode multi-band transmitter with integrated frequency synthesizer for short-range wireless communication.\n \n \n \n \n\n\n \n Nan Qi; Fan Chen; Lingwei Zhang; Xiaoman Wang; and Baoyong Chi.\n\n\n \n\n\n\n Journal of Semiconductors, 34(9): 095008. sep 2013.\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{nan2013reconfigurable,\r\n\tdoi = {10.1088/1674-4926/34/9/095008},\r\n\turl = {https://doi.org/10.1088/1674-4926/34/9/095008},\r\n\tyear = 2013,\r\n\tmonth = {sep},\r\n\tpublisher = {{IOP} Publishing},\r\n\tvolume = {34},\r\n\tnumber = {9},\r\n\tpages = {095008},\r\n\tauthor = {Nan Qi and {<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and Lingwei Zhang and Xiaoman Wang and Baoyong Chi},\r\n\ttitle = {A reconfigurable multi-mode multi-band transmitter with integrated frequency synthesizer for short-range wireless communication},\r\n\tjournal = {Journal of Semiconductors},\r\n\tabstract = {A reconfigurable multi-mode direct-conversion transmitter (TX) with integrated frequency synthesizer (FS) is presented. The TX as well as the FS is designed with a flexible architecture and frequency plan, which helps to support all the 433/868/915 MHz ISM band signals, with the reconfigurable bandwidth from 250 kHz to 2 MHz. In order to save power and chip area, only one 1.8 GHz VCO is adopted to cover the whole frequency range. All the operation modes can be regulated in real time by configuring the integrated register-bank through an SPI interface. Implemented in 180 nm CMOS, the FS achieves a frequency coverage of 320–460 MHz and 620–920 MHz. The lowest phase noise can be −107 dBc/Hz at a 100 kHz offset and −126 dBc/Hz at a 1 MHz offset. The transmitter features a + 10.2 dBm peak output power with a +9.5 dBm 1-dB-compression point and 250 kHz/500 kHz/1 MHz/2 MHz reconfigurable signal bandwidth.}\r\n}\r\n\r\n
\n
\n\n\n
\n A reconfigurable multi-mode direct-conversion transmitter (TX) with integrated frequency synthesizer (FS) is presented. The TX as well as the FS is designed with a flexible architecture and frequency plan, which helps to support all the 433/868/915 MHz ISM band signals, with the reconfigurable bandwidth from 250 kHz to 2 MHz. In order to save power and chip area, only one 1.8 GHz VCO is adopted to cover the whole frequency range. All the operation modes can be regulated in real time by configuring the integrated register-bank through an SPI interface. Implemented in 180 nm CMOS, the FS achieves a frequency coverage of 320–460 MHz and 620–920 MHz. The lowest phase noise can be −107 dBc/Hz at a 100 kHz offset and −126 dBc/Hz at a 1 MHz offset. The transmitter features a + 10.2 dBm peak output power with a +9.5 dBm 1-dB-compression point and 250 kHz/500 kHz/1 MHz/2 MHz reconfigurable signal bandwidth.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2009\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n A 2.4 GHz wireless transmitter front-end for endoscopy capsule system.\n \n \n \n\n\n \n Fan Chen; Baoyong Chi; and Zhihua Wang.\n\n\n \n\n\n\n In 2009 IEEE 8th International Conference on ASIC, pages 465–468, 2009. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chen20092,\r\n  title={A 2.4 GHz wireless transmitter front-end for endoscopy capsule system},\r\n  author={{<a href="https://homes.luddy.indiana.edu/fc7/" target="_blank">Fan Chen</a></span>} and Chi, Baoyong and Wang, Zhihua},\r\n  booktitle={2009 IEEE 8th International Conference on ASIC},\r\n  pages={465--468},\r\n  year={2009},\r\n  organization={IEEE},\r\n  doi={10.1109/ASICON.2009.5351340},\r\n  abstract = {A 2.4 GHz wireless transmitter front-end for the endoscopy capsule system is presented. The transmitter front-end consists of an IF module (ASK modulator, FSK modulator) and a direct-conversion transmitter RF front-end (double-balanced up-converter, power amplifier). The power amplifier (PA) has digitally programmable output power levels and works at 3.3 V power supply, while other parts work at 1.8 V power supply. The transmitter front-end is implemented in UMC 0.18 um CMOS process. Its maximum out power to 50 u load achieves 10 dBm with the data rate of 2 Mbps and dissipates 7.09 mA current from 1.8 V power supply and 25 mA from 3.3 V power supply.}\r\n}\r\n
\n
\n\n\n
\n A 2.4 GHz wireless transmitter front-end for the endoscopy capsule system is presented. The transmitter front-end consists of an IF module (ASK modulator, FSK modulator) and a direct-conversion transmitter RF front-end (double-balanced up-converter, power amplifier). The power amplifier (PA) has digitally programmable output power levels and works at 3.3 V power supply, while other parts work at 1.8 V power supply. The transmitter front-end is implemented in UMC 0.18 um CMOS process. Its maximum out power to 50 u load achieves 10 dBm with the data rate of 2 Mbps and dissipates 7.09 mA current from 1.8 V power supply and 25 mA from 3.3 V power supply.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);