var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=https%3A%2F%2Fwww.seas.upenn.edu%2F%7Ejaneli%2Ffiles%2Fpublications.bib&jsonp=1&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=https%3A%2F%2Fwww.seas.upenn.edu%2F%7Ejaneli%2Ffiles%2Fpublications.bib&jsonp=1\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=https%3A%2F%2Fwww.seas.upenn.edu%2F%7Ejaneli%2Ffiles%2Fpublications.bib&jsonp=1\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n 2020\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Liquid Silicon: A Nonvolatile Fully Programmable Processing-In-Memory Processor with Monolithically Integrated ReRAM for Big Data/Machine Learning Applications (invited).\n \n \n \n \n\n\n \n ZhaS, Y.; Nowak, E.; and Li, J.\n\n\n \n\n\n\n IEEE Journal of Solid-State Circuits (JSSC), 55(4): 908–919. 2020.\n \n\n\n\n
\n\n\n\n \n \n \"LiquidPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 21 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{zha2020jssc, \nauthor={Zha<sup>S</sup>, Yue and Nowak, Etienne and Li, Jing}, \njournal={IEEE Journal of Solid-State Circuits (<strong>JSSC</strong>)}, \ntitle={{Liquid Silicon}: A Nonvolatile Fully Programmable Processing-In-Memory Processor with Monolithically Integrated {ReRAM} for {Big Data/Machine Learning} Applications (<strong>invited</strong>)}, \nabstract = {The slowdown of the CMOS technology scaling, and the trade-off between efficiency and flexibility have fueled the exploration into novel architectures with emerging post-CMOS technology [e.g., resistive-RAM (RRAM)]. In this article, a nonvolatile fully programmable processing-in-memory (PIM) processor named Liquid Silicon is demonstrated, which combines the superior programmability of general-purpose computing devices [e.g., field-programmable gate array (FPGA)] and the high efficiency of domain-specific accelerators. Besides the general computing applications, Liquid Silicon is particularly well suited for artificial intelligence (AI)/machine learning and big data applications, which not only poses high computational/memory demand but also evolves rapidly. To fabricate the Liquid Silicon chip, the HfO 2 RRAM is monolithically integrated on top of the commercial 130 nm CMOS. Our measurement confirms that Liquid Silicon chip can operate reliably at a low voltage of 650 mV. It achieves 60.9 TOPS/W in performing neural network (NN) inferences, and 480 GOPS/W in performing content-based similarity search (a key big data application) at a nominal voltage supply of 1.2 V, showing 3x and 100x improvement over the state-of-the-art domain-specific CMOS-/RRAM-based accelerators. In addition, it outperforms the latest nonvolatile FPGA in energy efficiency by 3x in general computing applications.},\n year = {2020},\n volume = {55},\n number = {4},\n pages = {908--919},\n url = {https://doi.org/10.1109/JSSC.2019.2963005},\n doi = {10.1109/JSSC.2019.2963005}, \n keywords = {journal, sj, Liquid Silicon}\n}\n\n
\n
\n\n\n
\n The slowdown of the CMOS technology scaling, and the trade-off between efficiency and flexibility have fueled the exploration into novel architectures with emerging post-CMOS technology [e.g., resistive-RAM (RRAM)]. In this article, a nonvolatile fully programmable processing-in-memory (PIM) processor named Liquid Silicon is demonstrated, which combines the superior programmability of general-purpose computing devices [e.g., field-programmable gate array (FPGA)] and the high efficiency of domain-specific accelerators. Besides the general computing applications, Liquid Silicon is particularly well suited for artificial intelligence (AI)/machine learning and big data applications, which not only poses high computational/memory demand but also evolves rapidly. To fabricate the Liquid Silicon chip, the HfO 2 RRAM is monolithically integrated on top of the commercial 130 nm CMOS. Our measurement confirms that Liquid Silicon chip can operate reliably at a low voltage of 650 mV. It achieves 60.9 TOPS/W in performing neural network (NN) inferences, and 480 GOPS/W in performing content-based similarity search (a key big data application) at a nominal voltage supply of 1.2 V, showing 3x and 100x improvement over the state-of-the-art domain-specific CMOS-/RRAM-based accelerators. In addition, it outperforms the latest nonvolatile FPGA in energy efficiency by 3x in general computing applications.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Hyper-AP: Enhancing Associative Processing Through A Full-Stack Optimization.\n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In 2020 ACM/IEEE 45th Annual International Symposium on Computer Architecture, of ISCA '20, 2020. IEEE\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2020hyperAP,\n  title={{Hyper-AP: Enhancing Associative Processing Through A Full-Stack Optimization}},\n  author={Zha<sup>S</sup>, Yue and Li, Jing},\n  booktitle={2020 ACM/IEEE 45th Annual International Symposium on Computer Architecture},\n  series = {<strong>ISCA</strong> '20},\n  year={2020},\n  %pubstate = {upcoming},\n  organization={IEEE},\n  keywords={conference}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n ViTAL: Virtualizing FPGAs in the Cloud.\n \n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In the 24th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, of ASPLOS '20, pages 845–858, New York, NY, USA, Mar 2020. Association for Computing Machinery\n \n\n\n\n
\n\n\n\n \n \n \"ViTAL:Paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 18 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2020asplos,\n author = {Zha<sup>S</sup>, Yue and Li, Jing},\n title = {{ViTAL: Virtualizing FPGAs in the Cloud}},\n booktitle = {the 24th ACM International Conference on Architectural Support for Programming Languages and Operating Systems},\n series = {<strong>ASPLOS</strong> '20},\n publisher = {Association for Computing Machinery},\n address = {New York, NY, USA},\n url = {https://doi.org/10.1145/3373376.3378491},\n doi = {10.1145/3373376.3378491},\n date={2020-03-16},\n year = {2020},\n month = {Mar},\n pages = {845–858},\n numpages = {14},\n abstract={Field-Programmable Gate Arrays (FPGAs) have been integrated into the cloud infrastructure to enhance its computing performance by supporting on-demand acceleration. However, system support for FPGAs in the context of the cloud environment is still in its infancy with two major limitations, i.e., the inefficient runtime management due to the tight coupling between compilation and resource allocation, and the high programming complexity when exploiting scale-out acceleration. The root cause is that FPGA resources are not virtualized. In this paper, we propose a full-stack solution, namely ViTAL, to address the aforementioned limitations by virtualizing FPGA resources. Specifically, ViTAL provides a homogeneous abstraction to decouple the compilation and resource allocation. Applications are offline compiled onto the abstraction, while the resource allocation is dynamically determined at runtime. Enabled by a latency-insensitive communication interface, applications can be mapped flexibly onto either one FPGA or multiple FPGAs to maximize the resource utilization and the aggregated system throughput. Meanwhile, ViTAL creates an illusion of a single and large FPGA to users, thereby reducing the programming complexity and supporting scale-out acceleration. Moreover, ViTAL also provides virtualization support for peripheral components (e.g., on-board DRAM and Ethernet), as well as protection and isolation support to ensure a secure execution in the multi-user cloud environment. We evaluate ViTAL on a real system - an FPGA cluster composed of the latest Xilinx UltraScale+ FPGAs (XCVU37P). The results show that, compared with the existing management method, ViTAL enables fine-grained resource sharing and reduces the response time by 82% on average (improving Quality-of-Service) with a marginal virtualization overhead. Moreover, ViTAL also reduces the response time by 25% compared to AmorphOS (operating in high-throughput mode), a recently proposed FPGA virtualization method.},\n note = {},\n location = {Lausanne, Switzerland},\n keywords = {conference, FPGA, virtualization, field-programmable gate arrays, cloud computing, system abstraction, compilation framework, scale-out acceleration}\n }\n\n
\n
\n\n\n
\n Field-Programmable Gate Arrays (FPGAs) have been integrated into the cloud infrastructure to enhance its computing performance by supporting on-demand acceleration. However, system support for FPGAs in the context of the cloud environment is still in its infancy with two major limitations, i.e., the inefficient runtime management due to the tight coupling between compilation and resource allocation, and the high programming complexity when exploiting scale-out acceleration. The root cause is that FPGA resources are not virtualized. In this paper, we propose a full-stack solution, namely ViTAL, to address the aforementioned limitations by virtualizing FPGA resources. Specifically, ViTAL provides a homogeneous abstraction to decouple the compilation and resource allocation. Applications are offline compiled onto the abstraction, while the resource allocation is dynamically determined at runtime. Enabled by a latency-insensitive communication interface, applications can be mapped flexibly onto either one FPGA or multiple FPGAs to maximize the resource utilization and the aggregated system throughput. Meanwhile, ViTAL creates an illusion of a single and large FPGA to users, thereby reducing the programming complexity and supporting scale-out acceleration. Moreover, ViTAL also provides virtualization support for peripheral components (e.g., on-board DRAM and Ethernet), as well as protection and isolation support to ensure a secure execution in the multi-user cloud environment. We evaluate ViTAL on a real system - an FPGA cluster composed of the latest Xilinx UltraScale+ FPGAs (XCVU37P). The results show that, compared with the existing management method, ViTAL enables fine-grained resource sharing and reduces the response time by 82% on average (improving Quality-of-Service) with a marginal virtualization overhead. Moreover, ViTAL also reduces the response time by 25% compared to AmorphOS (operating in high-throughput mode), a recently proposed FPGA virtualization method.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2019\n \n \n (7)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n TOCO: A Framework for Compressing Neural Network Models Based on Tolerance Analysis.\n \n \n \n \n\n\n \n KhoramS, S.; and Li, J.\n\n\n \n\n\n\n 2019.\n \n\n\n\n
\n\n\n\n \n \n \"TOCO:Paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@misc{Khoram2019toco,\n    title={{TOCO: A Framework for Compressing Neural Network Models Based on Tolerance Analysis}},\n    author={Soroosh Khoram<sup>S</sup> and Jing Li},\n    abstract={Neural network compression methods have enabled deploying large models on emerging edge devices with little cost, by adapting already-trained models to the constraints of these devices. The rapid development of AI-capable edge devices with limited computation and storage requires streamlined methodologies that can efficiently satisfy the constraints of different devices. In contrast, existing methods often rely on heuristic and manual adjustments to maintain accuracy, support only coarse compression policies, or target specific device constraints that limit their applicability. We address these limitations by proposing the TOlerance-based COmpression (TOCO) framework. TOCO uses an in-depth analysis of the model, to maintain the accuracy, in an active learning system. The results of the analysis are tolerances that can be used to perform compression in a fine-grained manner. Finally, by decoupling compression from the tolerance analysis, TOCO allows flexibility to changes in the hardware.},\n    year={2019},\n    date={2019-12-18},\n    eprint={1912.08792},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    pubstate={preprint},\n    url={https://arxiv.org/abs/1912.08792},\n    keywords={whitepaper}\n}\n\n
\n
\n\n\n
\n Neural network compression methods have enabled deploying large models on emerging edge devices with little cost, by adapting already-trained models to the constraints of these devices. The rapid development of AI-capable edge devices with limited computation and storage requires streamlined methodologies that can efficiently satisfy the constraints of different devices. In contrast, existing methods often rely on heuristic and manual adjustments to maintain accuracy, support only coarse compression policies, or target specific device constraints that limit their applicability. We address these limitations by proposing the TOlerance-based COmpression (TOCO) framework. TOCO uses an in-depth analysis of the model, to maintain the accuracy, in an active learning system. The results of the analysis are tolerances that can be used to perform compression in a fine-grained manner. Finally, by decoupling compression from the tolerance analysis, TOCO allows flexibility to changes in the hardware.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Interleaved Composite Quantization for High-Dimensional Similarity Search.\n \n \n \n \n\n\n \n KhoramS, S.; Wright, S. J; and Li, J.\n\n\n \n\n\n\n 2019.\n \n\n\n\n
\n\n\n\n \n \n \"InterleavedPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@misc{Khoram2019interleaved,\n    title={{Interleaved Composite Quantization for High-Dimensional Similarity Search}},\n    author={Soroosh Khoram<sup>S</sup> and Stephen J Wright and Jing Li},\n    abstract={Similarity search retrieves the nearest neighbors of a query vector from a dataset of high-dimensional vectors. As the size of the dataset grows, the cost of performing the distance computations needed to implement a query can become prohibitive. A method often used to reduce this computational cost is quantization of the vector space and location-based encoding of the dataset vectors. These encodings can be used during query processing to find approximate nearest neighbors of the query point quickly. Search speed can be improved by using shorter codes, but shorter codes have higher quantization error, leading to degraded precision. In this work, we propose the Interleaved Composite Quantization (ICQ) which achieves fast similarity search without using shorter codes. In ICQ, a small subset of the code is used to approximate the distances, with complete codes being used only when necessary. Our method effectively reduces both code length and quantization error. Furthermore, ICQ is compatible with several recently proposed techniques for reducing quantization error and can be used in conjunction with these other techniques to improve results. We confirm these claims and show strong empirical performance of ICQ using several synthetic and real-word datasets.},\n    year={2019},\n    date={2019-12-18},\n    eprint={1912.08756},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    pubstate={preprint},\n    url={https://arxiv.org/abs/1912.08756},\n    keywords={whitepaper}\n}\n\n
\n
\n\n\n
\n Similarity search retrieves the nearest neighbors of a query vector from a dataset of high-dimensional vectors. As the size of the dataset grows, the cost of performing the distance computations needed to implement a query can become prohibitive. A method often used to reduce this computational cost is quantization of the vector space and location-based encoding of the dataset vectors. These encodings can be used during query processing to find approximate nearest neighbors of the query point quickly. Search speed can be improved by using shorter codes, but shorter codes have higher quantization error, leading to degraded precision. In this work, we propose the Interleaved Composite Quantization (ICQ) which achieves fast similarity search without using shorter codes. In ICQ, a small subset of the code is used to approximate the distances, with complete codes being used only when necessary. Our method effectively reduces both code length and quantization error. Furthermore, ICQ is compatible with several recently proposed techniques for reducing quantization error and can be used in conjunction with these other techniques to improve results. We confirm these claims and show strong empirical performance of ICQ using several synthetic and real-word datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n MLSys: The New Frontier of Machine Learning Systems.\n \n \n \n \n\n\n \n Ratner, A.; Alistarh, D.; Alonso, G.; Andersen, D. G.; Bailis, P.; Bird, S.; Carlini, N.; Catanzaro, B.; Chayes, J.; Chung, E.; Dally, B.; Dean, J.; Dhillon, I. S.; Dimakis, A.; Dubey, P.; Elkan, C.; Fursin, G.; Ganger, G. R.; Getoor, L.; Gibbons, P. B.; Gibson, G. A.; Gonzalez, J. E.; Gottschlich, J.; Han, S.; Hazelwood, K.; Huang, F.; Jaggi, M.; Jamieson, K.; Jordan, M. I.; Joshi, G.; Khalaf, R.; Knight, J.; Konečný, J.; Kraska, T.; Kumar, A.; Kyrillidis, A.; Lakshmiratan, A.; Li, J.; Madden, S.; McMahan, H. B.; Meijer, E.; Mitliagkas, I.; Monga, R.; Murray, D.; Olukotun, K.; Papailiopoulos, D.; Pekhimenko, G.; Rekatsinas, T.; Rostamizadeh, A.; Ré, C.; Sa, C. D.; Sedghi, H.; Sen, S.; Smith, V.; Smola, A.; Song, D.; Sparks, E.; Stoica, I.; Sze, V.; Udell, M.; Vanschoren, J.; Venkataraman, S.; Vinayak, R.; Weimer, M.; Wilson, A. G.; Xing, E.; Zaharia, M.; Zhang, C.; and Talwalkar, A.\n\n\n \n\n\n\n 2019.\n \n\n\n\n
\n\n\n\n \n \n \"MLSys:Paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@misc{ratner2019mlsys,\n    title={{MLSys: The New Frontier of Machine Learning Systems}},\n    author={Alexander Ratner and Dan Alistarh and Gustavo Alonso and David G. Andersen and Peter Bailis and Sarah Bird and Nicholas Carlini and Bryan Catanzaro and Jennifer Chayes and Eric Chung and Bill Dally and Jeff Dean and Inderjit S. Dhillon and Alexandros Dimakis and Pradeep Dubey and Charles Elkan and Grigori Fursin and Gregory R. Ganger and Lise Getoor and Phillip B. Gibbons and Garth A. Gibson and Joseph E. Gonzalez and Justin Gottschlich and Song Han and Kim Hazelwood and Furong Huang and Martin Jaggi and Kevin Jamieson and Michael I. Jordan and Gauri Joshi and Rania Khalaf and Jason Knight and Jakub Konečný and Tim Kraska and Arun Kumar and Anastasios Kyrillidis and Aparna Lakshmiratan and Jing Li and Samuel Madden and H. Brendan McMahan and Erik Meijer and Ioannis Mitliagkas and Rajat Monga and Derek Murray and Kunle Olukotun and Dimitris Papailiopoulos and Gennady Pekhimenko and Theodoros Rekatsinas and Afshin Rostamizadeh and Christopher Ré and Christopher De Sa and Hanie Sedghi and Siddhartha Sen and Virginia Smith and Alex Smola and Dawn Song and Evan Sparks and Ion Stoica and Vivienne Sze and Madeleine Udell and Joaquin Vanschoren and Shivaram Venkataraman and Rashmi Vinayak and Markus Weimer and Andrew Gordon Wilson and Eric Xing and Matei Zaharia and Ce Zhang and Ameet Talwalkar},\n    abstract={Machine learning (ML) techniques are enjoying rapidly increasing adoption. However, designing and implementing the systems that support ML models in real-world deployments remains a significant obstacle, in large part due to the radically different development and deployment profile of modern ML methods, and the range of practical concerns that come with broader adoption. We propose to foster a new systems machine learning research community at the intersection of the traditional systems and ML communities, focused on topics such as hardware systems for ML, software systems for ML, and ML optimized for metrics beyond predictive accuracy. To do this, we describe a new conference, MLSys, that explicitly targets research at the intersection of systems and machine learning with a program committee split evenly between experts in systems and ML, and an explicit focus on topics at the intersection of the two.},\n    year={2019},\n    date={2019-03-29},\n    eprint={1904.03257},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    pubstate={preprint},\n    url={http://arxiv.org/abs/1904.03257},\n\tabstract = {Machine learning (ML) techniques are enjoying rapidly increasing adoption. However, designing and implementing the systems that support ML models in real-world deployments remains a significant obstacle, in large part due to the radically different development and deployment profile of modern ML methods, and the range of practical concerns that come with broader adoption. We propose to foster a new systems machine learning research community at the intersection of the traditional systems and ML communities, focused on topics such as hardware systems for ML, software systems for ML, and ML optimized for metrics beyond predictive accuracy. To do this, we describe a new conference, SysML, that explicitly targets research at the intersection of systems and machine learning with a program committee split evenly between experts in systems and ML, and an explicit focus on topics at the intersection of the two.},\n    keywords={whitepaper}\n}\n\n\n%%%%%%%%%% Referred Journal %%%%%%%%%%\n
\n
\n\n\n
\n Machine learning (ML) techniques are enjoying rapidly increasing adoption. However, designing and implementing the systems that support ML models in real-world deployments remains a significant obstacle, in large part due to the radically different development and deployment profile of modern ML methods, and the range of practical concerns that come with broader adoption. We propose to foster a new systems machine learning research community at the intersection of the traditional systems and ML communities, focused on topics such as hardware systems for ML, software systems for ML, and ML optimized for metrics beyond predictive accuracy. To do this, we describe a new conference, SysML, that explicitly targets research at the intersection of systems and machine learning with a program committee split evenly between experts in systems and ML, and an explicit focus on topics at the intersection of the two.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Nb1-xO2 based Universal Selector with Ultra-high Endurance (>1012), high speed (10ns) and Excellent Vth Stability.\n \n \n \n\n\n \n Luo, Q.; Yu, J.; Zhang, X.; Xue, K.; Cheng, Y.; Gong, T.; Lv, H.; Xu, X.; Yuan, P.; Yin, J.; Tai, L.; Long, S.; Liu, Q.; Li, J.; and Liu, M.\n\n\n \n\n\n\n In 2019 IEEE Symposium on VLSI Technology, Jun 2019. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{luo2019vlsit,\n author = {Luo, Qing and Yu, Jie and Zhang, Xumeng and Xue, Kan-Hao and Cheng, Yan and Gong, Tiancheng and Lv, Hangbing and Xu, Xiaoxin and Yuan, Peng and Yin, Jiahao and Tai, Lu and Long, Shibing and Liu, Qi and Li, Jing and Liu, Ming},\n title = {Nb<sub>1-x</sub>O<sub>2</sub> based Universal Selector with Ultra-high Endurance (>10<sup>12</sup>), high speed (10ns) and Excellent V<sub>th</sub> Stability},\n booktitle = {2019 IEEE Symposium on VLSI Technology},\n year = {2019},\n date={2019-06-09},\n month={Jun},\n %pubstate = {forthcoming},\n note = {},\n doi={10.23919/VLSIT.2019.8776546},\n keywords = {conference}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Liquid Silicon: A Nonvolatile Fully Programmable Processing-In-Memory Processor with Monolithically Integrated ReRAM for Big Data/Machine Learning Applications.\n \n \n \n\n\n \n ZhaS, Y.; Nowak, E.; and Li, J.\n\n\n \n\n\n\n In 2019 IEEE Symposium on VLSI Circuits, Jun 2019. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2019vlsic,\n author = {Zha<sup>S</sup>, Yue and Nowak, Etienne and Li, Jing},\n title = {{Liquid Silicon}: A Nonvolatile Fully Programmable Processing-In-Memory Processor with Monolithically Integrated {ReRAM} for {Big Data/Machine Learning} Applications},\n booktitle = {2019 IEEE Symposium on VLSI Circuits},\n year = {2019},\n month = {Jun},\n date={2019-06-09},\n %pubstate = {forthcoming},\n note = {},\n doi={10.23919/VLSIC.2019.8778064},\n abstract={A nonvolatile fully programmable processing-in-memory (PIM) processor named Liquid Silicon (L-Si) is demonstrated, which combines the superior programmability of general-purpose computing devices (e.g. FPGA) and the high power efficiency of do-main-specific accelerators. Besides the general computing applications, L-Si is particularly well suited for AI/machine learning and big data applications, which not only pose high computational/memory demand but also evolves rapidly. L-Si is fabricated by monolithically integrating HfO 2 resistive RAM on top of commercial 130nm Si CMOS. Our measurement confirmed the fabricated chip operates reliably at low voltage of 650 mV. It achieves 60.9 TOPS/W in performing neural network inferences and 480 GOPS/W in performing content-based similarity search (a key big data application) at nominal voltage supply of 1.2V, showing >3× and ∼100× power efficiency improvement over the state-of-the-art domain-specific CMOS-/RRAM-based accelerators. In addition, it outperforms the latest nonvolatile FPGA in energy efficiency by ∼3× in general compute-intensive applications.},\n keywords = {conference}\n}\n\n
\n
\n\n\n
\n A nonvolatile fully programmable processing-in-memory (PIM) processor named Liquid Silicon (L-Si) is demonstrated, which combines the superior programmability of general-purpose computing devices (e.g. FPGA) and the high power efficiency of do-main-specific accelerators. Besides the general computing applications, L-Si is particularly well suited for AI/machine learning and big data applications, which not only pose high computational/memory demand but also evolves rapidly. L-Si is fabricated by monolithically integrating HfO 2 resistive RAM on top of commercial 130nm Si CMOS. Our measurement confirmed the fabricated chip operates reliably at low voltage of 650 mV. It achieves 60.9 TOPS/W in performing neural network inferences and 480 GOPS/W in performing content-based similarity search (a key big data application) at nominal voltage supply of 1.2V, showing >3× and ∼100× power efficiency improvement over the state-of-the-art domain-specific CMOS-/RRAM-based accelerators. In addition, it outperforms the latest nonvolatile FPGA in energy efficiency by ∼3× in general compute-intensive applications.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n MEG: A RISCV-based system simulation infrastructure for exploring memory optimization using FPGAs and Hybrid Memory Cube (Best Paper Nominee).\n \n \n \n\n\n \n ZhangS, J.; LiuS, Y.; JainS, G.; ZhaS, Y.; TaS, J.; and Li, J.\n\n\n \n\n\n\n In 2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), April 2019. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2019fccm,\n author = {Zhang<sup>S</sup>, Jialiang and Liu<sup>S</sup>, Yang and Jain<sup>S</sup>, Gaurav and Zha<sup>S</sup>, Yue and Ta<sup>S</sup>, Jonathan and Li, Jing},\n title = {{MEG}: {A RISCV}-based system simulation infrastructure for exploring memory optimization using {FPGAs} and {Hybrid Memory Cube} (<strong>Best Paper Nominee</strong>)},\n booktitle = {2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (<strong>FCCM</strong>)},\n year = {2019},\n month = {April},\n date={2019-04-28},\n doi={10.1109/FCCM.2019.00029},\n abstract={Emerging 3D memory technologies, such as the Hybrid Memory Cube (HMC) and High Bandwidth Memory (HBM), provide increased bandwidth and massive memory-level parallelism. Efficiently integrating emerging memories into existing system pose new challenges and require detailed evaluation in a real computing environment. In this paper, we propose MEG, an open-source, configurable, cycle-exact, and RISC-V based full system simulation infrastructure using FPGA and HMC. MEG has three highly configurable design components: (i) a HMC adaptation module that not only enables communication between the HMC device and the processor cores but also can be extended to fit other memories (e.g., HBM, nonvolatile memory) with minimal effort, (ii) a reconfigurable memory controller along with its OS support that can be effectively leveraged by system designers to perform software-hardware co-optimization, and (iii) a performance monitor module that effectively improves the observability and debuggability of the system to guide performance optimization. We provide a prototype implementation of MEG on Xilinx VCU110 board and demonstrate its capability, fidelity, and flexibility on real-world benchmark applications. We hope that our open-source release of MEG fills a gap in the space of publicly-available FPGA-based full system simulation infrastructures specifically targeting memory system and inspires further collaborative software/hardware innovations.},\n %pubstate = {forthcoming},\n note = {},\n keywords = {conference}\n}\n\n
\n
\n\n\n
\n Emerging 3D memory technologies, such as the Hybrid Memory Cube (HMC) and High Bandwidth Memory (HBM), provide increased bandwidth and massive memory-level parallelism. Efficiently integrating emerging memories into existing system pose new challenges and require detailed evaluation in a real computing environment. In this paper, we propose MEG, an open-source, configurable, cycle-exact, and RISC-V based full system simulation infrastructure using FPGA and HMC. MEG has three highly configurable design components: (i) a HMC adaptation module that not only enables communication between the HMC device and the processor cores but also can be extended to fit other memories (e.g., HBM, nonvolatile memory) with minimal effort, (ii) a reconfigurable memory controller along with its OS support that can be effectively leveraged by system designers to perform software-hardware co-optimization, and (iii) a performance monitor module that effectively improves the observability and debuggability of the system to guide performance optimization. We provide a prototype implementation of MEG on Xilinx VCU110 board and demonstrate its capability, fidelity, and flexibility on real-world benchmark applications. We hope that our open-source release of MEG fills a gap in the space of publicly-available FPGA-based full system simulation infrastructures specifically targeting memory system and inspires further collaborative software/hardware innovations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Unleashing the Power of Soft Logic for Convolutional Neural Network Acceleration via Product Quantization (Poster).\n \n \n \n\n\n \n ZhangS, J.; and Li, J.\n\n\n \n\n\n\n In the 2019 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '19, Feb 2019. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2019fpga,\n author = {Zhang<sup>S</sup>, Jialiang and Li, Jing},\n title = {{Unleashing the Power of Soft Logic for Convolutional Neural Network Acceleration via Product Quantization} (Poster)},\n booktitle = {the 2019 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n  series = {<strong>FPGA</strong> '19},\n year = {2019},\n month = {Feb},\n date={2019-02-24},\n doi={10.1145/3289602.3293951},\n %pubstate = {forthcoming},\n note = {},\n keywords = {conference}\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2018\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Computing Generalized Matrix Inverse on Spiking Neural Substrate.\n \n \n \n\n\n \n Shukla, R.; KhoramS, S.; Jorgensen, E.; Li, J.; Lipasti, M.; and Wright, S.\n\n\n \n\n\n\n Frontiers in neuroscience: Neuromorphic engineering, 12: 115. Feb 2018.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{shukla2018frontiers,\ntitle = {Computing Generalized Matrix Inverse on Spiking Neural Substrate},\nauthor = {Rohit Shukla and Soroosh Khoram<sup>S</sup> and Erik Jorgensen and Jing Li and Mikko Lipasti and Stephen Wright},\nyear = {2018},\njournal = {Frontiers in neuroscience: Neuromorphic engineering},\n%pubstate = {forthcoming},\nvolume={12},\n%number={},\npages={115},\nyear={2018},\nmonth={Feb},\ndate={2018-02-14},\ndoi={10.3389/fnins.2018.00115},\nabstract={Emerging neural hardware substrates, such as IBM's TrueNorth Neurosynaptic System, can provide an appealing platform for deploying numerical algorithms. For example, a recurrent Hopfield neural network can be used to find the Moore-Penrose generalized inverse of a matrix, thus enabling a broad class of linear optimizations to be solved efficiently, at low energy cost. However, deploying numerical algorithms on hardware platforms that severely limit the range and precision of representation for numeric quantities can be quite challenging. This paper discusses these challenges and proposes a rigorous mathematical framework for reasoning about range and precision on such substrates. The paper derives techniques for normalizing inputs and properly quantizing synaptic weights originating from arbitrary systems of linear equations, so that solvers for those systems can be implemented in a provably correct manner on hardware-constrained neural substrates. The analytical model is empirically validated on the IBM TrueNorth platform, and results show that the guarantees provided by the framework for range and precision hold under experimental conditions. Experiments with optical flow demonstrate the energy benefits of deploying a reduced-precision and energy-efficient generalized matrix inverse engine on the IBM TrueNorth platform, reflecting 10× to 100× improvement over FPGA and ARM core baselines.},\nISSN={1662-453X}, \nkeywords={journal}\n}\n\n
\n
\n\n\n
\n Emerging neural hardware substrates, such as IBM's TrueNorth Neurosynaptic System, can provide an appealing platform for deploying numerical algorithms. For example, a recurrent Hopfield neural network can be used to find the Moore-Penrose generalized inverse of a matrix, thus enabling a broad class of linear optimizations to be solved efficiently, at low energy cost. However, deploying numerical algorithms on hardware platforms that severely limit the range and precision of representation for numeric quantities can be quite challenging. This paper discusses these challenges and proposes a rigorous mathematical framework for reasoning about range and precision on such substrates. The paper derives techniques for normalizing inputs and properly quantizing synaptic weights originating from arbitrary systems of linear equations, so that solvers for those systems can be implemented in a provably correct manner on hardware-constrained neural substrates. The analytical model is empirically validated on the IBM TrueNorth platform, and results show that the guarantees provided by the framework for range and precision hold under experimental conditions. Experiments with optical flow demonstrate the energy benefits of deploying a reduced-precision and energy-efficient generalized matrix inverse engine on the IBM TrueNorth platform, reflecting 10× to 100× improvement over FPGA and ARM core baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Specialization: A New Path towards Low Power (invited).\n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n ASP Journal of Low Power Electronics, 2018, 14(2). 2018.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{zha2018JOLPE,\ntitle = {Specialization: A New Path towards Low Power (<strong>invited</strong>)},\nauthor = {Yue Zha<sup>S</sup> and Jing Li},\nyear = {2018},\ndate = {2018-02-15},\njournal = {ASP Journal of Low Power Electronics, 2018},\nvolume = {14},\nnumber = {2},\n%pubstate = {forthcoming},\ntppubtype = {article},\ndoi={10.1166/jolpe.2018.1559},\nkeywords={journal}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n An Alternative Analytical Approach to Associative Processing (Best of CAL).\n \n \n \n\n\n \n KhoramS, S.; ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n IEEE Computer Architecture Letters, 17(2): 113-116. July 2018.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{Khoram2018CAL, \nauthor={Khoram<sup>S</sup>, Soroosh and Zha<sup>S</sup>, Yue and Li, Jing}, \njournal={IEEE Computer Architecture Letters}, \ntitle={An Alternative Analytical Approach to Associative Processing (<strong>Best of CAL</strong>)}, \nyear={2018}, \nmonth={July},\ndate={2018-01-03},\nvolume={17}, \nnumber={2}, \npages={113-116}, \nabstract={Associative Processing (AP) is a promising alternative to the Von Neumann model as it addresses the memory wall problem through its inherent in-memory computations. However, because of the countless design parameter choices, comparisons between implementations of two so radically different models are challenging for simulation-based methods. To tackle these challenges, we develop an alternative analytical approach based on a new concept called architecturally-determined complexity. Using this method, we asymptotically evaluate the runtime/storage/energy bounds of the two models, i.e., AP and Von Neumann. We further apply the method to gain more insights into the performance bottlenecks of traditional AP and develop a new machine model named Two Dimensional AP to address these limitations. Finally, we experimentally validate our analytical method and confirm that the simulation results match our theoretical projections.},\nkeywords={journal, Analytical models,Complexity theory,Computational modeling,Computer architecture,Parallel processing,Runtime,Two dimensional displays,Analysis of Algorithms and Problem Complexity,Associative Processors,Modeling techniques,Models of Computation}, \ndoi={10.1109/LCA.2018.2789424}, \nISSN={1556-6056}, \n}\n\n
\n
\n\n\n
\n Associative Processing (AP) is a promising alternative to the Von Neumann model as it addresses the memory wall problem through its inherent in-memory computations. However, because of the countless design parameter choices, comparisons between implementations of two so radically different models are challenging for simulation-based methods. To tackle these challenges, we develop an alternative analytical approach based on a new concept called architecturally-determined complexity. Using this method, we asymptotically evaluate the runtime/storage/energy bounds of the two models, i.e., AP and Von Neumann. We further apply the method to gain more insights into the performance bottlenecks of traditional AP and develop a new machine model named Two Dimensional AP to address these limitations. Finally, we experimentally validate our analytical method and confirm that the simulation results match our theoretical projections.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n PQ-CNN: Accelerating Product Quantized Convolutional Neural Network (Poster).\n \n \n \n\n\n \n ZhangS, J.; and Li, J.\n\n\n \n\n\n\n In 2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), April 2018. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2018fccm,\n author = {Zhang<sup>S</sup>, Jialiang and Li, Jing},\n title = {{PQ-CNN}: {Accelerating} Product Quantized Convolutional Neural Network (Poster)},\n booktitle = {2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines (<strong>FCCM</strong>)},\n year = {2018},\n month = {April},\n date={2018-04-29},\n doi={10.1109/FCCM.2018.00041},\n %pubstate = {forthcoming},\n note = {},\n keywords = {conference}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Efficient Large-scale Approximate Nearest Neighbor Search on the OpenCL-FPGA.\n \n \n \n\n\n \n ZhangS, J.; KhoramS, S.; and Li, J.\n\n\n \n\n\n\n In Conference on Computer Vision and Pattern Recognition (CVPR), pages 4924–4932, Jun 2018. \n (Acceptance Rate: 29%, 979 out of over 3300)\n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2018cvpr,\n author = {Zhang<sup>S</sup>, Jialiang and Khoram<sup>S</sup>, Soroosh and Li, Jing},\n title = {Efficient Large-scale Approximate Nearest Neighbor Search on the {OpenCL-FPGA}},\n booktitle = {Conference on Computer Vision and Pattern Recognition (<strong>CVPR</strong>)},\n year = {2018},\n month={Jun},\n date={2018-06},\n %pubstate = {forthcoming},\n abstract={We present a new method for Product Quantization (PQ) based approximated nearest neighbor search (ANN) in high dimensional spaces. Specifically, we first propose a quantization scheme for the codebook of coarse quantizer, product quantizer, and rotation matrix, to reduce the cost of accessing these codebooks. Our approach also combines a highly parallel k-selection method, which can be fused with the distance calculation to reduce the memory overhead. We implement the proposed method on Intel HARPv2 platform using OpenCL-FPGA. The proposed method significantly outperforms state-of-the-art methods on CPU and GPU for high dimensional nearest neighbor queries on billion-scale datasets in terms of query time and accuracy regardless of the batch size. To our best knowledge, this is the first work to demonstrate FPGA performance superior to CPU and GPU on high-dimensional, large-scale ANN datasets.},\n doi={10.1109/CVPR.2018.00517},\n pages={4924--4932}, \n note = {(Acceptance Rate: <u>29\\%</u>, 979 out of over 3300)},\n keywords = {conference}\n}\n\n
\n
\n\n\n
\n We present a new method for Product Quantization (PQ) based approximated nearest neighbor search (ANN) in high dimensional spaces. Specifically, we first propose a quantization scheme for the codebook of coarse quantizer, product quantizer, and rotation matrix, to reduce the cost of accessing these codebooks. Our approach also combines a highly parallel k-selection method, which can be fused with the distance calculation to reduce the memory overhead. We implement the proposed method on Intel HARPv2 platform using OpenCL-FPGA. The proposed method significantly outperforms state-of-the-art methods on CPU and GPU for high dimensional nearest neighbor queries on billion-scale datasets in terms of query time and accuracy regardless of the batch size. To our best knowledge, this is the first work to demonstrate FPGA performance superior to CPU and GPU on high-dimensional, large-scale ANN datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Adaptive Quantization of Neural Networks.\n \n \n \n \n\n\n \n KhoramS, S.; and Li, J.\n\n\n \n\n\n\n In International Conference on Learning Representations (ICLR), April 2018. \n \n\n\n\n
\n\n\n\n \n \n \"AdaptivePaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{Khoram2018iclr,\n author = {Khoram<sup>S</sup>, Soroosh and Li, Jing},\n title = {Adaptive Quantization of Neural Networks},\n booktitle = {International Conference on Learning Representations (<strong>ICLR</strong>)},\n year = {2018},\n month={April},\n date={2018-04},\n abstract={Despite the state-of-the-art accuracy of Deep Neural Networks (DNN) in various classification problems, their deployment onto resource constrained edge computing devices remains challenging due to their large size and complexity. Several recent studies have reported remarkable results in reducing this complexity through quantization of DNN models. However, these studies usually do not consider the changes in the loss function when performing quantization, nor do they take the different importances of DNN model parameters to the accuracy into account. We address these issues in this paper by proposing a new method, called adaptive quantization, which simplifies a trained DNN model by finding a unique, optimal precision for each network parameter such that the increase in loss is minimized. The optimization problem at the core of this method iteratively uses the loss function gradient to determine an error margin for each parameter and assigns it a precision accordingly. Since this problem uses linear functions, it is computationally cheap and, as we will show, has a closed-form approximate solution. Experiments on MNIST, CIFAR, and SVHN datasets showed that the proposed method can achieve near or better than state-of-the-art reduction in model size with similar error rates. Furthermore, it can achieve compressions close to floating-point model compression methods without loss of accuracy.},\n %pubstate = {forthcoming},\n url={https://openreview.net/forum?id=SyOK1Sg0W},\n keywords = {conference},\n% note = {(Acceptance Rate: <u>34\\%</u>, 314 out of 935)}\n}\n\n
\n
\n\n\n
\n Despite the state-of-the-art accuracy of Deep Neural Networks (DNN) in various classification problems, their deployment onto resource constrained edge computing devices remains challenging due to their large size and complexity. Several recent studies have reported remarkable results in reducing this complexity through quantization of DNN models. However, these studies usually do not consider the changes in the loss function when performing quantization, nor do they take the different importances of DNN model parameters to the accuracy into account. We address these issues in this paper by proposing a new method, called adaptive quantization, which simplifies a trained DNN model by finding a unique, optimal precision for each network parameter such that the increase in loss is minimized. The optimization problem at the core of this method iteratively uses the loss function gradient to determine an error margin for each parameter and assigns it a precision accordingly. Since this problem uses linear functions, it is computationally cheap and, as we will show, has a closed-form approximate solution. Experiments on MNIST, CIFAR, and SVHN datasets showed that the proposed method can achieve near or better than state-of-the-art reduction in model size with similar error rates. Furthermore, it can achieve compressions close to floating-point model compression methods without loss of accuracy.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Nonvolatile Memory Outlook: Technology Driven or Application Driven? (invited).\n \n \n \n\n\n \n Li, J.\n\n\n \n\n\n\n In 2018 China Semiconductor Technology International Conference (CSTIC), pages 1–4, March 2018. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2018CSTIC, \nauthor={Li,Jing}, \nbooktitle={2018 China Semiconductor Technology International Conference (CSTIC)}, \ntitle={Nonvolatile Memory Outlook: Technology Driven or Application Driven? (<strong>invited</strong>)},\nyear={2018}, \ndate = {2018-03-12},\nvolume={}, \nnumber={}, \npages={1--4}, \nISSN={}, \nmonth={March},\ndoi={10.1109/CSTIC.2018.8369201},\n%pubstate = {forthcoming},\nkeywords = {conference}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Liquid Silicon-Monona: A Reconfigurable Memory-Oriented Computing Fabric with Scalable Multi-Context Support.\n \n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In 23nd International Conference on Architectural Support for Programming Languages and Operating Systems, volume 53, of ASPLOS '18, pages 214–228, New York, NY, USA, Mar 2018. ACM\n (Acceptance Rate: 18.2%, 56 out of 307)\n\n\n\n
\n\n\n\n \n \n \"LiquidPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2018asplos,\n author = {Zha<sup>S</sup>, Yue and Li, Jing},\n title = {{Liquid Silicon-Monona}: A Reconfigurable Memory-Oriented Computing Fabric with Scalable Multi-Context Support},\n booktitle = {23nd International Conference on Architectural Support for Programming Languages and Operating Systems},\n series = {<strong>ASPLOS</strong> '18},\n year = {2018},\n month={Mar},\n date={2018-03-19},\n location = {Williamsburg, VA, USA},\n pages = {214--228},\n volume={53},\n issue={2},\n url = {http://doi.acm.org/10.1145/3173162.3173167},\n doi = {10.1145/3173162.3173167},\n acmid = {},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {},\n %pubstate = {accepted},\n abstract={With the recent trend of promoting Field-Programmable Gate Arrays (FPGAs) to first-class citizens in accelerating compute-intensive applications in networking, cloud services and artificial intelligence, FPGAs face two major challenges in sustaining competitive advantages in performance and energy efficiency for diverse cloud workloads: 1) limited configuration capability for supporting light-weight computations/on-chip data storage to accelerate emerging search-/data-intensive applications. 2) lack of architectural support to hide reconfiguration overhead for assisting virtualization in a cloud computing environment. In this paper, we propose a reconfigurable memory-oriented computing fabric, namely Liquid Silicon-Monona (L-Si), enabled by emerging nonvolatile memory technology i.e. RRAM, to address these two challenges. Specifically, L-Si addresses the first challenge by virtue of a new architecture comprising a 2D array of physically identical but functionally-configurable building blocks. It, for the first time, extends the configuration capabilities of existing FPGAs from computation to the whole spectrum ranging from computation to data storage. It allows users to better customize hardware by flexibly partitioning hardware resources between computation and memory, greatly benefiting emerging search- and data-intensive applications. To address the second challenge, L-Si provides scalable multi-context architectural support to minimize reconfiguration overhead for assisting virtualization. In addition, we provide compiler support to facilitate the programming of applications written in high-level programming languages (e.g. OpenCL) and frameworks (e.g. TensorFlow, MapReduce) while fully exploiting the unique architectural capability of L-Si. Our evaluation results show L-Si achieves 99.6\\% area reduction, 1.43× throughput improvement and 94.0\\% power reduction on search-intensive benchmarks, as compared with the FPGA baseline. For neural network benchmarks, on average, L-Si achieves 52.3× speedup, 113.9× energy reduction and 81\\% area reduction over the FPGA baseline. In addition, the multi-context architecture of L-Si reduces the context switching time to - 10ns, compared with an off-the-shelf FPGA (∼100ms), greatly facilitating virtualization.},\n keywords = {conference},\n note = {(Acceptance Rate: <u>18.2\\%</u>, 56 out of 307)}\n} \n\n
\n
\n\n\n
\n With the recent trend of promoting Field-Programmable Gate Arrays (FPGAs) to first-class citizens in accelerating compute-intensive applications in networking, cloud services and artificial intelligence, FPGAs face two major challenges in sustaining competitive advantages in performance and energy efficiency for diverse cloud workloads: 1) limited configuration capability for supporting light-weight computations/on-chip data storage to accelerate emerging search-/data-intensive applications. 2) lack of architectural support to hide reconfiguration overhead for assisting virtualization in a cloud computing environment. In this paper, we propose a reconfigurable memory-oriented computing fabric, namely Liquid Silicon-Monona (L-Si), enabled by emerging nonvolatile memory technology i.e. RRAM, to address these two challenges. Specifically, L-Si addresses the first challenge by virtue of a new architecture comprising a 2D array of physically identical but functionally-configurable building blocks. It, for the first time, extends the configuration capabilities of existing FPGAs from computation to the whole spectrum ranging from computation to data storage. It allows users to better customize hardware by flexibly partitioning hardware resources between computation and memory, greatly benefiting emerging search- and data-intensive applications. To address the second challenge, L-Si provides scalable multi-context architectural support to minimize reconfiguration overhead for assisting virtualization. In addition, we provide compiler support to facilitate the programming of applications written in high-level programming languages (e.g. OpenCL) and frameworks (e.g. TensorFlow, MapReduce) while fully exploiting the unique architectural capability of L-Si. Our evaluation results show L-Si achieves 99.6% area reduction, 1.43× throughput improvement and 94.0% power reduction on search-intensive benchmarks, as compared with the FPGA baseline. For neural network benchmarks, on average, L-Si achieves 52.3× speedup, 113.9× energy reduction and 81% area reduction over the FPGA baseline. In addition, the multi-context architecture of L-Si reduces the context switching time to - 10ns, compared with an off-the-shelf FPGA (∼100ms), greatly facilitating virtualization.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Degree-aware Hybrid Graph Traversal on FPGA-HMC Platform.\n \n \n \n \n\n\n \n ZhangS, J.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '18, pages 229–238, New York, NY, USA, Feb 2018. ACM\n (Acceptance Rate*: 24%)\n\n\n\n
\n\n\n\n \n \n \"Degree-awarePaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2018fpga,\n author = {Zhang<sup>S</sup>, Jialiang and Li, Jing},\n title = {Degree-aware Hybrid Graph Traversal on {FPGA-HMC} Platform},\n booktitle = {Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n series = {<strong>FPGA</strong> '18},\n year = {2018},\n month={Feb},\n date={2018-02-25},\n pages = {229--238},\n location = {Monterey, California, USA},\n publisher = {ACM},\n address = {New York, NY, USA},\n url = {http://doi.acm.org/10.1145/3174243.3174245},\n doi = {10.1145/3174243.3174245},\n keywords = {conference, graph processor, hybrid memory cube, bfs},\n abstract={Graph traversal is a core primitive for graph analytics and a basis for many higher-level graph analysis methods. However, irregularities in the structure of scale-free graphs (e.g., social network) limit our ability to analyze these important and growing datasets. A key challenge is the redundant graph computations caused by the presence of high-degree vertices which not only increase the total amount of computations but also incur unnecessary random data access. In this paper, we present a graph processing system on an FPGA-HMC platform, based on software/hardware co-design and co- optimization. For the first time, we leverage the inherent graph property i.e. vertex degree to co-optimize algorithm and hardware architecture. In particular, we first develop two algorithm optimization techniques:degree-aware adjacency list reordering anddegree-aware vertex index sorting. The former can reduce the number of redundant graph computations, while the latter can create a strong correlation between vertex index and data access frequency, which can be effectively applied to guide the hardware design. We further implement the optimized hybrid graph traversal algorithm on an FPGA-HMC platform. By leveraging the strong correlation between vertex index and data access frequency made by degree-aware vertex index sorting, we develop two platform-dependent hardware optimization techniques, namely degree-aware data placement and degree-aware adjacency list compression. These two techniques together substantially reduce the amount of access to external memory. Finally, we conduct extensive experiments on an FPGA-HMC platform to verify the effectiveness of the proposed techniques. To the best of our knowledge, our implementation achieves the highest performance (45.8 billion traversed edges per second) among existing FPGA-based graph processing systems.},\n %pubstate = {accepted},\n note = {(Acceptance Rate*: <u>24\\%</u>)}\n} \n\n
\n
\n\n\n
\n Graph traversal is a core primitive for graph analytics and a basis for many higher-level graph analysis methods. However, irregularities in the structure of scale-free graphs (e.g., social network) limit our ability to analyze these important and growing datasets. A key challenge is the redundant graph computations caused by the presence of high-degree vertices which not only increase the total amount of computations but also incur unnecessary random data access. In this paper, we present a graph processing system on an FPGA-HMC platform, based on software/hardware co-design and co- optimization. For the first time, we leverage the inherent graph property i.e. vertex degree to co-optimize algorithm and hardware architecture. In particular, we first develop two algorithm optimization techniques:degree-aware adjacency list reordering anddegree-aware vertex index sorting. The former can reduce the number of redundant graph computations, while the latter can create a strong correlation between vertex index and data access frequency, which can be effectively applied to guide the hardware design. We further implement the optimized hybrid graph traversal algorithm on an FPGA-HMC platform. By leveraging the strong correlation between vertex index and data access frequency made by degree-aware vertex index sorting, we develop two platform-dependent hardware optimization techniques, namely degree-aware data placement and degree-aware adjacency list compression. These two techniques together substantially reduce the amount of access to external memory. Finally, we conduct extensive experiments on an FPGA-HMC platform to verify the effectiveness of the proposed techniques. To the best of our knowledge, our implementation achieves the highest performance (45.8 billion traversed edges per second) among existing FPGA-based graph processing systems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Liquid Silicon: A Data-Centric Reconfigurable Architecture enabled by RRAM Technology.\n \n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '18, pages 51–60, New York, NY, USA, Feb 2018. ACM\n (Acceptance Rate*: 24%, Ranked #1 among 100+ submissions)\n\n\n\n
\n\n\n\n \n \n \"LiquidPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2018fpga,\n author = {Zha<sup>S</sup>, Yue and Li, Jing},\n title = {{Liquid  Silicon}:  A Data-Centric Reconfigurable Architecture enabled by {RRAM} Technology},\n booktitle = {Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n series = {<strong>FPGA</strong> '18},\n year = {2018},\n month={Feb},\n date={2018-02-25},\n pages = {51--60},\n url = {http://doi.acm.org/10.1145/3174243.3174244},\n doi = {10.1145/3174243.3174244},\n location = {Monterey, California, USA},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {conference, monolithic stacking, non-volatile memory, processing-in-memory, reconfigurable architecture, tcam},\n abstract={This paper presents a data-centric reconfigurable architecture, namely Liquid Silicon, enabled by emerging non-volatile memory, i.e., RRAM. Compared to the heterogeneous architecture of commercial FPGAs, Liquid Silicon is inherently a homogeneous architecture comprising a two-dimensional (2D) array of identical 'tiles'. Each tile can be configured into one or a combination of four modes: TCAM, logic, interconnect, and memory. Such flexibility allows users to partition resources based on applications? needs, in contrast to the fixed hardware design using dedicated hard IP blocks in FPGAs. In addition to better resource usage, its 'memory friendly' architecture effectively addresses the limitations of commercial FPGAs i.e., scarce on-chip memory resources, making it an effective complement to FPGAs. Moreover, its coarse-grained logic implementation results in shallower logic depth, less inter-tile routing overhead, and thus smaller area and better performance, compared with its FPGA counterpart. Our study shows that, on average, for both traditional and emerging applications, we achieve 62\\% area reduction, 27\\% speedup and 31\\% improvement in energy efficiency when mapping applications onto Liquid Silicon instead of FPGAs.},\n %pubstate = {accepted},\n note = {(Acceptance Rate*: <u>24\\%</u>, Ranked <strong>\\#1</strong> among 100+ submissions)}\n} \n\n\n
\n
\n\n\n
\n This paper presents a data-centric reconfigurable architecture, namely Liquid Silicon, enabled by emerging non-volatile memory, i.e., RRAM. Compared to the heterogeneous architecture of commercial FPGAs, Liquid Silicon is inherently a homogeneous architecture comprising a two-dimensional (2D) array of identical 'tiles'. Each tile can be configured into one or a combination of four modes: TCAM, logic, interconnect, and memory. Such flexibility allows users to partition resources based on applications? needs, in contrast to the fixed hardware design using dedicated hard IP blocks in FPGAs. In addition to better resource usage, its 'memory friendly' architecture effectively addresses the limitations of commercial FPGAs i.e., scarce on-chip memory resources, making it an effective complement to FPGAs. Moreover, its coarse-grained logic implementation results in shallower logic depth, less inter-tile routing overhead, and thus smaller area and better performance, compared with its FPGA counterpart. Our study shows that, on average, for both traditional and emerging applications, we achieve 62% area reduction, 27% speedup and 31% improvement in energy efficiency when mapping applications onto Liquid Silicon instead of FPGAs.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Accelerating Graph Analytics By Co-Optimizing Storage and Access on an FPGA-HMC Platform.\n \n \n \n \n\n\n \n KhoramS, S.; ZhangS, J.; StrangeS, M.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '18, pages 239–248, New York, NY, USA, Feb 2018. ACM\n (Acceptance Rate*: 24%)\n\n\n\n
\n\n\n\n \n \n \"AcceleratingPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Khoram2018fpga,\n author = {Khoram<sup>S</sup>, Soroosh and Zhang<sup>S</sup>, Jialiang and Strange<sup>S</sup>, Maxwell and Li, Jing},\n title = {Accelerating  Graph  Analytics  By  Co-Optimizing  Storage  and  Access  on  an {FPGA-HMC} Platform},\n booktitle = {Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n series = {<strong>FPGA</strong> '18},\n year = {2018},\n month={Feb},\n date={2018-02-25},\n pages = {239--248},\n url = {http://doi.acm.org/10.1145/3174243.3174260},\n doi = {10.1145/3174243.3174260},\n location = {Monterey, California, USA},\n publisher = {ACM},\n address = {New York, NY, USA},\n abstract={Graph analytics, which explores the relationships among interconnected entities, is becoming increasingly important due to its broad applicability, from machine learning to social sciences. However, due to the irregular data access patterns in graph computations, one major challenge for graph processing systems is performance. The algorithms, softwares, and hardwares that have been tailored for mainstream parallel applications are generally not effective for massive, sparse graphs from the real-world problems, due to their complex and irregular structures. To address the performance issues in large-scale graph analytics, we leverage the exceptional random access performance of the emerging Hybrid Memory Cube (HMC) combined with the flexibility and efficiency of modern FPGAs. In particular, we develop a collaborative software/hardware technique to perform a level-synchronized Breadth First Search (BFS) on a FPGA-HMC platform. From the software perspective, we develop an architecture-aware graph clustering algorithm that exploits the FPGA-HMC platform»s capability to improve data locality and memory access efficiency. From the hardware perspective, we further improve the FPGA-HMC graph processor architecture by designing a memory request merging unit to take advantage of the increased data locality resulting from graph clustering. We evaluate the performance of our BFS implementation using the AC-510 development kit from Micron and achieve $2.8 \\times$ average performance improvement compared to the latest FPGA-HMC based graph processing system over a set of benchmarks from a wide range of applications.},\n keywords = {conference, graph analytics, graph clustering, hardware accelerators, hybrid memory cube, reconfigurable logic, bfs},\n %pubstate = {accepted},\n note = {(Acceptance Rate*: <u>24\\%</u>)}\n}\n\n
\n
\n\n\n
\n Graph analytics, which explores the relationships among interconnected entities, is becoming increasingly important due to its broad applicability, from machine learning to social sciences. However, due to the irregular data access patterns in graph computations, one major challenge for graph processing systems is performance. The algorithms, softwares, and hardwares that have been tailored for mainstream parallel applications are generally not effective for massive, sparse graphs from the real-world problems, due to their complex and irregular structures. To address the performance issues in large-scale graph analytics, we leverage the exceptional random access performance of the emerging Hybrid Memory Cube (HMC) combined with the flexibility and efficiency of modern FPGAs. In particular, we develop a collaborative software/hardware technique to perform a level-synchronized Breadth First Search (BFS) on a FPGA-HMC platform. From the software perspective, we develop an architecture-aware graph clustering algorithm that exploits the FPGA-HMC platform»s capability to improve data locality and memory access efficiency. From the hardware perspective, we further improve the FPGA-HMC graph processor architecture by designing a memory request merging unit to take advantage of the increased data locality resulting from graph clustering. We evaluate the performance of our BFS implementation using the AC-510 development kit from Micron and achieve $2.8 ×$ average performance improvement compared to the latest FPGA-HMC based graph processing system over a set of benchmarks from a wide range of applications.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2017\n \n \n (10)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n CMA: A Reconfigurable Complex Matching Accelerator for Wire-speed Network Intrusion Detection.\n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n IEEE Computer Architecture Letters, 17(1): 33-36. 2017.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{zha2017CALCMA, \nauthor={Yue Zha<sup>S</sup> and Jing Li}, \njournal={IEEE Computer Architecture Letters}, \ntitle={{CMA}: A Reconfigurable Complex Matching Accelerator for Wire-speed Network Intrusion Detection}, \nyear={2017},\ndate={2017-07-03},\nvolume={17}, \nnumber={1}, \npages={33-36}, \nkeywords={journal, Computer architecture,Coprocessors,Encoding,IP networks,Intrusion detection,Ports (Computers),Accelerator,Intrusion Detection,Network Security,ReRAM,TCAM}, \ndoi={10.1109/LCA.2017.2719023}, \nISSN={1556-6056}, \n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n IMEC: A Fully Morphable In-Memory Computing Fabric Enabled by Resistive Crossbar.\n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n IEEE Computer Architecture Letters, 16(2): 123–126. Feb 2017.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{zha2017CALIMEC, \nauthor={Yue Zha<sup>S</sup> and Jing Li}, \njournal={IEEE Computer Architecture Letters}, \ntitle={{IMEC}: A Fully Morphable In-Memory Computing Fabric Enabled by Resistive Crossbar}, \nyear={2017}, \nvolume={16}, \nnumber={2}, \npages={123--126}, \nkeywords={journal, Decoding,Energy efficiency,Field programmable gate arrays,Nonvolatile memory,Program processors,Non-volatile memory,TCAM,energy-efficiency computing,processing-in-memory}, \ndoi={10.1109/LCA.2017.2672558}, \nISSN={1556-6056}, \nmonth={Feb},\ndate={2017-02-22},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n RRAM-based reconfigurable in-memory computing architecture with hybrid routing.\n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In 2017 IEEE/ACM International Conference on Computer-Aided Design (ICCAD), of ICCAD '17, pages 527–532, Nov 2017. \n (Acceptance Rate: 26%, 105 out of 399)\n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{zha2017iccad, \nauthor={Yue Zha<sup>S</sup> and Jing Li}, \nbooktitle={2017 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, \nseries = {<strong>ICCAD</strong> '17},\ntitle={{RRAM-based} reconfigurable in-memory computing architecture with hybrid routing}, \nyear={2017}, \nmonth={Nov},\ndate={2017-11-13},\nvolume={}, \nnumber={}, \npages={527--532}, \nkeywords={conference, Architecture,Delays,Field programmable gate arrays,Logic functions,Routing,Switches,Tiles,CAD Framework,Hybrid Routing,In-Memory Computing,Reconfigurable Architecture,liquid Silicon}, \nabstract={Recent advances in resistive random-access memory (RRAM) evoke great interests in exploring alternative architectures. One interesting work is a RRAM-based reconfigurable architecture that provides superior programmbility and blurs the boundary between computation and storage, but long-distance routing becomes a performance bottleneck. However, long-distance routing in FPGA is efficiently implemented, but its fine-grained routing structure results in a large routing overhead. In this work, we present a RRAM-based reconfigurable architecture that addresses the routing challenges using hybrid routing, i.e., local and global routing by taking the best advantages of both architectures (prior RRAM-based and FPGA). We also provide a complete CAD framework that exhibits high parallelism and good scalability. Experimental results show that our reconfigurable architecture outperforms both architectures. It achieves a 46.88\\% reduction in delay and improves the energy efficiency by 66.23\\% compared with the prior RRAM-based architecture with a slightly increased area overhead. While comparing with FPGA, it reduces the delay and the routing overhead by 36.00\\% and 50.20\\%, respectively. Additionally, our CAD framework achieves 5.39x speedup, compared with the prior framework.},\ndoi={10.1109/ICCAD.2017.8203822}, \nISSN={1558-2434}, \nnote = {(Acceptance Rate: <u>26\\%</u>, 105 out of 399)},\n}\n\n
\n
\n\n\n
\n Recent advances in resistive random-access memory (RRAM) evoke great interests in exploring alternative architectures. One interesting work is a RRAM-based reconfigurable architecture that provides superior programmbility and blurs the boundary between computation and storage, but long-distance routing becomes a performance bottleneck. However, long-distance routing in FPGA is efficiently implemented, but its fine-grained routing structure results in a large routing overhead. In this work, we present a RRAM-based reconfigurable architecture that addresses the routing challenges using hybrid routing, i.e., local and global routing by taking the best advantages of both architectures (prior RRAM-based and FPGA). We also provide a complete CAD framework that exhibits high parallelism and good scalability. Experimental results show that our reconfigurable architecture outperforms both architectures. It achieves a 46.88% reduction in delay and improves the energy efficiency by 66.23% compared with the prior RRAM-based architecture with a slightly increased area overhead. While comparing with FPGA, it reduces the delay and the routing overhead by 36.00% and 50.20%, respectively. Additionally, our CAD framework achieves 5.39x speedup, compared with the prior framework.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Accelerating Large-Scale Graph Analytics with FPGA and HMC (Poster).\n \n \n \n\n\n \n KhoramS, S.; ZhangS, J.; StrangeS, M.; and Li, J.\n\n\n \n\n\n\n In 2017 IEEE 25th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), pages 82–82, April 2017. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{Khoram2017fccm, \nauthor={Soroosh Khoram<sup>S</sup> and Jialiang Zhang<sup>S</sup> and Maxwell Strange<sup>S</sup> and Jing Li}, booktitle={2017 IEEE 25th Annual International Symposium on Field-Programmable Custom Computing Machines (<strong>FCCM</strong>)}, \ntitle={Accelerating Large-Scale Graph Analytics with {FPGA} and {HMC} (Poster)}, \nyear={2017},\ndate={2017-04-30},\nvolume={}, \nnumber={82--82}, \npages={82--82}, \nkeywords={conference, field programmable gate arrays,graph theory,information retrieval,learning (artificial intelligence),social sciences,tree searching,BFS,FPGA-HMC based graph processing system,breadth first search,hybrid memory cube,interconnected entities,irregular data access pattern,large-scale graph analytics,machine learning,massive-scale sparse graphs,social science,Acceleration,Clustering algorithms,Field programmable gate arrays,Hardware,Merging,Software,Software algorithms,Breadth-First Search,Graph Clustering,Hybrid memory Cube}, \ndoi={10.1109/FCCM.2017.58}, \nISSN={}, \nmonth={April},\n%note = {Acceptance rate: <u>25\\%</u>, 32 out of 128},\n}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Challenges and Opportunities: From Near-memory Computing to In-memory Computing (invited).\n \n \n \n \n\n\n \n KhoramS, S.; ZhaS, Y.; ZhangS, J.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2017 ACM on International Symposium on Physical Design, of ISPD '17, pages 43–46, New York, NY, USA, Mar 2017. ACM\n \n\n\n\n
\n\n\n\n \n \n \"ChallengesPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Khoram2017ISPD,\n author = {Khoram<sup>S</sup>, Soroosh and Zha<sup>S</sup>, Yue and Zhang<sup>S</sup>, Jialiang and Li, Jing},\n title = {Challenges and Opportunities: From Near-memory Computing to In-memory Computing (<strong>invited</strong>)},\n booktitle = {Proceedings of the 2017 ACM on International Symposium on Physical Design},\n series = {<strong>ISPD</strong> '17},\n year = {2017},\n month = {Mar},\n date={2017-03-19},\n isbn = {978-1-4503-4696-2},\n location = {Portland, Oregon, USA},\n pages = {43--46},\n numpages = {4},\n url = {http://doi.acm.org/10.1145/3036669.3038242},\n doi = {10.1145/3036669.3038242},\n acmid = {3038242},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {conference, 3d integration, in-memory processing, near-memory processing, nonvolatile memory},\n abstract={The confluence of the recent advances in technology and the ever-growing demand for large-scale data analytics created a renewed interest in a decades-old concept, processing-in-memory (PIM). PIM, in general, may cover a very wide spectrum of compute capabilities embedded in close proximity to or even inside the memory array. In this paper, we present an initial taxonomy for dividing PIM into two broad categories: 1) Near-memory processing and 2) In-memory processing. This paper highlights some interesting work in each category and provides insights into the challenges and possible future directions.},\n% note = {(Acceptance Rate*: <u>35\\%</u>)}\n} \n\n\n
\n
\n\n\n
\n The confluence of the recent advances in technology and the ever-growing demand for large-scale data analytics created a renewed interest in a decades-old concept, processing-in-memory (PIM). PIM, in general, may cover a very wide spectrum of compute capabilities embedded in close proximity to or even inside the memory array. In this paper, we present an initial taxonomy for dividing PIM into two broad categories: 1) Near-memory processing and 2) In-memory processing. This paper highlights some interesting work in each category and provides insights into the challenges and possible future directions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Recent progress in RRAM technology: From compact models to applications (invited).\n \n \n \n\n\n \n ZhaS, Y.; Wei, Z.; and Li, J.\n\n\n \n\n\n\n In 2017 China Semiconductor Technology International Conference (CSTIC), pages 1–4, March 2017. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{zha2017CSTIC, \nauthor={Yue Zha<sup>S</sup> and Zhiqiang Wei and Jing Li}, \nbooktitle={2017 China Semiconductor Technology International Conference (CSTIC)}, \ntitle={Recent progress in {RRAM} technology: From compact models to applications (<strong>invited</strong>)}, \nyear={2017}, \nvolume={}, \nnumber={},\ndate={2017-03-12},\npages={1--4}, \nkeywords={conference, integrated circuit modelling,product development,resistive RAM,IV characteristics,RRAM technology,SCM,commercialization progress,compact model,drop-in replacement,embedded memory,essential electrical-chemical-thermal properties,nonVon Neumann architecture,product development,standalone memory,storage class memory,switching dynamics,Computational modeling,Computer architecture,Hidden Markov models,Mathematical model,Random access memory,Resistance,Switches}, \ndoi={10.1109/CSTIC.2017.7919731}, \nISSN={}, \nmonth={March},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n RRAM-based Reconfigurable In-Memory Computing Architecture with Hybrid Routing (poster).\n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In the 54th Annual Design Automation Conference Work-in-Progress, of DAC-WIP '17, New York, NY, USA, Jun 2017. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2017dacwip,\n author = {Zha<sup>S</sup>, Yue and Li, Jing},\n title = {{RRAM}-based  Reconfigurable  In-Memory  Computing  Architecture with Hybrid Routing (poster)},\n booktitle = {the 54th Annual Design Automation Conference Work-in-Progress},\n series = {DAC-WIP '17},\n year = {2017},\n date = {2017-06},\n month = {Jun},\n isbn = {978-1-4503-4927-7},\n location = {Austin, TX, USA},\n address = {New York, NY, USA},\n keywords = {conference},\n% note = {(Acceptance Rate*: <u>29\\%</u>)},\n} \n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Improving the Performance of OpenCL-based FPGA Accelerator for Convolutional Neural Network.\n \n \n \n \n\n\n \n ZhangS, J.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '17, pages 25–34, New York, NY, USA, 2017. ACM\n (Acceptance Rate: 25%, 25 out of 101)\n\n\n\n
\n\n\n\n \n \n \"ImprovingPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2017fpgaCNN,\n author = {Zhang<sup>S</sup>, Jialiang and Li, Jing},\n title = {Improving the Performance of {OpenCL-based FPGA} Accelerator for Convolutional Neural Network},\n booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n series = {<strong>FPGA</strong> '17},\n year = {2017},\n date={2017-02-22},\n isbn = {978-1-4503-4354-1},\n location = {Monterey, California, USA},\n pages = {25--34},\n numpages = {10},\n url = {http://doi.acm.org/10.1145/3020078.3021698},\n doi = {10.1145/3020078.3021698},\n acmid = {3021698},\n publisher = {ACM},\n address = {New York, NY, USA},\n abstract={OpenCL FPGA has recently gained great popularity with emerging needs for workload acceleration such as Convolutional Neural Network (CNN), which is the most popular deep learning architecture in the domain of computer vision. While OpenCL enhances the code portability and programmability of FPGA, it comes at the expense of performance. The key challenge is to optimize the OpenCL kernels to efficiently utilize the flexible hardware resources in FPGA. Simply optimizing the OpenCL kernel code through various compiler options turns out insufficient to achieve desirable performance for both compute-intensive and data-intensive workloads such as convolutional neural networks.\n\nIn this paper, we first propose an analytical performance model and apply it to perform an in-depth analysis on the resource requirement of CNN classifier kernels and available resources on modern FPGAs. We identify that the key performance bottleneck is the on-chip memory bandwidth. We propose a new kernel design to effectively address such bandwidth limitation and to provide an optimal balance between computation, on-chip, and off-chip memory access. As a case study, we further apply these techniques to design a CNN accelerator based on the VGG model. Finally, we evaluate the performance of our CNN accelerator using an Altera Arria 10 GX1150 board. We achieve 866 Gop/s floating point performance at 370MHz working frequency and 1.79 Top/s 16-bit fixed-point performance at 385MHz. To the best of our knowledge, our implementation achieves the best power efficiency and performance density compared to existing work.},\n keywords = {conference, convolutional neural networks, fpga, hardware accelerator, opencl},\n note = {(Acceptance Rate: <u>25\\%</u>, 25 out of 101)},\n} \n\n\n
\n
\n\n\n
\n OpenCL FPGA has recently gained great popularity with emerging needs for workload acceleration such as Convolutional Neural Network (CNN), which is the most popular deep learning architecture in the domain of computer vision. While OpenCL enhances the code portability and programmability of FPGA, it comes at the expense of performance. The key challenge is to optimize the OpenCL kernels to efficiently utilize the flexible hardware resources in FPGA. Simply optimizing the OpenCL kernel code through various compiler options turns out insufficient to achieve desirable performance for both compute-intensive and data-intensive workloads such as convolutional neural networks. In this paper, we first propose an analytical performance model and apply it to perform an in-depth analysis on the resource requirement of CNN classifier kernels and available resources on modern FPGAs. We identify that the key performance bottleneck is the on-chip memory bandwidth. We propose a new kernel design to effectively address such bandwidth limitation and to provide an optimal balance between computation, on-chip, and off-chip memory access. As a case study, we further apply these techniques to design a CNN accelerator based on the VGG model. Finally, we evaluate the performance of our CNN accelerator using an Altera Arria 10 GX1150 board. We achieve 866 Gop/s floating point performance at 370MHz working frequency and 1.79 Top/s 16-bit fixed-point performance at 385MHz. To the best of our knowledge, our implementation achieves the best power efficiency and performance density compared to existing work.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Boosting the Performance of FPGA-based Graph Processor Using Hybrid Memory Cube: A Case for Breadth First Search.\n \n \n \n \n\n\n \n ZhangS, J.; KhoramS, S.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '17, pages 207–216, New York, NY, USA, 2017. ACM\n (Acceptance Rate: 25%, 25 out of 101)\n\n\n\n
\n\n\n\n \n \n \"BoostingPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zhang2017fpgaBFS,\n author = {Zhang<sup>S</sup>, Jialiang and Khoram<sup>S</sup>, Soroosh and Li, Jing},\n title = {Boosting the Performance of {FPGA-based} Graph Processor Using {Hybrid Memory Cube}: A Case for Breadth First Search},\n booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n series = {<strong>FPGA</strong> '17},\n year = {2017},\n date={2017-02-22},\n isbn = {978-1-4503-4354-1},\n location = {Monterey, California, USA},\n pages = {207--216},\n numpages = {10},\n url = {http://doi.acm.org/10.1145/3020078.3021737},\n doi = {10.1145/3020078.3021737},\n acmid = {3021737},\n publisher = {ACM},\n address = {New York, NY, USA},\n abstract={Large graph processing has gained great attention in recent years due to its broad applicability from machine learning to social science. Large real-world graphs, however, are inherently difficult to process efficiently, not only due to their large memory footprint, but also that most graph algorithms entail memory access patterns with poor locality and a low compute-to-memory access ratio. In this work, we leverage the exceptional random access performance of emerging Hybrid Memory Cube (HMC) technology that stacks multiple DRAM dies on top of a logic layer, combined with the flexibility and efficiency of FPGA to address these challenges. To our best knowledge, this is the first work that implements a graph processing system on a FPGA-HMC platform based on software/hardware co-design and co-optimization. We first present the modifications of algorithm and a platform-aware graph processing architecture to perform level-synchronized breadth first search (BFS) on FPGA-HMC platform. To gain better insights into the potential bottlenecks of proposed implementation, we develop an analytical performance model to quantitatively evaluate the HMC access latency and corresponding BFS performance. Based on the analysis, we propose a two-level bitmap scheme to further reduce memory access and perform optimization on key design parameters (e.g. memory access granularity). Finally, we evaluate the performance of our BFS implementation using the AC-510 development kit from Micron. We achieved 166 million edges traversed per second (MTEPS) using GRAPH500 benchmark on a random graph with a scale of 25 and an edge factor of 16, which significantly outperforms CPU and other FPGA-based large graph processors.},\n keywords = {conference, graph processor, hybrid memory cube:bfs},\n note = {(Acceptance Rate: <u>25\\%</u>, 25 out of 101)}\n} \n\n\n
\n
\n\n\n
\n Large graph processing has gained great attention in recent years due to its broad applicability from machine learning to social science. Large real-world graphs, however, are inherently difficult to process efficiently, not only due to their large memory footprint, but also that most graph algorithms entail memory access patterns with poor locality and a low compute-to-memory access ratio. In this work, we leverage the exceptional random access performance of emerging Hybrid Memory Cube (HMC) technology that stacks multiple DRAM dies on top of a logic layer, combined with the flexibility and efficiency of FPGA to address these challenges. To our best knowledge, this is the first work that implements a graph processing system on a FPGA-HMC platform based on software/hardware co-design and co-optimization. We first present the modifications of algorithm and a platform-aware graph processing architecture to perform level-synchronized breadth first search (BFS) on FPGA-HMC platform. To gain better insights into the potential bottlenecks of proposed implementation, we develop an analytical performance model to quantitatively evaluate the HMC access latency and corresponding BFS performance. Based on the analysis, we propose a two-level bitmap scheme to further reduce memory access and perform optimization on key design parameters (e.g. memory access granularity). Finally, we evaluate the performance of our BFS implementation using the AC-510 development kit from Micron. We achieved 166 million edges traversed per second (MTEPS) using GRAPH500 benchmark on a random graph with a scale of 25 and an edge factor of 16, which significantly outperforms CPU and other FPGA-based large graph processors.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Mixed-Signal Data-Centric Reconfigurable Architecture Enabled by RRAM Technology (poster).\n \n \n \n \n\n\n \n ZhaS, Y.; ZhangS, J.; Wei, Z.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays, of FPGA '17, pages 285–285, New York, NY, USA, 2017. ACM\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2017FPGAposter,\n author = {Zha<sup>S</sup>, Yue and Zhang<sup>S</sup>, Jialiang and Wei, Zhiqiang and Li, Jing},\n title = {A Mixed-Signal Data-Centric Reconfigurable Architecture Enabled by {RRAM} Technology (poster)},\n booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},\n series = {<strong>FPGA</strong> '17},\n year = {2017},\n date={2017-02-22},\n isbn = {978-1-4503-4354-1},\n location = {Monterey, California, USA},\n pages = {285--285},\n numpages = {1},\n url = {http://doi.acm.org/10.1145/3020078.3021759},\n doi = {10.1145/3020078.3021759},\n acmid = {3021759},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {conference, coarse-grained configuration, mixed-signal processing, non-volatile memory, reconfigurable architecture, ternary content addressable memory},\n% note = {(Acceptance Rate: <u>25\\%</u>, 25 out of 101)},\n} \n\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2016\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Reconfigurable in-memory computing with resistive memory crossbar.\n \n \n \n \n\n\n \n ZhaS, Y.; and Li, J.\n\n\n \n\n\n\n In Proceedings of the 35th International Conference on Computer-Aided Design, of ICCAD '16, pages 120:1–120:8, 2016. ACM\n (Acceptance Rate: 24%, 97 out of 408)\n\n\n\n
\n\n\n\n \n \n \"ReconfigurablePaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{zha2016ICCAD,\ntitle = {Reconfigurable in-memory computing with resistive memory crossbar},\nauthor = {Yue Zha<sup>S</sup> and Jing Li},\nurl = {http://dl.acm.org/citation.cfm?id=2967069},\ndoi = {10.1145/2966986.2967069},\nisbn = {978-1-4503-4466-1 },\nyear = {2016},\ndate = {2016-11-07},\nbooktitle = {Proceedings of the 35th International Conference on Computer-Aided Design},\nlocation = {Austin, Texas},\npages = {120:1--120:8},\nseries = {<strong>ICCAD</strong> '16},\norganization = {ACM},\nkeywords = {conference, RRAM, in-memory computing, reconfigurable},\nabstract={Driven by recent advances in resistive random-access memory (RRAM), there have been growing interests in exploring alternative computing concept, i.e., in-memory processing, to address the classical von Neumann bottlenecks. Despite of their great promise in improving performance and energy efficiency, most existing works are built on the inherent matrix-vector multiplication capability of RRAM crossbar structure, and thus lack the flexibility to adapt to future market/technology induced changes in data-intensive applications. To address these challenges, we propose an in-memory reconfigurable architecture based on RRAM crossbar structure. For the first time, it achieves a full programmability across computation and storage, and thereby provides more flexibilities of partitioning the hardware resources based on applications' needs. We further develop two complete CAD design flows to facilitate development of applications written in hardware description languages (HDLs) for our architecture, based on: 1) adaption from existing tool set developed for FPGA, 2) a custom tool design optimized towards the new architecture. Our experiments show that, both design flows are effective in exploiting flexible resources offered by our architecture and thus achieves better efficiency than state-of-art FPGAs (30\\% improvement in performance with 66\\% reduction in area). In addition, compared to adapted design flow, our custom design flow achieves speedup by 3.3×, and further improves mapping quality.},\n%tppubtype = {inproceedings},\nnote = {(Acceptance Rate: <u>24\\%</u>, 97 out of 408)}\n}\n\n\n
\n
\n\n\n
\n Driven by recent advances in resistive random-access memory (RRAM), there have been growing interests in exploring alternative computing concept, i.e., in-memory processing, to address the classical von Neumann bottlenecks. Despite of their great promise in improving performance and energy efficiency, most existing works are built on the inherent matrix-vector multiplication capability of RRAM crossbar structure, and thus lack the flexibility to adapt to future market/technology induced changes in data-intensive applications. To address these challenges, we propose an in-memory reconfigurable architecture based on RRAM crossbar structure. For the first time, it achieves a full programmability across computation and storage, and thereby provides more flexibilities of partitioning the hardware resources based on applications' needs. We further develop two complete CAD design flows to facilitate development of applications written in hardware description languages (HDLs) for our architecture, based on: 1) adaption from existing tool set developed for FPGA, 2) a custom tool design optimized towards the new architecture. Our experiments show that, both design flows are effective in exploiting flexible resources offered by our architecture and thus achieves better efficiency than state-of-art FPGAs (30% improvement in performance with 66% reduction in area). In addition, compared to adapted design flow, our custom design flow achieves speedup by 3.3×, and further improves mapping quality.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Fully CMOS compatible 3D vertical RRAM with self-aligned self-selective cell enabling sub-5nm scaling.\n \n \n \n\n\n \n Xu, X.; Luo, Q.; Gong, T.; Lv, H.; Long, S.; Liu, Q.; Chung, S. S.; Li, J.; and Liu, M.\n\n\n \n\n\n\n In 2016 IEEE Symposium on VLSI Technology, pages 1–2, June 2016. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{xu2016vlsi, \nauthor={Xiaoxin Xu and Q. Luo and Tiancheng Gong and Hangbing Lv and Shibing Long and Qi Liu and S. S. Chung and Jing Li and Ming Liu}, \nbooktitle={2016 IEEE Symposium on VLSI Technology}, \ntitle={Fully {CMOS} compatible {3D} vertical {RRAM} with self-aligned self-selective cell enabling sub-5nm scaling}, \nyear={2016}, \ndate={2016-06},\nvolume={}, \nnumber={}, \npages={1--2}, \nkeywords={conference, CMOS memory circuits,integrated circuit manufacture,resistive RAM,CMOS,RRAM,self-aligned self-selective cell,size 5 nm,vertical resistive switching memory,Etching,Hafnium compounds,Leakage currents,Programming,Resistance,Three-dimensional displays,Threshold voltage}, \ndoi={10.1109/VLSIT.2016.7573388}, \nISSN={}, \nmonth={June},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A compact model for RRAM including random telegraph noise.\n \n \n \n\n\n \n GuanS, B.; and Li, J.\n\n\n \n\n\n\n In 2016 IEEE International Reliability Physics Symposium (IRPS), pages MY-5-1–MY-5-4, April 2016. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{guan2016irps, \nauthor={Bochen Guan<sup>S</sup> and Jing Li}, \nbooktitle={2016 IEEE International Reliability Physics Symposium (IRPS)}, \ntitle={A compact model for {RRAM} including random telegraph noise}, \nyear={2016}, \ndate={2016-04},\nvolume={}, \nnumber={}, \npages={MY-5-1--MY-5-4}, \nkeywords={conference, Monte Carlo methods,current fluctuations,electromagnetic interference,integrated circuit design,integrated circuit reliability,random noise,resistive RAM,telegraphy,Monte Carlo method,RRAM circuit reliability,RRAM compact model,RTN effect,current fluctuation,random telegraph noise,tunneling gap,Current measurement,Data models,Electron traps,Fluctuations,Integrated circuit modeling,Mathematical model,Switches,Compact model,RRAM,Random Telegraph Noise}, \ndoi={10.1109/IRPS.2016.7574621}, \nISSN={}, \nmonth={April},}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2015\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Demonstration of 3D vertical RRAM with ultra low-leakage, high-selectivity and self-compliance memory cells.\n \n \n \n\n\n \n Luo, Q.; Xu, X.; Liu, H.; Lv, H.; Gong, T.; Long, S.; Liu, Q.; Sun, H.; Banerjee, W.; Li, L.; Gao, J.; Lu, N.; Chung, S. S.; Li, J.; and Liu, M.\n\n\n \n\n\n\n In 2015 IEEE International Electron Devices Meeting (IEDM), pages 10.2.1–10.2.4, Dec 2015. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{luo2015iedm, \nauthor={Q. Luo and X. Xu and H. Liu and H. Lv and T. Gong and S. Long and Q. Liu and H. Sun and W. Banerjee and L. Li and J. Gao and N. Lu and S. S. Chung and Jing Li and M. Liu}, \nbooktitle={2015 IEEE International Electron Devices Meeting (<strong>IEDM</strong>)}, \ntitle={Demonstration of 3D vertical {RRAM} with ultra low-leakage, high-selectivity and self-compliance memory cells}, \nyear={2015}, \ndate={2015-12},\nvolume={}, \nnumber={}, \npages={10.2.1--10.2.4}, \nkeywords={conference, hafnium compounds,ionic conductivity,leakage currents,mixed conductivity,resistive RAM,3D vertical RRAM,HfO2,HfO2/mixed ionic and electronic conductor bilayer,four-layer V-RRAM array,high selectivity,nonlinearity,operation current,self-compliance memory cells,self-selective cell,ultra low-leakage,ultra-low half-select leakage,Hafnium compounds,Leakage currents,Optical switches,Resistance,Three-dimensional displays,Tin}, \ndoi={10.1109/IEDM.2015.7409667}, \nISSN={}, \nmonth={Dec},\n%note={(Acceptance Rate*: <u>33\\%</u>)},\n}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Enabling phase-change memory for data-centric computing: Technology, circuitand system (invited).\n \n \n \n\n\n \n Li, J.\n\n\n \n\n\n\n In 2015 IEEE International Symposium on Circuits and Systems (ISCAS), pages 21–24, May 2015. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2015iscas, \nauthor={Jing Li}, \nbooktitle={2015 IEEE International Symposium on Circuits and Systems (ISCAS)}, \ntitle={Enabling phase-change memory for data-centric computing: Technology, circuitand system (<strong>invited</strong>)}, \nyear={2015}, \ndate={2015-05},\nvolume={}, \nnumber={}, \npages={21--24}, \nkeywords={conference, Big Data,computer centres,content-addressable storage,memory architecture,phase change memories,Big Data problems,NVM technology,PCM technology,TCAM,computing stack,cost-per-bit factor,data manipulation,data storage,data-centric computing,data-intensive applications,endurance factor,hardware features,nonvolatile memory technology,performance factor,phase-change memory,power factor,retention factor,ternary content addressable memory,Encoding,Hardware,Nonvolatile memory,Phase change materials,Phase change memory,Random access memory,Reliability,Emerging Nonvolatile Memory,PCM,TCAM,Ternary Content Addressable Memory,data-centric system,near-/in-memory computing,phase change memory}, \ndoi={10.1109/ISCAS.2015.7168560}, \nISSN={0271-4302}, \nmonth={May},}\n\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2014\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n 1 Mb 0.41 $μ$m$^2$ 2T-2R cell nonvolatile TCAM with two-bit encoding and clocked self-referenced sensing (invited).\n \n \n \n\n\n \n Li, J.; Montoye, R.; Ishii, M.; and Chang, L.\n\n\n \n\n\n\n IEEE Journal of Solid-State Circuits, 49(4): 896–907. April 2014.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{li2014jssc, \nauthor={Jing Li and Robert Montoye and Masatoshi Ishii and Leland Chang}, \njournal={IEEE Journal of Solid-State Circuits}, \ntitle={1 {Mb} 0.41 {$\\mu$m$^2$} {2T-2R} cell nonvolatile {TCAM} with two-bit encoding and clocked self-referenced sensing (<strong>invited</strong>)}, \nyear={2014}, \nvolume={49}, \nnumber={4}, \npages={896--907}, \nkeywords={journal, content-addressable storage,encoding,phase change memories,2T 2R cell nonvolatile TCAM,CMOS technology,algorithmic mapping,clocked self referenced sensing,phase change memory technology,resistive memories,size 90 nm,time 1.9 ns,two bit encoding,Arrays,Encoding,Microprocessors,Phase change materials,Random access memory,Sensors,Associative computing,encoding,hardware accelerator,intrusion detection,matchline compensation,nonvolatile,packet classification,phase change memory (PCM),search engine,self-referenced sensing,ternary content addressable memory (TCAM)}, \nabstract={This work demonstrates the first fabricated 1 Mb nonvolatile TCAM using 2-transistor/2-resistive-storage (2T-2R) cells to achieve >10× smaller cell size than SRAM-based TCAMs at the same technology node. The test chip was designed and fabricated in IBM 90 nm CMOS technology and mushroom phase-change memory (PCM) technology. The primary challenge for enabling reliable array operation with such aggressive cell is presented, namely, severely degraded sensing margin due to significantly lower ON/OFF ratio of resistive memories (~10 2 for PCM) than that of traditional MOSFETs (>10 5 ). To address this challenge, two enabling techniques were developed and implemented in hardware: 1) two-bit encoding and 2) a clocked self-referenced sensing scheme (CSRSS). In addition, the two-bit encoding can also improve algorithmic mapping by effectively compressing TCAM entries. The 1 Mb chip demonstrates reliable low voltage search operation (VDDmin ~750 mV) and a match delay of 1.9 ns under nominal operating conditions.},\ndoi={10.1109/JSSC.2013.2292055}, \nISSN={0018-9200}, \nmonth={April},\n}\n\n
\n
\n\n\n
\n This work demonstrates the first fabricated 1 Mb nonvolatile TCAM using 2-transistor/2-resistive-storage (2T-2R) cells to achieve >10× smaller cell size than SRAM-based TCAMs at the same technology node. The test chip was designed and fabricated in IBM 90 nm CMOS technology and mushroom phase-change memory (PCM) technology. The primary challenge for enabling reliable array operation with such aggressive cell is presented, namely, severely degraded sensing margin due to significantly lower ON/OFF ratio of resistive memories (~10 2 for PCM) than that of traditional MOSFETs (>10 5 ). To address this challenge, two enabling techniques were developed and implemented in hardware: 1) two-bit encoding and 2) a clocked self-referenced sensing scheme (CSRSS). In addition, the two-bit encoding can also improve algorithmic mapping by effectively compressing TCAM entries. The 1 Mb chip demonstrates reliable low voltage search operation (VDDmin ~750 mV) and a match delay of 1.9 ns under nominal operating conditions.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2013\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Assisted cubic to hexagonal phase transition in GeSbTe thin films on silicon nitride.\n \n \n \n \n\n\n \n Cil, K; Zhu, Y; Li, J.; Lam, C.; and Silva, H\n\n\n \n\n\n\n Thin Solid Films, 536: 216–219. 2013.\n \n\n\n\n
\n\n\n\n \n \n \"AssistedPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{cil2013thinfilm,\n  title={Assisted cubic to hexagonal phase transition in GeSbTe thin films on silicon nitride},\n  author={Cil, K and Zhu, Y and Li, Jing and Lam, CH and Silva, H},\n  journal={Thin Solid Films},\n  volume={536},\n  pages={216--219},\n  year={2013},\n  publisher={Elsevier},\n  issn = {0040-6090},\n  doi = {10.1016/j.tsf.2013.03.087},\n  url = {http://www.sciencedirect.com/science/article/pii/S0040609013005476},\n  keywords = {Phase change memory, Germanium–antimony–tellurium, Phase transition temperature, Face-centered cubic, Hexagonal close-packed, Substrate dependence, Silicon nitride, Silicon dioxide},\n  keywords={journal}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n 1Mb 0.41 $μ$m$^2$ 2T-2R cell nonvolatile TCAM with two-bit encoding and clocked self-referenced sensing (Highlight Paper of the Year).\n \n \n \n\n\n \n Li, J.; Montoye, R.; Ishii, M.; Stawiasz, K.; Nishida, T.; Maloney, K.; Ditlow, G.; Lewis, S.; Maffitt, T.; Jordan, R.; and others\n\n\n \n\n\n\n In 2013 Symposium on VLSI Circuits, pages C104–C105, June 2013. \n (Acceptance Rate: 27%, 109 out of 396)\n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2013vlsi, \nauthor={Li, Jing and Montoye, Robert and Ishii, Masatoshi and Stawiasz, Kevin and Nishida, Takeshi and Maloney, Kim and Ditlow, Gary and Lewis, Scott and Maffitt, Tom and Jordan, Richard and others},\nbooktitle={2013 Symposium on VLSI Circuits}, \ntitle={{1Mb} 0.41 {$\\mu$m$^2$} {2T-2R} cell nonvolatile {TCAM} with two-bit encoding and clocked self-referenced sensing (<strong>Highlight Paper of the Year</strong>)}, \nyear={2013}, \ndate={2013-06-12},\nvolume={}, \nnumber={}, \npages={C104--C105}, \nkeywords={conference, CMOS memory circuits,SRAM chips,clocks,content-addressable storage,integrated circuit design,integrated circuit reliability,low-power electronics,phase change memories,search problems,2-transistor-2-resistive-storage cells,2T-2R cells,CSRSS,IBM CMOS technology,PCM process,SRAM-based TCAM,bit rate 1 Mbit/s,cell nonvolatile TCAM,cell size,clocked self-referenced sensing scheme,compact cells,fabricated nonvolatile TCAM,low voltage search operation,match delay,mushroom phase-change memory process,reliable search operation,size 90 nm,technology node,test chip design,two-bit encoding,Arrays,Clocks,Encoding,Microprocessors,Phase change materials,Sensors}, \ndoi={}, \nISSN={2158-5601}, \nmonth={June},\nabstract={This work demonstrates the first fabricated nonvolatile TCAM using 2-transistor/2-resistive-storage (2T-2R) cells to achieve >10× smaller cell size than SRAM-based TCAMs at the same technology node. The test chip was designed and fabricated in IBM 90nm CMOS technology and mushroom phase-change memory (PCM) process. To ensure reliable search operation with such compact cells, two enabling techniques were developed and implemented in hardware: 1) two-bit encoding, and 2) a clocked self-referenced sensing scheme (CSRSS). The 1Mb chip demonstrates reliable low voltage search operation (VDDmin~750mV) and a match delay of 1.9 ns under nominal operating conditions.},\nnote = {(Acceptance Rate: <u>27\\%</u>, 109 out of 396)}\n}\n\n\n
\n
\n\n\n
\n This work demonstrates the first fabricated nonvolatile TCAM using 2-transistor/2-resistive-storage (2T-2R) cells to achieve >10× smaller cell size than SRAM-based TCAMs at the same technology node. The test chip was designed and fabricated in IBM 90nm CMOS technology and mushroom phase-change memory (PCM) process. To ensure reliable search operation with such compact cells, two enabling techniques were developed and implemented in hardware: 1) two-bit encoding, and 2) a clocked self-referenced sensing scheme (CSRSS). The 1Mb chip demonstrates reliable low voltage search operation (VDDmin~750mV) and a match delay of 1.9 ns under nominal operating conditions.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2012\n \n \n (7)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Theory and Experiments of the Impact of Work-Function Variability on Threshold Voltage Variability in MOS Devices.\n \n \n \n\n\n \n Zhang, X.; Mitard, J.; Ragnarsson, L.; Hoffmann, T.; Deal, M.; Grubbs, M. E.; Li, J.; Magyari-Kope, B.; Clemens, B. M.; and Nishi, Y.\n\n\n \n\n\n\n IEEE Transactions on Electron Devices, 59(11): 3124–3126. Nov 2012.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{zhang2012TED, \nauthor={Xiao Zhang and Jerome Mitard and Lars-Ake Ragnarsson and Tomas Hoffmann and Michael Deal and Melody E. Grubbs and Jing Li and Blanka Magyari-Kope and Bruce M. Clemens and Yoshio Nishi}, \njournal={IEEE Transactions on Electron Devices}, \ntitle={Theory and Experiments of the Impact of Work-Function Variability on Threshold Voltage Variability in {MOS} Devices}, \nyear={2012}, \nvolume={59}, \nnumber={11}, \npages={3124--3126}, \nkeywords={journal, MOSFET,failure analysis,probability,random-access storage,semiconductor device models,semiconductor device reliability,MOS devices,MOSFET,WFV,grain orientation,polycrystalline metal gate,random dopant fluctuation,size 22 nm,static RAM failure probability,threshold voltage variability,work-function variability,Integrated circuit modeling,Logic gates,Random access memory,Resource description framework,Semiconductor device modeling,MOSFETS,Metal gate,variability,work function (WF)}, \ndoi={10.1109/TED.2012.2212021}, \nISSN={0018-9383}, \nmonth={Nov},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The impact of heater-recess and load matching in phase change memory mushroom cells.\n \n \n \n \n\n\n \n Cywar, A.; Li, J.; Lam, C.; and Silva, H.\n\n\n \n\n\n\n Nanotechnology, 23(22): 225201. 2012.\n \n\n\n\n
\n\n\n\n \n \n \"ThePaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{cywar2012nano,\n  author={Adam Cywar and Jing Li and Chung Lam and Helena Silva},\n  title={The impact of heater-recess and load matching in phase change memory mushroom cells},\n  journal={Nanotechnology},\n  volume={23},\n  number={22},\n  pages={225201},\n  url={http://stacks.iop.org/0957-4484/23/i=22/a=225201},\n  year={2012},\n  doi={10.1088/0957-4484/23/22/225201},\n  keywords={journal},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A case for small row buffers in non-volatile main memories.\n \n \n \n\n\n \n Meza, J.; Li, J.; and Mutlu, O.\n\n\n \n\n\n\n In 2012 IEEE 30th International Conference on Computer Design (ICCD), pages 484–485, Sept 2012. \n (Acceptance rate: 25%, 61 out of 241)\n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{meza2012iccd, \nauthor={Justin Meza and Jing Li and Onur Mutlu}, \nbooktitle={2012 IEEE 30th International Conference on Computer Design (<strong>ICCD</strong>)}, \ntitle={A case for small row buffers in non-volatile main memories}, \nyear={2012},\ndate={2012-09-30},\nvolume={}, \nnumber={}, \npages={484--485}, \nkeywords={conference, DRAM chips,buffer circuits,multiprocessing systems,DRAM baseline,DRAM chips,DRAM-based main memories,NVM technologies,array access,buffered data,chip costs,data mapping schemes,main memory dynamic energy,memory array access,memory parallelism,multicore architectures,nonvolatile main memories,read operations,row buffer size,small row buffers,system-level trends,Arrays,Memory management,Nonvolatile memory,Organizations,Phase change materials,Random access memory}, \nabstract={DRAM-based main memories have read operations that destroy the read data, and as a result, must buffer large amounts of data on each array access to keep chip costs low. Unfortunately, system-level trends such as increased memory contention in multi-core architectures and data mapping schemes that improve memory parallelism lead to only a small amount of the buffered data to be accessed. This makes buffering large amounts of data on every memory array access energy-inefficient; yet organizing DRAM chips to buffer small amounts of data is costly, as others have shown. Emerging non-volatile memories (NVMs) such as PCM, STT-RAM, and RRAM, however, do not have destructive read operations, opening up opportunities for employing small row buffers without incurring additional area penalty and/or design complexity. In this work, we discuss and evaluate architectural changes to enable small row buffers at a low cost in NVMs. We find that on a multi-core system, reducing the row buffer size can greatly reduce main memory dynamic energy compared to a DRAM baseline with large row sizes, without greatly affecting endurance, and for some NVM technologies, leads to improved performance.},\ndoi={10.1109/ICCD.2012.6378685}, \nISSN={1063-6404}, \nmonth={Sept},\nnote={(Acceptance rate: <u>25\\%</u>, 61 out of 241)},\n}\n\n
\n
\n\n\n
\n DRAM-based main memories have read operations that destroy the read data, and as a result, must buffer large amounts of data on each array access to keep chip costs low. Unfortunately, system-level trends such as increased memory contention in multi-core architectures and data mapping schemes that improve memory parallelism lead to only a small amount of the buffered data to be accessed. This makes buffering large amounts of data on every memory array access energy-inefficient; yet organizing DRAM chips to buffer small amounts of data is costly, as others have shown. Emerging non-volatile memories (NVMs) such as PCM, STT-RAM, and RRAM, however, do not have destructive read operations, opening up opportunities for employing small row buffers without incurring additional area penalty and/or design complexity. In this work, we discuss and evaluate architectural changes to enable small row buffers at a low cost in NVMs. We find that on a multi-core system, reducing the row buffer size can greatly reduce main memory dynamic energy compared to a DRAM baseline with large row sizes, without greatly affecting endurance, and for some NVM technologies, leads to improved performance.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Resistance drift in phase change memory (invited).\n \n \n \n\n\n \n Li, J.; Luan, B.; and Lam, C.\n\n\n \n\n\n\n In 2012 IEEE International Reliability Physics Symposium (IRPS), pages 6C.1.1–6C.1.6, April 2012. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2012irps, \nauthor={Jing Li and Binquan Luan and Chung Lam}, \nbooktitle={2012 IEEE International Reliability Physics Symposium (<strong>IRPS</strong>)}, \ntitle={Resistance drift in phase change memory (<strong>invited</strong>)}, \nyear={2012}, \nvolume={}, \nnumber={}, \npages={6C.1.1--6C.1.6}, \nkeywords={conference, circuit reliability,molecular dynamics method,phase change memories,MLC PCM,SR,amorphous chalcogenide material,atomic structure,material engineering,mitigation technique,phase change memory,physics model,quantum molecular dynamic simulation,reliability issue,structural relaxation,time dependent resistance drift,Annealing,Kinetic theory,Phase change materials,Resistance,Strontium,Temperature measurement,drift,multi-level cell,phase change memory,structural relaxation}, \ndoi={10.1109/IRPS.2012.6241871}, \nISSN={1541-7026}, \nmonth={April},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n The impact of melting during reset operation on the reliability of phase change memory.\n \n \n \n\n\n \n Du, P. Y.; Wu, J. Y.; Hsu, T. H.; Lee, M. H.; Wang, T. Y.; Cheng, H. Y.; Lai, E. K.; Lai, S. C.; Lung, H. L.; Kim, S.; BrightSky, M. J.; Zhu, Y.; Mittal, S.; Cheek, R.; Raoux, S.; Joseph, E. A.; Schrott, A.; Li, J.; and Lam, C.\n\n\n \n\n\n\n In 2012 IEEE International Reliability Physics Symposium (IRPS), pages 6C.2.1–6C.2.6, April 2012. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{du2012irps, \nauthor={P. Y. Du and J. Y. Wu and T. H. Hsu and M. H. Lee and T. Y. Wang and H. Y. Cheng and E. K. Lai and S. C. Lai and H. L. Lung and S. Kim and M. J. BrightSky and Y. Zhu and S. Mittal and R. Cheek and S. Raoux and E. A. Joseph and A. Schrott and Jing Li and C. Lam}, \nbooktitle={2012 IEEE International Reliability Physics Symposium (<strong>IRPS</strong>)}, \ntitle={The impact of melting during reset operation on the reliability of phase change memory}, \nyear={2012}, \nvolume={}, \nnumber={}, \npages={6C.2.1--6C.2.6}, \nkeywords={conference, arrays,circuit reliability,electromigration,melting,phase change memories,segregation,GST-based phase change memory,RESET melting healing effect,SET induced damage,SET operation,control circuits,electromigration,large test chips,operation impact,phase change memory reliability,phase segregation,reset operation,Conductivity,Electromigration,Maintenance engineering,Phase change materials,Phase change memory,Resistance,Tin,Endurance,RESET operation,electromigration,melting,phasechange memory (PCM),reliability,segregation}, doi={10.1109/IRPS.2012.6241872}, \nISSN={1541-7026}, \nmonth={April},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Optimization of programming current on endurance of phase change memory.\n \n \n \n\n\n \n Kim, S.; Du, P. Y.; Li, J.; Breitwisch, M.; Zhu, Y.; Mittal, S.; Cheek, R.; Hsu, T. H.; Lee, M. H.; Schrott, A.; Raoux, S.; Cheng, H. Y.; Lai, S. C.; Wu, J. Y.; Wang, T. Y.; Joseph, E. A.; Lai, E. K.; Ray, A.; Lung, H. L.; and Lam, C.\n\n\n \n\n\n\n In Proceedings of Technical Program of 2012 VLSI Technology, System and Application (VLSI-TSA), pages 1–2, April 2012. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{kim2012vlsitsa, \nauthor={S. Kim and P. Y. Du and Jing Li and M. Breitwisch and Y. Zhu and S. Mittal and R. Cheek and T. H. Hsu and M. H. Lee and A. Schrott and S. Raoux and H. Y. Cheng and S. C. Lai and J. Y. Wu and T. Y. Wang and E. A. Joseph and E. K. Lai and A. Ray and H. L. Lung and C. Lam}, \nbooktitle={Proceedings of Technical Program of 2012 VLSI Technology, System and Application (<strong>VLSI-TSA</strong>)}, \ntitle={Optimization of programming current on endurance of phase change memory}, \nyear={2012}, \nvolume={}, \nnumber={}, \npages={1--2}, \nkeywords={conference, failure analysis,phase change memories,RESET current margin,endurance cycles,endurance failure modes,material segregation effect,open failure,optimization,phase change memory,phase-dependent open-failure mechanisms,programming conditions,programming current,stuck-SET failure characteristic curves,Current density,Optimization,Phase change materials,Phase change memory,Programming,Resistance}, \ndoi={10.1109/VLSI-TSA.2012.6210122}, \nISSN={1524-766X}, \nmonth={April},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Evaluating Row Buffer Locality in Future Non-Volatile Main Memories.\n \n \n \n\n\n \n Meza, J.; Li, J.; and Mutlu, O.\n\n\n \n\n\n\n Technical Report 2012-002, Carnegie Mellon University (CMU), Dec 2012.\n SAFARI Technical Report\n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@techreport{meza2012:report:nvm,\n  title = {Evaluating Row Buffer Locality in Future Non-Volatile Main Memories},\n  author = {Justin Meza and Jing Li and Onur Mutlu},\n  institution = {Carnegie Mellon University (CMU)},\n  year = {2012},\n  month = {Dec},\n  number = {2012-002},\n  note = {SAFARI Technical Report},\n  keywords = {techreport}\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2011\n \n \n (8)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Phase change memory (invited).\n \n \n \n \n\n\n \n Li, J.; and Lam, C.\n\n\n \n\n\n\n Science China Information Sciences, 54(5): 1061–1072. May 2011.\n \n\n\n\n
\n\n\n\n \n \n \"PhasePaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@Article{li2011sciencechina,\nauthor={Li, Jing and Lam, Chung},\ntitle={Phase change memory (<strong>invited</strong>)},\njournal={Science China Information Sciences},\nyear={2011},\nmonth={May},\nday={01},\nvolume={54},\nnumber={5},\npages={1061--1072},\nabstract={Phase change memory (PCM) is a non-volatile solid-state memory technology based on the large resistivity contrast between the amorphous and crystalline states in phase change materials. We present the physics behind this large resistivity contrast and describe how it is being exploited to create high density PCM. We address the challenges facing this technology, including the design of PCM cells, fabrication, device variability, thermal cross-talk and write disturb. We discuss the scalability, assess the performance, and examine the reliability of PCM including data retention, multi-bit storage and endurance.},\nissn={1869-1919},\ndoi={10.1007/s11432-011-4223-x},\nurl={https://doi.org/10.1007/s11432-011-4223-x},\nkeywords={journal}\n}\n\n
\n
\n\n\n
\n Phase change memory (PCM) is a non-volatile solid-state memory technology based on the large resistivity contrast between the amorphous and crystalline states in phase change materials. We present the physics behind this large resistivity contrast and describe how it is being exploited to create high density PCM. We address the challenges facing this technology, including the design of PCM cells, fabrication, device variability, thermal cross-talk and write disturb. We discuss the scalability, assess the performance, and examine the reliability of PCM including data retention, multi-bit storage and endurance.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Materials engineering for Phase Change Random Access Memory.\n \n \n \n\n\n \n Raoux, S.; Cheng, H.; Sandrini, J.; Li, J.; and Jordan-Sweet, J.\n\n\n \n\n\n\n In 2011 11th Annual Non-Volatile Memory Technology Symposium Proceeding (NVMTS), pages 1–5, Nov 2011. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{raoux2011nvmts, \nauthor={Simone Raoux and Huai-Yu Cheng and Jury Sandrini and Jing Li and Jean Jordan-Sweet}, \nbooktitle={2011 11th Annual Non-Volatile Memory Technology Symposium Proceeding (NVMTS)}, \ntitle={Materials engineering for Phase Change Random Access Memory}, \nyear={2011}, \nvolume={}, \nnumber={}, \npages={1--5}, \nkeywords={conference, X-ray diffraction,antimony alloys,crystallisation,germanium alloys,phase change materials,phase change memories,tellurium alloys,GeSbTe,amorphous phase,crystallization temperature,electrical contrast,materials ewngineering,phase change random access memory,rhombohedral phase,temperature 200 degC,time resolved X-ray diffraction,Phase Change Materials,Phase Change Random Access Memory}, \ndoi={10.1109/NVMTS.2011.6137090}, \nISSN={}, \nmonth={Nov},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Explore physical origins of resistance drift in phase change memory and its implication for drift-insensitive materials.\n \n \n \n\n\n \n Li, J.; Luan, B.; Hsu, T. H.; Zhu, Y.; Martyna, G.; Newns, D.; Cheng, H. Y.; Raoux, S.; Lung, H. L.; and Lam, C.\n\n\n \n\n\n\n In 2011 International Electron Devices Meeting (IEDM), pages 12.5.1–12.5.4, Dec 2011. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2011iedm, \nauthor={Jing Li and Binquan Luan and T. H. Hsu and Y. Zhu and G. Martyna and D. Newns and H. Y. Cheng and S. Raoux and H. L. Lung and C. Lam}, \nbooktitle={2011 International Electron Devices Meeting (<strong>IEDM</strong>)}, \ntitle={Explore physical origins of resistance drift in phase change memory and its implication for drift-insensitive materials}, \nyear={2011}, \nvolume={}, \nnumber={}, \npages={12.5.1--12.5.4}, \nkeywords={conference, amorphous semiconductors,antimony alloys,atomic structure,germanium alloys,phase change materials,phase change memories,tellurium alloys,Ge,Sb,Te,amorphous germanium,atomic structure,drift-insensitive phase change material,electrical characteristics,first principle ab initio method,material-device characterization,phase change memory,resistance drift,tellurium ternary alloys,Conductivity,Phase change materials,Phase change memory,Programming,Resistance,Temperature measurement}, \ndoi={10.1109/IEDM.2011.6131541}, \nISSN={0163-1918}, \nmonth={Dec},\n%note={(Acceptance Rate*: <u>33\\%</u>)},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A low power phase change memory using thermally confined TaN/TiN bottom electrode.\n \n \n \n\n\n \n Wu, J. Y.; Breitwisch, M.; Kim, S.; Hsu, T. H.; Cheek, R.; Du, P. Y.; Li, J.; Lai, E. K.; Zhu, Y.; Wang, T. Y.; Cheng, H. Y.; Schrott, A.; Joseph, E. A.; Dasaka, R.; Raoux, S.; Lee, M. H.; Lung, H. L.; and Lam, C.\n\n\n \n\n\n\n In 2011 International Electron Devices Meeting (IEDM), pages 3.2.1–3.2.4, Dec 2011. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{wu2011iedm, \nauthor={J. Y. Wu and M. Breitwisch and S. Kim and T. H. Hsu and R. Cheek and P. Y. Du and Jing Li and E. K. Lai and Y. Zhu and T. Y. Wang and H. Y. Cheng and A. Schrott and E. A. Joseph and R. Dasaka and S. Raoux and M. H. Lee and H. L. Lung and C. Lam}, \nbooktitle={2011 International Electron Devices Meeting (<strong>IEDM</strong>)}, \ntitle={A low power phase change memory using thermally confined {TaN/TiN} bottom electrode}, \nyear={2011}, \nvolume={}, \nnumber={}, \npages={3.2.1--3.2.4}, \nkeywords={conference, conductors (electric),electrodes,heat losses,integrated circuit reliability,low-power electronics,phase change memories,tantalum compounds,thermal insulation,titanium compounds,TaN-TiN,current 30 muA,electrical conductivity,electrothermal simulation,low power PCM,low power phase change memory,size 1.5 nm,size 39 nm,storage capacity 256 Mbit,thermal barrier,thermal insulation,thermally confined bottom electrode,Electrodes,Heating,Phase change memory,Solids,Thermal resistance,Tin}, \ndoi={10.1109/IEDM.2011.6131479}, \nISSN={0163-1918}, \nmonth={Dec},\n%note={(Acceptance Rate*: <u>33\\%</u>)},\n}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Post-silicon calibration of analog CMOS using phase-change memory cells.\n \n \n \n\n\n \n WenS, C.; Paramesh, J.; Pileggi, L.; Li, J.; Kim, S.; Proesel, J.; and Lam, C.\n\n\n \n\n\n\n In 2011 Proceedings of the ESSCIRC (ESSCIRC), pages 423–426, Sept 2011. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{wen2011esscirc, \nauthor={ Cheng-Yuan Wen<sup>S</sup> and Jeyanandh Paramesh and Larry Pileggi and Jing Li and SangBum Kim and Jonathan Proesel and Chung Lam}, \nbooktitle={2011 Proceedings of the ESSCIRC (<strong>ESSCIRC</strong>)}, \ntitle={Post-silicon calibration of analog {CMOS} using phase-change memory cells}, \nyear={2011}, \nvolume={}, \nnumber={}, \npages={423--426}, \nkeywords={conference, CMOS analogue integrated circuits,antimony compounds,calibration,chalcogenide glasses,comparators (circuits),elemental semiconductors,germanium compounds,phase change memories,redundancy,silicon,tellurium compounds,Ge2Sb2Te5,IBM CMOS technology,PCRAM mushroom cells,Si,analog CMOS,capacitance 4.41 fF,combinatorial redundancy,digital calibration,embedded GST,nonvolatile phase-change random access memory cells,offset-minimized CMOS comparator,post-manufacturing calibration,post-silicon calibration,power 55.42 muW,size 90 nm,switchable resistances,voltage 1 V,Arrays,CMOS integrated circuits,Calibration,Generators,Phase change random access memory,Redundancy,Resistance}, \ndoi={10.1109/ESSCIRC.2011.6044997}, \nISSN={1930-8833}, \nmonth={Sept},\n%note={(Acceptance Rate: <u>38\\%</u>, 121 out of 314)},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A non-volatile look-up table design using PCM (phase-change memory) cells.\n \n \n \n\n\n \n WenS, C. Y.; Li, J.; Kim, S.; Breitwisch, M.; Lam, C.; Paramesh, J.; and Pileggi, L. T.\n\n\n \n\n\n\n In 2011 Symposium on VLSI Circuits - Digest of Technical Papers, pages 302–303, June 2011. \n (Acceptance Rate: 28%, 115 out of 409)\n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{wen2011vlci, \nauthor={C. Y. Wen<sup>S</sup> and Jing Li and S. Kim and M. Breitwisch and C. Lam and J. Paramesh and L. T. Pileggi}, \nbooktitle={2011 Symposium on VLSI Circuits - Digest of Technical Papers}, \ntitle={A non-volatile look-up table design using {PCM} (phase-change memory) cells}, \nyear={2011}, \ndate={2011-06-15},\nvolume={}, \nnumber={}, \npages={302--303}, \nkeywords={conference, CMOS memory circuits,antimony compounds,chalcogenide glasses,germanium compounds,logic circuits,phase change memories,programmable circuits,random-access storage,tellurium compounds,CMOS technology,Ge2Sb2Te5,PCM mushroom cell,digital look-up table circuit,nonvolatile logic functions,nonvolatile look-up table design,phase-change memory,programmable logic functions,resistance transformation ratio,size 90 nm,voltage 1 V,CMOS integrated circuits,Logic gates,Phase change materials,Phase change random access memory,Resistance,Table lookup}, \ndoi={}, \nISSN={2158-5636}, \nmonth={June},\nnote={(Acceptance Rate: <u>28\\%</u>, 115 out of 409)},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A Novel Reconfigurable Sensing Scheme for Variable Level Storage in Phase Change Memory.\n \n \n \n\n\n \n Li, J.; Wu, C. I.; Lewis, S. C.; Morrish, J.; Wang, T. Y.; Jordan, R.; Maffitt, T.; Breitwisch, M.; Schrott, A.; Cheek, R.; Lung, H. L.; and Lam, C.\n\n\n \n\n\n\n In 2011 3rd IEEE International Memory Workshop (IMW), pages 1–4, May 2011. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2011imw, \nauthor={Jing Li and C. I. Wu and S. C. Lewis and J. Morrish and T. Y. Wang and R. Jordan and T. Maffitt and M. Breitwisch and A. Schrott and R. Cheek and H. L. Lung and C. Lam}, \nbooktitle={2011 3rd IEEE International Memory Workshop (<strong>IMW</strong>)}, \ntitle={A Novel Reconfigurable Sensing Scheme for Variable Level Storage in Phase Change Memory}, \nyear={2011}, \nvolume={}, \nnumber={}, \npages={1--4}, \nkeywords={conference, CMOS digital integrated circuits,NAND circuits,flash memories,phase change memories,2Mcell PCM chip,CMOS technology,NAND flash,analog resistance levels,frequency 50 MHz,phase change memory,reconfigurable sensing scheme,size 90 nm,time 35 mus to 50 mus,time 5 mus,variable level storage,word length 8 bit,Clocks,Electrical resistance measurement,Flash memory,Phase change materials,Radiation detectors,Resistance}, \ndoi={10.1109/IMW.2011.5873227}, \nISSN={2159-483X}, \nmonth={May},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Demonstration of CAM and TCAM Using Phase Change Devices.\n \n \n \n\n\n \n Rajendran, B.; Cheek, R. W.; Lastras, L. A.; Franceschini, M. M.; Breitwisch, M. J.; Schrott, A. G.; Li, J.; Montoye, R. K.; Chang, L.; and Lam, C.\n\n\n \n\n\n\n In 2011 3rd IEEE International Memory Workshop (IMW), pages 1–4, May 2011. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{rajendran2011imw, \nauthor={B. Rajendran and R. W. Cheek and L. A. Lastras and M. M. Franceschini and M. J. Breitwisch and A. G. Schrott and Jing Li and R. K. Montoye and L. Chang and C. Lam}, \nbooktitle={2011 3rd IEEE International Memory Workshop (<strong>IMW</strong>)}, \ntitle={Demonstration of {CAM} and {TCAM} Using Phase Change Devices}, \nyear={2011}, \nvolume={}, \nnumber={}, \npages={1--4}, \nkeywords={conference, Monte Carlo methods,content-addressable storage,phase change memories,Monte-Carlo simulation,PCM decives,SRAM,TCAM,content addressable memory,phase change devices,phase change memory technology,ternary CAM,Arrays,Computer aided manufacturing,FETs,Phase change materials,Programming,Resistance,Resistors}, \ndoi={10.1109/IMW.2011.5873229}, \nISSN={2159-483X}, \nmonth={May},}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2010\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Design Paradigm for Robust Spin-Torque Transfer Magnetic RAM (STT MRAM) From Circuit/Architecture Perspective (best paper).\n \n \n \n\n\n \n Li, J.; Ndai, P.; Goel, A.; Salahuddin, S.; and Roy, K.\n\n\n \n\n\n\n IEEE Transactions on Very Large Scale Integration (VLSI) Systems, 18(12): 1710–1723. Dec 2010.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{li2010tvlsi, \nauthor={Jing Li and Patrick Ndai and Ashish Goel and Sayeef Salahuddin and Kaushik Roy}, \njournal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems}, \ntitle={Design Paradigm for Robust Spin-Torque Transfer Magnetic {RAM} ({STT} {MRAM}) From Circuit/Architecture Perspective (<strong>best paper</strong>)}, \nyear={2010}, \nvolume={18}, \nnumber={12}, \npages={1710--1723}, \nkeywords={journal, integrated circuit design,magnetic storage,random-access storage,high memory yield,parametric failures,process variations,robust spin-torque transfer magnetic RAM,Circuit stability,Costs,Failure analysis,Flash memory,Magnetic circuits,Performance analysis,Random access memory,Read-write memory,Robustness,Scalability,Spin-torque transfer (STT),magnetic ram (MRAM),memory yield,parametric failures}, \ndoi={10.1109/TVLSI.2009.2027907}, \nISSN={1063-8210}, \nmonth={Dec},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Variable-Latency Adder (VL-Adder) Designs for Low Power and NBTI Tolerance.\n \n \n \n\n\n \n Chen, Y.; Li, H.; Koh, C.; Sun, G.; Li, J.; Xie, Y.; and Roy, K.\n\n\n \n\n\n\n IEEE Transactions on Very Large Scale Integration (VLSI) Systems, 18(11): 1621–1624. Nov 2010.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{chen2010tvlsi, \nauthor={Yiran Chen and Hai Li and Cheng-Kok Koh and Guangyu Sun and Jing Li and Yuan Xie and Kaushik Roy}, \njournal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems}, \ntitle={Variable-Latency Adder ({VL-Adder}) Designs for Low Power and {NBTI} Tolerance}, \nyear={2010}, \nvolume={18}, \nnumber={11}, \npages={1621--1624}, \nkeywords={journal, adders,digital arithmetic,integrated circuit design,logic design,IC design,NBTI tolerance,circuit delay,digital arithmetic,logic design,negative bias temperature instability,variable-latency adder designs,word length 64 bit,Adders,Circuits,Clocks,Delay,Negative bias temperature instability,Niobium compounds,Sun,Throughput,Titanium compounds,Very large scale integration,Digital arithmetic,IC design,logic design}, \ndoi={10.1109/TVLSI.2009.2026280}, \nISSN={1063-8210}, \nmonth={Nov},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2009\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Variation Estimation and Compensation Technique in Scaled LTPS TFT Circuits for Low-Power Low-Cost Applications.\n \n \n \n\n\n \n Li, J.; Kang, K.; and Roy, K.\n\n\n \n\n\n\n IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems, 28(1): 46–59. Jan 2009.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@ARTICLE{li2009tcad, \nauthor={Jing Li and Kunhyuk Kang and Kaushik Roy}, \njournal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, \ntitle={Variation Estimation and Compensation Technique in Scaled {LTPS} {TFT} Circuits for Low-Power Low-Cost Applications}, \nyear={2009}, \nvolume={28}, \nnumber={1}, \npages={46--59}, \nkeywords={journal, CMOS integrated circuits,circuit reliability,elemental semiconductors,low-power electronics,silicon,statistical analysis,thin film transistors,CMOS technology,Si,circuit reliability,compensation technique,delay variation,four-finger structure,inverter chain,low-power low-cost application,low-temperature polycrystalline-silicon thin-film transistor,multifinger design technique,multimodal delay distribution,response surface method,statistical simulation methodology,unimodal distribution,variation estimation,CMOS logic circuits,CMOS technology,Circuit simulation,Delay,Grain boundaries,Logic devices,Response surface methodology,Robustness,Substrates,Thin film transistors,Grain boundary (GB),low-temperature polycrystalline-silicon (LTPS),process variation,thin-film transistor (TFT)}, \ndoi={10.1109/TCAD.2008.2009149}, \nISSN={0278-0070}, \nmonth={Jan},\nkeywords={journal}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Physical model of the impact of metal grain work function variability on emerging dual metal gate MOSFETs and its implication for SRAM reliability.\n \n \n \n\n\n \n Zhang, X.; Li, J.; Grubbs, M.; Deal, M.; Magyari-Köpe, B.; Clemens, B. M.; and Nishi, Y.\n\n\n \n\n\n\n In 2009 IEEE International Electron Devices Meeting (IEDM), pages 1–4, Dec 2009. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{zhang2009iedm, \nauthor={Xiao Zhang and Jing Li and M. Grubbs and M. Deal and B. Magyari-Köpe and B. M. Clemens and Y. Nishi}, \nbooktitle={2009 IEEE International Electron Devices Meeting (<strong>IEDM</strong>)}, \ntitle={Physical model of the impact of metal grain work function variability on emerging dual metal gate {MOSFETs} and its implication for {SRAM} reliability}, \nyear={2009}, \ndate={2009-12},\nvolume={}, \nnumber={}, \npages={1--4}, \nkeywords={conference, MOS integrated circuits,MOSFET,SRAM chips,integrated circuit metallisation,integrated circuit reliability,work function,SRAM reliability,dual metal gate MOSFET,grain orientation difference,metal grain work function variability,polycrystalline metal gate,size 22 nm,Charge carrier density,Circuit analysis,Electrodes,Fluctuations,High K dielectric materials,MOSFETs,Predictive models,Random access memory,Resource description framework,Semiconductor process modeling}, \ndoi={10.1109/IEDM.2009.5424420}, \nISSN={0163-1918}, \nmonth={Dec},\n%note={(Acceptance Rate*: <u>33\\%</u>)},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Robust Heterogeneous System Design in Spintronics: Error Resilient Spin Torque MRAM (STT MRAM) Design.\n \n \n \n\n\n \n Li, J.; and Roy, K.\n\n\n \n\n\n\n In the 46th Annual Design Automation Conference PHD Forum, of DAC '09, 2009. \n (Acceptance Rate: 22%, 148 out of 684)\n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{li2009dac,\n author = {Li, Jing and Roy, Kaushik},\n title = {Robust Heterogeneous System Design in Spintronics: Error Resilient Spin Torque {MRAM} ({STT MRAM}) Design},\n booktitle = {the 46th Annual Design Automation Conference PHD Forum},\n series = {<strong>DAC</strong> '09},\n year = {2009},\n keywords = {conference},\n note = {(Acceptance Rate: <u>22\\%</u>, 148 out of 684)},\n} \n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An Alternate Design Paradigm for Robust Spin-torque Transfer Magnetic RAM (STT MRAM) from Circuit/Architecture Perspective.\n \n \n \n \n\n\n \n Li, J.; Ndai, P.; Goel, A.; Liu, H.; and Roy, K.\n\n\n \n\n\n\n In Proceedings of the 2009 Asia and South Pacific Design Automation Conference, of ASP-DAC '09, pages 841–846, Piscataway, NJ, USA, Jan 2009. IEEE Press\n \n\n\n\n
\n\n\n\n \n \n \"AnPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{li2009aspdac,\n author = {Li, Jing and Ndai, Patrick and Goel, Ashish and Liu, Haixin and Roy, Kaushik},\n title = {An Alternate Design Paradigm for Robust Spin-torque Transfer Magnetic {RAM} ({STT MRAM}) from Circuit/Architecture Perspective},\n booktitle = {Proceedings of the 2009 Asia and South Pacific Design Automation Conference},\n series = {<strong>ASP-DAC</strong> '09},\n year = {2009},\n month={Jan},\n date={2009-01-19},\n isbn = {978-1-4244-2748-2},\n location = {Yokohama, Japan},\n pages = {841--846},\n numpages = {6},\n url = {http://dl.acm.org/citation.cfm?id=1509633.1509820},\n doi = {10.1109/ASPDAC.2009.4796585},\n acmid = {1509820},\n publisher = {IEEE Press},\n address = {Piscataway, NJ, USA},\n keywords = {conference, stt mram},\n abstract={Spin-Torque Transfer Magnetic RAM (STT MRAM) is a promising candidate for future embedded applications. It provides desirable memory attributes such as fast access time, low cost, high density and non-volatility. However, variations in process parameters can lead to a large number of cells to fail, severely affecting the yield of the memory array. In this paper, we provide a thorough analysis of the impact of design parameters on parametric failures due to process variations. To achieve high memory yield without incurring expensive technology modification, we developed an alternate design paradigm ---circuit/architecture co-design --- to take advantage of different levels of design hierarchy (circuit and architecture) to improve the yield and memory density. The technique decouples the conflicting design requirements for read stability/writability and density. Consequently, the memory cell failure probability reduces by 48\\% and cell area reduces by 21\\% with negligible performance degradation (~0.4\\%).},\n %note = {(Acceptance Rate: <u>33\\%</u>, 116 out of 355)},\n} \n\n
\n
\n\n\n
\n Spin-Torque Transfer Magnetic RAM (STT MRAM) is a promising candidate for future embedded applications. It provides desirable memory attributes such as fast access time, low cost, high density and non-volatility. However, variations in process parameters can lead to a large number of cells to fail, severely affecting the yield of the memory array. In this paper, we provide a thorough analysis of the impact of design parameters on parametric failures due to process variations. To achieve high memory yield without incurring expensive technology modification, we developed an alternate design paradigm —circuit/architecture co-design — to take advantage of different levels of design hierarchy (circuit and architecture) to improve the yield and memory density. The technique decouples the conflicting design requirements for read stability/writability and density. Consequently, the memory cell failure probability reduces by 48% and cell area reduces by 21% with negligible performance degradation (~0.4%).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Variation Resilient Spin Torque Transfer MRAM (poster).\n \n \n \n\n\n \n Li, J.; Ndai, P.; Ashish, G.; and Roy, K.\n\n\n \n\n\n\n In GSRC Workshop, Mar 2009. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{li2009gsrc,\n author = {Jing  Li and  Patrick  Ndai and Goel  Ashish and  Kaushik  Roy},\n title = {Variation  Resilient  Spin  Torque  Transfer  {MRAM} (poster)},\n booktitle = {GSRC Workshop},\n year = {2009},\n month={Mar},\n location = {Dallas, TX, USA},\n keywords = {conference},\n} \n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Robust and Energy-efficient Heterogeneous System Design in Emerging Technologies (nominated for Best Thesis Award).\n \n \n \n\n\n \n Li, J.\n\n\n \n\n\n\n Ph.D. Thesis, Electrical and Computer Engineering, 2009.\n Advisor: Prof. Kaushik Roy\n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@Phdthesis{phd,\n  Title                    = {Robust and Energy-efficient Heterogeneous System Design in Emerging Technologies (<strong>nominated for Best Thesis Award</strong>)},\n  Author                   = {Li, Jing},\n  Institution              = {Purdue University},\n  Year                     = {2009},\n\n  School                   = {Electrical and Computer Engineering},\n  note = {Advisor: Prof. Kaushik Roy},\n  keywords = {phd}\n}\n\n%%%%%%%%%% Techncal report %%%%%%%%%%\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2008\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n An Alternate Design Paradigm for Low-power, Low-cost, Testable Hybrid Systems Using Scaled LTPS TFTs (invited).\n \n \n \n \n\n\n \n Li, J.; Bansal, A.; Ghosh, S.; and Roy, K.\n\n\n \n\n\n\n J. Emerg. Technol. Comput. Syst., 4(3): 13:1–13:19. Aug 2008.\n \n\n\n\n
\n\n\n\n \n \n \"AnPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{li2008jetc,\n author = {Li, Jing and Bansal, Aditya and Ghosh, Swarop and Roy, Kaushik},\n title = {An Alternate Design Paradigm for Low-power, Low-cost, Testable Hybrid Systems Using Scaled {LTPS} {TFTs} (<strong>invited</strong>)},\n journal = {J. Emerg. Technol. Comput. Syst.},\n issue_date = {August 2008},\n volume = {4},\n number = {3},\n month = {Aug},\n year = {2008},\n issn = {1550-4832},\n pages = {13:1--13:19},\n articleno = {13},\n numpages = {19},\n url = {http://doi.acm.org/10.1145/1389089.1389093},\n doi = {10.1145/1389089.1389093},\n acmid = {1389093},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {3D integration, BIST, DFT, Low-temperature polycrystalline silicon (LTPS), generic, grain boundary (GB), hybrid system, inherent variation, reconfigurable, thin-film transistor (TFT)},\n keywords={journal}\n} \n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Modeling of Failure Probability and Statistical Design of Spin-Torque Transfer Magnetic RAM (STT MRAM) Array for Yield Enhancement.\n \n \n \n\n\n \n Li, J.; and Roy, K.\n\n\n \n\n\n\n In SRC Technology and Talent for the 21st Century Technology (TECHCON), 2008. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{li2008techcon,\n author = {Jing  Li and  Kaushik  Roy},\n title = {Modeling of Failure Probability and Statistical Design of Spin-Torque Transfer Magnetic {RAM} ({STT MRAM}) Array for Yield Enhancement},\n booktitle = {SRC Technology and Talent for the 21st Century Technology (TECHCON)},\n year = {2008},\n keywords = {conference},\n} \n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Variation-tolerant Spin-Torque Transfer (STT) MRAM array for yield enhancement.\n \n \n \n\n\n \n Li, J.; Liu, H.; Salahuddin, S.; and Roy, K.\n\n\n \n\n\n\n In 2008 IEEE Custom Integrated Circuits Conference (CICC), pages 193–196, Sept 2008. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2008cicc, \nauthor={Jing Li and Haixin Liu and S. Salahuddin and Kaushik Roy}, \nbooktitle={2008 IEEE Custom Integrated Circuits Conference (<strong>CICC</strong>)}, \ntitle={Variation-tolerant Spin-Torque Transfer ({STT}) {MRAM} array for yield enhancement}, \nyear={2008}, \ndate={2008-09-21},\nvolume={}, \nnumber={}, \npages={193--196}, \nkeywords={conference, Green's function methods,MRAM devices,DRAM,SRAM,flash memories,nonequilibrium Green's function,optimization,variation-tolerant spin-torque transfer MRAM array,yield enhancement,Circuit simulation,Circuit stability,Circuit synthesis,Electrodes,Green's function methods,Magnetic tunneling,Random access memory,Read-write memory,Robust stability,Scalability}, \ndoi={10.1109/CICC.2008.4672056}, \nISSN={0886-5930}, \nmonth={Sept},}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Modeling of failure probability and statistical design of Spin-Torque Transfer Magnetic Random Access Memory (STT MRAM) array for yield enhancement.\n \n \n \n\n\n \n Li, J.; Augustine, C.; Salahuddin, S.; and Roy, K.\n\n\n \n\n\n\n In 2008 45th ACM/IEEE Design Automation Conference (DAC), pages 278–283, June 2008. \n (Acceptance Rate: 23%, 147 out of 639)\n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2008dac, \nauthor={Jing Li and Charles Augustine and Sayeef Salahuddin and Kaushik Roy}, \nbooktitle={2008 45th ACM/IEEE Design Automation Conference (<strong>DAC</strong>)}, \ntitle={Modeling of failure probability and statistical design of Spin-Torque Transfer Magnetic Random Access Memory ({STT MRAM}) array for yield enhancement}, \nyear={2008}, \ndate={2008-06-08},\nvolume={}, \nnumber={}, \npages={278--283}, \nkeywords={conference, failure analysis,magnetic storage,magnetoelectronics,optimisation,random-access storage,coupled electromagnetic dynamics,failure probability,on-chip embedded memories,spin-torque transfer magnetic random access memory,spintronic device,statistical optimization methodology,yield enhancement,Couplings,Failure analysis,Flash memory,Magnetic analysis,Magnetic devices,Predictive models,Probability,Random access memory,Read-write memory,Scalability,STT MRAM,Yield}, \ndoi={10.1145/1391469.1391540}, \nISSN={0738-100X}, \nmonth={June},\nnote = {(Acceptance Rate: <u>23\\%</u>, 147 out of 639)},\n}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Body History Study on 12S eDRAM Sensing Operation.\n \n \n \n\n\n \n Li, J.\n\n\n \n\n\n\n Technical Report Semiconductor Research and Development Center (SRDC), IBM, Fishkill, 2008.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@techreport{li2008:report:edram,\n  title = {Body History Study on {12S} {eDRAM} Sensing Operation},\n  author = {Jing Li},\n  institution = {Semiconductor Research and Development Center (SRDC), IBM},\n  address = {Fishkill},\n  year = {2008},\n  keywords = {techreport}\n}\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2007\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Poly-Si Thin-Film Transistors: An Efficient and Low-Cost Option for Digital Operation.\n \n \n \n\n\n \n Li, J.; Bansal, A.; and Roy, K.\n\n\n \n\n\n\n IEEE Transactions on Electron Devices, 54(11): 2918-2929. Nov 2007.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@ARTICLE{li2007ted, \nauthor={Jing Li and Aditya Bansal and Kaushik Roy}, \njournal={IEEE Transactions on Electron Devices}, \ntitle={{Poly-Si} Thin-Film Transistors: An Efficient and Low-Cost Option for Digital Operation}, \nyear={2007}, \nvolume={54}, \nnumber={11}, \npages={2918-2929}, \nkeywords={journal, elemental semiconductors,low-power electronics,silicon,silicon-on-insulator,thin film transistors,LTPS TFT,SOI,Si - Interface,driving current,low-temperature polycrystalline-silicon thin-film transistors,midgap trap density,poly-Si thin-film transistors,silicon-on-insulator,single-crystalline silicon,submicrometer ultralow-power digital operation,ultralow-power subthreshold operation,Costs,Design methodology,Design optimization,Energy consumption,Fabrication,Glass,Polymers,Silicon,Substrates,Thin film transistors,Grain boundary (GB),low-pressure chemical vapor deposition (LPCVD),low-temperature polycrystalline silicon (LTPS),thin-film transistor (TFT)}, \ndoi={10.1109/TED.2007.906940}, \nISSN={0018-9383}, \nmonth={Nov},\n}\n\n%%%%%%%%%% Referred conference %%%%%%%%%%\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n A generic and reconfigurable test paradigm using Low-cost integrated Poly-Si TFTs.\n \n \n \n\n\n \n Li, J.; Ghosh, S.; and Roy, K.\n\n\n \n\n\n\n In 2007 IEEE International Test Conference (ITC), pages 1–10, Oct 2007. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2007itc, \nauthor={Jing Li and S. Ghosh and Kaushik Roy}, \nbooktitle={2007 IEEE International Test Conference (<strong>ITC</strong>)}, \ntitle={A generic and reconfigurable test paradigm using Low-cost integrated {Poly-Si TFTs}}, \nyear={2007}, \nvolume={}, \nnumber={}, \npages={1--10}, \nkeywords={conference, VLSI,built-in self test,design for testability,elemental semiconductors,integrated circuit testing,silicon,thin film transistors,3-D technology,BIST components,Si,VLSI systems,configurable design-for-test units,generic test structure,low-cost low-temperature integrated poly-silicon TFT,process tolerant test structure,reconfigurable test structure,thin film transistors,Circuit testing,Costs,Crystallization,Design for testability,Silicon,Substrates,System testing,Temperature,Thin film transistors,Very large scale integration}, \ndoi={10.1109/TEST.2007.4437622}, \nISSN={1089-3539}, \nmonth={Oct},}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Variable-latency adder (VL-adder): new arithmetic circuit design practice to overcome NBTI.\n \n \n \n\n\n \n Chen, Y.; Li, H.; Li, J.; and Koh, C.\n\n\n \n\n\n\n In 2007 ACM/IEEE International Symposium on Low Power Electronics and Design (ISLPED), pages 195–200, Aug 2007. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2007islped, \nauthor={Yiran Chen and Hai Li and Jing Li and Cheng-Kok Koh}, \nbooktitle={2007 ACM/IEEE International Symposium on Low Power Electronics and Design (<strong>ISLPED</strong>)}, \ntitle={Variable-latency adder ({VL-adder}): new arithmetic circuit design practice to overcome {NBTI}}, \nyear={2007}, \nvolume={}, \nnumber={}, \npages={195--200}, \nkeywords={conference, MOSFET,adders,logic design,low-power electronics,NBTI-induced delay degradation,NBTI-tolerant techniques,VL-adder,arithmetic circuit design,clock edge,energy efficiency,lower-power adder designs,manufacturing costs,nanoscale PMOS transistors,negative bias temperature instability,variable-latency adder technique,Adders,Arithmetic,Circuit synthesis,Clocks,Degradation,Delay,MOSFETs,Negative bias temperature instability,Niobium compounds,Titanium compounds,negative bias temperature instability (NBTI),variable-latency adder (VL-adder)}, \ndoi={10.1145/1283780.1283822}, \nISSN={}, \nmonth={Aug},\n%note = {(Acceptance Rate: <u>39\\%</u>, 74 out of 192)},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Low Power and Variation Tolerant Digital Circuit Design in Sub-micron Regime using Low Cost LTPS TFTs.\n \n \n \n\n\n \n Li, J.; and Roy, K.\n\n\n \n\n\n\n In SRC Technology and Talent for the 21st Century Technology (TECHCON), 2007. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{li2007techcon,\n author = {Jing  Li and  Kaushik  Roy},\n title = {Low Power and Variation Tolerant Digital Circuit Design in Sub-micron  Regime  using  Low  Cost {LTPS TFTs}},\n booktitle = {SRC Technology and Talent for the 21st Century Technology (TECHCON)},\n year = {2007},\n keywords = {conference},\n} \n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Novel Variation-Aware Circuit Design of Scaled LTPS TFT for Ultra low Power, Low-Cost Applications.\n \n \n \n\n\n \n Li, J.; Kang, K.; and Roy, K.\n\n\n \n\n\n\n In 2007 IEEE International Conference on Integrated Circuit Design and Technology (ICICDT), pages 1–4, May 2007. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2007icicdt, \nauthor={Jing Li and Kunhyuk Kang and Kaushik Roy}, \nbooktitle={2007 IEEE International Conference on Integrated Circuit Design and Technology (<strong>ICICDT</strong>)}, \ntitle={Novel Variation-Aware Circuit Design of Scaled {LTPS TFT} for Ultra low Power, Low-Cost Applications}, \nyear={2007}, \nvolume={}, \nnumber={}, \npages={1--4}, \nkeywords={conference, digital integrated circuits,elemental semiconductors,flexible electronics,grain boundaries,integrated circuit design,low-power electronics,response surface methodology,silicon,thin film transistors,Si,battery-operated portable electronics,defect grain boundary region,device-to-device variation,flexible substrate,low-cost digital design,low-temperature polycrystalline silicon thin film transistors,multifinger parallel structure,power dissipation,response surface method,scaled LTPS TFT,size 200 nm,statistical variation,variation-aware circuit design,voltage 10 V to 20 V,Circuit synthesis,Digital circuits,Flexible printed circuits,Glass,Grain boundaries,Polymers,Silicon,Substrates,Temperature,Thin film transistors,Low-temperature polycrystalline-Silicon (LTPS),Response Surface Method (RSM),grain boundary (GB),thin film transistor (TFT)}, \ndoi={10.1109/ICICDT.2007.4299589}, \nISSN={2381-3555}, \nmonth={May},}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n High Performance and Low Power Electronics on Flexible Substrate.\n \n \n \n\n\n \n Li, J.; Kang, K.; Bansal, A.; and Roy, K.\n\n\n \n\n\n\n In 2007 44th ACM/IEEE Design Automation Conference (DAC), pages 274–275, June 2007. \n (Acceptance Rate*: 13%)\n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2007dac, \nauthor={Jing Li and Kunhyuk Kang and Aditya Bansal and Kaushik Roy}, \nbooktitle={2007 44th ACM/IEEE Design Automation Conference (<strong>DAC</strong>)}, \ntitle={High Performance and Low Power Electronics on Flexible Substrate}, \nyear={2007}, \ndate={2007-06},\nvolume={}, \nnumber={}, \npages={274--275}, \nkeywords={conference, flexible electronics,low-power electronics,semiconductor device models,silicon,substrates,thin film transistors,GB-tolerant design,flexible substrate,grain boundaries,low power electronics,polycrystalline silicon thin film transistor,ultra low power digital application,Design methodology,Design optimization,Displays,Electron traps,Grain boundaries,Low power electronics,Silicon,Substrates,Temperature,Thin film transistors,Design,Experimentation,Grain Boundary (GB),Thin Film Transistor (TFT)}, \ndoi={10.1145/1278480.1278550}, \nISSN={0738-100X}, \nmonth={June},\nnote = {(Acceptance Rate*: <u>13\\%</u>)},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2006\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Exploring Low Temperature Poly-Si for Low Cost and Low Power Sub-micron Digital Operation.\n \n \n \n\n\n \n Li, J.; Bansal, A.; and Roy, K.\n\n\n \n\n\n\n In 2006 64th Device Research Conference (DRC), pages 61–62, June 2006. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@INPROCEEDINGS{li2006drc, \nauthor={Li, Jing and Bansal, Aditya and Roy, Kaushik}, \nbooktitle={2006 64th Device Research Conference (<strong>DRC</strong>)}, \ntitle={Exploring Low Temperature {Poly-Si} for Low Cost and Low Power Sub-micron Digital Operation}, \nyear={2006}, \nvolume={}, \nnumber={}, \npages={61--62}, \nkeywords={conference, Costs,Crystallization,Dielectric substrates,Digital circuits,Fabrication,Grain boundaries,Grain size,Silicon,Temperature,Thin film transistors}, \ndoi={10.1109/DRC.2006.305118}, \nISSN={1548-3770}, \nmonth={June},\n}\n\n%%%%%%%%%% Thesis %%%%%%%%%%\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);