HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference. Zhang, Z., Shen, H., Vargaftik, S., Basat, R. B., Mitzenmacher, M., & Yu, M. In ACM Special Interest Group on Data Communication (SIGCOMM 2025), Coimbra, Portugal, September, 2025.
Paper doi abstract bibtex Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.
@inproceedings{zhang_hack_2025,
address = {Coimbra, Portugal},
title = {{HACK}: {Homomorphic} {Acceleration} via {Compression} of the {Key}-{Value} {Cache} for {Disaggregated} {LLM} {Inference}},
url = {https://github.com/pcl-projects/HACK},
doi = {https://doi.org/10.1145/3718958.3750481},
abstract = {Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.},
language = {en},
booktitle = {{ACM} {Special} {Interest} {Group} on {Data} {Communication} ({SIGCOMM} 2025)},
author = {Zhang, Zeyu and Shen, Haiying and Vargaftik, Shay and Basat, Ran Ben and Mitzenmacher, Michael and Yu, Minlan},
month = sep,
year = {2025},
keywords = {Explorable, Foundational, SYS: CosmicAI Contact Author, WG: Explorable},
}
Downloads: 0
{"_id":"g3XxWEXoZPoTd8yGr","bibbaseid":"zhang-shen-vargaftik-basat-mitzenmacher-yu-hackhomomorphicaccelerationviacompressionofthekeyvaluecachefordisaggregatedllminference-2025","author_short":["Zhang, Z.","Shen, H.","Vargaftik, S.","Basat, R. B.","Mitzenmacher, M.","Yu, M."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","address":"Coimbra, Portugal","title":"HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference","url":"https://github.com/pcl-projects/HACK","doi":"https://doi.org/10.1145/3718958.3750481","abstract":"Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.","language":"en","booktitle":"ACM Special Interest Group on Data Communication (SIGCOMM 2025)","author":[{"propositions":[],"lastnames":["Zhang"],"firstnames":["Zeyu"],"suffixes":[]},{"propositions":[],"lastnames":["Shen"],"firstnames":["Haiying"],"suffixes":[]},{"propositions":[],"lastnames":["Vargaftik"],"firstnames":["Shay"],"suffixes":[]},{"propositions":[],"lastnames":["Basat"],"firstnames":["Ran","Ben"],"suffixes":[]},{"propositions":[],"lastnames":["Mitzenmacher"],"firstnames":["Michael"],"suffixes":[]},{"propositions":[],"lastnames":["Yu"],"firstnames":["Minlan"],"suffixes":[]}],"month":"September","year":"2025","keywords":"Explorable, Foundational, SYS: CosmicAI Contact Author, WG: Explorable","bibtex":"@inproceedings{zhang_hack_2025,\n\taddress = {Coimbra, Portugal},\n\ttitle = {{HACK}: {Homomorphic} {Acceleration} via {Compression} of the {Key}-{Value} {Cache} for {Disaggregated} {LLM} {Inference}},\n\turl = {https://github.com/pcl-projects/HACK},\n\tdoi = {https://doi.org/10.1145/3718958.3750481},\n\tabstract = {Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.},\n\tlanguage = {en},\n\tbooktitle = {{ACM} {Special} {Interest} {Group} on {Data} {Communication} ({SIGCOMM} 2025)},\n\tauthor = {Zhang, Zeyu and Shen, Haiying and Vargaftik, Shay and Basat, Ran Ben and Mitzenmacher, Michael and Yu, Minlan},\n\tmonth = sep,\n\tyear = {2025},\n\tkeywords = {Explorable, Foundational, SYS: CosmicAI Contact Author, WG: Explorable},\n}\n\n\n\n","author_short":["Zhang, Z.","Shen, H.","Vargaftik, S.","Basat, R. B.","Mitzenmacher, M.","Yu, M."],"key":"zhang_hack_2025","id":"zhang_hack_2025","bibbaseid":"zhang-shen-vargaftik-basat-mitzenmacher-yu-hackhomomorphicaccelerationviacompressionofthekeyvaluecachefordisaggregatedllminference-2025","role":"author","urls":{"Paper":"https://github.com/pcl-projects/HACK"},"keyword":["Explorable","Foundational","SYS: CosmicAI Contact Author","WG: Explorable"],"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://bibbase.org/zotero-group/pratikmhatre/5933976","dataSources":["yJr5AAtJ5Sz3Q4WT4"],"keywords":["explorable","foundational","sys: cosmicai contact author","wg: explorable"],"search_terms":["hack","homomorphic","acceleration","via","compression","key","value","cache","disaggregated","llm","inference","zhang","shen","vargaftik","basat","mitzenmacher","yu"],"title":"HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference","year":2025}