HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference

HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference. Zhang, Z., Shen, H., Vargaftik, S., Basat, R. B., Mitzenmacher, M., & Yu, M. In Coimbra, Portugal, September, 2025. https://github.com/pcl-projects/HACK

Paper doi abstract bibtex

Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.

@inproceedings{zhang_hack_2025,
	address = {Coimbra, Portugal},
	title = {{HACK}: {Homomorphic} {Acceleration} via {Compression} of the {Key}-{Value} {Cache} for {Disaggregated} {LLM} {Inference}},
	url = {https://doi.org/10.1145/3718958.3750481},
	doi = {https://doi.org/10.1145/3718958.3750481},
	abstract = {Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.},
	language = {en},
	author = {Zhang, Zeyu and Shen, Haiying and Vargaftik, Shay and Basat, Ran Ben and Mitzenmacher, Michael and Yu, Minlan},
	month = sep,
	year = {2025},
	note = {https://github.com/pcl-projects/HACK},
	keywords = {Explorable},
}

Downloads: 0

{"_id":"g3XxWEXoZPoTd8yGr","bibbaseid":"zhang-shen-vargaftik-basat-mitzenmacher-yu-hackhomomorphicaccelerationviacompressionofthekeyvaluecachefordisaggregatedllminference-2025","author_short":["Zhang, Z.","Shen, H.","Vargaftik, S.","Basat, R. B.","Mitzenmacher, M.","Yu, M."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","address":"Coimbra, Portugal","title":"HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference","url":"https://doi.org/10.1145/3718958.3750481","doi":"https://doi.org/10.1145/3718958.3750481","abstract":"Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.","language":"en","author":[{"propositions":[],"lastnames":["Zhang"],"firstnames":["Zeyu"],"suffixes":[]},{"propositions":[],"lastnames":["Shen"],"firstnames":["Haiying"],"suffixes":[]},{"propositions":[],"lastnames":["Vargaftik"],"firstnames":["Shay"],"suffixes":[]},{"propositions":[],"lastnames":["Basat"],"firstnames":["Ran","Ben"],"suffixes":[]},{"propositions":[],"lastnames":["Mitzenmacher"],"firstnames":["Michael"],"suffixes":[]},{"propositions":[],"lastnames":["Yu"],"firstnames":["Minlan"],"suffixes":[]}],"month":"September","year":"2025","note":"https://github.com/pcl-projects/HACK","keywords":"Explorable","bibtex":"@inproceedings{zhang_hack_2025,\n\taddress = {Coimbra, Portugal},\n\ttitle = {{HACK}: {Homomorphic} {Acceleration} via {Compression} of the {Key}-{Value} {Cache} for {Disaggregated} {LLM} {Inference}},\n\turl = {https://doi.org/10.1145/3718958.3750481},\n\tdoi = {https://doi.org/10.1145/3718958.3750481},\n\tabstract = {Disaggregated Large Language Model (LLM) inference decouples the compute-intensive prefill stage from the memory-intensive decode stage, allowing low-end, compute-focused GPUs for prefill and high-end, memory-rich GPUs for decode, which reduces cost while maintaining high throughput. However, transmitting Key-Value (KV) data between the two stages can be a bottleneck, especially for long prompts. Additionally, the computational overhead in the two stages is key for optimizing Job Completion Time (JCT), and KV data size can become prohibitive for long prompts and sequences. Existing KV quantization methods can alleviate transmission and memory bottlenecks, but they introduce significant dequantization overhead, exacerbating the computation time.},\n\tlanguage = {en},\n\tauthor = {Zhang, Zeyu and Shen, Haiying and Vargaftik, Shay and Basat, Ran Ben and Mitzenmacher, Michael and Yu, Minlan},\n\tmonth = sep,\n\tyear = {2025},\n\tnote = {https://github.com/pcl-projects/HACK},\n\tkeywords = {Explorable},\n}\n\n\n\n","author_short":["Zhang, Z.","Shen, H.","Vargaftik, S.","Basat, R. B.","Mitzenmacher, M.","Yu, M."],"key":"zhang_hack_2025","id":"zhang_hack_2025","bibbaseid":"zhang-shen-vargaftik-basat-mitzenmacher-yu-hackhomomorphicaccelerationviacompressionofthekeyvaluecachefordisaggregatedllminference-2025","role":"author","urls":{"Paper":"https://doi.org/10.1145/3718958.3750481"},"keyword":["Explorable"],"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://bibbase.org/zotero-group/pratikmhatre/5933976","dataSources":["yJr5AAtJ5Sz3Q4WT4"],"keywords":["explorable"],"search_terms":["hack","homomorphic","acceleration","via","compression","key","value","cache","disaggregated","llm","inference","zhang","shen","vargaftik","basat","mitzenmacher","yu"],"title":"HACK: Homomorphic Acceleration via Compression of the Key-Value Cache for Disaggregated LLM Inference","year":2025}