SMALLTOLARGE (S2L): Scalable Data Selection for Fine-tuning Large Language Models by Summarizing Training Loss Trajectories of Small Models. Yang, Y., Mishra, S., Chiang, J., & Mirzasoleiman, B. In The 38th Conference on Neural Information Processing Systems (NeurIPS 2024), Vancouver, Canada, December, 2024.
doi  abstract   bibtex   
Despite the effectiveness of data selection for pretraining and instruction fine-tuninglarge language models (LLMs), improving data efficiency in supervised fine-tuning(SFT) for specialized domains poses significant challenges due to the complexityof fine-tuning data. To bridge this gap, we introduce an effective and scalabledata selection method for SFT, SmallToLarge (S2L), which trains a smallmodel, clusters loss trajectories of the examples, and samples from these clusters toguide data selection for larger models. We prove that during fine-tuning, sampleswithin the same loss trajectory cluster exhibit similar gradients. Then, we showthat S2L subsets have a bounded gradient error w.r.t. the full data, hence guaranteeconvergence to the neighborhood of the optimal solution. We demonstrate throughextensive experiments that S2L significantly improves data efficiency in SFT formathematical problem-solving, reducing the training data requirement to just 11%of the original MathInstruct dataset to match full dataset performance whileoutperforming state-of-the-art data selection algorithms by an average of 4.7%across 6 in- and out-domain evaluation datasets. Remarkably, selecting only 50Kdata for SFT, S2L achieves a 32.7% accuracy on the challenging MATHbenchmark, improving Phi-2 by 16.6%. In clinical text summarization on theMIMIC-III dataset, S2L again outperforms training on the full dataset usingonly 50% of the data. Notably, S2L can perform scalable data selection using areference model 100× smaller than the target model, proportionally reducing thecomputational cost.
@inproceedings{yang_smalltolarge_2024,
	address = {Vancouver, Canada},
	title = {{SMALLTOLARGE} ({S2L}): {Scalable} {Data} {Selection} for {Fine}-tuning {Large} {Language} {Models} by {Summarizing} {Training} {Loss} {Trajectories} of {Small} {Models}},
	doi = {https://doi.org/10.52202/079017-2655},
	abstract = {Despite the effectiveness of data selection for pretraining and instruction fine-tuninglarge language models (LLMs), improving data efficiency in supervised fine-tuning(SFT) for specialized domains poses significant challenges due to the complexityof fine-tuning data. To bridge this gap, we introduce an effective and scalabledata selection method for SFT, SmallToLarge (S2L), which trains a smallmodel, clusters loss trajectories of the examples, and samples from these clusters toguide data selection for larger models. We prove that during fine-tuning, sampleswithin the same loss trajectory cluster exhibit similar gradients. Then, we showthat S2L subsets have a bounded gradient error w.r.t. the full data, hence guaranteeconvergence to the neighborhood of the optimal solution. We demonstrate throughextensive experiments that S2L significantly improves data efficiency in SFT formathematical problem-solving, reducing the training data requirement to just 11\%of the original MathInstruct dataset to match full dataset performance whileoutperforming state-of-the-art data selection algorithms by an average of 4.7\%across 6 in- and out-domain evaluation datasets. Remarkably, selecting only 50Kdata for SFT, S2L achieves a 32.7\% accuracy on the challenging MATHbenchmark, improving Phi-2 by 16.6\%. In clinical text summarization on theMIMIC-III dataset, S2L again outperforms training on the full dataset usingonly 50\% of the data. Notably, S2L can perform scalable data selection using areference model 100× smaller than the target model, proportionally reducing thecomputational cost.},
	language = {en},
	booktitle = {The 38th {Conference} on {Neural} {Information} {Processing} {Systems} ({NeurIPS} 2024)},
	author = {Yang, Yu and Mishra, Siddhartha and Chiang, Jeffrey and Mirzasoleiman, Baharan},
	month = dec,
	year = {2024},
	keywords = {Foundational, SYS: CosmicAI Contact Author, WG: Explorable},
}

Downloads: 0