We take actions in a 3D simulator and use privileged 3D information about the assets. We use
natural language descriptions of the assets and make question-answer pairs based on how the 3D nature of the scene changes with the actions taken.
@misc{ray2025satdynamicspatialaptitude,
title={SAT: Dynamic Spatial Aptitude Training for Multimodal Language Models},
author={Arijit Ray and Jiafei Duan and Ellis Brown and Reuben Tan and Dina Bashkirova and Rose Hendrix and Kiana Ehsani and Aniruddha Kembhavi and Bryan A. Plummer and Ranjay Krishna and Kuo-Hao Zeng and Kate Saenko},
year={2025},
eprint={2412.07755},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.07755},
}