Current robot learning algorithms for acquiring novel skills often rely on demonstration datasets or environment interactions, resulting in high labor costs and potential safety risks. To address these challenges, this study proposes a skill-learning framework that enables robots to acquire novel skills from natural language instructions. The proposed pipeline leverages vision-language models to generate demonstration videos of novel skills, which are processed by an inverse dynamics model to extract actions from the unlabeled demonstrations. These actions are subsequently mapped to environmental contexts via imitation learning, enabling robots to learn new skills effectively.
@misc{jin2024learningnovelskillslanguagegenerated,
title={Learning Novel Skills from Language-Generated Demonstrations},
author={Ao-Qun Jin and Tian-Yu Xiang and Xiao-Hu Zhou and Mei-Jiang Gui and Xiao-Liang Xie and Shi-Qi Liu and Shuang-Yi Wang and Yue Cao and Sheng-Bin Duan and Fu-Chao Xie and Zeng-Guang Hou},
year={2024},
eprint={2412.09286},
archivePrefix={arXiv},
primaryClass={cs.RO},
url={https://arxiv.org/abs/2412.09286},
}