Back
Controlling 3D human motion through natural language is key to unlocking interactive experiences in animation, gaming, virtual reality, and robotics. Yet current generative models still struggle with the compositional nature of real human behavior — actions unfold in sequence, overlap in time, and need fine-grained editing, all of which demand more than single-prompt, single-motion generation. This thesis argues that truly controllable motion generation requires compositional thinking: the ability to chain actions over time, layer them across body parts, and refine them through iterative editing. I present a suite of methods and datasets that tackle each of these axes. I first introduce TEACH, a hierarchical Transformer-based model that generates temporally coherent motion sequences from a series of textual descriptions, handling transitions between consecutive actions. I then address spatial composition with SINC, which synthesizes simultaneous actions — such as waving while walking — by leveraging structural knowledge about body-part involvement extracted from large language models. Moving from generation to editing, I present MotionFix, a dataset of source–target–edit-text triplets, together with TMED, a conditional diffusion model that modifies existing motions according to fine-grained textual instructions. Underpinning these contributions is BABEL, a large-scale dataset of semantically rich, frame-level annotations for motion-capture data that serves as a shared foundation for training and benchmarking across all three tasks. Together, these contributions advance language-driven motion generation from isolated actions toward the compositional, editable control that real-world applications demand.
@phdthesis{nikos-thesis, title = {Towards Fine-grained 3D Human Motion Generation from Textual Instructions}, abstract = {Controlling 3D human motion through natural language is key to unlocking interactive experiences in animation, gaming, virtual reality, and robotics. Yet current generative models still struggle with the compositional nature of real human behavior — actions unfold in sequence, overlap in time, and need fine-grained editing, all of which demand more than single-prompt, single-motion generation. This thesis argues that truly controllable motion generation requires compositional thinking: the ability to chain actions over time, layer them across body parts, and refine them through iterative editing. I present a suite of methods and datasets that tackle each of these axes. I first introduce TEACH, a hierarchical Transformer-based model that generates temporally coherent motion sequences from a series of textual descriptions, handling transitions between consecutive actions. I then address spatial composition with SINC, which synthesizes simultaneous actions — such as waving while walking — by leveraging structural knowledge about body-part involvement extracted from large language models. Moving from generation to editing, I present MotionFix, a dataset of source–target–edit-text triplets, together with TMED, a conditional diffusion model that modifies existing motions according to fine-grained textual instructions. Underpinning these contributions is BABEL, a large-scale dataset of semantically rich, frame-level annotations for motion-capture data that serves as a shared foundation for training and benchmarking across all three tasks. Together, these contributions advance language-driven motion generation from isolated actions toward the compositional, editable control that real-world applications demand.}, school = {University of Tübingen}, month = apr, year = {2026}, author = {Athanasiou, Nikos}, month_numeric = {4} }
More information