@inproceedings{fan2024hold,
  title = {{HOLD}: Category-agnostic {3D} Reconstruction of Interacting Hands and Objects from Video},
  aword_paper = {Highlight},
  booktitle = {IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)},
  abstract = {Since humans interact with diverse objects every day, the holistic 3D capture of these interactions is important to understand and model human behaviour. However, most existing methods for hand-object reconstruction from RGB either assume pre-scanned object templates or heavily rely on limited 3D hand-object data, restricting their ability to scale and generalize to more unconstrained interaction settings. To this end, we introduce HOLD -- the first category-agnostic method that reconstructs an articulated hand and object jointly from a monocular interaction video. We develop a compositional articulated implicit model that can reconstruct disentangled 3D hand and object from 2D images. We also further incorporate hand-object constraints to improve hand-object poses and consequently the reconstruction quality. Our method does not rely on 3D hand-object annotations while outperforming fully-supervised baselines in both in-the-lab and challenging in-the-wild settings. Moreover, we qualitatively show its robustness in reconstructing from in-the-wild videos.},
  pages = {494--504},
  address = {Piscataway, NJ},
  month = sep,
  year = {2024},
  author = {Fan, Zicong and Parelli, Maria and Kadoglou, Maria Eleni and Kocabas, Muhammed and Chen, Xu and Black, Michael J. and Hilliges, Otmar},
  doi = {10.1109/CVPR52733.2024.00054},
  url = {https://github.com/zc-alexfan/hold},
  month_numeric = {9}
}
