@InCollection{RossZemel03,
  author    = {David A. Ross and Richard S. Zemel},
  title     = {Multiple Cause Vector Quantization},
  booktitle = {Advances in Neural Information Processing Systems 15},
  editor    = {S. Becker, S. Thrun and K. Obermayer},
  publisher = {MIT Press},
  address   = {Cambridge, MA},
  pages     = {1017--1024},
  year      = {2003},
}

@MastersThesis{Ross03,
  author    = {David A. Ross},
  title     = {Learning Parts-Based Representations of Data},
  school    = {University of Toronto},
  year      = {2003},
}

@InCollection{RossLimYang04,
  author    = {David A. Ross and Jongwoo Lim and Ming-Hsuan Yang},
  title     = {Adaptive Probabilistic Visual Tracking with Incremental Subspace Update},
  booktitle = {Proc. Eighth European Conference on Computer Vision (ECCV 2004)},
  editor    = {T. Pajdla and J. Matas},
  volume    = {2},
  publisher = {Springer},
  pages     = {470--482},
  year      = {2004},
}

@InCollection{LimRossLinYang05,
  author    = {Jongwoo Lim and David A. Ross and Ruei-Sung Lin and Ming-Hsuan Yang},
  title     = {Incremental Learning for Visual Tracking},
  booktitle = {Advances in Neural Information Processing Systems 17},
  editor    = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou},
  publisher = {MIT Press},
  address   = {Cambridge, MA},
  year      = {2005},
}

@InCollection{LinRossLimYang05,
  author    = {Ruei-Sung Lin and David A. Ross and Jongwoo Lim and Ming-Hsuan Yang},
  title     = {Adaptive Discriminative Generative Model and Its Applications},
  booktitle = {Advances in Neural Information Processing Systems 17},
  editor    = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou},
  publisher = {MIT Press},
  address   = {Cambridge, MA},
  year      = {2005},
}

@InProceedings{RossOsinderoZemel06,
  author = 	 {David A. Ross and Simon Osindero and Richard S. Zemel},
  title = 	 {Combining Discriminative Features to Infer Complex Trajectories},
  booktitle = 	 {Proceedings of the Twenty-Third International Conference on Machine Learning},
  year = 	 {2006},
}

@Article{RossZemel06,
  author = 	 {David A. Ross and Richard S. Zemel},
  title = 	 {Learning Parts-Based Representations of Data},
  journal = 	 {Journal of Machine Learning Research},
  year = 	 {2006},
  volume = 	 {7},
  pages = 	 {2369--2397},
  month = 	 {Nov},
}

@Article{Cohen07,
  author = 	 {Andrew L. Cohen and Richard M. Shiffrin and Jason
                  M. Gold and David A. Ross and Michael G. Ross},
  title = 	 {{Inducing features from visual noise}},
  journal = 	 {Journal of Vision},
  year = 	 {2007},
  volume =	 {7},
  number =	 {8},
  pages =	 {1--14},
  month =	 {6},
}

@InProceedings{RossTarlowZemel07,
  author = 	 {David A. Ross and Daniel Tarlow and Richard S. Zemel},
  title = 	 {Learning Articulated Skeletons From Motion},
  booktitle =    {Workshop on Dynamical Vision at ICCV},
  year = 	 {2007}
}

@Article{RossLimLinYang08,
  author = 	 {David A. Ross and Jongwoo Lim and Ruei-Sung Lin and Ming-Hsuan Yang},
  title = 	 {Incremental Learning for Robust Visual Tracking},
  journal = 	 {International Journal of Computer Vision},
  year = 	 {2008},
  volume =       {77},
  number =       {1--3},
  month =        {May},
  note =         {Special Issue on Machine Learning for Vision}
}

@InProceedings{MeedsRossZemelRoweis08,
  author = 	 {Edward Meeds and David A. Ross and Richard S. Zemel and Sam Roweis},
  title = 	 {Learning stick-figure models using nonparametric Bayesian priors over trees},
  booktitle = 	 {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  year = 	 {2008}
}

@PhdThesis{Ross08,
  author =       {David A. Ross},
  title =        {Learning Probabilistic Models for Visual Motion},
  school =       {University of Toronto},
  year =         {2008},
  address =      {Ontario, Canada},
}

@InProceedings{RossTarlowZemel08,
  author = 	 {David A. Ross and Daniel Tarlow and Richard S. Zemel},
  title = 	 {Unsupervised Learning of Skeletons From Motion},
  booktitle = 	 {Proceedings of the 10th European Conference on Computer Vision (ECCV 2008)},
  year = 	 {2008},
  editor =       {D. Forsyth and P. Torr and A. Zisserman},
  publisher =    {Springer},
}

@Article{RossTarlowZemel10,
  author = 	 {David A. Ross and Daniel Tarlow and Richard S. Zemel},
  title = 	 {Learning Articulated Structure and Motion},
  journal = 	 {International Journal of Computer Vision},
  year = 	 {2010},
  volume =       {88},
  number =       {2},
  month =        {March},
  note =         {Special Issue on Probabilistic Models for Image Understanding}
}

@InProceedings{LinRossYagnik10,
  title =        {SPEC hashing: Similarity preserving algorithm for
                  entropy-based coding},
  author =       {Ruei-Sung Lin and David A. Ross and Jay Yagnik},
  year =         {2010},
  publisher =    {IEEE},
  booktitle = 	 {Proceedings of the IEEE Conference on Computer
                  Vision and Pattern Recognition},
}

@InProceedings{ChandrasekharSarginRoss11,
  author =       {Vijay Chandrasekhar and Mehmet Emre Sargin and David A. Ross},
  title =        {Automatic Language Identification in Music Videos
                  with Low Level Audio and Visual Features},
  booktitle =    {IEEE International Conference on Acoustics, Speech
                  and Signal Processing},
  year =         {2011},
}

@InProceedings{YagnikStrelowRossLin11,
  author = 	 {Jay Yagnik and Dennis Strelow and David A. Ross and
                  Ruei-Sung Lin},
  title =        {The Power of Comparative Reasoning},
  booktitle =    {IEEE International Conference on Computer Vision},
  year = 	 {2011},
}

@InProceedings{ChandrasekharSharifiRoss11,
  author =       {Vijay Chandrasekhar and Matt Sharifi and David A. Ross},
  title =        {Survey and Evaluation of Audio Fingerprinting
                  Schemes for Mobile Query-by-Example Applications},
  booktitle =    {Proc. of the 12th International Society for Music
                  Information Retrieval Conference (ISMIR 2011)},
  year =         {2011},
}

@inproceedings{MadaniGeorgRoss12,
  title = {On Using Nearly-Independent Feature Families for High
                  Precision and Confidence},
  author  = {Omid Madani and Manfred Georg and David Ross},
  year  = 2012,
  URL = {http://jmlr.csail.mit.edu/proceedings/papers/v25/},
  booktitle = {Fourth Asian Machine Learning Conference},
  pages = {269-284}
}

@article{WaltersRossLyon13,
  title={The intervalgram: an audio feature for large-scale melody
                  recognition},
  author={Thomas C. Walters and David Ross and Richard F. Lyon.},
  journal={Proceedings of the 9th international symposium on computer
                  music modeling and retrieval (CMMR)},
  year={2012}
}

@article{MadaniGeorgRoss13,
  year={2013},
  issn={0885-6125},
  journal={Machine Learning},
  volume={92},
  number={2-3},
  doi={10.1007/s10994-013-5377-0},
  title={On using nearly-independent feature families for high precision and confidence},
  url={http://dx.doi.org/10.1007/s10994-013-5377-0},
  publisher={Springer US},
  keywords={Classifier combination; Independent features; High
                  precision; Late fusion; Early fusion; Ensembles;
                  Multiple views; Supervised learning},
  author={Madani, Omid and Georg, Manfred and Ross, David},
  pages={457-477},
  language={English}
}

@InProceedings{Gu_2018_CVPR,
  author = {Gu, Chunhui and Sun, Chen and Ross, David A. and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and Schmid, Cordelia and Malik, Jitendra},
  title = {{AVA: A Video Dataset of Spatio-Temporally Localized Atomic Visual Actions}},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  month = {June},
  year = {2018}
}

@InProceedings{Chao_2018_CVPR,
author = {Chao, Yu-Wei and Vijayanarasimhan, Sudheendra and Seybold, Bryan and Ross, David A. and Deng, Jia and Sukthankar, Rahul},
title = {{Rethinking the Faster R-CNN Architecture for Temporal Action Localization}},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2018}
}

@InProceedings{Stroud2020d3d,
  title={D3D: Distilled 3d networks for video action recognition},
  author={Stroud, Jonathan and Ross, David and Sun, Chen and Deng, Jia and Sukthankar, Rahul},
  booktitle={The IEEE Winter Conference on Applications of Computer Vision},
  pages={625--634},
  year={2020}
}

@InProceedings{Nagrani20c,
  author       = {Arsha Nagrani and Chen Sun and David Ross and Rahul Sukthankar and Cordelia Schmid and Andrew Zisserman},
  title        = {{Speech2Action: Cross-modal Supervision for Action Recognition}},
  booktitle    = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year         = {2020}
}

@InProceedings{Najibi20,
  author = {Mahyar Najibi and Guangda Lai and Abhijit Kundu and
	Zhichao Lu and Vivek Rathod and Thomas Funkhouser and Caroline
	Pantofaru and David Ross and Larry S. Davis and Alireza Fathi},
  title = {{DOPS: Learning to Detect 3D Objects and Predict their 3D Shapes}},
  booktitle    = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year         = {2020}
}

@InProceedings{wang2020pillar,
  title={{Pillar-based Object Detection for Autonomous Driving}},
  author={Wang, Yue and Fathi, Alireza and Kundu, Abhijit and Ross, David and Pantofaru, Caroline and Funkhouser, Tom and Solomon, Justin},
  booktitle={Proceedings of the 16th European Conference on Computer Vision (ECCV 2020)},
  year={2020}
}

@InProceedings{huang2020,
  title={{An LSTM Approach to Temporal 3D Object Detection in LiDAR Point Clouds}},
  author={Rui Huang and Wanyue Zhang and Abhijit Kundu and Caroline
                  Pantofaru and David A. Ross and Thomas Funkhouser and Alireza Fathi},
  booktitle={Proceedings of the 16th European Conference on Computer Vision (ECCV 2020)},
  year={2020}
}

@InProceedings{kundu2020,
  title={{Virtual Multi-view Fusion for 3D Semantic Segmentation}},
  author={Abhijit Kundu and Xiaoqi Yin and Alireza Fathi and David Ross and Brian Brewington and Thomas Funkhouser and Caroline Pantofaru},
  booktitle={Proceedings of the 16th European Conference on Computer Vision (ECCV 2020)},
  year={2020}
}

@article{li2020ava,
  title={{The AVA-Kinetics Localized Human Actions Video Dataset}},
  author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
  journal={arXiv preprint arXiv:2005.00214},
  year={2020}
}

@InProceedings{chan2020,
    author    = {Chan, David M. and Vijayanarasimhan, Sudheendra and Ross, David A. and Canny, John F.},
    title     = {{Active Learning for Video Description With Cluster-Regularized Ensemble Ranking}},
    booktitle = {Proceedings of the Asian Conference on Computer Vision (ACCV)},
    month     = {November},
    year      = {2020}
}

@Article{stroud2020wvt,
  author = 	 {Jonathan C. Stroud and David A. Ross and Chen Sun
                  and Jia Deng and Rahul Sukthankar and Cordelia Schmid},
  title = 	 {{Learning Video Representations from Textual Web Supervision}},
  journal = 	 {arXiv preprint arXiv:2007.14937},
  year = 	 {2020},
}

@InProceedings{Li_2021_ICCV,
    author    = {Li, Ruilong and Yang, Shan and Ross, David A. and Kanazawa, Angjoo},
    title     = {{AI Choreographer: Music Conditioned 3D Dance Generation With AIST++}},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2021},
    pages     = {13401-13412}
}

@article{hu2021optical,
  title={{Optical Mouse: 3D Mouse Pose From Single-View Video}},
  author={Hu, Bo and Seybold, Bryan and Yang, Shan and Ross, David and Sud, Avneesh and Ruby, Graham and Liu, Yi},
  journal={CV4Animals: Computer Vision for Animal Behavior Tracking and Modeling Workshop, In conjunction with Computer Vision and Pattern Recognition},
  year={2021}
}

@InProceedings{Chan_2022_CVPR,
    author    = {Chan, David M. and Myers, Austin and Vijayanarasimhan, Sudheendra and Ross, David A. and Seybold, Bryan and Canny, John F.},
    title     = {{What's in a Caption? Dataset-Specific Linguistic Diversity and Its Effect on Visual Description Models and Metrics}},
    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops},
    month     = {June},
    year      = {2022},
    pages     = {4740-4749}
}

@article{mi2022im2nerf,
  title={{im2nerf: Image to Neural Radiance Field in the Wild}},
  author={Mi, Lu and Kundu, Abhijit and Ross, David and Dellaert, Frank and Snavely, Noah and Fathi, Alireza},
  journal={arXiv preprint arXiv:2209.04061},
  year={2022}
}

@article{rathod2022open,
  title={Open-vocabulary temporal action detection with off-the-shelf image-text features},
  author={Rathod, Vivek and Seybold, Bryan and Vijayanarasimhan, Sudheendra and Myers, Austin and Gu, Xiuye and Birodkar, Vighnesh and Ross, David A},
  journal={arXiv preprint arXiv:2212.10596},
  year={2022}
}

@article{chan2022distribution,
  title={{Distribution Aware Metrics for Conditional Natural Language Generation}},
  author={Chan, David M and Ni, Yiming and Myers, Austin and Vijayanarasimhan, Sudheendra and Ross, David A and Canny, John},
  journal={LREC-COLING 2024 - The 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation},
  year={2022}
}

@inproceedings{hu2023reveal,
  title={Reveal: Retrieval-augmented visual-language pre-training with multi-source multimodal knowledge memory},
  author={Hu, Ziniu and Iscen, Ahmet and Sun, Chen and Wang, Zirui and Chang, Kai-Wei and Sun, Yizhou and Schmid, Cordelia and Ross, David A and Fathi, Alireza},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={23369--23379},
  year={2023}
}

@inproceedings{chan2023ic3,
    title = {{IC3: Image Captioning by Committee Consensus}},
    author = "Chan, David M and
        Myers, Austin and
        Vijayanarasimhan, Sudheendra and
        Ross, David A and
        Canny, John",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2023",
    address = "Singapore, Singapore",
    publisher = "Association for Computational Linguistics",
}

@article{gu2023dataseg,
  title={Dataseg: Taming a universal multi-dataset multi-task segmentation model},
  author={Gu, Xiuye and Cui, Yin and Huang, Jonathan and Rashwan, Abdullah and Yang, Xuan and Zhou, Xingyi and Ghiasi, Golnaz and Kuo, Weicheng and Chen, Huizhong and Chen, Liang-Chieh and Ross, David A},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={67329--67354},
  year={2023}
}

@article{hu2023avis,
  title={Avis: Autonomous visual information seeking with large language model agent},
  author={Hu, Ziniu and Iscen, Ahmet and Sun, Chen and Chang, Kai-Wei and Sun, Yizhou and Ross, David and Schmid, Cordelia and Fathi, Alireza},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={867--878},
  year={2023}
}

@article{yu2023spae,
  title={{SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs}},
  author={Yu, Lijun and Cheng, Yong and Wang, Zhiruo and Kumar, Vivek and Macherey, Wolfgang and Huang, Yanping and Ross, David and Essa, Irfan and Bisk, Yonatan and Yang, Ming-Hsuan and Kevin Murphy and Alexander G. Hauptmann and Lu Jiang},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={52692--52704},
  year={2023}
}

@article{hu20233d,
  title={3D mouse pose from single-view video and a new dataset},
  author={Bo Hu and Bryan Seybold and Shan Yang and Avneesh Sud and Yi Liu and Karla Barron and Paulyn Cha and Marcelo Cosino and Ellie Karlsson and Janessa Kite and Ganesh Kolumam and Joseph Preciado and José Zavala-Solorio and Chunlian Zhang and Xiaomeng Zhang and Martin Voorbach and Ann E. Tovcimak and J. Graham Ruby and David A. Ross },
  journal={Scientific Reports},
  volume={13},
  number={1},
  pages={13554},
  year={2023},
  publisher={Nature Publishing Group UK London}
}

@inproceedings{yan2023unloc,
  title={Unloc: A unified framework for video localization tasks},
  author={Yan, Shen and Xiong, Xuehan and Nagrani, Arsha and Arnab, Anurag and Wang, Zhonghao and Ge, Weina and Ross, David and Schmid, Cordelia},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={13623--13633},
  year={2023}
}

@inproceedings{yu2024language,
  title={Language Model Beats Diffusion--Tokenizer is Key to Visual Generation},
  author={Lijun Yu and José Lezama and Nitesh B. Gundavarapu and Luca Versari and Kihyuk Sohn and David Minnen and Yong Cheng and Agrim Gupta and Xiuye Gu and Alexander G. Hauptmann and Boqing Gong and Ming-Hsuan Yang and Irfan Essa and David A. Ross and Lu Jiang},
  booktitle={International Conference on Learning Representations},
  year={2024}
}

@inproceedings{kondratyuk2024videopoet,
  title = {VideoPoet: A large language model for zero-shot video generation},
  author = {Dan Kondratyuk and Lijun Yu and Xiuye Gu and José Lezama and Jonathan Huang and Grant Schindler and Rachel Hornung and Vighnesh Birodkar and Jimmy Yan and Ming-Chang Chiu and Krishna Somandepalli and Hassan Akbari and Yair Alon and Yong Cheng and Josh Dillon and Agrim Gupta and Meera Hahn and Anja Hauth and David Hendon and Alonso Martinez and David Minnen and Mikhail Sirotenko and Kihyuk Sohn and Xuan Yang and Hartwig Adam and Ming-Hsuan Yang and Irfan Essa and Huisheng Wang and David A. Ross and Bryan Seybold and Lu Jiang
},
  url = {https://arxiv.org/pdf/2312.14125},
  doi = {10.48550/arXiv.2312.14125},
  year = {2024},
  date = {2024-07-23},
  urldate = {2024-07-23},
  booktitle = {Proceedings of International Conference on Machine Learning (ICML)},
}

@article{zhao2024videoprism,
  title={Videoprism: A foundational visual encoder for video understanding},
  author={Zhao, Long and Gundavarapu, Nitesh B and Yuan, Liangzhe and Zhou, Hao and Yan, Shen and Sun, Jennifer J and Friedman, Luke and Qian, Rui and Weyand, Tobias and Zhao, Yue and others},
  journal={Proceedings of International Conference on Machine Learning (ICML)},
  year={2024}
}

@inproceedings{hu2024scenecraft,
  title={Scenecraft: An llm agent for synthesizing 3d scenes as blender code},
  author={Hu, Ziniu and Iscen, Ahmet and Jain, Aashi and Kipf, Thomas and Yue, Yisong and Ross, David A and Schmid, Cordelia and Fathi, Alireza},
  booktitle={Proceedings of International Conference on Machine Learning (ICML)},
  year={2024}
}

@article{sun2024video,
  title={Video Foundation Models for Animal Behavior Analysis},
  author={Sun, Jennifer J and Zhou, Hao and Zhao, Long and Yuan, Liangzhe and Seybold, Bryan and Hendon, David and Schroff, Florian and Ross, David A and Adam, Hartwig and Hu, Bo and others},
  journal={bioRxiv},
  pages={2024--07},
  year={2024},
  publisher={Cold Spring Harbor Laboratory}
}

@article{zha2024language,
  title={Language-Guided Image Tokenization for Generation},
  author={Zha, Kaiwen and Yu, Lijun and Fathi, Alireza and Ross, David A and Schmid, Cordelia and Katabi, Dina and Gu, Xiuye},
  journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2025}
}

@inproceedings{yu2025malt,
  title={{MALT Diffusion: Memory-Augmented Latent Transformers for Any-Length Video Generation}},
  author={Yu, Sihyun and Hahn, Meera and Kondratyuk, Dan and Shin, Jinwoo and Gupta, Agrim and Lezama, Jos{\'e} and Essa, Irfan and Ross, David and Huang, Jonathan},
  booktitle={CVPR Workshop on AI for Content Creation},
  year={2025}
}