@article{sa2va,title={Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos},author={Yuan, Haobo and Li, Xiangtai and Zhang, Tao and Huang, Zilong and Xu, Shilin and Ji, Shunping and Tong, Yunhai and Qi, Lu and Feng, Jiashi and Yang, Ming-Hsuan},journal={arXiv pre-print},year={2025},}
arXiv
DenseWorld-1M: Towards Detailed Dense Grounded Caption in the Real World
Xiangtai Li, Tao Zhang, Yanwei Li, Haobo Yuan, Shihao Chen, Yikang Zhou, Jiahao Meng, Yueyi Sun, Shilin Xu, Lu Qi, Tianheng Cheng, Yi Lin, Zilong Huang, Wenhao Huang, Jiashi Feng, and Guang Shi
@article{li2025denseworld,title={{DenseWorld-1M}: Towards Detailed Dense Grounded Caption in the Real World},author={Li, Xiangtai and Zhang, Tao and Li, Yanwei and Yuan, Haobo and Chen, Shihao and Zhou, Yikang and Meng, Jiahao and Sun, Yueyi and Xu, Shilin and Qi, Lu and Cheng, Tianheng and Lin, Yi and Huang, Zilong and Huang, Wenhao and Feng, Jiashi and Shi, Guang},journal={arXiv preprint arXiv:2506.24102},year={2025},}
arXiv
An empirical study of gpt-4o image generation capabilities
Sixiang Chen, Jinbin Bai, Zhuoran Zhao, Tian Ye, Qingyu Shi, Donghao Zhou, Wenhao Chai, Xin Lin, Jianzong Wu, Chao Tang, Shilin Xu, Tao Zhang, Haobo Yuan, Yikang Zhou, Wei Chow, Linfeng Li, Xiangtai Li, Lei Zhu, and Lu Qi
@article{chen2025empirical,title={An empirical study of gpt-4o image generation capabilities},author={Chen, Sixiang and Bai, Jinbin and Zhao, Zhuoran and Ye, Tian and Shi, Qingyu and Zhou, Donghao and Chai, Wenhao and Lin, Xin and Wu, Jianzong and Tang, Chao and Xu, Shilin and Zhang, Tao and Yuan, Haobo and Zhou, Yikang and Chow, Wei and Li, Linfeng and Li, Xiangtai and Zhu, Lei and Qi, Lu},journal={arXiv preprint arXiv:2504.05979},year={2025},}
@inproceedings{fei2025path,title={On path to multimodal generalist: General-level and general-bench},author={Fei, Hao and Zhou, Yuan and Li, Juncheng and Li, Xiangtai and Xu, Qingshan and Li, Bobo and Wu, Shengqiong and Wang, Yaoting and Zhou, Junbao and Meng, Jiahao and Shi, Qingyu and Zhou, Zhiyuan and Shi, Liangtao and Gao, Minghe and Zhang, Daoan and Ge, Zhiqi and Tang, Siliang and Pan, Kaihang and Ye, Yaobo and Yuan, Haobo and Zhang, Tao and Wu, Weiming and Ju, Tianjie and Meng, Zixiang and Xu, Shilin and Jia, Liyu and Hu, Wentao and Luo, Meng and Luo, Jiebo and Chua, Tat-Seng and Yan, Shuicheng and Zhang, Hanwang},booktitle={ICML},year={2025},address={Vancouver, Canada},}
@inproceedings{zhang2025point,title={Point Could Mamba: Point Cloud Learning via State Space Model},author={Zhang, Tao and Li, Xiangtai and Yuan, Haobo and Ji, Shunping and Yan, Shuicheng},booktitle={AAAI},address={Philadelphia, PA, USA},year={2025},}
@inproceedings{xu2025rapsam,title={RAP-SAM:Towards Real-Time All-Purpose Segment Anything},author={Xu, Shilin and Yuan, Haobo and Shi, Qingyu and Qi, Lu and Wang, Jingbo and Yang, Yibo and Li, Yining and Chen, Kai and Tong, Yunhai and Ghanem, Bernard and Li, Xiangtai and Yang, Ming-Hsuan},booktitle={ICLR},address={Singapore},year={2025},}
@inproceedings{yuan2024ovsam,title={Open-Vocabulary SAM: Segment and Recognize Twenty-thousand Classes Interactively},author={Yuan, Haobo and Li, Xiangtai and Zhou, Chong and Li, Yining and Chen, Kai and Loy, Chen Change},booktitle={ECCV},address={Milano, Italy},year={2024},}
@inproceedings{li2024omg,title={OMG-Seg: Is One Model Good Enough For All Segmentation?},author={Li, Xiangtai and Yuan, Haobo and Li, Wei and Ding, Henghui and Wu, Size and Zhang, Wenwei and Li, Yining and Chen, Kai and Loy, Chen Change},booktitle={CVPR},address={Seattle, WA, USA},year={2024},}
@inproceedings{zhang2024omgllava,title={OMG-LLaVA: Bridging Image-level, Object-level, Pixel-level Reasoning and Understanding},author={Zhang, Tao and Li, Xiangtai and Fei, Hao and Yuan, Haobo and Wu, Shengqiong and Ji, Shunping and Loy, Chen Change and Yan, Shuicheng},booktitle={NeurIPS},address={Vancouver, Canada},year={2024},}
@article{li2024transformer,title={Transformer-based Visual Segmentation: A Survey},author={Li, Xiangtai and Ding, Henghui and Yuan, Haobo and Zhang, Wenwei and Pang, Jiangmiao and Cheng, Guangliang and Chen, Kai and Liu, Ziwei and Loy, Chen Change},journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},year={2024},}
@article{wu2024towards,title={Towards Open Vocabulary Learning: A Survey},author={Wu, Jianzong and Li, Xiangtai and Xu, Shilin and Yuan, Haobo and Ding, Henghui and Yang, Yibo and Li, Xia and Zhang, Jiangning and Tong, Yunhai and Jiang, Xudong and Ghanem, Bernard and Tao, Dacheng},journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},year={2024},}
@article{li2023panopticpartformer++,title={PanopticPartFormer++: A Unified and Decoupled View for Panoptic Part Segmentation},author={Li, Xiangtai and Xu, Shilin and Yang, Yibo and Yuan, Haobo and Cheng, Guangliang and Tong, Yunhai and Lin, Zhouchen and Yang, Ming-Hsuan and Tao, Dacheng},journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},year={2024},}
arXiv
Mamba or RWKV: Exploring High-Quality and High-Efficiency Segment Anything Model
@article{yuan2024mamba,title={Mamba or RWKV: Exploring High-Quality and High-Efficiency Segment Anything Model},author={Yuan, Haobo and Li, Xiangtai and Qi, Lu and Zhang, Tao and Yang, Ming-Hsuan and Yan, Shuicheng and Loy, Chen Change},journal={arXiv preprint},year={2024},}
arXiv
LLAVADI: What Matters For Multimodal Large Language Models Distillation
@article{xu2024LLAVADI,title={LLAVADI: What Matters For Multimodal Large Language Models Distillation},author={Xu, Shilin and Li, Xiangtai and Yuan, Haobo and Qi, Lu and Tong, Yunhai and Yang, Ming-Hsuan},journal={arXiv preprint},year={2024},}
@inproceedings{li2023tube,title={Tube-Link: A Flexible Cross Tube Baseline for Universal Video Segmentation},author={Li, Xiangtai and Yuan, Haobo and Zhang, Wenwei and Cheng, Guangliang and Pang, Jiangmiao and Loy, Chen Change},booktitle={ICCV},address={Paris, France},year={2023},}
@article{yuan2023monocular,title={Monocular Road Planar Parallax Estimation},author={Yuan, Haobo and Chen, Teng and Sui, Wei and Xie, Jiafeng and Zhang, Lefei and Li, Yuan and Zhang, Qian},journal={IEEE Transactions on Image Processing},volume={32},pages={3690-3701},year={2023},}
@inproceedings{yang2023neural,title={Neural Collapse Inspired Feature-Classifier Alignment for Few-Shot Class-Incremental Learning},author={Yang, Yibo and Yuan, Haobo and Li, Xiangtai and Lin, Zhouchen and Torr, Philip and Tao, Dacheng},booktitle={ICLR},year={2023},address={Kigali, Rwanda},}
@article{xu2023multi,title={Multi-Task Learning with Multi-query Transformer for Dense Prediction},author={Xu, Yangyang and Li, Xiangtai and Yuan, Haobo and Yang, Yibo and Zhang, Lefei},journal={IEEE Transactions on Circuits and Systems for Video Technology},year={2023},}
arXiv
Neural Collapse Terminus: A Unified Solution for Class Incremental Learning and Its Variants
@article{yang2023nct,author={Yang, Yibo and Yuan, Haobo and Li, Xiangtai and Wu, Jianlong and Zhang, Lefei and Lin, Zhouchen and Torr, Philip and Tao, Dacheng and Ghanem, Bernard},title={Neural Collapse Terminus: A Unified Solution for Class Incremental Learning and Its Variants},journal={arXiv pre-print},year={2023},}
@inproceedings{yuan2022polyphonicformer,title={PolyphonicFormer: Unified Query Learning for Depth-aware Video Panoptic Segmentation},author={Yuan, Haobo and Li, Xiangtai and Yang, Yibo and Cheng, Guangliang and Zhang, Jing and Tong, Yunhai and Zhang, Lefei and Tao, Dacheng},booktitle={ECCV},year={2022},address={Tel Aviv, Israel},}
@inproceedings{yang2022towards,title={Towards Theoretically Inspired Neural Initialization Optimization},author={Yang, Yibo and Wang, Hong and Yuan, Haobo and Lin, Zhouchen},booktitle={NeurIPS},year={2022},address={New Orleans, LA, USA},}
@article{chen2020bossa,title={BOSSA: a decentralized system for proofs of data retrievability and replication},author={Chen, Dian and Yuan, Haobo and Hu, Shengshan and Wang, Qian and Wang, Cong},journal={IEEE Transactions on Parallel and Distributed Systems},volume={32},number={4},pages={786--798},year={2021},publisher={IEEE},}