@inproceedings{922dd92387c74d3d9126920a21effeef,
title = "KDNet: Leveraging Vision-Language Knowledge Distillation for Few-Shot Object Detection",
abstract = "Few-shot object detection (FSOD) aims to detect new categories given only few instances for training. Recently emerged vision-language models (VLMs) have shown great performances in zero-shot and open-vocabulary object detection due to their strong ability to align object-level embedding with textual embedding of categories. However, few existing models distill VLMs{\textquoteright} object-level knowledge in FSOD, which can help FSOD to learn novel semantic concepts to gain further improvement. Inspired by the recent knowledge distillation approaches with VLMs, we propose an end-to-end few-shot object detector with knowledge distillation from pre-trained VLMs, termed KDNet. A knowledge distillation branch is introduced alongside the object detector to distill knowledge from VLMs{\textquoteright} visual encoder to the object detector. Also, we propose a pre-training mechanism with large-scale dataset to inject more semantic concepts to the detector to improve the performance on small datasets. The KDNet achieved state-of-the-art performance on both PASCAL VOC and MS COCO benchmarks over most of the shot settings and evaluation metrics.",
author = "Mengyuan Ma and Lin Qian and Hujun Yin",
year = "2024",
month = sep,
day = "17",
doi = "10.1007/978-3-031-72335-3_11",
language = "English",
isbn = "9783031723346",
volume = "2",
series = "Lecture Notes in Computer Science",
publisher = "Springer Cham",
pages = "153–167",
editor = "Michael Wand and Krist{\'i}na Malinovsk{\'a} and J{\"u}rgen Schmidhuber and Tetko, {Igor V.}",
booktitle = "Artificial Neural Networks and Machine Learning – ICANN 2024",
address = "Switzerland",
}