@inproceedings{4f28dd107c8f45689bf9226add498354,
title = "MixDir: Scalable bayesian clustering for high-dimensional categorical data",
abstract = "Multivariate analysis of high-dimensional datasets with multiple categorical variables (e.g. surveys, questionnaires) is a challenging task but can reveal patterns of responses that are masked from univariate analyses. In this paper we propose a novel variational inference algorithm to cluster high-dimensional categorical observations into latent classes. Variational inference is an approximate Bayesian inference algorithm, which combines fast optimization methods with the ability to propagate the uncertainty to the clustering (soft clustering). The model is robust to misspecification of the number of latent classes and can infer a reasonable number from the data. We assess the performance on synthetic and real world data and show that our algorithm has similar performance to the best other tested method if the correct number of classes is known and outperforms the other methods if it the number of classes needs to be inferred. An R-package implementing our algorithm is available at the Comprehensive R Archive Network.",
keywords = "Bayesian, Categorical variables, Clustering, High-dimensional, Variational inference",
author = "Constantin Ahlmann-Eltze and Christopher Yau",
year = "2019",
doi = "10.1109/DSAA.2018.00068",
language = "English",
series = "Proceedings - 2018 IEEE 5th International Conference on Data Science and Advanced Analytics, DSAA 2018",
publisher = "IEEE",
pages = "526--539",
editor = "Tina Eliassi-Rad and Wei Wang and Ciro Cattuto and Foster Provost and Rayid Ghani and Francesco Bonchi",
booktitle = "Proceedings - 2018 IEEE 5th International Conference on Data Science and Advanced Analytics, DSAA 2018",
address = "United States",
note = "5th IEEE International Conference on Data Science and Advanced Analytics, DSAA 2018 ; Conference date: 01-10-2018 Through 04-10-2018",
}