@inproceedings{777379c1b8a64b428c6791dfafcff10e,
title = "Estimating the number and sizes of fuzzy-duplicate clusters",
abstract = "Duplicates in a dataset are multiple representations of the same real-world entity and constitute a major data quality problem. This paper investigates the problem of estimating the number and sizes of duplicate record clusters in advance and describes a sampling-based method for solving this problem. In extensive experiments, on multiple datasets, we show that the proposed method reliably estimates the number of duplicate clusters, while being highly efficient. Our method can be used a) to measure the dirtiness of a dataset, b) to assess the quality of duplicate detection configurations, such as similarity measures, and c) to gather approximate statistics about the true number of entities represented in the dataset.",
keywords = "Cluster, Data integration, Duplicate, Estimation, Pair",
author = "Arvid Heise and Gjergji Kasneci and Felix Naumann",
note = "Publisher Copyright: Copyright 2014 ACM.; 23rd ACM International Conference on Information and Knowledge Management, CIKM 2014 ; Conference date: 03-11-2014 Through 07-11-2014",
year = "2014",
month = nov,
day = "3",
doi = "10.1145/2661829.2661885",
language = "English",
series = "CIKM 2014 - Proceedings of the 2014 ACM International Conference on Information and Knowledge Management",
publisher = "Association for Computing Machinery",
pages = "959--968",
booktitle = "CIKM 2014 - Proceedings of the 2014 ACM International Conference on Information and Knowledge Management",
}