@inproceedings{2c9df52329ef4b20a43d7921b56b100a,
title = "Filtering speaker-specific words from electronic discussions",
abstract = "The work presented in this paper is the first step in a project which aims to cluster and summarise electronic discussions in the context of help-desk applications. The eventual objective of this project is to use these summaries to assist help-desk users and operators. In this paper, we identify features of electronic discussions that influence the clustering process, and offer a filtering mechanism that removes undesirable influences. We tested the clustering and filtering processes on electronic newsgroup discussions, and evaluated their performance by means of two experiments: coarse-level clustering and simple information retrieval. Our evaluation shows that our filtering mechanism has a significant positive effect on both tasks.",
author = "Ingrid Zukerman and Yuval Marom",
note = "Funding Information: From a more qualitative perspective, we clearly saw the benefit of the filtering mechanism in the example in Section 3.3 (Tables 2 and 3): when a gen- eration component is used to describe the contents of clusters, the inclusion of author-specific words is uninformative and even confusing. Our approach to filtering is general in the sense that we do not target specific parts of electronic discussions (e.g. the last few lines of a posting) for filtering. We have experimented with a more naive approach that removes all web and email addresses from a posting (they account for a significant portion of a signature). However, this simple heuristic yielded only a small improvement in clustering performance. More importantly, it clearly does not generalise to deal with the problem of identifying and removing author-specific terminology. 6 Acknowledgments This research was supported in part by grant LP0347470 from the Australian Research Council and by an endowment from Hewlett Packard. References Abdelmonem Abdelaziz Afifi and Virginia Ann Clark. 1996. Computer-Aided Multivariate Analysis. Chapman & Hall, London. Yuval Marom and Ingrid Zukerman. 2004. Im-proving newsgroup clustering by filtering author-specific words. In PRICAI{\textquoteright}04 – Proceedings of the 8th Pacific Rim International Conference on Artificial Intelligence, Auckland, New Zealand. J. L. Neto, A. D. Santos, C. A. A. Kaestner, and A. A. Freitas. 2000. Document clustering and text summarization. In PAKDD-2000 – Proceed-ings of the 4th International Conference on Prac-tical Applications of Knowledge Discovery and Data Mining, pages 41–55, London, UK. G. Salton and M.J. McGill. 1983. An Introduction to Modern Information Retrieval. McGraw Hill. Gerald Salton. 1971. Cluster search strategies and the optimization of retrieval effectiveness. In Gerald Salton, editor, The SMART Retrieval Sys-tem — Experiments in Automatic Document Pro-cessing, pages 223–242. Prentice-Hall, Inc., En-glewood Cliffs, NJ. Hinrich Sch{\"u}tze and Jan O. Pedersen. 1995. Infor-mation retrieval based on word senses. In Pro-ceedings of the 4th Annual Symposium on Doc-ument Analysis and Information Retrieval, pages 161–175, Las Vegas, Nevada. Oren Zamir and Oren Etzioni. 1998. Web docu-ment clustering: A feasibility demonstration. In SIGIR{\textquoteright}98 – Proceedings of the 21st ACM Inter-national Conference on Research and Develop-ment in Information Retrieval, pages 46–54, Mel-bourne, Australia. Publisher Copyright: {\textcopyright} 2004 COLING 2004 - Proceedings of the 20th International Conference on Computational Linguistics. All rights reserved.; International Conference on Computational Linguistics 2004, COLING 2004 ; Conference date: 23-08-2004 Through 27-08-2004",
year = "2004",
language = "English",
isbn = "1932432485",
volume = "I",
pages = "473--479",
editor = "S Nirenburg",
booktitle = "Proceedings of the 20th International Conference on Computational Linguistics",
publisher = "Association for Computational Linguistics (ACL)",
url = "https://dblp.org/db/conf/coling/coling2004.html",
}