@inproceedings{joachims_optimizing_2002, address = {Edmonton, Alberta, Canada}, title = {Optimizing search engines using clickthrough data}, isbn = {978-1-58113-567-1}, url = {http://dl.acm.org/citation.cfm?id=775047.775067}, doi = {10.1145/775047.775067}, abstract = {This paper presents an approach to automatically optimizing the retrieval quality of search engines using clickthrough data. Intuitively, a good information retrieval system should present relevant documents high in the ranking, with less relevant documents following below. While previous approaches to learning retrieval functions from examples exist, they typically require training data generated from relevance judgments by experts. This makes them difficult and expensive to apply. The goal of this paper is to develop a method that utilizes clickthrough data for training, namely the query-log of the search engine in connection with the log of links the users clicked on in the presented ranking. Such clickthrough data is available in abundance and can be recorded at very low cost. Taking a Support Vector Machine (SVM) approach, this paper presents a method for learning retrieval functions. From a theoretical perspective, this method is shown to be well-founded in a risk minimization framework. Furthermore, it is shown to be feasible even for large sets of queries and features. The theoretical results are verified in a controlled experiment. It shows that the method can effectively adapt the retrieval function of a meta-search engine to a particular group of users, outperforming Google in terms of retrieval quality after only a couple of hundred training examples.}, language = {en}, urldate = {2019-01-18}, booktitle = {Proceedings of the eighth {ACM} {SIGKDD} international conference on {Knowledge} discovery and data mining}, publisher = {ACM}, author = {Joachims, Thorsten}, month = jul, year = {2002}, pages = {133--142}, } @inproceedings{agrawal_diversifying_2009, address = {New York, NY, USA}, series = {{WSDM} '09}, title = {Diversifying {Search} {Results}}, isbn = {978-1-60558-390-7}, url = {http://doi.acm.org/10.1145/1498759.1498766}, doi = {10.1145/1498759.1498766}, abstract = {We study the problem of answering ambiguous web queries in a setting where there exists a taxonomy of information, and that both queries and documents may belong to more than one category according to this taxonomy. We present a systematic approach to diversifying results that aims to minimize the risk of dissatisfaction of the average user. We propose an algorithm that well approximates this objective in general, and is provably optimal for a natural special case. Furthermore, we generalize several classical IR metrics, including NDCG, MRR, and MAP, to explicitly account for the value of diversification. We demonstrate empirically that our algorithm scores higher in these generalized metrics compared to results produced by commercial search engines.}, language = {en}, urldate = {2019-01-27}, booktitle = {Proceedings of the {Second} {ACM} {International} {Conference} on {Web} {Search} and {Data} {Mining}}, publisher = {ACM}, author = {Agrawal, Rakesh and Gollapudi, Sreenivas and Halverson, Alan and Ieong, Samuel}, year = {2009}, pages = {5--14}, } @article{robertson_probabilistic_2009, title = {The {Probabilistic} {Relevance} {Framework}: {BM25} and {Beyond}}, volume = {3}, issn = {1554-0669, 1554-0677}, shorttitle = {The {Probabilistic} {Relevance} {Framework}}, url = {https://www.nowpublishers.com/article/Details/INR-019}, doi = {10.1561/1500000019}, abstract = {The Probabilistic Relevance Framework (PRF) is a formal framework for document retrieval, grounded in work done in the 1970–1980s, which led to the development of one of the most successful text-retrieval algorithms, BM25. In recent years, research in the PRF has yielded new retrieval models capable of taking into account document meta-data (especially structure and link-graph information). Again, this has led to one of the most successful Web-search and corporate-search algorithms, BM25F. This work presents the PRF from a conceptual point of view, describing the probabilistic modelling assumptions behind the framework and the different ranking algorithms that result from its application: the binary independence model, relevance feedback models, BM25 and BM25F. It also discusses the relation between the PRF and other statistical models for IR, and covers some related topics, such as the use of non-textual features, and parameter optimisation for models with free parameters.}, language = {en}, number = {4}, urldate = {2019-01-18}, journal = {Foundations and Trends® in Information Retrieval}, author = {Robertson, Stephen and Zaragoza, Hugo}, month = dec, year = {2009}, pages = {333--389}, } @inproceedings{vieira_query_2011, address = {Washington, DC, USA}, series = {{ICDE} '11}, title = {On {Query} {Result} {Diversification}}, isbn = {978-1-4244-8959-6}, url = {http://dx.doi.org/10.1109/ICDE.2011.5767846}, doi = {10.1109/ICDE.2011.5767846}, abstract = {In this paper we describe a general framework for evaluation and optimization of methods for diversifying query results. In these methods, an initial ranking candidate set produced by a query is used to construct a result set, where elements are ranked with respect to relevance and diversity features, i.e., the retrieved elements should be as relevant as possible to the query, and, at the same time, the result set should be as diverse as possible. While addressing relevance is relatively simple and has been heavily studied, diversity is a harder problem to solve. One major contribution of this paper is that, using the above framework, we adapt, implement and evaluate several existing methods for diversifying query results. We also propose two new approaches, namely the Greedy with Marginal Contribution (GMC) and the Greedy Randomized with Neighborhood Expansion (GNE) methods. Another major contribution of this paper is that we present the first thorough experimental evaluation of the various diversification techniques implemented in a common framework. We examine the methods' performance with respect to precision, running time and quality of the result. Our experimental results show that while the proposed methods have higher running times, they achieve precision very close to the optimal, while also providing the best result quality. While GMC is deterministic, the randomized approach (GNE) can achieve better result quality if the user is willing to tradeoff running time.}, language = {en}, urldate = {2019-01-27}, booktitle = {Proceedings of the 2011 {IEEE} 27th {International} {Conference} on {Data} {Engineering}}, publisher = {IEEE Computer Society}, author = {Vieira, Marcos R. and Razente, Humberto L. and Barioni, Maria C. N. and Hadjieleftheriou, Marios and Srivastava, Divesh and Traina, Caetano and Tsotras, Vassilis J.}, year = {2011}, pages = {1163--1174}, }