@inproceedings { , title = {Meta-evaluation of online and offline web search evaluation metrics}, abstract = {As in most information retrieval (IR) studies, evaluation plays an essential part in Web search research. Both offline and online evaluation metrics are adopted in measuring the performance of search engines. Offline metrics are usually based on relevance judgments of query-document pairs from assessors while online metrics exploit the user behavior data, such as clicks, collected from search engines to compare search algorithms. Although both types of IR evaluation metrics have achieved success, to what extent can they predict user satisfaction still remains under-investigated. To shed light on this research question, we meta-evaluate a series of existing online and offline metrics to study how well they infer actual search user satisfaction in different search scenarios. We find that both types of evaluation metrics significantly correlate with user satisfaction while they reflect satisfaction from different perspectives for different search tasks. Offline metrics better align with user satisfaction in homogeneous search (i.e. ten blue links) whereas online metrics outperform when vertical results are federated. Finally, we also propose to incorporate mouse hover information into existing online evaluation metrics, and empirically show that they better align with search user satisfaction than click-based online metrics.}, conference = {SIGIR '17: 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, doi = {10.1145/3077136.3080804}, isbn = {9781450350228}, organization = {Shinjuku, Tokyo, Japan}, pages = {15-24}, publicationstatus = {Published}, url = {https://nottingham-repository.worktribe.com/output/876955}, year = {2017}, author = {Chen, Ye and Zhou, Ke and Liu, Yiqun and Zhang, Min and Ma, Shaoping} }