People

About me
Hi, I’m Yifan Wang, a PhD student working within the RTG project 2853 “Neuroexplicit Models of Language, Vision, and Action”. I’m lucky to be co-supervised by Prof. Dr. Vera Demberg and Prof. Dr. Isabel Valera.
My research focuses on tackling undesired behaviors in NLP systems, like toxicity, social biases, and stereotypes. I’m also deeply interested in making large language models more interpretable and transparent. If you’re curious to learn more, feel free to check out my website: ewanwong.github.io.
Publications
2026
Wang, Yifan; Jobanputra, Mayank; Lee, Ji-Ung; Oh, Soyoung; Valera, Isabel; Demberg, Vera
Bridging Fairness and Explainability: Can Input-Based Explanations Promote Fairness in Hate Speech Detection? Journal Article
In: 2026.
@article{nokey,
title = {Bridging Fairness and Explainability: Can Input-Based Explanations Promote Fairness in Hate Speech Detection?},
author = {Yifan Wang and Mayank Jobanputra and Ji-Ung Lee and Soyoung Oh and Isabel Valera and Vera Demberg},
doi = { https://doi.org/10.48550/arXiv.2509.22291},
year = {2026},
date = {2026-02-11},
urldate = {2026-02-11},
abstract = {Natural language processing (NLP) models often replicate or amplify social bias from training data, raising concerns about fairness. At the same time, their black-box nature makes it difficult for users to recognize biased predictions and for developers to effectively mitigate them. While some studies suggest that input-based explanations can help detect and mitigate bias, others question their reliability in ensuring fairness. Existing research on explainability in fair NLP has been predominantly qualitative, with limited large-scale quantitative analysis. In this work, we conduct the first systematic study of the relationship between explainability and fairness in hate speech detection, focusing on both encoder- and decoder-only models. We examine three key dimensions: (1) identifying biased predictions, (2) selecting fair models, and (3) mitigating bias during model training. Our findings show that input-based explanations can effectively detect biased predictions and serve as useful supervision for reducing bias during training, but they are unreliable for selecting fair models among this http URL http://candidates.our/ code is available at this URL https://github.com/Ewanwong/fairness_x_explainability},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2025
Jobanputra, Mayank; Kovtunova, Alisa; Balthes, Brisca; Pogulskiy, Fedor Grigoryevich; Wang, Yifan; Borgwardt, Stefan; Demberg, Vera
ProofTeller: Exposing recency bias in LLM reasoning and its side effects on communication Proceedings Article
In: Inui, Kentaro; Sakti, Sakriani; Wang, Haofen; Wong, Derek F.; Bhattacharyya, Pushpak; Banerjee, Biplab; Ekbal, Asif; Chakraborty, Tanmoy; Singh, Dhirendra Pratap (Ed.): Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pp. 1439–1462, The Asian Federation of Natural Language Processing and The Association for Computational Linguistics, Mumbai, India, 2025, ISBN: 979-8-89176-298-5.
@inproceedings{jobanputra-etal-2025-proofteller,
title = {ProofTeller: Exposing recency bias in LLM reasoning and its side effects on communication},
author = {Mayank Jobanputra and Alisa Kovtunova and Brisca Balthes and Fedor Grigoryevich Pogulskiy and Yifan Wang and Stefan Borgwardt and Vera Demberg},
editor = {Kentaro Inui and Sakriani Sakti and Haofen Wang and Derek F. Wong and Pushpak Bhattacharyya and Biplab Banerjee and Asif Ekbal and Tanmoy Chakraborty and Dhirendra Pratap Singh},
url = {https://aclanthology.org/2025.ijcnlp-long.80/},
doi = {10.18653/v1/2025.ijcnlp-long.80},
isbn = {979-8-89176-298-5},
year = {2025},
date = {2025-12-01},
urldate = {2025-12-01},
booktitle = {Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics},
pages = {1439–1462},
publisher = {The Asian Federation of Natural Language Processing and The Association for Computational Linguistics},
address = {Mumbai, India},
abstract = {Large language models (LLMs) are increasingly applied in domains that demand reliable and interpretable reasoning. While formal methods can generate provably correct proofs, these proofs are often inaccessible to non-expert users. This raises a natural question: can LLMs, when given a verified proof, faithfully interpret its reasoning and communicate it clearly? We introduce $ProofTeller$, a benchmark that evaluates this ability across three tasks: (1) identifying key proof steps, (2) summarizing the reasoning, and (3) explaining the result in concise natural language. The benchmark covers three domains: _Biology_, _Drones_, and _Recipes_, representing scientific, safety-critical, and everyday reasoning scenarios. We find a consistent near-conclusion bias: LLMs tend to focus on steps closest to the final proof conclusion rather than on the most informative ones. A targeted human study confirms that explanations based on such steps are rated less appropriate for end users. These findings indicate that even when reasoning is provided, current LLMs face challenges in communicating key information in a useful manner, highlighting the need for LLMs that can communicate important details reliably.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Wang, Yifan; Rao, Sukrut; Lee, Ji-Ung; Jobanputra, Mayank; Demberg, Vera
B-cos LM: Efficiently Transforming Pre-trained Language Models for Improved Explainability Journal Article
In: Transactions on Machine Learning Research, 2025, ISSN: 2835-8856.
@article{<LineBreak>wang2025bcos,
title = {B-cos LM: Efficiently Transforming Pre-trained Language Models for Improved Explainability},
author = {Yifan Wang and Sukrut Rao and Ji-Ung Lee and Mayank Jobanputra and Vera Demberg},
url = {https://openreview.net/forum?id=c180UH8Dg8},
issn = {2835-8856},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {Transactions on Machine Learning Research},
abstract = {Post-hoc explanation methods for black-box models often struggle with faithfulness and human interpretability due to the lack of explainability in current neural architectures. Meanwhile, B-cos networks have been introduced to improve model explainability by proposing an architecture that removes bias terms and promotes input-weight alignment. Although B-cos networks have shown success in building explainable systems, their application has so far been limited to computer vision models and their associated training pipelines. In this work, we introduce B-cos LMs, i.e., B-cos Language Models (LMs) empowered for natural language processing (NLP) tasks. Our approach directly transforms pre-trained language models into B-cos LMs by combining B-cos conversion and task fine-tuning, improving efficiency compared to previous methods. Automatic and human evaluation results demonstrate that B-cos LMs produce more faithful and human interpretable explanations than post-hoc methods, while maintaining task performance comparable to conventional fine-tuning. Our in-depth analysis explores how B-cos LMs differ from conventionally fine-tuned models in their learning processes and explanation patterns. Finally, we present a first exploration of transforming decoder-only models to B-cos LMs for generation tasks. Our code is available at https://github.com/Ewanwong/bcos_lm.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jobanputra, Mayank; Walter, Nils Philipp; Mehta, Maitrey; Veseli, Blerta; Chapple, Evan Parker Kelly; Wang, Yifan; Chetani, Sneha; Pavlick, Ellie; Vergari, Antonio; Demberg, Vera
Can LLMs subtract numbers? Miscellaneous
2025.
@misc{jobanputra2025llmssubtractnumbers,
title = {Can LLMs subtract numbers?},
author = {Mayank Jobanputra and Nils Philipp Walter and Maitrey Mehta and Blerta Veseli and Evan Parker Kelly Chapple and Yifan Wang and Sneha Chetani and Ellie Pavlick and Antonio Vergari and Vera Demberg},
url = {https://arxiv.org/abs/2511.02795},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
abstract = {We present a systematic study of subtraction in large language models (LLMs). While prior benchmarks emphasize addition and multiplication, subtraction has received comparatively little attention despite being structurally distinct as a non-commutative operation. We evaluate eight pretrained LLMs spanning four families on addition and subtraction problems. Our experiments reveal that subtraction accuracy lags behind addition by a wide margin. We find that the errors for (a-b) are concentrated in cases where (a<b). In such cases, LLMs frequently produce the correct magnitude but omit the negative sign. Probing analyses show that LLMs internally encode whether results should be negative, yet this information is often not reflected in generated outputs. We further test well-known techniques such as few-shot learning and instruction-tuning to see if they can improve the LLMs' performance. Our results suggest that while few-shot prompting yields modest gains, the instruction-tuned models achieve near-perfect accuracies in generating the negative sign. Together, these findings provide a clearer characterization of the limitations and recoverability of LLMs' arithmetic capabilities in subtraction},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2024
Wang, Yifan; Demberg, Vera
RSA-Control: A Pragmatics-Grounded Lightweight Controllable Text Generation Framework Proceedings Article
In: Al-Onaizan, Yaser; Bansal, Mohit; Chen, Yun-Nung (Ed.): Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pp. 5561–5582, Association for Computational Linguistics, Miami, Florida, USA, 2024.
@inproceedings{wang-demberg-2024-rsa,
title = {RSA-Control: A Pragmatics-Grounded Lightweight Controllable Text Generation Framework},
author = {Yifan Wang and Vera Demberg},
editor = {Yaser Al-Onaizan and Mohit Bansal and Yun-Nung Chen},
url = {https://aclanthology.org/2024.emnlp-main.318/},
doi = {10.18653/v1/2024.emnlp-main.318},
year = {2024},
date = {2024-11-01},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
pages = {5561–5582},
publisher = {Association for Computational Linguistics},
address = {Miami, Florida, USA},
abstract = {Despite significant advancements in natural language generation, controlling language models to produce texts with desired attributes remains a formidable challenge. In this work, we introduce RSA-Control, a training-free controllable text generation framework grounded in pragmatics. RSA-Control directs the generation process by recursively reasoning between imaginary speakers and listeners, enhancing the likelihood that target attributes are correctly interpreted by listeners amidst distractors. Additionally, we introduce a self-adjustable rationality parameter, which allows for automatic adjustment of control strength based on context. Our experiments, conducted with two task types and two types of language models, demonstrate that RSA-Control achieves strong attribute control while maintaining language fluency and content consistency. Our code is available at https://github.com/Ewanwong/RSA-Control.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Wang, Yifan; Demberg, Vera
A Parameter-Efficient Multi-Objective Approach to Mitigate Stereotypical Bias in Language Models Proceedings Article
In: Faleńska, Agnieszka; Basta, Christine; Costa-jussà, Marta; Goldfarb-Tarrant, Seraphina; Nozza, Debora (Ed.): Proceedings of the 5th Workshop on Gender Bias in Natural Language Processing (GeBNLP), pp. 1–19, Association for Computational Linguistics, Bangkok, Thailand, 2024.
@inproceedings{wang-demberg-2024-parameter,
title = {A Parameter-Efficient Multi-Objective Approach to Mitigate Stereotypical Bias in Language Models},
author = {Yifan Wang and Vera Demberg},
editor = {Agnieszka Faleńska and Christine Basta and Marta Costa-jussà and Seraphina Goldfarb-Tarrant and Debora Nozza},
url = {https://aclanthology.org/2024.gebnlp-1.1/},
doi = {10.18653/v1/2024.gebnlp-1.1},
year = {2024},
date = {2024-08-01},
booktitle = {Proceedings of the 5th Workshop on Gender Bias in Natural Language Processing (GeBNLP)},
pages = {1–19},
publisher = {Association for Computational Linguistics},
address = {Bangkok, Thailand},
abstract = {Pre-trained language models have shown impressive abilities of understanding and generating natural languages. However, they typically inherit undesired human-like bias and stereotypes from training data, which raises concerns about putting these models into use in real-world scenarios. Although prior research has proposed to reduce bias using different fairness objectives, they usually fail to capture different representations of bias and, therefore, struggle with fully debiasing models. In this work, we introduce a multi-objective probability alignment approach to overcome current challenges by incorporating multiple debiasing losses to locate and penalize bias in different forms. Compared to existing methods, our proposed method can more effectively and comprehensively reduce stereotypical bias, and maintains the language ability of pre-trained models at the same time. Besides, we adopt prefix-tuning to optimize fairness objectives, and results show that it can achieve better bias removal than full fine-tuning while requiring much fewer computational resources. Our code and data are available at https://github.com/Ewanwong/debias_NLG.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Liu, Dongqi; Wang, Yifan; Loy, Jia; Demberg, Vera
SciNews: From Scholarly Complexities to Public Narratives – a Dataset for Scientific News Report Generation Proceedings Article
In: Calzolari, Nicoletta; Kan, Min-Yen; Hoste, Veronique; Lenci, Alessandro; Sakti, Sakriani; Xue, Nianwen (Ed.): Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pp. 14429–14444, ELRA and ICCL, Torino, Italia, 2024.
@inproceedings{pu-etal-2024-scinews,
title = {SciNews: From Scholarly Complexities to Public Narratives – a Dataset for Scientific News Report Generation},
author = {Dongqi Liu and Yifan Wang and Jia Loy and Vera Demberg},
editor = {Nicoletta Calzolari and Min-Yen Kan and Veronique Hoste and Alessandro Lenci and Sakriani Sakti and Nianwen Xue},
url = {https://aclanthology.org/2024.lrec-main.1258/},
year = {2024},
date = {2024-05-01},
urldate = {2024-05-01},
booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)},
pages = {14429–14444},
publisher = {ELRA and ICCL},
address = {Torino, Italia},
abstract = {Scientific news reports serve as a bridge, adeptly translating complex research articles into reports that resonate with the broader public. The automated generation of such narratives enhances the accessibility of scholarly insights. In this paper, we present a new corpus to facilitate this paradigm development. Our corpus comprises a parallel compilation of academic publications and their corresponding scientific news reports across nine disciplines. To demonstrate the utility and reliability of our dataset, we conduct an extensive analysis, highlighting the divergences in readability and brevity between scientific news narratives and academic manuscripts. We benchmark our dataset employing state-of-the-art text generation models. The evaluation process involves both automatic and human evaluation, which lays the groundwork for future explorations into the automated generation of scientific news reports. The dataset and code related to this work are available at https://dongqi.me/projects/SciNews.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Liu, Dongqi; Wang, Yifan; Demberg, Vera
Incorporating Distributions of Discourse Structure for Long Document Abstractive Summarization Proceedings Article
In: Rogers, Anna; Boyd-Graber, Jordan; Okazaki, Naoaki (Ed.): Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5574–5590, Association for Computational Linguistics, Toronto, Canada, 2023.
@inproceedings{pu-etal-2023-incorporating,
title = {Incorporating Distributions of Discourse Structure for Long Document Abstractive Summarization},
author = {Dongqi Liu and Yifan Wang and Vera Demberg},
editor = {Anna Rogers and Jordan Boyd-Graber and Naoaki Okazaki},
url = {https://aclanthology.org/2023.acl-long.306/},
doi = {10.18653/v1/2023.acl-long.306},
year = {2023},
date = {2023-07-01},
urldate = {2023-07-01},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
pages = {5574–5590},
publisher = {Association for Computational Linguistics},
address = {Toronto, Canada},
abstract = {For text summarization, the role of discourse structure is pivotal in discerning the core content of a text. Regrettably, prior studies on incorporating Rhetorical Structure Theory (RST) into transformer-based summarization models only consider the nuclearity annotation, thereby overlooking the variety of discourse relation types. This paper introduces the `RSTformer', a novel summarization model that comprehensively incorporates both the types and uncertainty of rhetorical relations. Our RST-attention mechanism, rooted in document-level rhetorical structure, is an extension of the recently devised Longformer framework. Through rigorous evaluation, the model proposed herein exhibits significant superiority over state-of-the-art models, as evidenced by its notable performance on several automatic metrics and human evaluation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
