Machine Learning – Publications
Majumdar, Ayan; Kanubala, Deborah Dormah; Gupta, Kavya; Valera, Isabel
A Causal Framework to Measure and Mitigate Non-binary Treatment Discrimination Journal Article
In: CoRR, vol. abs/2503.22454, 2026.
Abstract | Links | BibTeX | Tags: ayanm, deborah, isabel, kavya, saml
@article{DBLP:journals/corr/abs-2503-22454,
title = {A Causal Framework to Measure and Mitigate Non-binary Treatment Discrimination},
author = {Ayan Majumdar and Deborah Dormah Kanubala and Kavya Gupta and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2503.22454},
doi = {10.48550/ARXIV.2503.22454},
year = {2026},
date = {2026-03-19},
urldate = {2026-03-19},
journal = {CoRR},
volume = {abs/2503.22454},
abstract = {Fairness studies of algorithmic decision-making systems often simplify complex decision processes, such as bail or lending decisions, into binary classification tasks (e.g., approve or not approve). However, these approaches overlook that such decisions are not inherently binary; they also involve non-binary treatment decisions (e.g., loan or bail terms) that can influence the downstream outcomes (e.g., loan repayment or reoffending). We argue that treatment decisions are integral to the decision-making process and, therefore, should be central to fairness analyses. Consequently, we propose a causal framework that extends and complements existing fairness notions by explicitly distinguishing between decision-subjects’ covariates and the treatment decisions. Our framework leverages path-specific counterfactual reasoning to: (i) measure treatment disparity and its downstream effects in historical data; and (ii) mitigate the impact of past unfair treatment decisions when automating decision-making. We use our framework to empirically analyze four widely used loan approval datasets to reveal potential disparity in non-binary treatment decisions and their discriminatory impact on outcomes, highlighting the need to incorporate treatment decisions in fairness assessments. Finally, by intervening in treatment decisions, we show that our framework effectively mitigates treatment discrimination from historical loan approval data to ensure fair risk score estimation and (non-binary) decision-making processes that benefit all stakeholders.},
keywords = {ayanm, deborah, isabel, kavya, saml},
pubstate = {published},
tppubtype = {article}
}
Koyuncu, Batuhan; Kwon, Byeungchun; Lombardi, Marco Jacopo; Perez-Cruz, Fernando; Shin, Hyun Song
BISTRO: a general purpose oracle for macroeconomic time series Journal Article
In: 2026.
Abstract | BibTeX | Tags: batu
@article{nokey,
title = {BISTRO: a general purpose oracle for macroeconomic time series},
author = {Batuhan Koyuncu and Byeungchun Kwon and Marco Jacopo Lombardi and Fernando Perez-Cruz and Hyun Song Shin},
year = {2026},
date = {2026-03-16},
urldate = {2026-03-16},
abstract = {Predictions of macroeconomic variables are a key input to economic policy, yet traditional econometric approaches have the limitation that the model needs to be tailored to the specific task. The advent of large language models (LLMs) opens up the tantalising prospect that a single general model can tackle a wide variety of tasks. This article introduces the BIS Time-series Regression Oracle (BISTRO), a general purpose time series model for macroeconomic forecasting. Building on the transformer architecture underlying LLMs, BISTRO is fine-tuned on the large repository of macroeconomic data maintained at the BIS. We put the model through its paces by assessing how well it forecasts the 2021 inflation surge. In contrast to standard benchmarks, which mechanically project a reversion to the mean, BISTRO correctly anticipates the persistence of the inflation wave. This highlights its ability to adapt to unfamiliar patterns in the data. Thus, BISTRO holds promise for producing reliable baseline forecasts and for scenario analysis.},
keywords = {batu},
pubstate = {published},
tppubtype = {article}
}
Koyuncu, Batuhan; Kwon, Byeungchun; Lombardi, Marco Jacopo; Perez-Cruz, Fernando; Shin, Hyun Song
Introducing BISTRO: a foundational model for unconditional and conditional forecasting of macroeconomic time series Journal Article
In: 2026.
Abstract | BibTeX | Tags: batu
@article{nokey,
title = {Introducing BISTRO: a foundational model for unconditional and conditional forecasting of macroeconomic time series},
author = {Batuhan Koyuncu and Byeungchun Kwon and Marco Jacopo Lombardi and Fernando Perez-Cruz and Hyun Song Shin},
year = {2026},
date = {2026-03-13},
urldate = {2026-03-13},
abstract = {This article introduces the BIS Time-series Regression Oracle (BISTRO), a general purpose time series model for macroeconomic forecasting. Its edge over traditional econometric approaches lies in its ability to deal with generic unconditional and conditional forecasting tasks, without requiring to adjust the model to the macroeconomic tasks being tackled. Building on the transformer architecture underlying LLMs, BISTRO is fine-tuned on the large repository of macroeconomic data maintained at the BIS. We show that BISTRO provides reliable unconditional forecasts for key macroeconomic aggregates and illustrate how using it for conditional forecasting can help unveiling patterns of nonlinearity in the data.},
keywords = {batu},
pubstate = {published},
tppubtype = {article}
}
Vo, Huyen; Martı́nez-Garcı́a, Marı́a; Valera, Isabel
Holder++: Improving the Quality-Coherence Trade-off in Multimodal VAEs Proceedings Article
In: 2026.
Abstract | Links | BibTeX | Tags: huyen, isabel, maria
@inproceedings{nokey,
title = {Holder++: Improving the Quality-Coherence Trade-off in Multimodal VAEs},
author = {Huyen Vo and Marı́a Martı́nez-Garcı́a and Isabel Valera},
url = {https://vothuckhanhhuyen.github.io/assets/pdf/Holder_ICML2026.pdf},
year = {2026},
date = {2026-03-11},
urldate = {2026-03-11},
abstract = {Existing approaches for multimodal variational autoencoders (VAEs) face a trade-off between generative quality and coherence—i.e., they struggle to generate realistic and diverse samples that, at the same time, are semantically consistent across modalities. A recent work shows that using a simple approximation to Hölder pooling as an aggregation method improves coherence over the SOTA MMVAE+, despite assuming a single shared representation across all modalities. Yet, it slightly compromises sample diversity. Inspired by this insight, we propose Hölder++, a novel multimodal VAE that improves the generative quality-coherence trade-off through: (i) the first implementation of Hölder pooling without any approximation for multimodal VAEs; (ii) an extended architecture that models distinct shared and private (i.e., modality-specific) representations (Hölder+); and (iii) hierarchical inference that further enhances the disentanglement between the shared and private representations (Hölder++). Our experiments corroborate that Hölder++ consistently improves the generative quality-coherence trade-off, yields more structured latent spaces, and learns shared representations that are informative for downstream tasks.},
keywords = {huyen, isabel, maria},
pubstate = {published},
tppubtype = {inproceedings}
}
Koyuncu, Batuhan; Kwon, Byeungchun; Lombardi, Marco; Perez-Cruz, Fernando; Shin, Hyun Song
A foundational model for macroeconomic times series forecasting and nowcasting Journal Article
In: 2026.
@article{nokey,
title = {A foundational model for macroeconomic times series forecasting and nowcasting},
author = {Batuhan Koyuncu and Byeungchun Kwon and Marco Lombardi and Fernando Perez-Cruz and Hyun Song Shin},
year = {2026},
date = {2026-02-20},
urldate = {2026-02-20},
keywords = {batu},
pubstate = {published},
tppubtype = {article}
}
Wang, Yifan; Jobanputra, Mayank; Lee, Ji-Ung; Oh, Soyoung; Valera, Isabel; Demberg, Vera
Bridging Fairness and Explainability: Can Input-Based Explanations Promote Fairness in Hate Speech Detection? Journal Article
In: 2026.
Abstract | Links | BibTeX | Tags: isabel, yifan
@article{nokey,
title = {Bridging Fairness and Explainability: Can Input-Based Explanations Promote Fairness in Hate Speech Detection?},
author = {Yifan Wang and Mayank Jobanputra and Ji-Ung Lee and Soyoung Oh and Isabel Valera and Vera Demberg},
doi = { https://doi.org/10.48550/arXiv.2509.22291},
year = {2026},
date = {2026-02-11},
urldate = {2026-02-11},
abstract = {Natural language processing (NLP) models often replicate or amplify social bias from training data, raising concerns about fairness. At the same time, their black-box nature makes it difficult for users to recognize biased predictions and for developers to effectively mitigate them. While some studies suggest that input-based explanations can help detect and mitigate bias, others question their reliability in ensuring fairness. Existing research on explainability in fair NLP has been predominantly qualitative, with limited large-scale quantitative analysis. In this work, we conduct the first systematic study of the relationship between explainability and fairness in hate speech detection, focusing on both encoder- and decoder-only models. We examine three key dimensions: (1) identifying biased predictions, (2) selecting fair models, and (3) mitigating bias during model training. Our findings show that input-based explanations can effectively detect biased predictions and serve as useful supervision for reducing bias during training, but they are unreliable for selecting fair models among this http URL http://candidates.our/ code is available at this URL https://github.com/Ewanwong/fairness_x_explainability},
keywords = {isabel, yifan},
pubstate = {published},
tppubtype = {article}
}
Uth, Richard; Niemitz, Nelli; Valera, Isabel; Langer, Markus
Personalizing explanations in AI-based decisions: The effects of personalization and (Mis)aligning with individual preferences Journal Article
In: Computers in Human Behavior, 2026.
Abstract | Links | BibTeX | Tags: isabel
@article{Uth2025PersonalizingEI,
title = {Personalizing explanations in AI-based decisions: The effects of personalization and (Mis)aligning with individual preferences},
author = {Richard Uth and Nelli Niemitz and Isabel Valera and Markus Langer},
url = {https://api.semanticscholar.org/CorpusID:283171660},
year = {2026},
date = {2026-02-03},
urldate = {2025-01-01},
journal = {Computers in Human Behavior},
abstract = {The increasing reliance on AI-based decision-making in high-stakes contexts underscores the need for transparency and justice. Here, negative outcomes drive individuals affected by AI-based decisions to seek actionable explanations that enable them to realize what they can do to achieve a better future outcome. However, actionability is subjective, varying across individuals and contexts. Personalization of explanations has been proposed to address this variability, but insights on personalized explanation processes, their potential, and challenges are scarce. This paper investigates the impact of personalization and (mis)alignment with individual needs and preferences in explanations for AI-based decisions through an experimental online study simulating denied loan applications. In a within-participants design (N=255), participants ranked the actionability of decision-relevant features and experienced five explanation conditions: personalized directive explanations based on the most, second most, or least actionable feature (as ranked by participants); a non-personalized directive explanation highlighting a random feature; and no explanation. In line with justice theory, our results show that any explanation was better than none, and that personalized explanations led to more favorable reactions than non-personalized explanations, enhancing perceptions of justice and attractiveness of the bank. Closer alignment with preferences had only small positive effects, mainly for attractiveness. These findings highlight that even simple ranking-based approaches can make explanations more effective and accessible without requiring technical expertise while cautioning against offering superficial control. This study provides insights into the effects of ranking-based personalization, informing the design of explainability tailored to diverse user needs and addressing ethical and practical considerations in personalization.},
keywords = {isabel},
pubstate = {published},
tppubtype = {article}
}
Müller, Nicola J.; Oster, Moritz; Valera, Isabel; Hoffmann, Jörg; Gros, Timo P.
Per-Domain Generalizing Policies: On Learning Efficient and Robust Q-Value Functions (Extended Version with Technical Appendix) Journal Article
In: CoRR, vol. abs/2603.17544, 2026.
Abstract | Links | BibTeX | Tags: isabel
@article{DBLP:journals/corr/abs-2603-17544,
title = {Per-Domain Generalizing Policies: On Learning Efficient and Robust Q-Value Functions (Extended Version with Technical Appendix)},
author = {Nicola J. Müller and Moritz Oster and Isabel Valera and Jörg Hoffmann and Timo P. Gros},
url = {https://doi.org/10.48550/arXiv.2603.17544},
doi = {10.48550/ARXIV.2603.17544},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
journal = {CoRR},
volume = {abs/2603.17544},
abstract = {Learning per-domain generalizing policies is a key challenge in learning for planning. Standard approaches learn state-value functions represented as graph neural networks using supervised learning on optimal plans generated by a teacher planner. In this work, we advocate for learning Q-value functions instead. Such policies are drastically cheaper to evaluate for a given state, as they need to process only the current state rather than every successor. Surprisingly, vanilla supervised learning of Q-values performs poorly as it does not learn to distinguish between the actions taken and those not taken by the teacher. We address this by using regularization terms that enforce this distinction, resulting in Q-value policies that consistently outperform state-value policies across a range of 10 domains and are competitive with the planner LAMA-first.},
keywords = {isabel},
pubstate = {published},
tppubtype = {article}
}
Gupta, Kavya; Kalampalikis, Nektarios; Heitz, Christoph; Valera, Isabel
First-See-Then-Design: A Multi-Stakeholder View for Optimal Performance-Fairness Trade-Offs Journal Article
In: arXiv preprint arXiv:2604.14035, 2026.
Abstract | Links | BibTeX | Tags: isabel, nektarios, saml
@article{gupta2026first,
title = {First-See-Then-Design: A Multi-Stakeholder View for Optimal Performance-Fairness Trade-Offs},
author = {Kavya Gupta and Nektarios Kalampalikis and Christoph Heitz and Isabel Valera},
url = {https://arxiv.org/abs/2604.14035},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
journal = {arXiv preprint arXiv:2604.14035},
abstract = {Fairness in algorithmic decision-making is often defined in the predictive space, where predictive performance - used as a proxy for decision-maker (DM) utility - is traded off against prediction-based fairness notions, such as demographic parity or equality of opportunity. This perspective, however, ignores how predictions translate into decisions and ultimately into utilities and welfare for both DM and decision subjects (DS), as well as their allocation across social-salient groups.
In this paper, we propose a multi-stakeholder framework for fair algorithmic decision-making grounded in welfare economics and distributive justice, explicitly modeling the utilities of both the DM and DS, and defining fairness via a social planner's utility that captures inequalities in DS utilities across groups under different justice-based fairness notions (e.g., Egalitarian, Rawlsian). We formulate fair decision-making as a post-hoc multi-objective optimization problem, characterizing the achievable performance-fairness trade-offs in the two-dimensional utility space of DM utility and the social planner's utility, under different decision policy classes (deterministic vs. stochastic, shared vs. group-specific). Using the proposed framework, we then identify conditions (in terms of the stakeholders' utilities) under which stochastic policies are more optimal than deterministic ones, and empirically demonstrate that simple stochastic policies can yield superior performance-fairness trade-offs by leveraging outcome uncertainty. Overall, we advocate a shift from prediction-centric fairness to a transparent, justice-based, multi-stakeholder approach that supports the collaborative design of decision-making policies.},
keywords = {isabel, nektarios, saml},
pubstate = {published},
tppubtype = {article}
}
In this paper, we propose a multi-stakeholder framework for fair algorithmic decision-making grounded in welfare economics and distributive justice, explicitly modeling the utilities of both the DM and DS, and defining fairness via a social planner's utility that captures inequalities in DS utilities across groups under different justice-based fairness notions (e.g., Egalitarian, Rawlsian). We formulate fair decision-making as a post-hoc multi-objective optimization problem, characterizing the achievable performance-fairness trade-offs in the two-dimensional utility space of DM utility and the social planner's utility, under different decision policy classes (deterministic vs. stochastic, shared vs. group-specific). Using the proposed framework, we then identify conditions (in terms of the stakeholders' utilities) under which stochastic policies are more optimal than deterministic ones, and empirically demonstrate that simple stochastic policies can yield superior performance-fairness trade-offs by leveraging outcome uncertainty. Overall, we advocate a shift from prediction-centric fairness to a transparent, justice-based, multi-stakeholder approach that supports the collaborative design of decision-making policies.
Vo, Huyen Thuc Khanh; Valera, Isabel
Hellinger Multimodal Variational Autoencoders Proceedings Article Spotlight
In: The 29th International Conference on Artificial Intelligence and Statistics, 2026.
Abstract | Links | BibTeX | Tags: huyen, isabel, saml, spotlight
@inproceedings{<LineBreak>vo2026hellinger,
title = {Hellinger Multimodal Variational Autoencoders},
author = {Huyen Thuc Khanh Vo and Isabel Valera},
url = {https://openreview.net/forum?id=mxHyYltMUa},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
booktitle = {The 29th International Conference on Artificial Intelligence and Statistics},
abstract = {Multimodal variational autoencoders (VAEs) are widely used for weakly supervised generative learning with multiple modalities. Predominant methods aggregate unimodal inference distributions using either a product of experts (PoE), a mixture of experts (MoE), or their combinations to approximate the joint posterior. In this work, we revisit multimodal inference through the lens of probabilistic opinion pooling, an optimization-based approach. We start from Hölder pooling with α=0.5, which corresponds to the unique symmetric member of the α-divergence family, and derive a moment-matching approximation, termed Hellinger. We then leverage such an approximation to propose HELVAE, a multimodal VAE that avoids sub-sampling, yielding an efficient yet effective model that: (i) learns more expressive latent representations as additional modalities are observed; and (ii) empirically achieves better trade-offs between generative coherence and quality, outperforming state-of-the-art multimodal VAE models.},
keywords = {huyen, isabel, saml, spotlight},
pubstate = {published},
tppubtype = {inproceedings}
}
Valdrighi, Giovani; Valera, Isabel; Raimundo, Marcos Medeiros
Long-term Fairness with Selective Labels Miscellaneous
2026.
Abstract | Links | BibTeX | Tags: isabel, saml
@misc{valdrighi2026longtermfairnessselectivelabels,
title = {Long-term Fairness with Selective Labels},
author = {Giovani Valdrighi and Isabel Valera and Marcos Medeiros Raimundo},
url = {https://arxiv.org/abs/2605.22291},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
abstract = {Long-term fairness algorithms aim to satisfy fairness beyond static and short-term notions by accounting for the dynamics between decision-making policies and population behavior. Most previous approaches evaluate performance and fairness measures from observable features and a label, which is assumed to be fully observed. However, in scenarios such as hiring or lending, the labels (e.g., ability to repay the loan) are selective labels as they are only revealed based on positive decisions (e.g., when a loan is granted). In this paper, we study long-term fairness in the selective labels setting and analytically show that naive solutions do not guarantee fairness. To address this gap, we then introduce a novel framework that leverages both the observed data and a label predictor model to estimate the true fairness measure value by decomposing it into the observed fairness and bias from label predictions. This allows us to derive sufficient conditions to satisfy true fairness from observable quantities by using the confidence in the predictor model. Finally, we rely on our theoretical results to propose a novel reinforcement learning algorithm for effective long-term fair decision-making with selective labels. In semisynthetic environments, the proposed algorithm reached comparable fairness and performance to an agent with oracle access to the true labels.},
keywords = {isabel, saml},
pubstate = {published},
tppubtype = {misc}
}
Budde, Lena Marie; Majumdar, Ayan; Uth, Richard; Langer, Markus; Valera, Isabel
From Universal to Individualized Actionability: Revisiting Personalization in Algorithmic Recourse Miscellaneous
2026.
Abstract | Links | BibTeX | Tags: isabel, saml
@misc{budde2026universalindividualizedactionabilityrevisiting,
title = {From Universal to Individualized Actionability: Revisiting Personalization in Algorithmic Recourse},
author = {Lena Marie Budde and Ayan Majumdar and Richard Uth and Markus Langer and Isabel Valera},
url = {https://arxiv.org/abs/2604.08030},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
abstract = {Algorithmic recourse aims to provide actionable recommendations that enable individuals to change unfavorable model outcomes, and prior work has extensively studied properties such as efficiency, robustness, and fairness. However, the role of personalization in recourse remains largely implicit and underexplored. While existing approaches incorporate elements of personalization through user interactions, they typically lack an explicit definition of personalization and do not systematically analyze its downstream effects on other recourse desiderata.
In this paper, we formalize personalization as individual actionability, characterized along two dimensions: hard constraints that specify which features are individually actionable, and soft, individualized constraints that capture preferences over action values and costs. We operationalize these dimensions within the causal algorithmic recourse framework, adopting a pre-hoc user-prompting approach in which individuals express preferences via rankings or scores prior to the generation of any recourse recommendation. Through extensive empirical evaluation, we investigate how personalization interacts with key recourse desiderata, including validity, cost, and plausibility. Our results highlight important trade-offs: individual actionability constraints, particularly hard ones, can substantially degrade the plausibility and validity of recourse recommendations across amortized and non-amortized approaches. Notably, we also find that incorporating individual actionability can reveal disparities in the cost and plausibility of recourse actions across socio-demographic groups. These findings underscore the need for principled definitions, careful operationalization, and rigorous evaluation of personalization in algorithmic recourse.},
keywords = {isabel, saml},
pubstate = {published},
tppubtype = {misc}
}
In this paper, we formalize personalization as individual actionability, characterized along two dimensions: hard constraints that specify which features are individually actionable, and soft, individualized constraints that capture preferences over action values and costs. We operationalize these dimensions within the causal algorithmic recourse framework, adopting a pre-hoc user-prompting approach in which individuals express preferences via rankings or scores prior to the generation of any recourse recommendation. Through extensive empirical evaluation, we investigate how personalization interacts with key recourse desiderata, including validity, cost, and plausibility. Our results highlight important trade-offs: individual actionability constraints, particularly hard ones, can substantially degrade the plausibility and validity of recourse recommendations across amortized and non-amortized approaches. Notably, we also find that incorporating individual actionability can reveal disparities in the cost and plausibility of recourse actions across socio-demographic groups. These findings underscore the need for principled definitions, careful operationalization, and rigorous evaluation of personalization in algorithmic recourse.
Jobanputra, Mayank; Kovtunova, Alisa; Balthes, Brisca; Pogulskiy, Fedor Grigoryevich; Wang, Yifan; Borgwardt, Stefan; Demberg, Vera
ProofTeller: Exposing recency bias in LLM reasoning and its side effects on communication Proceedings Article
In: Inui, Kentaro; Sakti, Sakriani; Wang, Haofen; Wong, Derek F.; Bhattacharyya, Pushpak; Banerjee, Biplab; Ekbal, Asif; Chakraborty, Tanmoy; Singh, Dhirendra Pratap (Ed.): Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pp. 1439–1462, The Asian Federation of Natural Language Processing and The Association for Computational Linguistics, Mumbai, India, 2025, ISBN: 979-8-89176-298-5.
Abstract | Links | BibTeX | Tags: yifan
@inproceedings{jobanputra-etal-2025-proofteller,
title = {ProofTeller: Exposing recency bias in LLM reasoning and its side effects on communication},
author = {Mayank Jobanputra and Alisa Kovtunova and Brisca Balthes and Fedor Grigoryevich Pogulskiy and Yifan Wang and Stefan Borgwardt and Vera Demberg},
editor = {Kentaro Inui and Sakriani Sakti and Haofen Wang and Derek F. Wong and Pushpak Bhattacharyya and Biplab Banerjee and Asif Ekbal and Tanmoy Chakraborty and Dhirendra Pratap Singh},
url = {https://aclanthology.org/2025.ijcnlp-long.80/},
doi = {10.18653/v1/2025.ijcnlp-long.80},
isbn = {979-8-89176-298-5},
year = {2025},
date = {2025-12-01},
urldate = {2025-12-01},
booktitle = {Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics},
pages = {1439–1462},
publisher = {The Asian Federation of Natural Language Processing and The Association for Computational Linguistics},
address = {Mumbai, India},
abstract = {Large language models (LLMs) are increasingly applied in domains that demand reliable and interpretable reasoning. While formal methods can generate provably correct proofs, these proofs are often inaccessible to non-expert users. This raises a natural question: can LLMs, when given a verified proof, faithfully interpret its reasoning and communicate it clearly? We introduce $ProofTeller$, a benchmark that evaluates this ability across three tasks: (1) identifying key proof steps, (2) summarizing the reasoning, and (3) explaining the result in concise natural language. The benchmark covers three domains: _Biology_, _Drones_, and _Recipes_, representing scientific, safety-critical, and everyday reasoning scenarios. We find a consistent near-conclusion bias: LLMs tend to focus on steps closest to the final proof conclusion rather than on the most informative ones. A targeted human study confirms that explanations based on such steps are rated less appropriate for end users. These findings indicate that even when reasoning is provided, current LLMs face challenges in communicating key information in a useful manner, highlighting the need for LLMs that can communicate important details reliably.},
keywords = {yifan},
pubstate = {published},
tppubtype = {inproceedings}
}
Azime, Israel Abebe; Kanubala, Deborah D.; Afonja, Tejumade; Fritz, Mario; Valera, Isabel; Klakow, Dietrich; Slusallek, Philipp
Accept or Deny? Evaluating LLM Fairness and Performance in Loan Approval across Table-to-Text Serialization Approaches Bachelor Thesis
2025.
Abstract | Links | BibTeX | Tags: isabel, saml
@bachelorthesis{nokey,
title = {Accept or Deny? Evaluating LLM Fairness and Performance in Loan Approval across Table-to-Text Serialization Approaches},
author = {Israel Abebe Azime and Deborah D. Kanubala and Tejumade Afonja and Mario Fritz and Isabel Valera and Dietrich Klakow and Philipp Slusallek},
url = {https://arxiv.org/pdf/2508.21512},
year = {2025},
date = {2025-08-29},
urldate = {2025-08-29},
abstract = {Large Language Models (LLMs) are increasingly employed in high-stakes decision-making tasks, such as loan approvals. While their applications expand across domains, LLMs struggle to process tabular data, ensuring fairness and delivering reliable predictions. In this work, we assess the performance and fairness of LLMs on serialized loan approval datasets from three geographically distinct regions: Ghana, Germany, and the United States. Our evaluation focuses on the model’s zero-shot and in-context learning (ICL) capabilities. Our results reveal that the choice of serialization1 format significantly affects both performance and fairness in LLMs, with certain formats such as GReaT and LIFT yielding higher F1 scores but exacerbating fairness disparities. Notably, while ICL improved model performance by 4.9-59.6% relative to zero-shot baselines, its effect on fairness varied considerably across datasets. Our work underscores the importance of effective tabular data representation methods and fairness-aware models to improve the reliability of LLMs in financial decision-making},
keywords = {isabel, saml},
pubstate = {published},
tppubtype = {bachelorthesis}
}
Martínez-García, María; Villacrés, Grace; Mitchell, David; Olmos, Pablo M
Improved Variational Inference in Discrete VAEs using Error Correcting Codes Proceedings Article
In: The 41st Conference on Uncertainty in Artificial Intelligence, 2025.
Abstract | Links | BibTeX | Tags: maria
@inproceedings{martinezimproved,
title = {Improved Variational Inference in Discrete VAEs using Error Correcting Codes},
author = {María Martínez-García and Grace Villacrés and David Mitchell and Pablo M Olmos},
url = {https://proceedings.mlr.press/v286/martinez-garcia25a.html},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
booktitle = {The 41st Conference on Uncertainty in Artificial Intelligence},
abstract = {Despite advances in deep probabilistic models, learning discrete latent representations remains challenging. This work introduces a novel method to improve inference in discrete Variational Autoencoders by reframing the inference problem through a generative perspective. We conceptualize the model as a communication system, and propose to leverage Error-Correcting Codes (ECCs) to introduce redundancy in latent representations, allowing the variational posterior to produce more accurate estimates and reduce the variational gap. We present a proof-of-concept using a Discrete Variational Autoencoder with binary latent variables and low-complexity repetition codes, extending it to a hierarchical structure for disentangling global and local data features. Our approach significantly improves generation quality, data reconstruction, and uncertainty calibration, outperforming the uncoded models even when trained with tighter bounds such as the Importance Weighted Autoencoder objective. We also outline the properties that ECCs should possess to be effectively utilized for improved discrete variational inference.},
keywords = {maria},
pubstate = {published},
tppubtype = {inproceedings}
}
Eisenhut, Jan; Fivser, Daniel; Valera, Isabel; Hoffmann, J¨org
On Picking Good Policies: Leveraging Action-Policy Testing in Policy Training Journal Article
In: Proceedings of the International Conference on Automated Planning and Scheduling, 2025.
Abstract | Links | BibTeX | Tags: isabel
@article{Eisenhut2025OnPG,
title = {On Picking Good Policies: Leveraging Action-Policy Testing in Policy Training},
author = {Jan Eisenhut and Daniel Fivser and Isabel Valera and J¨org Hoffmann},
url = {https://api.semanticscholar.org/CorpusID:281347744},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {Proceedings of the International Conference on Automated Planning and Scheduling},
abstract = {Testing is a natural approach to assess the quality of learned action policies π. Prior work introduced policy testing in AI planning as searching for bugs in π, that is, states where π is sub-optimal with respect to a given testing objective. Beyond quality assurance, an obvious application of these methods is policy selection: given several π to choose from, we can use testing to select the "least buggy" one. Here, we integrate testing-based policy selection into the training process. This includes making more informed decisions when selecting the final policy after training, as well as choosing more promising intermediate policies during the training process. Our experiments with ASNets action policies show that integrating testing allows us to more reliably obtain good-quality policies.},
keywords = {isabel},
pubstate = {published},
tppubtype = {article}
}
Eniser, Hasan Ferit; Lin, Songtuan; Müller, Nicola; Isychev, Anastasia; Wüstholz, Valentin; Valera, Isabel; Hoffmann, J¨org; Christakis, Maria
Using Action-Policy Testing in RL to Reduce the Number of Bugs Journal Article
In: Proceedings of the International Symposium on Combinatorial Search, 2025.
Abstract | Links | BibTeX | Tags: isabel
@article{Eniser2025UsingAT,
title = {Using Action-Policy Testing in RL to Reduce the Number of Bugs},
author = {Hasan Ferit Eniser and Songtuan Lin and Nicola Müller and Anastasia Isychev and Valentin Wüstholz and Isabel Valera and J¨org Hoffmann and Maria Christakis},
url = {https://api.semanticscholar.org/CorpusID:280219509},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {Proceedings of the International Symposium on Combinatorial Search},
abstract = {Reinforcement learning is becoming ever more prominent in solving combinatorial search problems, in particular ones where states are images. Prior work has devised action-policy testing methodology, that identifies so-called bug states where policy performance is sub-optimal. Here we show how to leverage this methodology during the RL process, using action-policy testing to find bugs and injecting those as alternate start states for the training runs. Running experiments across six 2D games, we find that our testing-guided training often achieves similar expected reward while reducing the number of bugs.},
keywords = {isabel},
pubstate = {published},
tppubtype = {article}
}
Javaloy, Adrián; Vergari, Antonio; Valera, Isabel
COPA: Comparing the Incomparable to Explore the Pareto Front Journal Article
In: CoRR, vol. abs/2503.14321, 2025.
Abstract | Links | BibTeX | Tags: adrian, isabel, saml
@article{DBLP:journals/corr/abs-2503-14321,
title = {COPA: Comparing the Incomparable to Explore the Pareto Front},
author = {Adrián Javaloy and Antonio Vergari and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2503.14321},
doi = {10.48550/ARXIV.2503.14321},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {CoRR},
volume = {abs/2503.14321},
abstract = {In machine learning (ML), we often need to choose one among hundreds of trained ML models at hand, based on various objectives such as accuracy, robustness, fairness or scalability. However, it is often unclear how to compare, aggregate and, ultimately, trade-off these objectives, making it a time-consuming task that requires expert knowledge, as objectives may be measured in different units and scales. In this work, we investigate how objectives can be automatically normalized and aggregated to systematically help the user navigate their Pareto front. To this end, we make incomparable objectives comparable using their cumulative functions, approximated by their relative rankings. As a result, our proposed approach, COPA, can aggregate them while matching user-specific preferences, allowing practitioners to meaningfully navigate and search for models in the Pareto front. We demonstrate the potential impact of COPA in both model selection and benchmarking tasks across diverse ML areas such as fair ML, domain generalization, AutoML and foundation models, where classical ways to normalize and aggregate objectives fall short.},
keywords = {adrian, isabel, saml},
pubstate = {published},
tppubtype = {article}
}
Almodóvar, Alejandro; Javaloy, Adrián; Parras, Juan; Zazo, Santiago; Valera, Isabel
DeCaFlow: A Deconfounding Causal Generative Model Journal Article Spotlight
In: CoRR, vol. abs/2503.15114, 2025.
Abstract | Links | BibTeX | Tags: adrian, isabel, saml, spotlight
@article{DBLP:journals/corr/abs-2503-15114,
title = {DeCaFlow: A Deconfounding Causal Generative Model},
author = {Alejandro Almodóvar and Adrián Javaloy and Juan Parras and Santiago Zazo and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2503.15114},
doi = {10.48550/ARXIV.2503.15114},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {CoRR},
volume = {abs/2503.15114},
abstract = {We introduce DeCaFlow, a deconfounding causal generative model. Training once per dataset using just observational data and the underlying causal graph, DeCaFlow enables accurate causal inference on continuous variables under the presence of hidden confounders. Specifically, we extend previous results on causal estimation under hidden confounding to show that a single instance of DeCaFlow provides correct estimates for all causal queries identifiable with do-calculus, leveraging proxy variables to adjust for the causal effects when do-calculus alone is insufficient. Moreover, we show that counterfactual queries are identifiable as long as their interventional counterparts are identifiable, and thus are also correctly estimated by DeCaFlow. Our empirical results on diverse settings (including the Ecoli70 dataset, with 3 independent hidden confounders, tens of observed variables and hundreds of causal queries) show that DeCaFlow outperforms existing approaches, while demonstrating its out-of-the-box applicability to any given causal graph},
keywords = {adrian, isabel, saml, spotlight},
pubstate = {published},
tppubtype = {article}
}
Cinquini, Martina; Beretta, Isacco; Ruggieri, Salvatore; Valera, Isabel
A Practical Approach to Causal Inference over Time Proceedings Article
In: Walsh, Toby; Shah, Julie; Kolter, Zico (Ed.): AAAI-25, Sponsored by the Association for the Advancement of Artificial Intelligence, February 25 - March 4, 2025, Philadelphia, PA, USA, pp. 14832–14839, AAAI Press, 2025.
Abstract | Links | BibTeX | Tags: isabel, saml
@inproceedings{DBLP:conf/aaai/CinquiniBRV25,
title = {A Practical Approach to Causal Inference over Time},
author = {Martina Cinquini and Isacco Beretta and Salvatore Ruggieri and Isabel Valera},
editor = {Toby Walsh and Julie Shah and Zico Kolter},
url = {https://doi.org/10.1609/aaai.v39i14.33626},
doi = {10.1609/AAAI.V39I14.33626},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
booktitle = {AAAI-25, Sponsored by the Association for the Advancement of Artificial
Intelligence, February 25 - March 4, 2025, Philadelphia, PA, USA},
pages = {14832–14839},
publisher = {AAAI Press},
abstract = {In this paper, we focus on estimating the causal effect of an intervention over time on a dynamical system. To that end, we formally define causal interventions and their effects over time on discrete-time stochastic processes (DSPs). Then, we show under which conditions the equilibrium states of a DSP, both before and after a causal intervention, can be captured by a structural causal model (SCM). With such an equivalence at hand, we provide an explicit mapping from vector autoregressive models (VARs), broadly applied in econometrics, to linear, but potentially cyclic and/or affected by unmeasured confounders, SCMs. The resulting causal VAR framework allows us to perform causal inference over time from observational time series data. Our experiments on synthetic and real-world datasets show that the proposed framework achieves strong performance in terms of observational forecasting while enabling accurate estimation of the causal effect of interventions on dynamical systems. We demonstrate, through a case study, the potential practical questions that can be addressed using the proposed causal VAR framework.},
keywords = {isabel, saml},
pubstate = {published},
tppubtype = {inproceedings}
}
