People

Adrián Javaloy
About me
Hi there 👋 My name is Adrián, and I am a PhD student at the Probabilistic Machine Learning group under the supervision of Isabel Valera at Saarland University. I am also an alumnus of the Empirical Inference department at the Max Planck Institute for Intelligent Systems. Before pursuing my PhD, I completed a Master in Computer Science and a double Bachelor Degree in Computer Science Engineering and Mathematics at the University of Murcia.
I am mostly interested in principled research that aims to improve machine learning methods, making them easier to use and more robust. Specifically, my current research focuses on improving models that optimize different tasks simultaneously. This includes apparently unrelated areas such as multitask learning (MTL) and deep probabilistic generative models (e.g. VAEs) under heterogeneous environments.
However, I am a curiosity-driven, and I am always happy to discuss interesting ideas even if they lie outside my comfort zone. Feel free to reach me out via Twitter or email.
Publications
2026
Javaloy, Adrián; Vergari, Antonio
An Embarrassingly Simple Way to Optimize Orthogonal Matrices at Scale Journal Article
In: CoRR, vol. abs/2602.14656, 2026.
@article{DBLP:journals/corr/abs-2602-14656,
title = {An Embarrassingly Simple Way to Optimize Orthogonal Matrices at Scale},
author = {Adrián Javaloy and Antonio Vergari},
url = {https://doi.org/10.48550/arXiv.2602.14656},
doi = {10.48550/ARXIV.2602.14656},
year = {2026},
date = {2026-01-01},
urldate = {2026-01-01},
journal = {CoRR},
volume = {abs/2602.14656},
abstract = {Orthogonality constraints are ubiquitous in robust and probabilistic machine learning. Unfortunately, current optimizers are computationally expensive and do not scale to problems with hundreds or thousands of constraints. One notable exception is the Landing algorithm (Ablin et al., 2024) which, however comes at the expense of temporarily relaxing orthogonality. In this work, we revisit and improve on the ideas behind Landing, enabling the inclusion of modern adaptive optimizers while ensuring that orthogonal constraints are effectively met. Remarkably, these improvements come at little to no cost, and reduce the number of required hyperparemeters. Our algorithm POGO is fast and GPU-friendly, consisting of only 5 matrix products, and in practice maintains orthogonality at all times. On several challenging benchmarks, POGO greatly outperforms recent optimizers and shows it can optimize problems with thousands of orthogonal matrices in minutes while alternatives would take hours. As such, POGO sets a milestone to finally exploit orthogonality constraints in ML at scale.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2025
Javaloy, Adrián; Vergari, Antonio; Valera, Isabel
COPA: Comparing the Incomparable to Explore the Pareto Front Journal Article
In: CoRR, vol. abs/2503.14321, 2025.
@article{DBLP:journals/corr/abs-2503-14321,
title = {COPA: Comparing the Incomparable to Explore the Pareto Front},
author = {Adrián Javaloy and Antonio Vergari and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2503.14321},
doi = {10.48550/ARXIV.2503.14321},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {CoRR},
volume = {abs/2503.14321},
abstract = {In machine learning (ML), we often need to choose one among hundreds of trained ML models at hand, based on various objectives such as accuracy, robustness, fairness or scalability. However, it is often unclear how to compare, aggregate and, ultimately, trade-off these objectives, making it a time-consuming task that requires expert knowledge, as objectives may be measured in different units and scales. In this work, we investigate how objectives can be automatically normalized and aggregated to systematically help the user navigate their Pareto front. To this end, we make incomparable objectives comparable using their cumulative functions, approximated by their relative rankings. As a result, our proposed approach, COPA, can aggregate them while matching user-specific preferences, allowing practitioners to meaningfully navigate and search for models in the Pareto front. We demonstrate the potential impact of COPA in both model selection and benchmarking tasks across diverse ML areas such as fair ML, domain generalization, AutoML and foundation models, where classical ways to normalize and aggregate objectives fall short.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Almodóvar, Alejandro; Javaloy, Adrián; Parras, Juan; Zazo, Santiago; Valera, Isabel
DeCaFlow: A Deconfounding Causal Generative Model Journal Article
In: CoRR, vol. abs/2503.15114, 2025.
@article{DBLP:journals/corr/abs-2503-15114,
title = {DeCaFlow: A Deconfounding Causal Generative Model},
author = {Alejandro Almodóvar and Adrián Javaloy and Juan Parras and Santiago Zazo and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2503.15114},
doi = {10.48550/ARXIV.2503.15114},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {CoRR},
volume = {abs/2503.15114},
abstract = {We introduce DeCaFlow, a deconfounding causal generative model. Training once per dataset using just observational data and the underlying causal graph, DeCaFlow enables accurate causal inference on continuous variables under the presence of hidden confounders. Specifically, we extend previous results on causal estimation under hidden confounding to show that a single instance of DeCaFlow provides correct estimates for all causal queries identifiable with do-calculus, leveraging proxy variables to adjust for the causal effects when do-calculus alone is insufficient. Moreover, we show that counterfactual queries are identifiable as long as their interventional counterparts are identifiable, and thus are also correctly estimated by DeCaFlow. Our empirical results on diverse settings (including the Ecoli70 dataset, with 3 independent hidden confounders, tens of observed variables and hundreds of causal queries) show that DeCaFlow outperforms existing approaches, while demonstrating its out-of-the-box applicability to any given causal graph},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Loconte, Lorenzo; Javaloy, Adrián; Vergari, Antonio
How to Square Tensor Networks and Circuits Without Squaring Them Journal Article
In: CoRR, vol. abs/2512.17090, 2025.
@article{DBLP:journals/corr/abs-2512-17090,
title = {How to Square Tensor Networks and Circuits Without Squaring Them},
author = {Lorenzo Loconte and Adrián Javaloy and Antonio Vergari},
url = {https://doi.org/10.48550/arXiv.2512.17090},
doi = {10.48550/ARXIV.2512.17090},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
journal = {CoRR},
volume = {abs/2512.17090},
abstract = {Squared tensor networks (TNs) and their extension as computational graphs--squared circuits--have been used as expressive distribution estimators, yet supporting closed-form marginalization. However, the squaring operation introduces additional complexity when computing the partition function or marginalizing variables, which hinders their applicability in ML. To solve this issue, canonical forms of TNs are parameterized via unitary matrices to simplify the computation of marginals. However, these canonical forms do not apply to circuits, as they can represent factorizations that do not directly map to a known TN. Inspired by the ideas of orthogonality in canonical forms and determinism in circuits enabling tractable maximization, we show how to parameterize squared circuits to overcome their marginalization overhead. Our parameterizations unlock efficient marginalization even in factorizations different from TNs, but encoded as circuits, whose structure would otherwise make marginalization computationally hard. Finally, our experiments on distribution estimation show how our proposed conditions in squared circuits come with no expressiveness loss, while enabling more efficient learning},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2023
Javaloy, Adrián; Sánchez-Mart'ın, Pablo; Valera, Isabel
Causal normalizing flows: from theory to practice Journal Article
In: CoRR, vol. abs/2306.05415, 2023.
@article{DBLP:journals/corr/abs-2306-05415,
title = {Causal normalizing flows: from theory to practice},
author = {Adrián Javaloy and Pablo Sánchez-Mart'ın and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2306.05415},
doi = {10.48550/arXiv.2306.05415},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {CoRR},
volume = {abs/2306.05415},
abstract = {In this work, we deepen on the use of normalizing flows for causal reasoning. Specifically, we first leverage recent results on non-linear ICA to show that causal models are identifiable from observational data given a causal ordering, and thus can be recovered using autoregressive normalizing flows (NFs). Second, we analyze different design and learning choices for causal normalizing flows to capture the underlying causal data-generating process. Third, we describe how to implement the do-operator in causal NFs, and thus, how to answer interventional and counterfactual questions. Finally, in our experiments, we validate our design and training choices through a comprehensive ablation study; compare causal NFs to other approaches for approximating causal models; and empirically demonstrate that causal NFs can be used to address real-world problems—where the presence of mixed discrete-continuous data and partial knowledge on the causal graph is the norm. The code for this work can be found at https://github.com/psanch21/causal-flows.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Javaloy, Adrián; Sánchez-Mart'ın, Pablo; Levi, Amit; Valera, Isabel
Learnable Graph Convolutional Attention Networks Proceedings Article
In: The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1-5, 2023, OpenReview.net, 2023.
@inproceedings{DBLP:conf/iclr/JavaloySLV23,
title = {Learnable Graph Convolutional Attention Networks},
author = {Adrián Javaloy and Pablo Sánchez-Mart'ın and Amit Levi and Isabel Valera},
url = {https://openreview.net/pdf?id=WsUMeHPo-2},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {The Eleventh International Conference on Learning Representations,
ICLR 2023, Kigali, Rwanda, May 1-5, 2023},
publisher = {OpenReview.net},
abstract = {Existing Graph Neural Networks (GNNs) compute the message exchange between nodes by either aggregating uniformly (convolving) the features of all the neighbor- ing nodes, or by applying a non-uniform score (attending) to the features. Recent works have shown the strengths and weaknesses of the resulting GNN architectures, respectively, GCNs and GATs. In this work, we aim at exploiting the strengths of both approaches to their full extent. To this end, we first introduce the graph convolutional attention layer (CAT), which relies on convolutions to compute the attention scores. Unfortunately, as in the case of GCNs and GATs, we show that there exists no clear winner between the three—neither theoretically nor in practice—as their performance directly depends on the nature of the data (i.e., of the graph and features). This result brings us to the main contribution of our work, the learnable graph convolutional attention network (L-CAT): a GNN architecture that automatically interpolates between GCN, GAT and CAT in each layer, by adding only two scalar parameters. Our results demonstrate that L-CAT is able to efficiently combine different GNN layers along the network, outperforming competing methods in a wide range of datasets, and resulting in a more robust model that reduces the need of cross-validating.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Javaloy, Adrián; Meghdadi, Maryam; Valera, Isabel
Mitigating Modality Collapse in Multimodal VAEs via Impartial Optimization Journal Article
In: CoRR, vol. abs/2206.04496, 2022.
@article{DBLP:journals/corr/abs-2206-04496,
title = {Mitigating Modality Collapse in Multimodal VAEs via Impartial Optimization},
author = {Adrián Javaloy and Maryam Meghdadi and Isabel Valera},
url = {https://doi.org/10.48550/arXiv.2206.04496},
doi = {10.48550/arXiv.2206.04496},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {CoRR},
volume = {abs/2206.04496},
abstract = {A number of variational autoencoders (VAEs) have recently emerged with the aim of modeling multimodal data, e.g., to jointly model images and their corresponding captions. Still, multimodal VAEs tend to focus solely on a subset of the modalities, e.g., by fitting the image while neglecting the caption. We refer to this limitation as modality collapse. In this work, we argue that this effect is a consequence of conflicting gradients during multimodal VAE training. We show how to detect the sub-graphs in the computational graphs where gradients conflict (impartiality blocks), as well as how to leverage existing gradient-conflict solutions from multitask learning to mitigate modality collapse. That is, to ensure impartial optimization across modalities. We apply our training framework to several multimodal VAE models, losses and datasets from the literature, and empirically show that our framework significantly improves the reconstruction performance, conditional generation, and coherence of the latent space across modalities.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Javaloy, Adrián; Valera, Isabel
RotoGrad: Gradient Homogenization in Multitask Learning Proceedings Article
In: The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022, OpenReview.net, 2022.
@inproceedings{DBLP:conf/iclr/JavaloyV22,
title = {RotoGrad: Gradient Homogenization in Multitask Learning},
author = {Adrián Javaloy and Isabel Valera},
url = {https://openreview.net/forum?id=T8wHz4rnuGL},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {The Tenth International Conference on Learning Representations, ICLR
2022, Virtual Event, April 25-29, 2022},
publisher = {OpenReview.net},
abstract = {Multitask learning is being increasingly adopted in applications domains like computer vision and reinforcement learning. However, optimally exploiting its advantages remains a major challenge due to the effect of negative transfer. Previous works have tracked down this issue to the disparities in gradient magnitudes and directions across tasks, when optimizing the shared network parameters. While recent work has acknowledged that negative transfer is a two-fold problem, existing approaches fall short as they only focus on either homogenizing the gradient magnitude across tasks; or greedily change the gradient directions, overlooking future conflicts. In this work, we introduce RotoGrad, an algorithm that tackles negative transfer as a whole: it jointly homogenizes gradient magnitudes and directions, while ensuring training convergence. We show that RotoGrad outperforms competing methods in complex problems, including multi-label classification in CelebA and computer vision tasks in the NYUv2 dataset. A Pytorch implementation can be found in https://github.com/adrianjav/rotograd.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Javaloy, Adrián; Meghdadi, Maryam; Valera, Isabel
Boosting heterogeneous VAEs via multi-objective optimization Workshop
2021.
@workshop{nokey,
title = {Boosting heterogeneous VAEs via multi-objective optimization},
author = {Adrián Javaloy and Maryam Meghdadi and Isabel Valera},
url = {http://adrian.javaloy.com/publication/mo-vae/},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
abstract = {A number of variational autoencoders (VAEs) have recently emerged with the aim of modeling multimodal data, e.g., to jointly model images and their corresponding captions. Still, multimodal VAEs tend to focus solely on a subset of the modalities, e.g., by fitting the image while neglecting the caption. We refer to this limitation as modality collapse. In this work, we argue that this effect is a consequence of conflicting gradients during multimodal VAE training. We show how to detect the sub-graphs in the computational graphs where gradients conflict (impartiality blocks), as well as how to leverage existing gradient-conflict solutions from multitask learning to mitigate modality collapse. That is, to ensure impartial optimization across modalities. We apply our training framework to several multimodal VAE models, losses and datasets from the literature, and empirically show that our framework significantly improves the reconstruction performance, conditional generation, and coherence of the latent space across modalities.},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
2020
Javaloy, Adrián; Valera, Isabel
Lipschitz standardization for robust multivariate learning Journal Article
In: CoRR, vol. abs/2002.11369, 2020.
@article{DBLP:journals/corr/abs-2002-11369,
title = {Lipschitz standardization for robust multivariate learning},
author = {Adrián Javaloy and Isabel Valera},
url = {https://arxiv.org/abs/2002.11369},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {CoRR},
volume = {abs/2002.11369},
abstract = {Probabilistic learning is increasingly being tackled as an optimization problem, with gradient-based approaches as predominant methods. When modelling multivariate likelihoods, a usual but undesirable outcome is that the learned model fits only a subset of the observed variables, overlooking the rest. In this work, we study this problem through the lens of multitask learning (MTL), where similar effects have been broadly studied. While MTL solutions do not directly apply in the probabilistic setting (as they cannot handle the likelihood constraints) we show that similar ideas may be leveraged during data preprocessing. First, we show that data standardization often helps under common continuous likelihoods, but it is not enough in the general case, specially under mixed continuous and discrete likelihood models. In order for balance multivariate learning, we then propose a novel data preprocessing, Lipschitz standardization, which balances the local Lipschitz smoothness across variables. Our experiments on real-world datasets show that Lipschitz standardization leads to more accurate multivariate models than the ones learned using existing data preprocessing techniques. The models and datasets employed in the experiments can be found in this URL https://github.com/adrianjav/lipschitz-standardization},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Gresele, Luigi; Fissore, Giancarlo; Javaloy, Adrián; Schölkopf, Bernhard; Hyvärinen, Aapo
Relative gradient optimization of the Jacobian term in unsupervised deep learning Journal Article
In: CoRR, vol. abs/2006.15090, 2020.
@article{DBLP:journals/corr/abs-2006-15090,
title = {Relative gradient optimization of the Jacobian term in unsupervised deep learning},
author = {Luigi Gresele and Giancarlo Fissore and Adrián Javaloy and Bernhard Schölkopf and Aapo Hyvärinen},
url = {https://arxiv.org/abs/2006.15090},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {CoRR},
volume = {abs/2006.15090},
abstract = {Learning expressive probabilistic models correctly describing the data is a ubiquitous problem in machine learning. A popular approach for solving it is mapping the observations into a representation space with a simple joint distribution, which can typically be written as a product of its marginals — thus drawing a connection with the field of nonlinear independent component analysis. Deep density models have been widely used for this task, but their maximum likelihood based training requires estimating the log-determinant of the Jacobian and is computationally expensive, thus imposing a trade-off between computation and expressive power. In this work, we propose a new approach for exact training of such neural networks. Based on relative gradients, we exploit the matrix structure of neural network parameters to compute updates efficiently even in high-dimensional spaces; the computational cost of the training is quadratic in the input size, in contrast with the cubic scaling of naive approaches. This allows fast training with objective functions involving the log-determinant of the Jacobian, without imposing constraints on its structure, in stark contrast to autoregressive normalizing flows.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2019
Javaloy, Adrián; García-Mateos, Ginés
A Character-Level Approach to the Text Normalization Problem Based on a New Causal Encoder Journal Article
In: CoRR, vol. abs/1903.02642, 2019.
@article{DBLP:journals/corr/abs-1903-02642,
title = {A Character-Level Approach to the Text Normalization Problem Based on a New Causal Encoder},
author = {Adrián Javaloy and Ginés García-Mateos},
url = {http://arxiv.org/abs/1903.02642},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
journal = {CoRR},
volume = {abs/1903.02642},
abstract = {Text normalization is a ubiquitous process that appears as the first step of many Natural Language Processing problems. However, previous Deep Learning approaches have suffered from so-called silly errors, which are undetectable on unsupervised frameworks, making those models unsuitable for deployment. In this work, we make use of an attention-based encoder-decoder architecture that overcomes these undetectable errors by using a fine-grained character-level approach rather than a word-level one. Furthermore, our new general-purpose encoder based on causal convolutions, called Causal Feature Extractor (CFE), is introduced and compared to other common encoders. The experimental results show the feasibility of this encoder, which leverages the attention mechanisms the most and obtains better results in terms of accuracy, number of parameters and convergence time. While our method results in a slightly worse initial accuracy (92.74%), errors can be automatically detected and, thus, more readily solved, obtaining a more robust model for deployment. Furthermore, there is still plenty of room for future improvements that will push even further these advantages.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
