From 69e88141e0dfa63f42941e08767a323da6136f27 Mon Sep 17 00:00:00 2001 From: ppraneth Date: Tue, 16 Jun 2026 08:09:28 +0530 Subject: [PATCH] add cosmos --- CHANGELOG.md | 4 ++ docs/source/docs/scalarization/cosmos.rst | 7 ++ docs/source/docs/scalarization/index.rst | 1 + src/torchjd/scalarization/__init__.py | 2 + src/torchjd/scalarization/_cosmos.py | 74 +++++++++++++++++++++ tests/unit/scalarization/test_cosmos.py | 81 +++++++++++++++++++++++ 6 files changed, 169 insertions(+) create mode 100644 docs/source/docs/scalarization/cosmos.rst create mode 100644 src/torchjd/scalarization/_cosmos.py create mode 100644 tests/unit/scalarization/test_cosmos.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 829e29ca..af56e41d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,10 @@ changelog does not include internal changes that do not affect the user. inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward a preference direction. +- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective + Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear + scalarization with a cosine-similarity penalty pulling the vector of values toward a preference + direction. - Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf) (CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its diff --git a/docs/source/docs/scalarization/cosmos.rst b/docs/source/docs/scalarization/cosmos.rst new file mode 100644 index 00000000..9b3d9c1c --- /dev/null +++ b/docs/source/docs/scalarization/cosmos.rst @@ -0,0 +1,7 @@ +:hide-toc: + +COSMOS +====== + +.. autoclass:: torchjd.scalarization.COSMOS + :members: __call__ diff --git a/docs/source/docs/scalarization/index.rst b/docs/source/docs/scalarization/index.rst index d38708c0..76b98cd6 100644 --- a/docs/source/docs/scalarization/index.rst +++ b/docs/source/docs/scalarization/index.rst @@ -15,6 +15,7 @@ Abstract base class :maxdepth: 1 constant.rst + cosmos.rst dwa.rst famo.rst geometric_mean.rst diff --git a/src/torchjd/scalarization/__init__.py b/src/torchjd/scalarization/__init__.py index f1d22029..a7a0c3fc 100644 --- a/src/torchjd/scalarization/__init__.py +++ b/src/torchjd/scalarization/__init__.py @@ -20,6 +20,7 @@ """ from ._constant import Constant +from ._cosmos import COSMOS from ._dwa import DWA from ._famo import FAMO from ._geometric_mean import GeometricMean @@ -33,6 +34,7 @@ __all__ = [ "Constant", + "COSMOS", "DWA", "FAMO", "GeometricMean", diff --git a/src/torchjd/scalarization/_cosmos.py b/src/torchjd/scalarization/_cosmos.py new file mode 100644 index 00000000..4371e08c --- /dev/null +++ b/src/torchjd/scalarization/_cosmos.py @@ -0,0 +1,74 @@ +import torch +from torch import Tensor + +from ._scalarizer_base import Scalarizer + + +class COSMOS(Scalarizer): + r""" + :class:`~torchjd.scalarization.Scalarizer` that combines the input tensor of values using the + COSMOS scalarization, proposed in `Scalable Pareto Front Approximation for Deep Multi-Objective + Learning `_. + + It returns a linear scalarization penalized by the cosine similarity between the values and the + preference vector: + + .. math:: + \sum_i r_i L_i - \lambda \frac{\sum_i r_i L_i}{\lVert r \rVert \, \lVert L \rVert}, + + where: + + - :math:`L_i` is the :math:`i`-th input value (the :math:`i`-th objective); + - :math:`r_i` is its preference weight (the ``weights`` parameter); + - :math:`\lambda` is the cosine-similarity penalty coefficient (the ``lambda_`` parameter); + - the subtracted term is :math:`\lambda \cos(r, L)`, which rewards aligning the vector of values + with the preference direction and is what spreads the approximated Pareto front. + + :param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative. + A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values + ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value. + :param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on + the probability simplex). If ``None``, a uniform preference summing to one is used. If + provided, it must have the same shape as the values passed at call time. + + .. note:: + COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``. + This is not enforced. + + .. note:: + The full COSMOS method also conditions the model on the preference vector by concatenating it + to the input; that is a modeling choice left to the user. This scalarizer only implements the + objective. The `libmoon `_ reference normalizes the + linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in + the paper and the official implementation. + """ + + def __init__(self, lambda_: float, weights: Tensor | None = None) -> None: + if lambda_ < 0.0: + raise ValueError( + f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`." + ) + + super().__init__() + self.lambda_ = lambda_ + self.weights = weights + + def forward(self, values: Tensor, /) -> Tensor: + if self.weights is not None and self.weights.shape != values.shape: + raise ValueError( + f"Parameter `weights` should have the same shape as `values`. Found " + f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = " + f"{tuple(values.shape)}`." + ) + + if self.weights is None: + weights = torch.full_like(values, 1.0 / values.numel()) + else: + weights = self.weights + + weighted_sum = (weights * values).sum() + cosine_similarity = weighted_sum / (weights.norm() * values.norm()) + return weighted_sum - self.lambda_ * cosine_similarity + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})" diff --git a/tests/unit/scalarization/test_cosmos.py b/tests/unit/scalarization/test_cosmos.py new file mode 100644 index 00000000..1e675b14 --- /dev/null +++ b/tests/unit/scalarization/test_cosmos.py @@ -0,0 +1,81 @@ +import torch +from pytest import mark, raises +from torch import Tensor +from utils.tensors import tensor_ + +from torchjd.scalarization import COSMOS + +from ._asserts import ( + assert_grad_flow, + assert_permutation_invariant, + assert_returns_scalar, +) +from ._inputs import all_inputs + + +def test_value_default() -> None: + # Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the + # weighted sum (1) minus lambda (1): 0. + out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0])) + torch.testing.assert_close(out, tensor_(0.0)) + + +def test_value_lambda_zero_is_linear_scalarization() -> None: + # With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum. + out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0])) + torch.testing.assert_close(out, tensor_(7.0 / 3.0)) + + +def test_value_with_weights() -> None: + # With lambda = 0, only the linear term remains: 2*3 + 1*4 = 10. + out = COSMOS(lambda_=0.0, weights=tensor_([2.0, 1.0]))(tensor_([3.0, 4.0])) + torch.testing.assert_close(out, tensor_(10.0)) + + +def test_full_formula() -> None: + values = tensor_([1.0, 2.0, 4.0]) + weights = tensor_([0.5, 0.3, 0.2]) + lambda_ = 2.0 + weighted_sum = (weights * values).sum() + expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm()) + torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected) + + +@mark.parametrize("values", all_inputs) +def test_expected_structure(values: Tensor) -> None: + assert_returns_scalar(COSMOS(lambda_=1.0), values) + + +@mark.parametrize("values", all_inputs) +def test_grad_flow(values: Tensor) -> None: + assert_grad_flow(COSMOS(lambda_=1.0), values) + + +@mark.parametrize("values", all_inputs) +def test_permutation_invariant(values: Tensor) -> None: + # With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs. + assert_permutation_invariant(COSMOS(lambda_=1.0), values) + + +def test_nan_for_all_zero_values() -> None: + # The cosine term divides by ||L||, so an all-zero vector of values produces nan. + out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0])) + assert out.isnan() + + +@mark.parametrize("lambda_", [-1.0, -0.5]) +def test_raises_on_negative_lambda(lambda_: float) -> None: + with raises(ValueError): + COSMOS(lambda_=lambda_) + + +def test_raises_on_weights_shape_mismatch() -> None: + scalarizer = COSMOS(lambda_=1.0, weights=tensor_([1.0, 1.0, 1.0])) + with raises(ValueError): + scalarizer(tensor_([1.0, 1.0])) + + +def test_representations() -> None: + s = COSMOS(lambda_=0.5) + assert repr(s) == "COSMOS(lambda_=0.5, weights=None)" + assert str(s) == "COSMOS"