From 69e88141e0dfa63f42941e08767a323da6136f27 Mon Sep 17 00:00:00 2001
From: ppraneth <pranethparuchuri@gmail.com>
Date: Tue, 16 Jun 2026 08:09:28 +0530
Subject: [PATCH] add cosmos

---
 CHANGELOG.md                              |  4 ++
 docs/source/docs/scalarization/cosmos.rst |  7 ++
 docs/source/docs/scalarization/index.rst  |  1 +
 src/torchjd/scalarization/__init__.py     |  2 +
 src/torchjd/scalarization/_cosmos.py      | 74 +++++++++++++++++++++
 tests/unit/scalarization/test_cosmos.py   | 81 +++++++++++++++++++++++
 6 files changed, 169 insertions(+)
 create mode 100644 docs/source/docs/scalarization/cosmos.rst
 create mode 100644 src/torchjd/scalarization/_cosmos.py
 create mode 100644 tests/unit/scalarization/test_cosmos.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 829e29ca..af56e41d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,10 @@ changelog does not include internal changes that do not affect the user.
   inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches
   using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward
   a preference direction.
+- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective
+  Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear
+  scalarization with a cosine-similarity penalty pulling the vector of values toward a preference
+  direction.
 - Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with
   Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf)
   (CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its
diff --git a/docs/source/docs/scalarization/cosmos.rst b/docs/source/docs/scalarization/cosmos.rst
new file mode 100644
index 00000000..9b3d9c1c
--- /dev/null
+++ b/docs/source/docs/scalarization/cosmos.rst
@@ -0,0 +1,7 @@
+:hide-toc:
+
+COSMOS
+======
+
+.. autoclass:: torchjd.scalarization.COSMOS
+    :members: __call__
diff --git a/docs/source/docs/scalarization/index.rst b/docs/source/docs/scalarization/index.rst
index d38708c0..76b98cd6 100644
--- a/docs/source/docs/scalarization/index.rst
+++ b/docs/source/docs/scalarization/index.rst
@@ -15,6 +15,7 @@ Abstract base class
     :maxdepth: 1
 
     constant.rst
+    cosmos.rst
     dwa.rst
     famo.rst
     geometric_mean.rst
diff --git a/src/torchjd/scalarization/__init__.py b/src/torchjd/scalarization/__init__.py
index f1d22029..a7a0c3fc 100644
--- a/src/torchjd/scalarization/__init__.py
+++ b/src/torchjd/scalarization/__init__.py
@@ -20,6 +20,7 @@
 """
 
 from ._constant import Constant
+from ._cosmos import COSMOS
 from ._dwa import DWA
 from ._famo import FAMO
 from ._geometric_mean import GeometricMean
@@ -33,6 +34,7 @@
 
 __all__ = [
     "Constant",
+    "COSMOS",
     "DWA",
     "FAMO",
     "GeometricMean",
diff --git a/src/torchjd/scalarization/_cosmos.py b/src/torchjd/scalarization/_cosmos.py
new file mode 100644
index 00000000..4371e08c
--- /dev/null
+++ b/src/torchjd/scalarization/_cosmos.py
@@ -0,0 +1,74 @@
+import torch
+from torch import Tensor
+
+from ._scalarizer_base import Scalarizer
+
+
+class COSMOS(Scalarizer):
+    r"""
+    :class:`~torchjd.scalarization.Scalarizer` that combines the input tensor of values using the
+    COSMOS scalarization, proposed in `Scalable Pareto Front Approximation for Deep Multi-Objective
+    Learning <https://arxiv.org/pdf/2103.13392>`_.
+
+    It returns a linear scalarization penalized by the cosine similarity between the values and the
+    preference vector:
+
+    .. math::
+        \sum_i r_i L_i - \lambda \frac{\sum_i r_i L_i}{\lVert r \rVert \, \lVert L \rVert},
+
+    where:
+
+    - :math:`L_i` is the :math:`i`-th input value (the :math:`i`-th objective);
+    - :math:`r_i` is its preference weight (the ``weights`` parameter);
+    - :math:`\lambda` is the cosine-similarity penalty coefficient (the ``lambda_`` parameter);
+    - the subtracted term is :math:`\lambda \cos(r, L)`, which rewards aligning the vector of values
+      with the preference direction and is what spreads the approximated Pareto front.
+
+    :param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative.
+        A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values
+        ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value.
+    :param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on
+        the probability simplex). If ``None``, a uniform preference summing to one is used. If
+        provided, it must have the same shape as the values passed at call time.
+
+    .. note::
+        COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``.
+        This is not enforced.
+
+    .. note::
+        The full COSMOS method also conditions the model on the preference vector by concatenating it
+        to the input; that is a modeling choice left to the user. This scalarizer only implements the
+        objective. The `libmoon <https://github.com/xzhang2523/libmoon>`_ reference normalizes the
+        linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in
+        the paper and the official implementation.
+    """
+
+    def __init__(self, lambda_: float, weights: Tensor | None = None) -> None:
+        if lambda_ < 0.0:
+            raise ValueError(
+                f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`."
+            )
+
+        super().__init__()
+        self.lambda_ = lambda_
+        self.weights = weights
+
+    def forward(self, values: Tensor, /) -> Tensor:
+        if self.weights is not None and self.weights.shape != values.shape:
+            raise ValueError(
+                f"Parameter `weights` should have the same shape as `values`. Found "
+                f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = "
+                f"{tuple(values.shape)}`."
+            )
+
+        if self.weights is None:
+            weights = torch.full_like(values, 1.0 / values.numel())
+        else:
+            weights = self.weights
+
+        weighted_sum = (weights * values).sum()
+        cosine_similarity = weighted_sum / (weights.norm() * values.norm())
+        return weighted_sum - self.lambda_ * cosine_similarity
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})"
diff --git a/tests/unit/scalarization/test_cosmos.py b/tests/unit/scalarization/test_cosmos.py
new file mode 100644
index 00000000..1e675b14
--- /dev/null
+++ b/tests/unit/scalarization/test_cosmos.py
@@ -0,0 +1,81 @@
+import torch
+from pytest import mark, raises
+from torch import Tensor
+from utils.tensors import tensor_
+
+from torchjd.scalarization import COSMOS
+
+from ._asserts import (
+    assert_grad_flow,
+    assert_permutation_invariant,
+    assert_returns_scalar,
+)
+from ._inputs import all_inputs
+
+
+def test_value_default() -> None:
+    # Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the
+    # weighted sum (1) minus lambda (1): 0.
+    out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0]))
+    torch.testing.assert_close(out, tensor_(0.0))
+
+
+def test_value_lambda_zero_is_linear_scalarization() -> None:
+    # With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum.
+    out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0]))
+    torch.testing.assert_close(out, tensor_(7.0 / 3.0))
+
+
+def test_value_with_weights() -> None:
+    # With lambda = 0, only the linear term remains: 2*3 + 1*4 = 10.
+    out = COSMOS(lambda_=0.0, weights=tensor_([2.0, 1.0]))(tensor_([3.0, 4.0]))
+    torch.testing.assert_close(out, tensor_(10.0))
+
+
+def test_full_formula() -> None:
+    values = tensor_([1.0, 2.0, 4.0])
+    weights = tensor_([0.5, 0.3, 0.2])
+    lambda_ = 2.0
+    weighted_sum = (weights * values).sum()
+    expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm())
+    torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected)
+
+
+@mark.parametrize("values", all_inputs)
+def test_expected_structure(values: Tensor) -> None:
+    assert_returns_scalar(COSMOS(lambda_=1.0), values)
+
+
+@mark.parametrize("values", all_inputs)
+def test_grad_flow(values: Tensor) -> None:
+    assert_grad_flow(COSMOS(lambda_=1.0), values)
+
+
+@mark.parametrize("values", all_inputs)
+def test_permutation_invariant(values: Tensor) -> None:
+    # With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs.
+    assert_permutation_invariant(COSMOS(lambda_=1.0), values)
+
+
+def test_nan_for_all_zero_values() -> None:
+    # The cosine term divides by ||L||, so an all-zero vector of values produces nan.
+    out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0]))
+    assert out.isnan()
+
+
+@mark.parametrize("lambda_", [-1.0, -0.5])
+def test_raises_on_negative_lambda(lambda_: float) -> None:
+    with raises(ValueError):
+        COSMOS(lambda_=lambda_)
+
+
+def test_raises_on_weights_shape_mismatch() -> None:
+    scalarizer = COSMOS(lambda_=1.0, weights=tensor_([1.0, 1.0, 1.0]))
+    with raises(ValueError):
+        scalarizer(tensor_([1.0, 1.0]))
+
+
+def test_representations() -> None:
+    s = COSMOS(lambda_=0.5)
+    assert repr(s) == "COSMOS(lambda_=0.5, weights=None)"
+    assert str(s) == "COSMOS"