Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ changelog does not include internal changes that do not affect the user.
inner loop on a cross-batch matrix `A = J_1 @ J_2.T` (computed from two independent mini-batches
using `autojac.jac`), with a direction-oriented regularizer pulling the descent direction toward
a preference direction.
- Added `COSMOS` from [Scalable Pareto Front Approximation for Deep Multi-Objective
Learning](https://arxiv.org/pdf/2103.13392) (ICDM 2021), a `Scalarizer` that combines a linear
scalarization with a cosine-similarity penalty pulling the vector of values toward a preference
direction.
- Added `DWA` (Dynamic Weight Average) from [End-to-End Multi-Task Learning with
Attention](https://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_End-To-End_Multi-Task_Learning_With_Attention_CVPR_2019_paper.pdf)
(CVPR 2019), a stateful `Scalarizer` that weights each value by the relative rate at which its
Expand Down
7 changes: 7 additions & 0 deletions docs/source/docs/scalarization/cosmos.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
:hide-toc:

COSMOS
======

.. autoclass:: torchjd.scalarization.COSMOS
:members: __call__
1 change: 1 addition & 0 deletions docs/source/docs/scalarization/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Abstract base class
:maxdepth: 1

constant.rst
cosmos.rst
dwa.rst
famo.rst
geometric_mean.rst
Expand Down
2 changes: 2 additions & 0 deletions src/torchjd/scalarization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"""

from ._constant import Constant
from ._cosmos import COSMOS
from ._dwa import DWA
from ._famo import FAMO
from ._geometric_mean import GeometricMean
Expand All @@ -33,6 +34,7 @@

__all__ = [
"Constant",
"COSMOS",
"DWA",
"FAMO",
"GeometricMean",
Expand Down
74 changes: 74 additions & 0 deletions src/torchjd/scalarization/_cosmos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import torch
from torch import Tensor

from ._scalarizer_base import Scalarizer


class COSMOS(Scalarizer):
r"""
:class:`~torchjd.scalarization.Scalarizer` that combines the input tensor of values using the
COSMOS scalarization, proposed in `Scalable Pareto Front Approximation for Deep Multi-Objective
Learning <https://arxiv.org/pdf/2103.13392>`_.

It returns a linear scalarization penalized by the cosine similarity between the values and the
preference vector:

.. math::
\sum_i r_i L_i - \lambda \frac{\sum_i r_i L_i}{\lVert r \rVert \, \lVert L \rVert},

where:

- :math:`L_i` is the :math:`i`-th input value (the :math:`i`-th objective);
- :math:`r_i` is its preference weight (the ``weights`` parameter);
- :math:`\lambda` is the cosine-similarity penalty coefficient (the ``lambda_`` parameter);
- the subtracted term is :math:`\lambda \cos(r, L)`, which rewards aligning the vector of values
with the preference direction and is what spreads the approximated Pareto front.

:param lambda_: The cosine-similarity penalty coefficient :math:`\lambda`. Must be non-negative.
A value of ``0`` reduces COSMOS to a plain linear scalarization. The paper uses values
ranging from ``0.01`` to ``8`` depending on the dataset, with no single best value.
:param weights: The preference vector :math:`r` applied to the values (in the paper, sampled on
the probability simplex). If ``None``, a uniform preference summing to one is used. If
provided, it must have the same shape as the values passed at call time.

.. note::
COSMOS divides by :math:`\lVert L \rVert`, so an all-zero vector of values produces ``nan``.
This is not enforced.

.. note::
The full COSMOS method also conditions the model on the preference vector by concatenating it
to the input; that is a modeling choice left to the user. This scalarizer only implements the
objective. The `libmoon <https://github.com/xzhang2523/libmoon>`_ reference normalizes the
linear term by :math:`\lVert r \rVert`; here the linear term is the raw weighted sum, as in
the paper and the official implementation.
"""

def __init__(self, lambda_: float, weights: Tensor | None = None) -> None:
if lambda_ < 0.0:
raise ValueError(
f"Parameter `lambda_` should be non-negative. Found `lambda_ = {lambda_}`."
)

super().__init__()
self.lambda_ = lambda_
self.weights = weights

def forward(self, values: Tensor, /) -> Tensor:
if self.weights is not None and self.weights.shape != values.shape:
raise ValueError(
f"Parameter `weights` should have the same shape as `values`. Found "
f"`weights.shape = {tuple(self.weights.shape)}` and `values.shape = "
f"{tuple(values.shape)}`."
)

if self.weights is None:
weights = torch.full_like(values, 1.0 / values.numel())
else:
weights = self.weights

weighted_sum = (weights * values).sum()
cosine_similarity = weighted_sum / (weights.norm() * values.norm())
return weighted_sum - self.lambda_ * cosine_similarity

def __repr__(self) -> str:
return f"{self.__class__.__name__}(lambda_={self.lambda_}, weights={self.weights!r})"
81 changes: 81 additions & 0 deletions tests/unit/scalarization/test_cosmos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import torch
from pytest import mark, raises
from torch import Tensor
from utils.tensors import tensor_

from torchjd.scalarization import COSMOS

from ._asserts import (
assert_grad_flow,
assert_permutation_invariant,
assert_returns_scalar,
)
from ._inputs import all_inputs


def test_value_default() -> None:
# Uniform weights on equal values are perfectly aligned, so cos(r, L) = 1. The result is the
# weighted sum (1) minus lambda (1): 0.
out = COSMOS(lambda_=1.0)(tensor_([1.0, 1.0]))
torch.testing.assert_close(out, tensor_(0.0))


def test_value_lambda_zero_is_linear_scalarization() -> None:
# With lambda = 0 there is no cosine penalty, so COSMOS is the (uniform) weighted sum.
out = COSMOS(lambda_=0.0)(tensor_([1.0, 2.0, 4.0]))
torch.testing.assert_close(out, tensor_(7.0 / 3.0))


def test_value_with_weights() -> None:
# With lambda = 0, only the linear term remains: 2*3 + 1*4 = 10.
out = COSMOS(lambda_=0.0, weights=tensor_([2.0, 1.0]))(tensor_([3.0, 4.0]))
torch.testing.assert_close(out, tensor_(10.0))


def test_full_formula() -> None:
values = tensor_([1.0, 2.0, 4.0])
weights = tensor_([0.5, 0.3, 0.2])
lambda_ = 2.0
weighted_sum = (weights * values).sum()
expected = weighted_sum - lambda_ * weighted_sum / (weights.norm() * values.norm())
torch.testing.assert_close(COSMOS(lambda_, weights=weights)(values), expected)


@mark.parametrize("values", all_inputs)
def test_expected_structure(values: Tensor) -> None:
assert_returns_scalar(COSMOS(lambda_=1.0), values)


@mark.parametrize("values", all_inputs)
def test_grad_flow(values: Tensor) -> None:
assert_grad_flow(COSMOS(lambda_=1.0), values)


@mark.parametrize("values", all_inputs)
def test_permutation_invariant(values: Tensor) -> None:
# With uniform weights, both the weighted sum and the cosine term are symmetric in the inputs.
assert_permutation_invariant(COSMOS(lambda_=1.0), values)


def test_nan_for_all_zero_values() -> None:
# The cosine term divides by ||L||, so an all-zero vector of values produces nan.
out = COSMOS(lambda_=1.0)(tensor_([0.0, 0.0]))
assert out.isnan()


@mark.parametrize("lambda_", [-1.0, -0.5])
def test_raises_on_negative_lambda(lambda_: float) -> None:
with raises(ValueError):
COSMOS(lambda_=lambda_)


def test_raises_on_weights_shape_mismatch() -> None:
scalarizer = COSMOS(lambda_=1.0, weights=tensor_([1.0, 1.0, 1.0]))
with raises(ValueError):
scalarizer(tensor_([1.0, 1.0]))


def test_representations() -> None:
s = COSMOS(lambda_=0.5)
assert repr(s) == "COSMOS(lambda_=0.5, weights=None)"
assert str(s) == "COSMOS"
Loading