# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
import torch
import torch.distributed as dist
from .utils import (
all_gather_v_bwd_wrapper,
all_gather_v_wrapper,
gather_v_wrapper,
indexed_all_to_all_v_wrapper,
indexed_all_to_all_v_wrapper_bwd,
scatter_v_wrapper,
)
[docs]class AllGatherVAutograd(torch.autograd.Function):
"""
Autograd Wrapper for a distributed AllGatherV primitive.
It is based on the idea of a single global tensor which is distributed
along a specified dimension into chunks of variable size.
This primitive gathers all local tensors from each rank into the
full global tensor onto each rank. Its indended to be used in
tensor-parallel settings on tensors which require gradients
to be passed through.
The backward pass performs an AllReduceV operation where
each rank gathers its corresponding chunk of a global tensor
from each other rank and sums up these individual gradients.
"""
[docs] @staticmethod
def forward(
ctx,
tensor: torch.Tensor,
sizes: List[int],
dim: int = 0,
use_fp32: bool = True,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""forward pass of the Distributed AllGatherV primitive"""
gathered_tensor = all_gather_v_wrapper(tensor, sizes, dim=dim, group=group)
ctx.sizes = sizes
ctx.group = group
ctx.dim = dim
ctx.use_fp32 = use_fp32
return gathered_tensor
[docs] @staticmethod
def backward(ctx, grad_output: torch.Tensor): # pragma: no cover
"""backward pass of the of the Distributed AllGatherV primitive"""
grad_tensor = all_gather_v_bwd_wrapper(
grad_output,
ctx.sizes,
dim=ctx.dim,
use_fp32=ctx.use_fp32,
group=ctx.group,
)
if not ctx.needs_input_grad[0]:
grad_tensor = None
return grad_tensor, None, None, None, None
[docs]class GatherVAutograd(torch.autograd.Function):
"""
Autograd Wrapper for a distributed GatherV primitive.
It is based on the idea of a single global tensor which is distributed
along a specified dimension into chunks of variable size.
This primitive assumes such a distributed tensor and gathers all
local tensors from each rank into the full global tensor valid
on the specified destination rank. It is intended to be used in
tensor-parallel settings on tensors which require gradients to
be passed through.
The backward pass corresponds to a straightforward
ScatterV primitive distributing the global gradient from the
specified destination rank to all the other ranks.
"""
[docs] @staticmethod
def forward(
ctx,
tensor: torch.Tensor,
sizes: List[int],
dim: int = 0,
dst: int = 0,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""forward pass of the distributed GatherV primitive"""
gathered_tensor = gather_v_wrapper(tensor, sizes, dim=dim, dst=dst, group=group)
ctx.sizes = sizes
ctx.dim = dim
ctx.dst = dst
ctx.group = group
return gathered_tensor
[docs] @staticmethod
def backward(
ctx,
grad_output: torch.Tensor,
) -> torch.Tensor: # pragma: no cover
"""backward pass of the Distributed GatherV primitive"""
grad_tensor = scatter_v_wrapper(
grad_output, ctx.sizes, dim=ctx.dim, src=ctx.dst, group=ctx.group
)
if not ctx.needs_input_grad[0]:
grad_tensor = None
return grad_tensor, None, None, None, None
[docs]class ScatterVAutograd(torch.autograd.Function):
"""
Autograd Wrapper for Distributed ScatterV. It is based
on the idea of a single global tensor which is distributed along
a specified dimension into chunks of variable size.
This primitive scatters the global tensor from a specified source rank
into local chunks onto each other rank. It is intended to be used in
tensor-parallel settings on tensors which require gradients to
be passed through.
The backward pass corresponds to an GatherV primitive
gathering local gradients from all the other ranks into a single
global gradient on the specified source rank.
"""
[docs] @staticmethod
def forward(
ctx,
tensor: torch.Tensor,
sizes: List[int],
dim: int = 0,
src: int = 0,
group=Optional[dist.ProcessGroup],
) -> torch.Tensor: # pragma: no cover
"""forward pass of the Distributed ScatterV primitive"""
scattered_tensor = scatter_v_wrapper(
tensor, sizes, dim=dim, src=src, group=group
)
ctx.tensor = tensor
ctx.sizes = sizes
ctx.dim = dim
ctx.src = src
ctx.group = group
return scattered_tensor
[docs] @staticmethod
def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor: # pragma: no cover
"""backward pass of the Distributed ScatterV primitive"""
grad_tensor = gather_v_wrapper(
grad_output, ctx.sizes, dim=ctx.dim, dst=ctx.src, group=ctx.group
)
if not ctx.needs_input_grad[0]:
grad_tensor = None
return grad_tensor, None, None, None, None
[docs]class IndexedAllToAllVAutograd(torch.autograd.Function):
"""
Autograd Wrapper for an Indexed AllToAllV primitive. It is based on the
idea of a single global tensor which is distributed along a
specified dimension into chunks of variable size.
This primitive assumes a set of indices into this dimension which indicate
the corresponding slices sent to each other rank forming an indexed version
of an AllToAllV primitive. It is intended to be used in tensor-parallel settings
on tensors which require gradients to be passed through.
The backward pass more or less corresponds to the same operation as in the forward
pass but with reversed roles and does an additional reduction of gathered gradients
so that each rank finally will compute the overall gradient on its local tensor partition.
"""
[docs] @staticmethod
def forward(
ctx,
tensor: torch.Tensor,
indices: List[torch.Tensor],
sizes: List[List[int]],
use_fp32: bool = True,
dim: int = 0,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""forward pass of the Distributed IndexedAlltoAllV primitive"""
tensor_to_recv = indexed_all_to_all_v_wrapper(
tensor,
indices,
sizes,
dim=dim,
group=group,
)
ctx.sizes = sizes
ctx.use_fp32 = use_fp32
ctx.group = group
ctx.tensor_size_along_dim = tensor.size(dim)
ctx.indices = indices
ctx.dim = dim
return tensor_to_recv
[docs] @staticmethod
def backward(
ctx,
grad_output: torch.Tensor,
) -> torch.Tensor: # pragma: no cover
"""backward pass of the Distributed IndexedAlltoAllV primitive"""
grad_tensor = indexed_all_to_all_v_wrapper_bwd(
grad_output,
ctx.indices,
ctx.sizes,
tensor_size_along_dim=ctx.tensor_size_along_dim,
use_fp32=ctx.use_fp32,
dim=ctx.dim,
group=ctx.group,
)
if not ctx.needs_input_grad[0]:
grad_tensor = None
return grad_tensor, None, None, None, None, None, None
[docs]def all_gather_v(
tensor: torch.Tensor,
sizes: List[int],
dim: int = 0,
use_fp32: bool = True,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""
Autograd Wrapper for a distributed AllGatherV primitive.
It is based on the idea of a single global tensor which is distributed
along a specified dimension into chunks of variable size.
This primitive gathers all local tensors from each rank into the
full global tensor onto each rank. Its indended to be used in
tensor-parallel settings on tensors which require gradients
to be passed through.
The backward pass performs an AllReduceV operation where
each rank gathers its corresponding chunk of a global tensor
from each other rank and sums up these individual gradients.
Parameters
----------
tensor : "torch.Tensor"
local tensor on each rank
sizes : List[int]
list of the sizes of each chunk on each rank along distributed dimension,
valid and set on each rank
dim : int, optional
dimension along which global tensor is distributed, by default 0
use_fp32 : bool, optional
boolean flag to indicate whether to use FP32 precision for the
reduction in the backward pass, by default True
group : Optional[dist.ProcessGroup], optional
process group along which global tensor is shared, by default None
Returns
-------
torch.Tensor
full global tensor, valid on each rank
"""
return AllGatherVAutograd.apply(tensor, sizes, dim, use_fp32, group)
[docs]def gather_v(
tensor: torch.Tensor,
sizes: List[int],
dim: int = 0,
dst: int = 0,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""
Autograd Wrapper for a distributed GatherV primitive.
It is based on the idea of a single global tensor which is distributed
along a specified dimension into chunks of variable size.
This primitive assumes such a distributed tensor and gathers all
local tensors from each rank into the full global tensor valid
on the specified destination rank. It is intended to be used in
tensor-parallel settings on tensors which require gradients to
be passed through.
The backward pass corresponds to a straightforward
ScatterV primitive distributing the global gradient from the
specified destination rank to all the other ranks.
Parameters
----------
tensor : torch.Tensor
local tensor on each rank
sizes : List[int]
list of the sizes of each chunk on each rank along distributed dimension,
valid and set on each rank
dim : int, optional
dimension along which global tensor is distributed, by default 0
dst : int, optional
destination rank which contains the full global tensor after the operation, by default 0
group : Optional[dist.ProcessGroup], optional
process group along which global tensor is shared, by default None
Returns
-------
torch.Tensor
full global tensor, valid on destination rank
"""
return GatherVAutograd.apply(tensor, sizes, dim, dst, group)
[docs]def scatter_v(
tensor: torch.Tensor,
sizes: List[int],
dim: int = 0,
src: int = 0,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""
Autograd Wrapper for Distributed ScatterV. It is based
on the idea of a single global tensor which is distributed along
a specified dimension into chunks of variable size.
This primitive scatters the global tensor from a specified source rank
into local chunks onto each other rank. It is intended to be used in
tensor-parallel settings on tensors which require gradients to
be passed through.
The backward pass corresponds to an GatherV primitive
gathering local gradients from all the other ranks into a single
global gradient on the specified source rank.
Parameters
----------
tensor : torch.Tensor
global tensor, valid on source rank
sizes : List[int]
list of the sizes of each chunk on each rank along distributed dimension,
valid and set each rank
dim : int, optional
dimension along which global tensor is distributed, by default 0
src : int, optional
source rank of primitive, i.e. rank of original full global tensor, by default 0
group : Optional[dist.ProcessGroup], optional
process group along which global tensor is shared, by default None
Returns
-------
torch.Tensor
corresponding local part of the global tensor on each rank
"""
return ScatterVAutograd.apply(tensor, sizes, dim, src, group)
[docs]def indexed_all_to_all_v(
tensor: torch.Tensor,
indices: List[torch.Tensor],
sizes: List[List[int]],
use_fp32: bool = True,
dim: int = 0,
group: Optional[dist.ProcessGroup] = None,
) -> torch.Tensor: # pragma: no cover
"""
Autograd Wrapper for an Indexed AllToAllV primitive. It is based on the
idea of a single global tensor which is distributed along a
specified dimension into chunks of variable size.
This primitive assumes a set of indices into this dimension which indicate
the corresponding slices sent to each other rank forming an indexed version
of an AllToAllV primitive. It is intended to be used in tensor-parallel settings
on tensors which require gradients to be passed through.
The backward pass more or less corresponds to the same operation as in the forward
pass but with reversed roles and does an additional reduction of gathered gradients
so that each rank finally will compute the overall gradient on its local tensor partition.
Parameters
----------
tensor : torch.Tensor
local part of global tensor on each rank
indices : List[torch.Tensor]
list of indices on each rank of slices being sent to
each other rank from this rank
sizes : List[List[int]]
number of indices each rank sends to each other rank,
valid and set on each rank, e.g. sizes[0][3] corresponds
to the number of slices rank 0 sends to rank 3
use_fp32 : bool, optional
flag to specify whether to use FP32 precision in the reduction
in the backward pass, by default True
dim : int
dimension along which global tensor is distributed, by default 0
group : Optional[dist.ProcessGroup], optional
process group along which global tensor is shared, by default None
Returns
-------
torch.Tensor
local result of primitive corresponding to indexed global tensor
"""
return IndexedAllToAllVAutograd.apply(
tensor,
indices,
sizes,
use_fp32,
dim,
group,
)