core.resharding.copy_services.gloo_copy_service#

Module Contents#

Classes#

SendOp

Simple container describing a single send operation.

RecvOp

Simple container describing a single receive operation.

GlooCopyService

CopyService implementation that routes refit traffic over a CPU/Gloo process group instead of NCCL.

Data#

API#

core.resharding.copy_services.gloo_copy_service.logger#

‘getLogger(…)’

class core.resharding.copy_services.gloo_copy_service.SendOp#

Simple container describing a single send operation.

task_id: int | None#

None

tensor: torch.Tensor#

None

dest_rank: int#

None

class core.resharding.copy_services.gloo_copy_service.RecvOp#

Simple container describing a single receive operation.

task_id: int | None#

None

tensor: torch.Tensor#

None

src_rank: int#

None

class core.resharding.copy_services.gloo_copy_service.GlooCopyService#

Bases: core.resharding.copy_services.base.CopyService

CopyService implementation that routes refit traffic over a CPU/Gloo process group instead of NCCL.

Initialization

submit_send(src_tensor: torch.Tensor, dest_rank: int)#
submit_send_with_id(
task_id: int,
src_tensor: torch.Tensor,
dest_rank: int,
)#

Submit a send operation with a unique task identifier.

submit_recv(dest_tensor: torch.Tensor, src_rank: int)#

Submit a receive operation.

submit_recv_with_id(
task_id: int,
dest_tensor: torch.Tensor,
src_rank: int,
)#

Submit a receive operation with a unique task identifier.

run()#