pix2pix.html

Source code for modulus.sym.models.pix2pix

# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import torch
from typing import List, Dict
import numpy as np

from modulus.sym.key import Key
from modulus.sym.models.activation import Activation, get_activation_fn
from modulus.sym.models.arch import Arch

from modulus.models.pix2pix import Pix2Pix

Tensor = torch.Tensor


[docs]class Pix2PixArch(Arch):
    """Convolutional encoder-decoder based on pix2pix generator models.

    Note
    ----
    The pix2pix architecture supports options for 1D, 2D and 3D fields which can
    be constroled using the `dimension` parameter.

    Parameters
    ----------
    input_keys : List[Key]
        Input key list. The key dimension size should equal the variables channel dim.
    output_keys : List[Key]
        Output key list. The key dimension size should equal the variables channel dim.
    dimension : int
        Model dimensionality (supports 1, 2, 3).
    detach_keys : List[Key], optional
        List of keys to detach gradients, by default []
    conv_layer_size : int, optional
        Latent channel size after first convolution, by default 64
    n_downsampling : int, optional
        Number of downsampling/upsampling blocks, by default 3
    n_blocks : int, optional
        Number of residual blocks in middle of model, by default 3
    scaling_factor : int, optional
        Scaling factor to increase the output feature size compared to the input
        (1, 2, 4, or 8), by default 1
    activation_fn : Activation, optional
        Activation function, by default :obj:`Activation.RELU`
    batch_norm : bool, optional
        Batch normalization, by default False
    padding_type : str, optional
        Padding type ('constant', 'reflect', 'replicate' or 'circular'),
        by default "reflect"

    Variable Shape
    --------------
    Input variable tensor shape:

    - 1D: :math:`[N, size, W]`
    - 2D: :math:`[N, size, H, W]`
    - 3D: :math:`[N, size, D, H, W]`

    Output variable tensor shape:

    - 1D: :math:`[N, size,  W]`
    - 2D: :math:`[N, size, H, W]`
    - 3D: :math:`[N, size, D, H, W]`

    Note
    ----
    Reference:  Isola, Phillip, et al. “Image-To-Image translation with conditional
    adversarial networks” Conference on Computer Vision and Pattern Recognition, 2017.
    https://arxiv.org/abs/1611.07004

    Reference: Wang, Ting-Chun, et al. “High-Resolution image synthesis and semantic
    manipulation with conditional GANs” Conference on Computer Vision and Pattern
    Recognition, 2018. https://arxiv.org/abs/1711.11585

    Note
    ----
    Based on the implementation: https://github.com/NVIDIA/pix2pixHD
    """

    def __init__(
        self,
        input_keys: List[Key],
        output_keys: List[Key],
        dimension: int,
        detach_keys: List[Key] = [],
        conv_layer_size: int = 64,
        n_downsampling: int = 3,
        n_blocks: int = 3,
        scaling_factor: int = 1,
        activation_fn: Activation = Activation.RELU,
        batch_norm: bool = False,
        padding_type="reflect",
    ):
        super().__init__(
            input_keys=input_keys, output_keys=output_keys, detach_keys=detach_keys
        )
        in_channels = sum(self.input_key_dict.values())
        out_channels = sum(self.output_key_dict.values())
        self.var_dim = 1
        activation_fn = get_activation_fn(activation_fn, module=True, inplace=True)

        # Scaling factor must be 1, 2, 4, or 8
        scaling_factor = int(scaling_factor)
        assert scaling_factor in {
            1,
            2,
            4,
            8,
        }, "The scaling factor must be 1, 2, 4, or 8!"
        n_upsampling = n_downsampling + int(np.log2(scaling_factor))

        self._impl = Pix2Pix(
            in_channels,
            out_channels,
            dimension,
            conv_layer_size,
            n_downsampling,
            n_upsampling,
            n_blocks,
            activation_fn,
            batch_norm,
            padding_type,
        )

[docs]    def forward(self, in_vars: Dict[str, Tensor]) -> Dict[str, Tensor]:
        input = self.prepare_input(
            in_vars,
            self.input_key_dict.keys(),
            detach_dict=self.detach_key_dict,
            dim=1,
            input_scales=self.input_scales,
        )
        output = self._impl(input)
        return self.prepare_output(
            output, self.output_key_dict, dim=1, output_scales=self.output_scales
        )