Using DALI in PaddlePaddle

Overview

This example shows how to use DALI in PaddlePaddle.

This example uses readers.Caffe. See other examples for details on how to use different data formats.

Let us start from defining some global constants

DALI_EXTRA_PATH environment variable should point to the place where data from DALI extra repository is downloaded. Please make sure that the proper release tag is checked out.

[1]:
import os.path

test_data_root = os.environ["DALI_EXTRA_PATH"]

# Caffe LMDB
lmdb_folder = os.path.join(test_data_root, "db", "lmdb")

N = 8  # number of GPUs
BATCH_SIZE = 128  # batch size per GPU
IMAGE_SIZE = 3

Let us define a pipeline with a reader:

[2]:
from nvidia.dali import pipeline_def, Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types


@pipeline_def
def caffe_pipeline(num_gpus):
    device_id = Pipeline.current().device_id
    jpegs, labels = fn.readers.caffe(
        name="Reader",
        path=lmdb_folder,
        random_shuffle=True,
        shard_id=device_id,
        num_shards=num_gpus,
    )
    images = fn.decoders.image(jpegs, device="mixed")
    images = fn.resize(
        images, resize_shorter=fn.random.uniform(range=(256, 480)), interp_type=types.INTERP_LINEAR
    )
    images = fn.crop_mirror_normalize(
        images,
        crop_pos_x=fn.random.uniform(range=(0.0, 1.0)),
        crop_pos_y=fn.random.uniform(range=(0.0, 1.0)),
        dtype=types.FLOAT,
        crop=(227, 227),
        mean=[128.0, 128.0, 128.0],
        std=[1.0, 1.0, 1.0],
    )

    return images, labels

Let us create the pipeline and pass it to PaddlePaddle generic iterator

[3]:
import numpy as np
from nvidia.dali.plugin.paddle import DALIGenericIterator


label_range = (0, 999)
pipes = [
    caffe_pipeline(batch_size=BATCH_SIZE, num_threads=2, device_id=device_id, num_gpus=N)
    for device_id in range(N)
]

for pipe in pipes:
    pipe.build()

dali_iter = DALIGenericIterator(pipes, ["data", "label"], reader_name="Reader")

for i, data in enumerate(dali_iter):
    # Testing correctness of labels
    for d in data:
        label = d["label"]
        image = d["data"]
        ## labels need to be integers
        assert np.equal(np.mod(label, 1), 0).all()
        ## labels need to be in range pipe_name[2]
        assert (np.array(label) >= label_range[0]).all()
        assert (np.array(label) <= label_range[1]).all()

print("OK")
OK
[ ]: