Using DALI in PaddlePaddle


This example shows how to use DALI in PaddlePaddle.

This example uses readers.Caffe. See other examples for details on how to use different data formats.

Let us start from defining some global constants

DALI_EXTRA_PATH environment variable should point to the place where data from DALI extra repository is downloaded. Please make sure that the proper release tag is checked out.

import os.path

test_data_root = os.environ['DALI_EXTRA_PATH']

# Caffe LMDB
lmdb_folder = os.path.join(test_data_root, 'db', 'lmdb')

N = 8             # number of GPUs
BATCH_SIZE = 128  # batch size per GPU

Let us define a pipeline with a reader:

from nvidia.dali import pipeline_def, Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types

def caffe_pipeline(num_gpus):
    device_id = Pipeline.current().device_id
    jpegs, labels = fn.readers.caffe(
        name='Reader', path=lmdb_folder, random_shuffle=True, shard_id=device_id, num_shards=num_gpus)
    images = fn.decoders.image(jpegs, device='mixed')
    images = fn.resize(
        resize_shorter=fn.random.uniform(range=(256, 480)),
    images = fn.crop_mirror_normalize(
            crop_pos_x=fn.random.uniform(range=(0.0, 1.0)),
            crop_pos_y=fn.random.uniform(range=(0.0, 1.0)),
            crop=(227, 227),
            mean=[128., 128., 128.],
            std=[1., 1., 1.])

    return images, labels

Let us create the pipeline and pass it to PaddlePaddle generic iterator

import numpy as np
from nvidia.dali.plugin.paddle import DALIGenericIterator

label_range = (0, 999)
pipes = [caffe_pipeline(
    batch_size=BATCH_SIZE, num_threads=2, device_id=device_id, num_gpus=N) for device_id in range(N)]

for pipe in pipes:

dali_iter = DALIGenericIterator(pipes, ['data', 'label'], reader_name='Reader')

for i, data in enumerate(dali_iter):
    # Testing correctness of labels
    for d in data:
        label = d["label"]
        image = d["data"]
        ## labels need to be integers
        assert(np.equal(np.mod(label, 1), 0).all())
        ## labels need to be in range pipe_name[2]
        assert((np.array(label) >= label_range[0]).all())
        assert((np.array(label) <= label_range[1]).all())

[ ]: