Create a custom operator¶

DALI allows you to create a custom operator in C++ and load it at runtime. There are several reasons you might need to write your custom operator, for instance

DALI doesn’t support the operation you want to perform and it cannot be expressed by a composition of other operators
You want to write an operator that depends on a third party library
You want to optimize your pipeline by providing a manually fused operation in C++

In this tutorial we will walk you through the process of writing, compiling and loading a plugin with a DALI custom operator. For demonstration purposes we will provide a CPU and a GPU implementation for the ‘CustomDummy’ operator. The implementation will just copy the input data to the output without any modifications.

PREREQUISITES:

Knowledge of C++
DALI installed from the binary distribution or compiled from source
Basic knowledge of CMake

Operator definition¶

First we define the operator in a header file

[1]:

! cat customdummy/dummy.h

#ifndef EXAMPLE_DUMMY_H_
#define EXAMPLE_DUMMY_H_

#include "dali/pipeline/operators/operator.h"

namespace other_ns {

template <typename Backend>
class Dummy : public ::dali::Operator<Backend> {
 public:
  inline explicit Dummy(const ::dali::OpSpec &spec) :
    ::dali::Operator<Backend>(spec) {}

  virtual inline ~Dummy() = default;

  Dummy(const Dummy&) = delete;
  Dummy& operator=(const Dummy&) = delete;
  Dummy(Dummy&&) = delete;
  Dummy& operator=(Dummy&&) = delete;

 protected:
  void RunImpl(::dali::Workspace<Backend> *ws, const int idx) override;
};

}  // namespace other_ns

#endif  // EXAMPLE_DUMMY_H_

CPU operator implementation¶

Next, we provide the CPU implementation in a C++ implementation file. We register the schema for the custom operator with DALI_REGISTER_SCHEMA macro and define the CPU version of the operator with DALI_REGISTER_OPERATOR.

[2]:

! cat customdummy/dummy.cc

#include "dummy.h"

namespace other_ns {

template<>
void Dummy<::dali::CPUBackend>::RunImpl(::dali::SampleWorkspace *ws, const int idx) {
  auto &input = ws->Input<::dali::CPUBackend>(idx);
  auto output = ws->Output<::dali::CPUBackend>(idx);
  output->set_type(input.type());
  output->ResizeLike(input);

  ::dali::TypeInfo type = input.type();
  type.Copy<::dali::CPUBackend, ::dali::CPUBackend>(
      output->raw_mutable_data(),
      input.raw_data(), input.size(), 0);
}

}  // namespace other_ns

DALI_REGISTER_OPERATOR(CustomDummy, ::other_ns::Dummy<::dali::CPUBackend>, ::dali::CPU);

DALI_SCHEMA(CustomDummy)
  .DocStr("Make a copy of the input tensor")
  .NumInput(1)
  .NumOutput(1);

GPU operator implementation¶

Similarly, we provide a GPU implementation in a CUDA implementation file and register it with DALI_REGISTER_OPERATOR.

[3]:

! cat customdummy/dummy.cu

#include <cuda_runtime_api.h>
#include "dummy.h"

namespace other_ns {

template<>
void Dummy<::dali::GPUBackend>::RunImpl(::dali::DeviceWorkspace *ws, const int idx) {
  auto &input = ws->Input<::dali::GPUBackend>(idx);
  auto output = ws->Output<::dali::GPUBackend>(idx);
  output->set_type(input.type());
  output->ResizeLike(input);
  CUDA_CALL(cudaMemcpyAsync(
          output->raw_mutable_data(),
          input.raw_data(),
          input.nbytes(),
          cudaMemcpyDeviceToDevice,
          ws->stream()));
}

}  // namespace other_ns

DALI_REGISTER_OPERATOR(CustomDummy, ::other_ns::Dummy<::dali::GPUBackend>, ::dali::GPU);

Building the plugin¶

The last step is to specify the build configuration.

We can use nvidia.dali.sysconfig to retrieve the build configuration parameters

[4]:

from __future__ import print_function
import nvidia.dali.sysconfig as sysconfig

[5]:

print(sysconfig.get_include_dir())

/usr/local/lib/python2.7/dist-packages/nvidia/dali/include

[6]:

print(sysconfig.get_lib_dir())

/usr/local/lib/python2.7/dist-packages/nvidia/dali

[7]:

print(sysconfig.get_compile_flags())

['-I/usr/local/lib/python2.7/dist-packages/nvidia/dali/include', '-D_GLIBCXX_USE_CXX11_ABI=0']

[8]:

print(sysconfig.get_link_flags())

['-L/usr/local/lib/python2.7/dist-packages/nvidia/dali', '-ldali']

Important: Only one version of libdali.so should be loaded in the process at the same time. Therefore, a plugin must be linked against the exact same library in the python package directory of DALI that you intend to use to load your plugin. Due to this limitation, upgrading your DALI version will require you to relink your plugin against the new library.

In this example we used CMake to build the plugin

[9]:

! cat customdummy/CMakeLists.txt

cmake_minimum_required(VERSION 3.5)
find_package(CUDA 8.0 REQUIRED)

execute_process(
        COMMAND python -c "import nvidia.dali as dali; print(dali.sysconfig.get_lib_dir())"
        OUTPUT_VARIABLE DALI_LIB_DIR)
string(STRIP ${DALI_LIB_DIR} DALI_LIB_DIR)

execute_process(
        COMMAND python -c "import nvidia.dali as dali; print(\" \".join(dali.sysconfig.get_compile_flags()))"
        OUTPUT_VARIABLE DALI_COMPILE_FLAGS)
string(STRIP ${DALI_COMPILE_FLAGS} DALI_COMPILE_FLAGS)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${DALI_COMPILE_FLAGS} ")
link_directories( "${DALI_LIB_DIR}" )

cuda_add_library(customdummy SHARED dummy.cc dummy.cu )
target_link_libraries(customdummy dali)

Now we are ready to compile our plugin containing the custom operator ‘CustomDummy’

[10]:

! rm -rf customdummy/build
! mkdir -p customdummy/build
! cd customdummy/build && \
  cmake .. && \
  make

-- The C compiler identification is GNU 5.4.0
-- The CXX compiler identification is GNU 5.4.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Looking for pthread_create
-- Looking for pthread_create - not found
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE
-- Found CUDA: /usr/local/cuda (found suitable version "9.0", minimum required is "8.0")
-- Configuring done
-- Generating done
-- Build files have been written to: /home/dali/git/dali/docs/examples/extend/customdummy/build
[ 33%] Building NVCC (Device) object CMakeFiles/customdummy.dir/customdummy_generated_dummy.cu.o
Scanning dependencies of target customdummy
[ 66%] Building CXX object CMakeFiles/customdummy.dir/dummy.cc.o
[100%] Linking CXX shared library libcustomdummy.so
[100%] Built target customdummy

After the build steps we should have a dynamic library file created and ready to use.

[11]:

! ls customdummy/build/*.so

customdummy/build/libcustomdummy.so

Importing the plugin¶

First we can see that there is no such plugin named ‘CustomDummy’

[12]:

import nvidia.dali.ops as ops
try:
    help(ops.CustomDummy)
except Exception as e:
    print('Error: ' + str(e))

Error: 'module' object has no attribute 'CustomDummy'

We can now load the plugin

[13]:

import nvidia.dali.plugin_manager as plugin_manager
plugin_manager.load_library('./customdummy/build/libcustomdummy.so')

and verify that the new operator is available

[14]:

help(ops.CustomDummy)

Help on class CustomDummy in module nvidia.dali.ops:

class CustomDummy(__builtin__.object)
 |  This is 'CPU', 'GPU' operator
 |
 |  Make a copy of the input tensor
 |
 |  Parameters
 |  ----------
 |
 |  Methods defined here:
 |
 |  __call__(self, *inputs, **kwargs)
 |
 |  __init__(self, **kwargs)
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |
 |  __dict__
 |      dictionary for instance variables (if defined)
 |
 |  __weakref__
 |      list of weak references to the object (if defined)
 |
 |  device
 |
 |  schema
 |
 |  spec