{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MXNet with DALI - ResNet 50 example\n",
    "\n",
    "## Overview\n",
    "\n",
    "This example shows, how to use DALI pipelines with Apache MXNet.\n",
    "\n",
    "## ResNet 50 Pipeline\n",
    "\n",
    "Let us first define a few global constants."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nvidia.dali.pipeline import Pipeline\n",
    "import nvidia.dali.ops as ops\n",
    "import nvidia.dali.types as types\n",
    "import nvidia.dali.fn as fn\n",
    "\n",
    "N = 8  # number of GPUs\n",
    "batch_size = 128  # batch size per GPU\n",
    "\n",
    "db_folder = \"/data/imagenet/train-480-val-256-recordio/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The Training Pipeline\n",
    "\n",
    "The training pipeline consists of the following steps:\n",
    " * Data is first read from MXNet's recordIO file (the reader op is given a name `Reader` for later use)\n",
    " * Then, images are decoded using nvJPEG\n",
    " * RGB images are then randomly cropped and resized to the final size of (224, 224) pixels\n",
    " * Finally, the batch is transposed from NHWC layout to NCHW layout, normalized and randomly mirrored.\n",
    " \n",
    "`DALIClassificationIterator`, which we will use for interfacing with MXNet in this example, requires outputs of the pipeline to follow (image, label) structure.\n",
    "\n",
    "The validation pipeline is similar to the training pipeline, but omits the random resized crop and random mirroring steps, as well as shuffling the data coming from the reader."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dali_pipeline(batch_size, num_threads, device_id, db_folder, crop, size,\n",
    "                         shard_id, num_shards, dali_cpu=False, is_training=True):\n",
    "    pipeline = Pipeline(batch_size, num_threads, device_id, seed=12 + device_id)\n",
    "    with pipeline:\n",
    "        images, labels = fn.readers.mxnet(path=[db_folder+\"train.rec\"], index_path=[db_folder+\"train.idx\"],\n",
    "                                          random_shuffle=False, shard_id=device_id, num_shards=num_shards,\n",
    "                                          pad_last_batch=is_training, name=\"Reader\")\n",
    "        dali_device = 'cpu' if dali_cpu else 'gpu'\n",
    "        decoder_device = 'cpu' if dali_cpu else 'mixed'\n",
    "        # ask nvJPEG to preallocate memory for the biggest sample in ImageNet for CPU and GPU to avoid reallocations in runtime\n",
    "        device_memory_padding = 211025920 if decoder_device == 'mixed' else 0\n",
    "        host_memory_padding = 140544512 if decoder_device == 'mixed' else 0\n",
    "        # ask HW NVJPEG to allocate memory ahead for the biggest image in the data set to avoid reallocations in runtime\n",
    "        preallocate_width_hint = 5980 if decoder_device == 'mixed' else 0\n",
    "        preallocate_height_hint = 6430 if decoder_device == 'mixed' else 0\n",
    "        if is_training:\n",
    "            images = fn.decoders.image_random_crop(images,\n",
    "                                                  device=decoder_device, output_type=types.RGB,\n",
    "                                                  device_memory_padding=device_memory_padding,\n",
    "                                                  host_memory_padding=host_memory_padding,\n",
    "                                                  preallocate_width_hint=preallocate_width_hint,\n",
    "                                                  preallocate_height_hint=preallocate_height_hint,\n",
    "                                                  random_aspect_ratio=[0.8, 1.25],\n",
    "                                                  random_area=[0.1, 1.0],\n",
    "                                                  num_attempts=100)\n",
    "            images = fn.resize(images,\n",
    "                               device=dali_device,\n",
    "                               resize_x=crop,\n",
    "                               resize_y=crop,\n",
    "                               interp_type=types.INTERP_TRIANGULAR)\n",
    "            mirror = fn.random.coin_flip(probability=0.5)\n",
    "        else:\n",
    "            images = fn.decoders.image(images,\n",
    "                                       device=decoder_device,\n",
    "                                       output_type=types.RGB)\n",
    "            images = fn.resize(images,\n",
    "                               device=dali_device,\n",
    "                               size=size,\n",
    "                               mode=\"not_smaller\",\n",
    "                               interp_type=types.INTERP_TRIANGULAR)\n",
    "            mirror = False\n",
    "\n",
    "        images = fn.crop_mirror_normalize(images.gpu(),\n",
    "                                          dtype=types.FLOAT,\n",
    "                                          output_layout=\"CHW\",\n",
    "                                          crop=(crop, crop),\n",
    "                                          mean=[0.485 * 255,0.456 * 255,0.406 * 255],\n",
    "                                          std=[0.229 * 255,0.224 * 255,0.225 * 255],\n",
    "                                          mirror=mirror)\n",
    "        labels = labels.gpu()\n",
    "        pipeline.set_outputs(images, labels)\n",
    "    return pipeline\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainpipes = [create_dali_pipeline(db_folder=db_folder, batch_size=batch_size, \n",
    "                                   num_threads=2, device_id=i, shard_id=i, num_shards=N, is_training=True,\n",
    "                                   crop=224, size=256) for i in range(N)]\n",
    "valpipes = [create_dali_pipeline(db_folder=db_folder, batch_size=batch_size,\n",
    "                                 num_threads=2, device_id=i, shard_id=i, num_shards=N, is_training=False, \n",
    "                                 crop=224, size=256) for i in range(N)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using the MXNet Plugin\n",
    "\n",
    "MXNet data iterators need to know what is the size of the dataset. Since DALI pipelines may consist of multiple readers, potentially with differently sized datasets, we need to specify the reader which we ask for the epoch size. That is why we gave a name to readers in both training and validation pipelines.\n",
    "\n",
    "In order to get the epoch size out of the reader, we need to build one of the training and one of the validation pipelines."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainpipes[0].build()\n",
    "valpipes[0].build()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training pipeline epoch size: 1281167\n",
      "Validation pipeline epoch size: 50000\n"
     ]
    }
   ],
   "source": [
    "print(\"Training pipeline epoch size: {}\".format(trainpipes[0].epoch_size(\"Reader\")))\n",
    "print(\"Validation pipeline epoch size: {}\".format(valpipes[0].epoch_size(\"Reader\")))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can make MXNet iterators out of our pipelines, using `DALIClassificationIterator` class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nvidia.dali.plugin.mxnet import DALIClassificationIterator, LastBatchPolicy\n",
    "dali_train_iter = DALIClassificationIterator(trainpipes, reader_name=\"Reader\", last_batch_policy=LastBatchPolicy.PARTIAL)\n",
    "dali_val_iter = DALIClassificationIterator(valpipes, reader_name=\"Reader\", last_batch_policy=LastBatchPolicy.PARTIAL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training with MXNet\n",
    "\n",
    "Once we have MXNet data iterators from `DALIClassificationIterator`, we can use them instead of MXNet's`mx.io.ImageRecordIter`. Here we show modified `train_imagenet.py` example that uses our DALI pipelines."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:start with arguments Namespace(batch_size=1024, benchmark=0, data_nthreads=40, data_train='/data/imagenet/train-480-val-256-recordio/train.rec', data_train_idx='', data_val='/data/imagenet/train-480-val-256-recordio/val.rec', data_val_idx='', disp_batches=100, dtype='float16', gc_threshold=0.5, gc_type='none', gpus='0, 1, 2, 3, 4, 5, 6, 7', image_shape='3,224,224', initializer='default', kv_store='device', load_epoch=None, loss='', lr=0.1, lr_factor=0.1, lr_step_epochs='30,60', macrobatch_size=0, max_random_aspect_ratio=0.25, max_random_h=0, max_random_l=0, max_random_rotate_angle=0, max_random_s=0, max_random_scale=1, max_random_shear_ratio=0.0, min_random_scale=0.533, model_prefix=None, mom=0.9, monitor=0, network='resnet-v1', num_classes=1000, num_epochs=1, num_examples=1281167, num_layers=50, optimizer='sgd', pad_size=0, random_crop=1, random_mirror=1, rgb_mean='123.68,116.779,103.939', test_io=0, top_k=0, warmup_epochs=5, warmup_strategy='linear', wd=0.0001)\n",
      "INFO:root:Epoch[0] Batch [100]\tSpeed: 4407.30 samples/sec\taccuracy=0.001141\n",
      "INFO:root:Epoch[0] Batch [200]\tSpeed: 4444.77 samples/sec\taccuracy=0.003184\n",
      "INFO:root:Epoch[0] Batch [300]\tSpeed: 4395.88 samples/sec\taccuracy=0.006074\n",
      "INFO:root:Epoch[0] Batch [400]\tSpeed: 4384.70 samples/sec\taccuracy=0.011182\n",
      "INFO:root:Epoch[0] Batch [500]\tSpeed: 4389.42 samples/sec\taccuracy=0.017441\n",
      "INFO:root:Epoch[0] Batch [600]\tSpeed: 4382.10 samples/sec\taccuracy=0.026377\n",
      "INFO:root:Epoch[0] Batch [700]\tSpeed: 4388.26 samples/sec\taccuracy=0.036611\n",
      "INFO:root:Epoch[0] Batch [800]\tSpeed: 4383.51 samples/sec\taccuracy=0.047139\n",
      "INFO:root:Epoch[0] Batch [900]\tSpeed: 4402.73 samples/sec\taccuracy=0.057686\n",
      "INFO:root:Epoch[0] Batch [1000]\tSpeed: 4392.32 samples/sec\taccuracy=0.067861\n",
      "INFO:root:Epoch[0] Batch [1100]\tSpeed: 4384.42 samples/sec\taccuracy=0.079248\n",
      "INFO:root:Epoch[0] Batch [1200]\tSpeed: 4385.37 samples/sec\taccuracy=0.090088\n",
      "INFO:root:Epoch[0] Train-accuracy=0.098537\n",
      "INFO:root:Epoch[0] Time cost=295.153\n",
      "WARNING:root:DALI iterator does not support resetting while epoch is not finished. Ignoring...\n",
      "INFO:root:Epoch[0] Validation-accuracy=0.104393\n"
     ]
    }
   ],
   "source": [
    "import os.path\n",
    "import argparse\n",
    "import logging\n",
    "logging.basicConfig(level=logging.DEBUG)\n",
    "from resnetn.common import find_mxnet, data, fit\n",
    "import mxnet as mx\n",
    "\n",
    "gpus_string = \"\".join(str(list(range(N)))).replace('[','').replace(']','')\n",
    "\n",
    "s = ['--gpu', gpus_string,\n",
    "     '--batch-size', str(batch_size * N),\n",
    "     '--num-epochs', '1',\n",
    "     '--data-train', '/data/imagenet/train-480-val-256-recordio/train.rec',\n",
    "     '--data-val', '/data/imagenet/train-480-val-256-recordio/val.rec',\n",
    "     '--disp-batches', '100',\n",
    "     '--network', 'resnet-v1',\n",
    "     '--num-layers', '50',\n",
    "     '--data-nthreads', '40',\n",
    "     '--min-random-scale', '0.533',\n",
    "     '--max-random-shear-ratio', '0',\n",
    "     '--max-random-rotate-angle', '0',\n",
    "     '--max-random-h', '0',\n",
    "     '--max-random-l', '0',\n",
    "     '--max-random-s', '0',\n",
    "     '--dtype', 'float16']\n",
    "\n",
    "# parse args\n",
    "parser = argparse.ArgumentParser(description=\"train imagenet-1k\",\n",
    "                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n",
    "fit.add_fit_args(parser)\n",
    "data.add_data_args(parser)\n",
    "data.add_data_aug_args(parser)\n",
    "# use a large aug level\n",
    "data.set_data_aug_level(parser, 3)\n",
    "parser.set_defaults(\n",
    "        # network\n",
    "        network          = 'resnet',\n",
    "        num_layers       = 50,\n",
    "        # data\n",
    "        num_classes      = 1000,\n",
    "        num_examples     = 1281167,\n",
    "        image_shape      = '3,224,224',\n",
    "        min_random_scale = 1, # if input image has min size k, suggest to use\n",
    "                              # 256.0/x, e.g. 0.533 for 480\n",
    "        # train\n",
    "        num_epochs       = 80,\n",
    "        lr_step_epochs   = '30,60',\n",
    "        dtype            = 'float32'\n",
    "    )\n",
    "args = parser.parse_args(s)\n",
    "\n",
    "\n",
    "# load network\n",
    "from importlib import import_module\n",
    "net = import_module('resnetn.symbols.'+args.network)\n",
    "sym = net.get_symbol(1000, 50, \"3,224,224\", dtype='float16')\n",
    "\n",
    "def get_dali_iter(args, kv=None):\n",
    "    return (dali_train_iter, dali_val_iter)\n",
    "\n",
    "# train\n",
    "#fit.fit(args, sym, data.get_rec_iter)\n",
    "fit.fit(args, sym, get_dali_iter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}