Source code for nemo.collections.nlp.models.glue_benchmark.glue_benchmark_model

# Copyright 2018 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Dict, Optional, Union

import numpy as np
import torch
from omegaconf import DictConfig, OmegaConf
from pytorch_lightning import Trainer

from nemo.collections.common.losses import CrossEntropyLoss, MSELoss
from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUE_TASKS_NUM_LABELS, GLUEDataset
from nemo.collections.nlp.models.glue_benchmark.metrics_for_glue import compute_metrics
from nemo.collections.nlp.models.nlp_model import NLPModel
from nemo.collections.nlp.modules.common import SequenceClassifier, SequenceRegression
from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
from nemo.collections.nlp.parts.utils_funcs import list2str, tensor2list
from nemo.core.classes import typecheck
from nemo.core.neural_types import NeuralType
from nemo.utils import logging

__all__ = ['GLUEModel']

'''
Some transformer of this code were adapted from the HuggingFace library at
https://github.com/huggingface/transformers
Example of running a pretrained BERT model on the 9 GLUE tasks, read more
about GLUE benchmark here: https://gluebenchmark.com
Download the GLUE data by running the script:
https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e

Some of these tasks have a small dataset and training can lead to high variance
in the results between different runs. Below is the median on 5 runs
(with different seeds) for each of the metrics on the dev set of the benchmark
with an uncased BERT base model (the checkpoint bert-base-uncased)
(source https://github.com/huggingface/transformers/tree/master/examples#glue).
Task	Metric	                        Result
CoLA	Matthew's corr	                48.87
SST-2	Accuracy	                    91.74
MRPC	F1/Accuracy	                 90.70/86.27
STS-B	Person/Spearman corr.	     91.39/91.04
QQP	    Accuracy/F1	                 90.79/87.66
MNLI	Matched acc./Mismatched acc. 83.70/84.83
QNLI	Accuracy	                    89.31
RTE	    Accuracy	                    71.43
WNLI	Accuracy	                    43.66

'''


[docs]class GLUEModel(NLPModel): @property def input_types(self) -> Optional[Dict[str, NeuralType]]: return self.bert_model.input_types @property def output_types(self) -> Optional[Dict[str, NeuralType]]: return self.pooler.output_types def __init__(self, cfg: DictConfig, trainer: Trainer = None): """ Initializes model to use BERT model for GLUE tasks. """ if cfg.task_name not in cfg.supported_tasks: raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}') self.task_name = cfg.task_name # needed to setup validation on multiple datasets # MNLI task has two separate dev sets: matched and mismatched if not self._is_model_being_restored(): if self.task_name == "mnli": cfg.validation_ds.ds_item = [ os.path.join(cfg.dataset.data_dir, 'dev_matched.tsv'), os.path.join(cfg.dataset.data_dir, 'dev_mismatched.tsv'), ] else: cfg.validation_ds.ds_item = os.path.join(cfg.dataset.data_dir, cfg.validation_ds.ds_item) cfg.train_ds.ds_item = os.path.join(cfg.dataset.data_dir, cfg.train_ds.ds_item) logging.info(f'Using {cfg.validation_ds.ds_item} for model evaluation.') self.setup_tokenizer(cfg.tokenizer) super().__init__(cfg=cfg, trainer=trainer) num_labels = GLUE_TASKS_NUM_LABELS[self.task_name] self.bert_model = get_lm_model( pretrained_model_name=cfg.language_model.pretrained_model_name, config_file=cfg.language_model.config_file, config_dict=OmegaConf.to_container(cfg.language_model.config) if cfg.language_model.config else None, checkpoint_file=cfg.language_model.lm_checkpoint, vocab_file=cfg.tokenizer.vocab_file, ) # uses [CLS] token for classification (the first token) if self.task_name == "sts-b": self.pooler = SequenceRegression(hidden_size=self.bert_model.config.hidden_size) self.loss = MSELoss() else: self.pooler = SequenceClassifier( hidden_size=self.bert_model.config.hidden_size, num_classes=num_labels, log_softmax=False ) self.loss = CrossEntropyLoss()
[docs] def update_data_dir(self, data_dir: str) -> None: """ Update data directory and get data stats with Data Descriptor Weights are later used to setup loss Args: data_dir: path to data directory """ self._cfg.dataset.data_dir = data_dir logging.info(f'Setting model.dataset.data_dir to {data_dir}.') if self.task_name == "mnli": self._cfg.validation_ds.ds_item = [ os.path.join(data_dir, 'dev_matched.tsv'), os.path.join(data_dir, 'dev_mismatched.tsv'), ] else: self._cfg.validation_ds.ds_item = os.path.join(data_dir, 'dev.tsv') self._cfg.train_ds.ds_item = os.path.join(data_dir, 'train.tsv') logging.info(f'Using {self._cfg.validation_ds.ds_item} for model evaluation.')
[docs] @typecheck() def forward(self, input_ids, token_type_ids, attention_mask): hidden_states = self.bert_model( input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask ) output = self.pooler(hidden_states=hidden_states) return output
[docs] def training_step(self, batch, batch_idx): input_ids, input_type_ids, input_mask, labels = batch model_output = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) if self.task_name == "sts-b": loss = self.loss(preds=model_output, labels=labels) else: loss = self.loss(logits=model_output, labels=labels) lr = self._optimizer.param_groups[0]['lr'] self.log('train_loss', loss) self.log('lr', lr, prog_bar=True) return { 'loss': loss, 'lr': lr, }
[docs] def validation_step(self, batch, batch_idx, dataloader_idx=0): input_ids, input_type_ids, input_mask, labels = batch model_output = self(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) if self.task_name == "sts-b": val_loss = self.loss(preds=model_output, labels=labels) else: val_loss = self.loss(logits=model_output, labels=labels) if self.task_name != 'sts-b': model_output = torch.argmax(model_output, 1) eval_tensors = {'preds': model_output, 'labels': labels} return {'val_loss': val_loss, 'eval_tensors': eval_tensors}
[docs] def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): """ Called at the end of validation to aggregate outputs. outputs: list of individual outputs of each validation step. """ avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() preds = torch.cat([x['eval_tensors']['preds'] for x in outputs]) labels = torch.cat([x['eval_tensors']['labels'] for x in outputs]) all_preds = [] all_labels = [] if torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() for ind in range(world_size): all_preds.append(torch.empty_like(preds)) all_labels.append(torch.empty_like(labels)) torch.distributed.all_gather(all_preds, preds) torch.distributed.all_gather(all_labels, labels) else: all_preds.append(preds) all_labels.append(labels) tensorboard_logs = {} if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: preds = [] labels = [] for p in all_preds: preds.extend(tensor2list(p)) for l in all_labels: labels.extend(tensor2list(l)) tensorboard_logs = compute_metrics(self.task_name, np.array(preds), np.array(labels)) val_name = self._validation_names[dataloader_idx].upper() logging.info(f'{val_name} evaluation: {tensorboard_logs}') # writing labels and predictions to a file in output_dir is specified in the config output_dir = self._cfg.output_dir if output_dir: os.makedirs(output_dir, exist_ok=True) filename = os.path.join(output_dir, f'{self.task_name}_{val_name}.txt') logging.info(f'Saving labels and predictions to {filename}') with open(filename, 'w') as f: f.write('labels\t' + list2str(labels) + '\n') f.write('preds\t' + list2str(preds) + '\n') tensorboard_logs['val_loss'] = avg_loss for key in tensorboard_logs: self.log(f'{key}', tensorboard_logs[key], prog_bar=True) return {'val_loss': avg_loss, 'log': tensorboard_logs}
[docs] def setup_training_data(self, train_data_config: Optional[DictConfig] = None): if train_data_config is None: train_data_config = self._cfg.train_ds self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config)
[docs] def setup_validation_data(self, val_data_config: Optional[DictConfig] = None): if val_data_config is None: val_data_config = self._cfg.validation_ds self._validation_dl = self._setup_dataloader_from_config(cfg=val_data_config)
[docs] def setup_multiple_validation_data(self, val_data_config: Union[DictConfig, Dict] = None): if val_data_config is None: val_data_config = self._cfg.validation_ds return super().setup_multiple_validation_data(val_data_config)
def _setup_dataloader_from_config(self, cfg: DictConfig): file_name = cfg.ds_item if not os.path.exists(file_name): raise FileNotFoundError( "GLUE datasets not found. For more details on how to get the data, see: " "https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e" ) dataset = GLUEDataset( file_name=file_name, task_name=self.task_name, tokenizer=self.tokenizer, max_seq_length=self._cfg.dataset.max_seq_length, use_cache=self._cfg.dataset.use_cache, ) return torch.utils.data.DataLoader( dataset=dataset, collate_fn=dataset.collate_fn, batch_size=cfg.batch_size, shuffle=cfg.shuffle, num_workers=self._cfg.dataset.num_workers, pin_memory=self._cfg.dataset.pin_memory, drop_last=self._cfg.dataset.drop_last, )
[docs] @classmethod def list_available_models(cls) -> Optional[Dict[str, str]]: pass