MolMIM ZINC15 Training Dataset Setup#

Preparation of a drug-like subset of ZINC15 dataset#

The ZINC15 dataset is a freely available catalog of commercially purchasable chemical compounds. It is desirable to make some changes to the dataset before training chemical language models like MegaMolBART or MolMIM. In this notebook, we will walk through the steps to create a drug-like subset of the ZINC15 dataset.

import os
import yaml
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()

1. Download files#

Raw data files will be downloaded to /workspace/bionemo/data/raw/

# We will download just 1 file for now and illustrate the filtering steps on it

! mkdir -p /workspace/bionemo/data/raw
! wget http://files.docking.org/2D/AB/ABAC.txt + /workspace/bionemo/raw/ABAC.txt 

2. Process files#

2.1 Compute molecular properties#

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, QED

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {
        "Molecular Weight": np.nan,
        "cLogP": np.nan,
        "Number of Hydrogen Bond Donors": np.nan,
        "Number of Hydrogen Bond Acceptors": np.nan,
        "QED Score": np.nan,
        "Canonical SMILES": np.nan  # Add canonical SMILES to the output
    }
    
    mol_weight = Descriptors.ExactMolWt(mol)
    clogp = Crippen.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    qed_score = QED.qed(mol)
    canonical_smiles = Chem.MolToSmiles(mol)  # Generate canonical SMILES
    
    return {
        "Molecular Weight": mol_weight,
        "cLogP": clogp,
        "Number of Hydrogen Bond Donors": hbd,
        "Number of Hydrogen Bond Acceptors": hba,
        "QED Score": qed_score,
        "canon_smiles": canonical_smiles  # Add canonical SMILES to the output
    }
zinc_file = "ABAC.txt"
input_df = pd.read_csv(zinc_file, sep="\t", usecols=["zinc_id", "smiles"])

results = []
for zinc_id, smiles in tqdm(zip(input_df["zinc_id"], input_df["smiles"]), total=len(input_df)):
    descriptor = calculate_descriptors(smiles)
    descriptor['zinc_id'] = zinc_id  # Keep the original zinc_id
    descriptor['smiles'] = smiles  # Keep the original input SMILES
    results.append(descriptor)
prop_df = pd.DataFrame(results) # computed properties are stored in this DataFrame

2.2 Get vocabulary compliance#

Get the vocabulary of the SMILES strings and ensure that it is a subset of the allowed vocabulary.

from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
# File for allowed vocabulary
# Note: This file can be changed according to the user's needs
TOKENIZER_VOCAB = os.environ["BIONEMO_HOME"] + "/tokenizers/molecule/megamolbart/vocab/megamolbart.vocab"
MAX_TOKEN_LENGTH = 126

TOKENIZER_MODEL = os.environ["BIONEMO_HOME"] + "/tokenizers/molecule/megamolbart/vocab/megamolbart.model"
regex_tokenizer = RegExTokenizer().load_tokenizer(regex_file=TOKENIZER_MODEL, vocab_file=TOKENIZER_VOCAB)

with open(TOKENIZER_VOCAB, "r") as f:
    allowed_vocab = f.readlines()
curr_allowed_vocab = set([v.strip() for v in allowed_vocab])
def vocab_compliance_check(smiles, tokenizer, vocab_allowed, max_token_length):
    """Checks if the SMILES string only contains vocabulary in `allowed_vocab`
    and if the token length is less than or equal to `max_token_length"""
    tokens = set(tokenizer.text_to_tokens(smiles))
    return tokens.issubset(vocab_allowed) and len(tokens) <= max_token_length
prop_df["vocab_compliant"] = prop_df["canon_smiles"].progress_apply(lambda smi: vocab_compliance_check(
    smi, regex_tokenizer, curr_allowed_vocab, MAX_TOKEN_LENGTH))
prop_df.head()
prop_df["vocab_compliant"].value_counts()

2.3 Filter out undesirable molecules#

In this step, we filter out non-druglike molecules and molecules that do not adhere to the allowed vocabulary and token length. Druglikeness is estimate using the following criteria.

  1. Lipinski’s rule of 5 compliance

  2. Quantitative Estimate of Druglikeness (QED score) with a cutoff of 0.5

filter_param_string = """Molecular Weight: # Lipinski
  min: null
  max: 500
Number of Hydrogen Bond Donors: # Lipinski
  min: null
  max: 5
Number of Hydrogen Bond Acceptors: # Lipinski
  min: null
  max: 10
cLogP: # Lipinski
  min: null
  max: 5
QED Score:
  min: 0.5
  max: null
"""
filter_params = yaml.safe_load(filter_param_string)
def make_filter_mask(df, prop, min_val=None, max_val=None):
    """Returns filter mask by filtering on min_val and max_val of a prop (inclusive range) """
    if min_val is not None and max_val is not None:
        return (df[prop] >= min_val) & (df[prop] <= max_val)
    if min_val is None and max_val is not None:
        return df[prop] <= max_val
    if min_val is not None and max_val is None:
        return df[prop] >= min_val
    if min_val is None and max_val is None:
        raise ValueError(f"Both min_val and max_val cannot be None")
n_init = len(prop_df)
print(f"Number of molecules before filtering: {n_init}")
filter_mask = pd.Series(True, index=prop_df.index)
for prop in filter_params.keys():
    ifilter = make_filter_mask(prop_df, prop, filter_params[prop]["min"], filter_params[prop]["max"])
    filter_mask = filter_mask & ifilter

prop_filt_df = prop_df[filter_mask & prop_df["vocab_compliant"]][["zinc_id", "canon_smiles"]]
prop_filt_df = prop_filt_df.reset_index(drop=True)
n_filt = len(prop_filt_df)
print(f"Number of molecules left after filtering: {n_filt}")
print(f"Percentage of molecules filtered out: {((n_init-n_filt)/n_init) * 100:.2f}%")

3. Write filtered output to file#

Filtered files will be written to /workspace/bionemo/data/filtered/

! mkdir -p /workspace/bionemo/data/filtered/
# Set name of output file here
processed_filepath = "/workspace/bionemo/data/filtered/"
processed_filename = f"{processed_filepath}/ABAC_filtered.txt"
prop_filt_df.to_csv(processed_filename, index=False)

4. Split files into chunks of 10100000#

10.1M dataset shards will be written to /workspace/bionemo/data/split_data/

! mkdir -p /workspace/bionemo/data/split_data/
! cd /workspace/bionemo/data/split_data/; tail -q -n +2 /workspace/bionemo/data/filtered/** | split -d -l 10100000 -a 3

5. Split data into train/val/test#

# Make train/val_full/test_full sets
split_data_file_path = "/workspace/bionemo/data/split_data/"
output_dir = "/workspace/bionemo/data/"
os.makedirs(f"{output_dir}/train", exist_ok=True)
os.makedirs(f"{output_dir}/val_full", exist_ok=True)
os.makedirs(f"{output_dir}/test_full", exist_ok=True)
# Set parameters for splitting
test_frac = 0.005
val_frac = 0.005
number_of_files = 1

for file_idx in range(number_of_files):
    file = f"x{file_idx:03d}"
    input_file = f"{split_data_file_path}/{file}"
    print(f"Reading input file: {input_file}")
    input_df = pd.read_csv(input_file, header=None, names=['zinc_id', 'canon_smiles'], usecols=[0, 1])

    # Calculate sample sizes before size of dataframe changes
    test_samples = max(int(test_frac * input_df.shape[0]), 1)
    val_samples = max(int(val_frac * input_df.shape[0]), 1)

    test_df = input_df.sample(n=test_samples, random_state=file_idx)
    input_df = input_df.drop(test_df.index)  # remove test data from training data

    val_df = input_df.sample(n=val_samples, random_state=file_idx)
    input_df = input_df.drop(val_df.index)  # remove validation data from training data

    input_df.to_csv(f'{output_dir}/train/{file}.csv', index=False)
    test_df.to_csv(f'{output_dir}/test_full/{file}.csv', index=False)
    val_df.to_csv(f'{output_dir}/val_full/{file}.csv', index=False)

[OPTIONAL] 6. Clustering validation and test set for faster evaluation#

We can reduce the size of the validation and test sets to speed up our validation in loop and test evaluation. In order to do this without affecting the diversity of the full validation and test tests, we cluster the molecules and pick some exemplars from the clusters to form smaller validation and test sets. We perform clustering using ECFP4/128-bit fingerprint with MiniBatchKMeans clustering algorithm. Clustered validation set outputs are written to /workspace/bionemo/data/val/ and clustered test set outputs are written to /workspace/bionemo/data/test/

from pathlib import Path
import numpy as np
from rdkit.Chem import rdFingerprintGenerator
from sklearn.cluster import MiniBatchKMeans
def cluster_dataset(file_path, num_files, num_clusters, batch_size, sample_frac):
    """Clusters dataset by ECFP4/128-bit similarity
    file_path: Path to files
    num_clusters: Number of desired clusters
    batch_size: Batch size for MiniBatchKmeans
    sample_frac: Fraction of each cluster to select"""
    all_files = [Path(f"{file_path}/x{i:03}.csv") for i in range(num_files)]

    compound_df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)

    morgan_fp_128 = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=128)
    fps_128 = compound_df["canon_smiles"].progress_apply(lambda s: morgan_fp_128.GetFingerprintAsNumPy(Chem.MolFromSmiles(s)))
    X_fp_128 = np.concatenate([fp.reshape(-1, 1) for fp in fps_128.to_list()], axis=1).T
    
    # Perform clustering
    cluster_labels = MiniBatchKMeans(n_clusters=num_clusters, batch_size=batch_size, 
                                     random_state=0, n_init="auto").fit(X_fp_128)
    compound_df["cluster_label"] = cluster_labels.labels_
    sample_df = compound_df.groupby('cluster_label').apply(lambda x: x.sample(frac=sample_frac, random_state=0))

    return sample_df.reset_index(drop=True)
# Set fraction of validation set for 
val_frac = 

number_of_files = 176
num_clusters = 10000
batch_size = 200000
val_cluster_df = cluster_dataset("/workspace/bionemo/data/val_full/", number_of_files, 
                                 num_clusters, batch_size, val_frac)

os.makedirs("/workspace/bionemo/data/val/", exist_ok=True)
val_cluster_df[["zinc_id", "canon_smiles"]].to_csv("/workspace/bionemo/data/val/val_clustered.csv", index=False)
# Set fraction of validation set for 
test_frac = 

number_of_files = 176
num_clusters = 10000
batch_size = 200000
test_cluster_df = cluster_dataset("/workspace/bionemo/data/test_full/", number_of_files, 
                                 num_clusters, batch_size, test_frac)

os.makedirs("/workspace/bionemo/data/test/", exist_ok=True)
val_cluster_df[["zinc_id", "canon_smiles"]].to_csv("/workspace/bionemo/data/test/test_clustered.csv", index=False)