Skip to content

Transcript extraction

extract_default_transcript_sequences(transcript_info, fasta_records, output_file)

Extracts default transcript sequences from the provided transcript information and writes them to an output file.

Parameters:

Name Type Description Default
transcript_info dict

Dictionary containing transcript and exon information.

required
fasta_records NvFaidx

Indexed FASTA records.

required
output_file TextIO

File object to write the output sequences.

required
Source code in bionemo/evo2/data/transcript_extraction.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def extract_default_transcript_sequences(transcript_info, fasta_records, output_file):
    """Extracts default transcript sequences from the provided transcript information and writes them to an output file.

    Args:
        transcript_info (dict): Dictionary containing transcript and exon information.
        fasta_records (NvFaidx): Indexed FASTA records.
        output_file (TextIO): File object to write the output sequences.
    """
    for transcript_id in transcript_info["transcripts"]:
        gene_id = transcript_info["transcript2gene"][transcript_id]
        this_exons = sorted(transcript_info["transcript2exon"][transcript_id], key=lambda x: x[-1])

        seqname = None
        exon_qc_failed = False
        if len(this_exons) > 1:
            for i in range(1, len(this_exons)):
                this_exon = this_exons[i]
                prev_exon = this_exons[i - 1]
                this_coords = transcript_info["exons"][this_exon]
                prev_coords = transcript_info["exons"][prev_exon]
                if this_coords["strand"] != prev_coords["strand"]:
                    exon_qc_failed = True
                if this_coords["strand"] == "+" and this_coords["start"] < prev_coords["start"]:
                    exon_qc_failed = True
                if this_coords["strand"] == "-" and this_coords["start"] > prev_coords["start"]:
                    exon_qc_failed = True
                if this_coords["seqname"] != prev_coords["seqname"]:
                    exon_qc_failed = True

        if exon_qc_failed:
            continue

        transcript_seq = ""
        for exon in this_exons:
            coords = transcript_info["exons"][exon]
            if seqname is None:
                seqname = coords["seqname"]
            exon_seq = str(fasta_records[coords["seqname"]][coords["start"] : coords["end"]])
            if coords["strand"] == "-":
                exon_seq = reverse_sequence(complement_sequence(exon_seq))
            transcript_seq += exon_seq

        print(f">{seqname}|{gene_id}|{transcript_id}\n{transcript_seq}", file=output_file)

extract_stitched_transcript_sequences(transcript_info, fasta_records, output_file, stitch_token='@', promoter_size=1024, intron_window=32, overlap=False)

Extracts stitched transcript sequences from the provided transcript information and writes them to an output file.

The "stitched" word refers to the process of combining sequences from different regions of the genome to form a single, continuous transcript sequence. This includes: Promoter Region: A specified number of base pairs (bp) upstream of the transcript start site. Exons: The coding regions of the transcript. Intron Windows: A specified number of bp from the neighboring introns around each exon.

The stitch_token is used to denote the boundaries between these regions in the stitched transcript sequences.

Parameters:

Name Type Description Default
transcript_info dict

Dictionary containing transcript and exon information.

required
fasta_records NvFaidx

Indexed FASTA records.

required
output_file TextIO

File object to write the output sequences.

required
stitch_token str

Token to use for stitching sequences. Defaults to "@".

'@'
promoter_size int

Number of bp to include in the promoter region. Defaults to 1024.

1024
intron_window int

Number of bp to include from neighboring introns. Defaults to 32.

32
overlap bool

Whether to allow overlap of neighboring intron windows. Defaults to False.

False
Source code in bionemo/evo2/data/transcript_extraction.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def extract_stitched_transcript_sequences(
    transcript_info, fasta_records, output_file, stitch_token="@", promoter_size=1024, intron_window=32, overlap=False
):
    """Extracts stitched transcript sequences from the provided transcript information and writes them to an output file.

    The "stitched" word refers to the process of combining sequences from different regions of the genome to form a single,
    continuous transcript sequence.
    This includes:
    Promoter Region: A specified number of base pairs (bp) upstream of the transcript start site.
    Exons: The coding regions of the transcript.
    Intron Windows: A specified number of bp from the neighboring introns around each exon.

    The stitch_token is used to denote the boundaries between
    these regions in the stitched transcript sequences.

    Args:
        transcript_info (dict): Dictionary containing transcript and exon information.
        fasta_records (NvFaidx): Indexed FASTA records.
        output_file (TextIO): File object to write the output sequences.
        stitch_token (str, optional): Token to use for stitching sequences. Defaults to "@".
        promoter_size (int, optional): Number of bp to include in the promoter region. Defaults to 1024.
        intron_window (int, optional): Number of bp to include from neighboring introns. Defaults to 32.
        overlap (bool, optional): Whether to allow overlap of neighboring intron windows. Defaults to False.
    """
    for transcript_id in transcript_info["transcripts"]:
        gene_id = transcript_info["transcript2gene"][transcript_id]
        this_exons = sorted(transcript_info["transcript2exon"][transcript_id], key=lambda x: x[-1])

        exon_qc_failed = False
        if len(this_exons) > 1:
            for i in range(1, len(this_exons)):
                this_exon = this_exons[i]
                prev_exon = this_exons[i - 1]
                this_coords = transcript_info["exons"][this_exon]
                prev_coords = transcript_info["exons"][prev_exon]
                if this_coords["strand"] != prev_coords["strand"]:
                    exon_qc_failed = True
                if this_coords["strand"] == "+" and this_coords["start"] < prev_coords["start"]:
                    exon_qc_failed = True
                if this_coords["strand"] == "-" and this_coords["start"] > prev_coords["start"]:
                    exon_qc_failed = True
                if this_coords["seqname"] != prev_coords["seqname"]:
                    exon_qc_failed = True

        if exon_qc_failed:
            continue

        transcript_seq = ""
        seqname = None
        for i in range(len(this_exons)):
            # Previous Exon
            prev_exon = this_exons[i - 1] if i > 0 else None
            prev_coords = transcript_info["exons"].get(prev_exon, None)
            # Current Exon
            cur_exon = this_exons[i]
            cur_coords = transcript_info["exons"].get(cur_exon, None)
            exon_number = cur_exon[-1]
            if seqname is None:
                seqname = cur_coords["seqname"]
            # Next Exon
            next_exon = this_exons[i + 1] if i < len(this_exons) - 1 else None
            next_coords = transcript_info["exons"].get(next_exon, None)
            # Extract the stitched spliced sequence without overlapping intron windows.
            intron_window_left = (
                min(intron_window, math.floor(abs(cur_coords["start"] - prev_coords["end"]) / 2))
                if not overlap and prev_coords is not None
                else intron_window
            )
            intron_window_right = (
                min(intron_window, math.ceil(abs(next_coords["start"] - cur_coords["end"]) / 2))
                if not overlap and next_coords is not None
                else intron_window
            )
            if cur_coords["strand"] == "+" and exon_number == 1:
                exon_start = cur_coords["start"] - promoter_size
                exon_end = cur_coords["end"] + intron_window_right
            elif cur_coords["strand"] == "-" and exon_number == 1:
                exon_start = cur_coords["start"] - intron_window_left
                exon_end = cur_coords["end"] + promoter_size
            else:
                exon_start = cur_coords["start"] - intron_window_left
                exon_end = cur_coords["end"] + intron_window_right
            exon_seq = str(fasta_records[cur_coords["seqname"]][exon_start:exon_end])
            if cur_coords["strand"] == "-":
                exon_seq = stitch_token + reverse_sequence(complement_sequence(exon_seq))
            transcript_seq += exon_seq

        if stitch_token and len(stitch_token) > 0:
            transcript_seq = transcript_seq[len(stitch_token) :]

        print(f">{seqname}|{gene_id}|{transcript_id}\n{transcript_seq}", file=output_file)

extract_transcript_exons(gtf_path, only_longest_transcript)

Extracts transcript exons from a GTF file and optionally keeps only the longest transcript per gene.

Parameters:

Name Type Description Default
gtf_path str

Path to the GTF file.

required
only_longest_transcript bool

Whether to keep only the longest transcript per gene.

required

Returns:

Name Type Description
dict

A dictionary containing transcript and exon information.

Source code in bionemo/evo2/data/transcript_extraction.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def extract_transcript_exons(gtf_path: str, only_longest_transcript: bool):
    """Extracts transcript exons from a GTF file and optionally keeps only the longest transcript per gene.

    Args:
        gtf_path (str): Path to the GTF file.
        only_longest_transcript (bool): Whether to keep only the longest transcript per gene.

    Returns:
        dict: A dictionary containing transcript and exon information.
    """
    genes = defaultdict(set)
    gene2transcripts = defaultdict(set)
    transcripts = {}
    exons = {}
    exon2transcript = {}
    transcript2gene = {}
    transcript2exon = defaultdict(set)
    skip_transcripts = set()

    gtf_fields = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
    with open(gtf_path) as infile:
        for line in infile:
            # skip header lines
            if line.startswith("#"):
                continue
            line = line.strip().split("\t")
            if len(line) < 9:
                continue

            # parse the attributes into a dictionary
            line = dict(zip(gtf_fields, line))
            attribs = parse_gtf_attributes(line["attribute"])

            if line["feature"] == "gene":
                contig, start, end, strand = line["seqname"], line["start"], line["end"], line["strand"]
                start, end = int(line["start"]) - 1, int(line["end"])
                gene_id = attribs.get("gene_id", None)
                if not gene_id:
                    continue
                genes[gene_id].add((contig, start, end, strand))

            elif line["feature"] == "exon":
                contig, start, end, strand = line["seqname"], line["start"], line["end"], line["strand"]
                start, end = int(line["start"]) - 1, int(line["end"])
                gene_id = attribs.get("gene_id", None)
                if not gene_id:
                    continue
                transcript_id = attribs["transcript_id"]
                gene2transcripts[gene_id].add(transcript_id)

                # Skip exons that have already been handled and are likely errors
                if transcript_id in skip_transcripts:
                    continue
                exon_number = int(attribs["exon_number"])

                exon_id = (gene_id, transcript_id, exon_number)
                if exon_id in exons:
                    del exons[exon_id]
                    if transcript_id in transcripts:
                        del transcripts[transcript_id]
                    if transcript_id in transcript2exon:
                        del transcript2exon[transcript_id]
                    skip_transcripts.add(transcript_id)
                    continue

                exons[exon_id] = {"seqname": contig, "start": start, "end": end, "strand": strand}
                if exon_id in exon2transcript:
                    raise Exception("Exon Already Exists in exon2transcript")
                exon2transcript[exon_id] = transcript_id
                transcript2exon[transcript_id].add(exon_id)

            elif line["feature"] == "transcript":
                contig, start, end, strand = line["seqname"], line["start"], line["end"], line["strand"]
                start, end = int(line["start"]) - 1, int(line["end"])
                gene_id = attribs.get("gene_id", None)
                if not gene_id:
                    continue
                gbkey = attribs["gbkey"]
                transcript_biotype = attribs["transcript_biotype"]
                transcript_id = attribs["transcript_id"]
                if transcript_id in skip_transcripts:
                    continue

                transcripts[transcript_id] = {
                    "seqname": contig,
                    "start": start,
                    "end": end,
                    "strand": strand,
                    "gbkey": gbkey,
                    "transcript_biotype": transcript_biotype,
                }
                transcript2gene[transcript_id] = gene_id
                gene2transcripts[gene_id].add(transcript_id)

    if only_longest_transcript:
        transcript_lengths = defaultdict(int)
        for exon in exons:
            transcript_lengths[exon[1]] += exons[exon]["end"] - exons[exon]["start"]

        keep_transcripts = {}
        keep_exons = {}
        keep_exon2transcript = {}
        keep_transcript2gene = {}
        keep_transcript2exon = defaultdict(set)
        keep_skip_transcripts = set()

        for gene in gene2transcripts:
            this_transcripts = gene2transcripts[gene]
            this_transcript_lengths = [(transcript, transcript_lengths[transcript]) for transcript in this_transcripts]
            longest_transcript = max(this_transcript_lengths, key=lambda x: x[1])[0]
            keep_transcripts[longest_transcript] = dict(transcripts[longest_transcript])
            for exon in transcript2exon[longest_transcript]:
                keep_exons[exon] = dict(exons[exon])
                keep_exon2transcript[exon] = longest_transcript
                keep_transcript2exon[longest_transcript].add(exon)
                keep_transcript2gene[longest_transcript] = gene

        transcripts = keep_transcripts
        exons = keep_exons
        exon2transcript = keep_exon2transcript
        transcript2gene = keep_transcript2gene
        transcript2exon = keep_transcript2exon
        skip_transcripts = keep_skip_transcripts

    return {
        "transcripts": transcripts,
        "exons": exons,
        "exon2transcript": exon2transcript,
        "transcript2gene": transcript2gene,
        "transcript2exon": transcript2exon,
    }

main()

Entry point for the script. Parses arguments and runs the extraction process.

Source code in bionemo/evo2/data/transcript_extraction.py
413
414
415
416
417
418
def main():
    """Entry point for the script. Parses arguments and runs the extraction process."""
    args = parse_args()
    if args.verbose:
        logging.info(args)
    run(args)

parse_args()

Parses command line arguments for the transcript extraction script.

Returns:

Type Description

argparse.Namespace: Parsed command line arguments.

Source code in bionemo/evo2/data/transcript_extraction.py
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def parse_args():
    """Parses command line arguments for the transcript extraction script.

    Returns:
        argparse.Namespace: Parsed command line arguments.
    """
    ap = argparse.ArgumentParser(description="Extract spliced transcripts from a FASTA and GTF.")
    ap.add_argument("--fasta-path", type=str, required=True, help="Path to FASTA file to extract transcripts from.")
    ap.add_argument(
        "--gtf-path",
        type=str,
        required=True,
        help="Path to gene transfer format (GTF) file associated with the FASTA.",
    )
    ap.add_argument("--output-path", type=str, default=None, help="Path to output FASTA file.")
    ap.add_argument(
        "--transcript-type",
        type=str,
        default="default",
        choices=["default", "stitched"],
        help="Type of transcript to extract from the GTF and FASTA files for splicing. 'Stitched' transcripts include 1024 bp of sequence from the promoter and 32 bp around each exon.",
    )
    ap.add_argument(
        "--stitched-promoter",
        type=int,
        default=1024,
        help="Number of bp to include in the promoter region when --transcript-type=stitched is used. Defaults to 1024.",
    )
    ap.add_argument(
        "--stitched-intron",
        type=int,
        default=32,
        help="Number of bp to include from neighboring introns when --transcript-type=stitched is used. Defaults to 32.",
    )
    ap.add_argument(
        "--stitched-overlap",
        action="store_true",
        help="Allow overlap of neighboring intron windows when --transcript-type=stitched is used. Defaults to False, i.e. prevents overlap by shortening the intron windows for a contiguous splice.",
    )
    ap.add_argument(
        "--only-longest-transcript", action="store_true", help="Only extract the longest transcript per gene."
    )
    ap.add_argument("-v", "--verbose", action="store_true", help="Turn on verbose log messages.")
    return ap.parse_args()

parse_gtf_attributes(attributes)

Parses the attributes field of a GTF file line into a dictionary.

Parameters:

Name Type Description Default
attributes str

The attributes field from a GTF file line.

required

Returns:

Name Type Description
dict

A dictionary of attribute key-value pairs.

Source code in bionemo/evo2/data/transcript_extraction.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def parse_gtf_attributes(attributes: str):
    """Parses the attributes field of a GTF file line into a dictionary.

    Args:
        attributes (str): The attributes field from a GTF file line.

    Returns:
        dict: A dictionary of attribute key-value pairs.
    """
    # Split on all semicolons that are not inside quotes
    attributes = re.split(r';(?=(?:[^"]*"[^"]*")*[^"]*$)', attributes)
    out = {}
    for a in attributes:
        if len(a) == 0:
            continue
        key = a.split()[0]
        value = a.split('"')[1]
        out[key] = value
    return out

run(args)

Main function to run the transcript extraction process based on command line arguments.

Parameters:

Name Type Description Default
args Namespace

Parsed command line arguments.

required
Source code in bionemo/evo2/data/transcript_extraction.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def run(args):
    """Main function to run the transcript extraction process based on command line arguments.

    Args:
        args (argparse.Namespace): Parsed command line arguments.
    """
    with open(args.output_path, "w") if args.output_path is not None else sys.stdout as output_file:
        if args.verbose:
            logging.info("Indexing FASTA file...")

        fasta_index = NvFaidx(args.fasta_path)

        if args.transcript_type == "default":
            if args.verbose:
                logging.info("Extracting default transcripts...")
                if args.only_longest_transcript:
                    logging.info("Only extracting the longest transcript per gene.")
                else:
                    logging.info("Extracting all transcripts regardless of length.")

        elif args.transcript_type == "stitched":
            if args.verbose:
                logging.info("Extracting stitched transcripts...")
                if args.only_longest_transcript:
                    logging.info("Only extracting the longest transcript per gene.")
                else:
                    logging.info("Extracting all transcripts regardless of length.")

        transcript_info = extract_transcript_exons(args.gtf_path, args.only_longest_transcript)

        if args.transcript_type == "default":
            extract_default_transcript_sequences(transcript_info, fasta_index, output_file)
        elif args.transcript_type == "stitched":
            extract_stitched_transcript_sequences(
                transcript_info,
                fasta_index,
                output_file,
                promoter_size=args.stitched_promoter,
                intron_window=args.stitched_intron,
                overlap=args.stitched_overlap,
            )