Important
NeMo 2.0 is an experimental feature and currently released in the dev container only: nvcr.io/nvidia/nemo:dev. Please refer to the Migration Guide for information on getting started.
Scores
EN
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_citrinet_256 |
en |
4.2 % WER |
10.7 % WER |
4.4 % WER |
10.7 % WER |
||||||||||||||
stt_en_citrinet_512 |
en |
3.7 % WER |
8.9 % WER |
3.7 % WER |
8.9 % WER |
||||||||||||||
stt_en_citrinet_1024 |
en |
3.7 % WER |
8.3 % WER |
3.6 % WER |
7.9 % WER |
||||||||||||||
stt_en_citrinet_256_gamma_0_25 |
en |
4.7 % |
10.6 % |
4.8 % |
10.7 % |
8.3 % |
5.8 % |
3.6 % |
|||||||||||
stt_en_citrinet_512_gamma_0_25 |
en |
4.0 % |
9.0 % |
3.9 % |
9.0 % |
6.9 % |
4.4 % |
3.6 % |
|||||||||||
stt_en_citrinet_1024_gamma_0_25 |
en |
3.4 % |
7.7 % |
3.4 % |
7.6 % |
6.2 % |
4.0 % |
2.5 % |
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_conformer_ctc_small |
en |
3.6 |
8.1 |
3.7 |
8.1 |
||||||||||||||
stt_en_conformer_ctc_medium |
en |
2.5 |
5.8 |
2.6 |
5.9 |
||||||||||||||
stt_en_conformer_ctc_large |
en |
1.9 |
4.4 |
2.1 |
4.5 |
||||||||||||||
stt_en_conformer_ctc_xlarge |
en |
1.77 % |
3.79 % |
2.00 % |
3.74 % |
7.88 % |
5.99 % |
6.44 % |
22.90 % |
5.50 % |
2.36 % |
||||||||
stt_en_conformer_ctc_small_ls |
en |
3.3 |
8.8 |
3.4 |
8.8 |
||||||||||||||
stt_en_conformer_ctc_medium_ls |
en |
2.7 |
7.4 |
3.0 |
7.3 |
||||||||||||||
stt_en_conformer_ctc_large_ls |
en |
2.4 |
6.2 |
2.7 |
6.0 |
||||||||||||||
stt_en_conformer_transducer_small |
en |
2.8 |
6.6 |
2.5 |
6.6 |
||||||||||||||
stt_en_conformer_transducer_medium |
en |
2.0 |
4.6 |
2.1 |
4.7 |
||||||||||||||
stt_en_conformer_transducer_large |
en |
1.6 |
3.5 |
1.7 |
3.7 |
||||||||||||||
stt_en_conformer_transducer_large_ls |
en |
2.1 |
5.0 |
2.3 |
5.1 |
||||||||||||||
stt_en_conformer_transducer_xlarge |
en |
1.48 % |
2.95 % |
1.62 % |
3.01 % |
6.46 % |
4.59 % |
5.32 % |
5.70 % |
6.47 % |
21.32 % |
2.05 % |
1.17 % |
||||||
stt_en_conformer_transducer_xxlarge |
en |
1.52 % |
3.09 % |
1.72 % |
3.14 % |
5.29 % |
5.85 % |
6.64 % |
2.42 % |
1.49 % |
|||||||||
stt_en_fastconformer_hybrid_large_streaming_80ms (CTC) |
en |
3.5 % |
8.1 % |
10.2 % |
7.2 % |
3.5 % |
2.3 % |
||||||||||||
stt_en_fastconformer_hybrid_large_streaming_480ms (CTC) |
en |
3.6 % |
7.5 % |
9.8 % |
7.0 % |
3.5 % |
2.1 % |
||||||||||||
stt_en_fastconformer_hybrid_large_streaming_1040ms (CTC) |
en |
2.7 % |
6.4 % |
9.0 % |
7.0 % |
3.2 % |
1.9 % |
||||||||||||
stt_en_fastconformer_hybrid_large_streaming_80ms (RNNT) |
en |
2.7 % |
6.5 % |
9.1 % |
6.9 % |
3.2 % |
1.9 % |
||||||||||||
stt_en_fastconformer_hybrid_large_streaming_480ms (RNNT) |
en |
2.7 % |
6.1 % |
8.5 % |
6.7 % |
3.1 % |
1.8 % |
||||||||||||
stt_en_fastconformer_hybrid_large_streaming_1040ms (RNNT) |
en |
2.3 % |
5.5 % |
8.0 % |
6.6 % |
2.9 % |
1.6 % |
||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 0ms) |
en |
7.0 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 80ms) |
en |
6.4 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 480) |
en |
5.7 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 1040) |
en |
5.4 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 0ms) |
en |
8.4 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 80ms) |
en |
7.8 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 480) |
en |
6.7 % |
|||||||||||||||||
stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 1040) |
en |
6.2 % |
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_contextnet_256 |
en |
3.3 % |
7.9 % |
3.3 % |
8.0 % |
9.7 % |
11.0 % |
7.1 % |
4.6 % |
3.2 % |
|||||||||
stt_en_contextnet_512 |
en |
2.0 % |
4.8 % |
2.2 % |
5.0 % |
6.6 % |
7.3 % |
5.9 % |
2.8 % |
1.4 % |
|||||||||
stt_en_contextnet_1024 |
en |
1.7 % |
3.8 % |
1.9 % |
4.0 % |
7.9 % |
5.9 % |
5.2 % |
6.5 % |
21.7 % |
4.7 % |
2.3 % |
1.3 % |
||||||
stt_en_contextnet_256_mls |
en |
9.0 % |
9.2 % |
9.4 % |
10.9 % |
||||||||||||||
stt_en_contextnet_512_mls |
en |
5.2 % |
5.2 % |
5.6 % |
6.6 % |
||||||||||||||
stt_en_contextnet_1024_mls |
en |
4.1 % |
4.2 % |
4.6 % |
5.6 % |
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_fastconformer_ctc_large |
en |
1.9 |
4.2 |
2.1 |
4.2 |
||||||||||||||
stt_en_fastconformer_transducer_large |
en |
2.0 |
3.8 |
1.8 |
3.8 |
||||||||||||||
stt_en_fastconformer_hybrid_large_pc |
en |
8.0 % |
10.3 % |
2.0 % |
4.1 % |
8.2 % |
4.5 % |
4.6 % |
2.3 % |
4.5 % |
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_jasper10x5dr |
en |
3.74 |
10.21 |
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_quartznet15x5 |
en |
4.38 |
11.3 |
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MCV Test-Set v8.0 (en) |
MLS Dev (en) |
MLS Test (en) |
NSC Part1 |
NSC Part6 |
Peoples Speech Test v1 |
SLR 83 Test |
SPGI Test |
VoxPopuli Test (en) |
WSJ Dev 93 |
WSJ Eval 92 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_en_squeezeformer_ctc_xsmall_ls |
en |
3.6 % |
9.7 % |
3.8 % |
9.4 % |
||||||||||||||
stt_en_squeezeformer_ctc_small_ls |
en |
2.9 % |
7.4 % |
3.1 % |
7.4 % |
||||||||||||||
stt_en_squeezeformer_ctc_small_medium_ls |
en |
2.7 % |
7.0 % |
2.8 % |
7.1 % |
||||||||||||||
stt_en_squeezeformer_ctc_medium_ls |
en |
2.4 % |
6.2 % |
2.6 % |
6.3 % |
||||||||||||||
stt_en_squeezeformer_ctc_medium_large_ls |
en |
2.3 % |
6.0 % |
2.5 % |
5.9 % |
||||||||||||||
stt_en_squeezeformer_ctc_large_ls |
en |
2.3 % |
5.7 % |
2.4 % |
5.7 % |
BE
Model Name |
Language |
MCV Test-Set v10 (be) |
---|---|---|
stt_be_conformer_ctc_large |
be |
4.7 % |
stt_be_conformer_transducer_large |
be |
3.8 % |
BY
Model Name |
Language |
MCV Dev-Set v12.0 (be) |
MCV Test-Set v12.0 (be) |
---|---|---|---|
stt_by_fastconformer_hybrid_large_pc |
by |
2.7 % |
2.7 % |
CA
Model Name |
Language |
MCV Dev-Set (v??) (ca) |
MCV Dev-Set v9.0 (ca) |
MCV Test-Set v9.0 (ca) |
---|---|---|---|---|
stt_ca_conformer_ctc_large |
ca |
4.70 |
4.27 |
|
stt_ca_conformer_transducer_large |
ca |
4.43 |
3.85 |
Model Name |
Language |
MCV Dev-Set (v??) (ca) |
MCV Dev-Set v9.0 (ca) |
MCV Test-Set v9.0 (ca) |
---|---|---|---|---|
stt_ca_quartznet15x5 |
ca |
6.0 |
DE
Model Name |
Language |
MCV Dev-Set (v??) (de) |
MCV Dev-Set v12.0 (de) |
MCV Dev-Set v7.0 (de) |
MCV Test-Set v12.0 (de) |
MCV Test-Set v7.0 (de) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (de) |
VoxPopuli Test (de) |
---|---|---|---|---|---|---|---|---|---|---|
stt_de_citrinet_1024 |
de |
6.63 |
7.59 |
4.06 |
5.07 |
12.33 |
10.02 |
Model Name |
Language |
MCV Dev-Set (v??) (de) |
MCV Dev-Set v12.0 (de) |
MCV Dev-Set v7.0 (de) |
MCV Test-Set v12.0 (de) |
MCV Test-Set v7.0 (de) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (de) |
VoxPopuli Test (de) |
---|---|---|---|---|---|---|---|---|---|---|
stt_de_conformer_ctc_large |
de |
5.84 |
6.68 |
3.85 |
4.63 |
12.56 |
10.51 |
|||
stt_de_conformer_transducer_large |
de |
4.75 |
5.36 |
3.46 |
4.19 |
11.21 |
9.14 |
Model Name |
Language |
MCV Dev-Set (v??) (de) |
MCV Dev-Set v12.0 (de) |
MCV Dev-Set v7.0 (de) |
MCV Test-Set v12.0 (de) |
MCV Test-Set v7.0 (de) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (de) |
VoxPopuli Test (de) |
---|---|---|---|---|---|---|---|---|---|---|
stt_de_contextnet_1024 |
de |
4.76 |
5.5 |
3.53 |
4.2 |
11.32 |
9.4 |
Model Name |
Language |
MCV Dev-Set (v??) (de) |
MCV Dev-Set v12.0 (de) |
MCV Dev-Set v7.0 (de) |
MCV Test-Set v12.0 (de) |
MCV Test-Set v7.0 (de) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (de) |
VoxPopuli Test (de) |
---|---|---|---|---|---|---|---|---|---|---|
stt_de_fastconformer_hybrid_large_pc |
de |
4.2 % |
4.9 % |
3.3 % |
3.8 % |
10.8 % |
8.7 % |
Model Name |
Language |
MCV Dev-Set (v??) (de) |
MCV Dev-Set v12.0 (de) |
MCV Dev-Set v7.0 (de) |
MCV Test-Set v12.0 (de) |
MCV Test-Set v7.0 (de) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (de) |
VoxPopuli Test (de) |
---|---|---|---|---|---|---|---|---|---|---|
stt_de_quartznet15x5 |
de |
11.78 |
ENES
Model Name |
Language |
Fisher-Dev-En |
Fisher-Dev-Es |
Fisher-Test-En |
Fisher-Test-Es |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Dev-Set v7.0 (en) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set v7.0 (en) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Dev (es) |
MLS Test (en) |
MLS Test (es) |
VoxPopuli Dev (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (en) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_enes_conformer_ctc_large |
enes |
16.7 % |
2.2 % |
5.5 % |
2.6 % |
5.5 % |
5.8 % |
3.5 % |
5.7 % |
||||||||||||
stt_enes_conformer_ctc_large_codesw |
enes |
16.51 % |
16.31 % |
2.22 % |
5.36 % |
2.55 % |
5.38 % |
5.00 % |
5.51 % |
3.46 % |
3.73 % |
5.58 % |
6.63 % |
||||||||
stt_enes_conformer_transducer_large |
enes |
16.2 % |
2.0 % |
4.6 % |
2.2 % |
4.6 % |
5.0 % |
3.3 % |
5.3 % |
||||||||||||
stt_enes_conformer_transducer_large_codesw |
enes |
15.70 % |
15.66 % |
1.97 % |
4.54 % |
2.17 % |
4.53 % |
4.51 % |
5.06 % |
3.27 % |
3.67 % |
5.28 % |
6.54 % |
Model Name |
Language |
Fisher-Dev-En |
Fisher-Dev-Es |
Fisher-Test-En |
Fisher-Test-Es |
Librispeech Dev-Clean |
Librispeech Dev-Other |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Dev-Set v7.0 (en) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set v7.0 (en) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Dev (es) |
MLS Test (en) |
MLS Test (es) |
VoxPopuli Dev (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (en) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_enes_contextnet_large |
enes |
14.8 % |
2.2 % |
5.6 % |
2.3 % |
5.5 % |
4.7 % |
3.0 % |
5.0 % |
EO
Model Name |
Language |
MCV Dev-Set v11.0 (eo) |
MCV Test-Set v11.0 (eo) |
---|---|---|---|
stt_eo_conformer_ctc_large |
eo |
2.9 % |
4.8 % |
stt_eo_conformer_transducer_large |
eo |
2.4 % |
4.0 % |
ES
Model Name |
Language |
Call Home Dev Test (es) |
Call Home Eval Test (es) |
Call Home Train (es) |
Fisher Dev Set (es) |
Fisher Test Set (es) |
MCV Dev-Set (v??) (es) |
MCV Dev-Set v12.0 (es) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set (v??) (es) |
MCV Test-Set v12.0 (es) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_es_citrinet_512 |
es |
9.1 % WER |
10.3 % WER |
4.9 % WER |
5.2 % WER |
|||||||||||
stt_es_citrinet_1024_gamma_0_25 |
es |
19.9 % |
21.3 % |
19.1 % |
15.8 % |
15.9 % |
6.1 % |
6.8 % |
3.5 % |
4.1 % |
5.6 % |
7.0 % |
Model Name |
Language |
Call Home Dev Test (es) |
Call Home Eval Test (es) |
Call Home Train (es) |
Fisher Dev Set (es) |
Fisher Test Set (es) |
MCV Dev-Set (v??) (es) |
MCV Dev-Set v12.0 (es) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set (v??) (es) |
MCV Test-Set v12.0 (es) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_es_conformer_ctc_large |
es |
23.7 % |
25.3 % |
22.4 % |
18.3 % |
18.5 % |
6.3 % |
6.9 % |
4.3 % |
4.2 % |
6.1 % |
7.5 % |
||||
stt_es_conformer_transducer_large |
es |
18.0 % |
19.4 % |
17.2 % |
14.7 % |
14.8 % |
4.6 % |
5.2 % |
2.7 % |
3.2 % |
4.7 % |
6.0 % |
Model Name |
Language |
Call Home Dev Test (es) |
Call Home Eval Test (es) |
Call Home Train (es) |
Fisher Dev Set (es) |
Fisher Test Set (es) |
MCV Dev-Set (v??) (es) |
MCV Dev-Set v12.0 (es) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set (v??) (es) |
MCV Test-Set v12.0 (es) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_es_contextnet_1024 |
es |
19.1 % |
20.7 % |
18.2 % |
15.3 % |
15.1 % |
4.8 % |
5.2 % |
3.1 % |
3.5 % |
5.1 % |
6.2 % |
Model Name |
Language |
Call Home Dev Test (es) |
Call Home Eval Test (es) |
Call Home Train (es) |
Fisher Dev Set (es) |
Fisher Test Set (es) |
MCV Dev-Set (v??) (es) |
MCV Dev-Set v12.0 (es) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set (v??) (es) |
MCV Test-Set v12.0 (es) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_es_fastconformer_hybrid_large_pc |
es |
29.4 % |
28.9 % |
7.1 % |
7.5 % |
10.6 % |
11.8 % |
8.6 % |
9.8 % |
Model Name |
Language |
Call Home Dev Test (es) |
Call Home Eval Test (es) |
Call Home Train (es) |
Fisher Dev Set (es) |
Fisher Test Set (es) |
MCV Dev-Set (v??) (es) |
MCV Dev-Set v12.0 (es) |
MCV Dev-Set v7.0 (es) |
MCV Test-Set (v??) (es) |
MCV Test-Set v12.0 (es) |
MCV Test-Set v7.0 (es) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
stt_es_quartznet15x5 |
es |
12.97 |
FR
Model Name |
Language |
MCV Dev-Set (v??) (fr) |
MCV Dev-Set v7.0 (fr) |
MCV Dev-Set v7.0 (fr) (No Hyphen) |
MCV Test-Set v7.0 (fr) |
MCV Test-Set v7.0 (fr) (No Hyphen) |
MLS Dev (en) |
MLS Dev (en) (No Hyphen) |
MLS Test (en) |
MLS Test (en) (No Hyphen) |
---|---|---|---|---|---|---|---|---|---|---|
stt_fr_citrinet_1024_gamma_0_25 |
fr |
10.76 |
9.90 |
12.20 |
11.11 |
6.66 |
6.19 |
5.53 |
5.12 |
Model Name |
Language |
MCV Dev-Set (v??) (fr) |
MCV Dev-Set v7.0 (fr) |
MCV Dev-Set v7.0 (fr) (No Hyphen) |
MCV Test-Set v7.0 (fr) |
MCV Test-Set v7.0 (fr) (No Hyphen) |
MLS Dev (en) |
MLS Dev (en) (No Hyphen) |
MLS Test (en) |
MLS Test (en) (No Hyphen) |
---|---|---|---|---|---|---|---|---|---|---|
stt_fr_conformer_ctc_large |
fr |
8.35 |
7.88 |
9.63 |
9.01 |
5.88 |
5.90 |
4.91 |
4.63 |
|
stt_fr_conformer_transducer_large |
fr |
6.85 |
7.95 |
5.05 |
4.10 |
Model Name |
Language |
MCV Dev-Set (v??) (fr) |
MCV Dev-Set v7.0 (fr) |
MCV Dev-Set v7.0 (fr) (No Hyphen) |
MCV Test-Set v7.0 (fr) |
MCV Test-Set v7.0 (fr) (No Hyphen) |
MLS Dev (en) |
MLS Dev (en) (No Hyphen) |
MLS Test (en) |
MLS Test (en) (No Hyphen) |
---|---|---|---|---|---|---|---|---|---|---|
stt_fr_contextnet_1024 |
fr |
8.32 |
9.42 |
6.02 |
5.01 |
Model Name |
Language |
MCV Dev-Set (v??) (fr) |
MCV Dev-Set v7.0 (fr) |
MCV Dev-Set v7.0 (fr) (No Hyphen) |
MCV Test-Set v7.0 (fr) |
MCV Test-Set v7.0 (fr) (No Hyphen) |
MLS Dev (en) |
MLS Dev (en) (No Hyphen) |
MLS Test (en) |
MLS Test (en) (No Hyphen) |
---|---|---|---|---|---|---|---|---|---|---|
stt_fr_quartznet15x5 |
fr |
14.01 |
HR
Model Name |
Language |
ParlaSpeech Dev-Set v1.0 (hr) |
ParlaSpeech Test-Set v1.0 (hr) |
Parlaspeech Dev-Set (v??) (hr) |
Parlaspeech Test-Set (v??) (hr) |
---|---|---|---|---|---|
stt_hr_conformer_ctc_large |
hr |
4.43 |
4.70 |
||
stt_hr_conformer_transducer_large |
hr |
4.56 |
4.69 |
Model Name |
Language |
ParlaSpeech Dev-Set v1.0 (hr) |
ParlaSpeech Test-Set v1.0 (hr) |
Parlaspeech Dev-Set (v??) (hr) |
Parlaspeech Test-Set (v??) (hr) |
---|---|---|---|---|---|
stt_hr_fastconformer_hybrid_large_pc |
hr |
4.5 % |
4.2 % |
IT
Model Name |
Language |
MCV Dev-Set (v??) (it) |
MCV Dev-Set v11.0 (it) |
MCV Dev-Set v12.0 (it) |
MCV Test-Set v11.0 (it) |
MCV Test-Set v12.0 (it) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (it) |
VoxPopuli Test (it) |
---|---|---|---|---|---|---|---|---|---|---|
stt_it_conformer_ctc_large |
it |
5.38 |
5.92 |
13.16 |
10.62 |
13.43 |
16.75 |
|||
stt_it_conformer_transducer_large |
it |
4.80 |
5.24 |
14.62 |
12.18 |
12.00 |
15.15 |
Model Name |
Language |
MCV Dev-Set (v??) (it) |
MCV Dev-Set v11.0 (it) |
MCV Dev-Set v12.0 (it) |
MCV Test-Set v11.0 (it) |
MCV Test-Set v12.0 (it) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (it) |
VoxPopuli Test (it) |
---|---|---|---|---|---|---|---|---|---|---|
stt_it_fastconformer_hybrid_large_pc |
it |
5.2 % |
5.8 % |
13.6 % |
11.5 % |
12.7 % |
15.6 % |
Model Name |
Language |
MCV Dev-Set (v??) (it) |
MCV Dev-Set v11.0 (it) |
MCV Dev-Set v12.0 (it) |
MCV Test-Set v11.0 (it) |
MCV Test-Set v12.0 (it) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (it) |
VoxPopuli Test (it) |
---|---|---|---|---|---|---|---|---|---|---|
stt_it_quartznet15x5 |
it |
15.22 |
KAB
Model Name |
Language |
MCV Test-Set v10.0 (kab) |
---|---|---|
stt_kab_conformer_transducer_large |
kab |
18.86 |
NL
Model Name |
Language |
MCV Test-Set v12.0 (nl) |
MLS Test (nl) |
---|---|---|---|
stt_nl_fastconformer_hybrid_large_pc |
nl |
9.2 % |
12.1 % |
PL
Model Name |
Language |
MCV Dev-Set (v??) (pl) |
MCV Dev-Set v12.0 (pl) |
MCV Test-Set v12.0 (pl) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (pl) |
VoxPopuli Test (pl) |
---|---|---|---|---|---|---|---|---|
stt_pl_fastconformer_hybrid_large_pc |
pl |
6.0 % |
8.7 % |
7.1 % |
5.8 % |
11.3 % |
8.5 % |
Model Name |
Language |
MCV Dev-Set (v??) (pl) |
MCV Dev-Set v12.0 (pl) |
MCV Test-Set v12.0 (pl) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (pl) |
VoxPopuli Test (pl) |
---|---|---|---|---|---|---|---|---|
stt_pl_quartznet15x5 |
pl |
14 |
RU
Model Name |
Language |
GOLOS Crowd Test-Set (v??) (ru) |
GOLOS Farfield Test-Set (v??) (ru) |
Librispeech Test |
MCV Dev-Set (v??) (ru) |
MCV Dev-Set v10.0 (ru) |
MCV Test-Set v10.0 (ru) |
---|---|---|---|---|---|---|---|
stt_ru_conformer_ctc_large |
ru |
2.8 % |
7.1 % |
13.5 % |
3.9 % |
4.3 % |
|
stt_ru_conformer_transducer_large |
ru |
2.7% |
7.6% |
12.0% |
3.5% |
4.0% |
Model Name |
Language |
GOLOS Crowd Test-Set (v??) (ru) |
GOLOS Farfield Test-Set (v??) (ru) |
Librispeech Test |
MCV Dev-Set (v??) (ru) |
MCV Dev-Set v10.0 (ru) |
MCV Test-Set v10.0 (ru) |
---|---|---|---|---|---|---|---|
stt_ru_quartznet15x5 |
ru |
16.23 |
RW
Model Name |
Language |
MCV Test-Set v9.0 (rw) |
---|---|---|
stt_rw_conformer_ctc_large |
rw |
18.2 % |
stt_rw_conformer_transducer_large |
rw |
16.2 % |
UA
Model Name |
Language |
MCV Test-Set v12.0 (ua) |
---|---|---|
stt_ua_fastconformer_hybrid_large_pc |
ua |
5.2 % |
ZH
Model Name |
Language |
AIShell Dev-Android v2 |
AIShell Dev-Ios v1 |
AIShell Dev-Ios v2 |
AIShell Dev-Mic v2 |
AIShell Test-Android v2 |
AIShell Test-Ios v1 |
AIShell Test-Ios v2 |
AIShell Test-Mic v2 |
---|---|---|---|---|---|---|---|---|---|
stt_zh_citrinet_512 |
zh |
6.25% |
6.44% |
||||||
stt_zh_citrinet_1024_gamma_0_25 |
zh |
5.2 % |
4.8 % |
5.2 % |
5.5 % |
5.1 % |
5.5 % |
Model Name |
Language |
AIShell Dev-Android v2 |
AIShell Dev-Ios v1 |
AIShell Dev-Ios v2 |
AIShell Dev-Mic v2 |
AIShell Test-Android v2 |
AIShell Test-Ios v1 |
AIShell Test-Ios v2 |
AIShell Test-Mic v2 |
---|---|---|---|---|---|---|---|---|---|
stt_zh_conformer_transducer_large |
zh |
3.4 |
3.2 |
3.4 |
3.4 |
3.2 |
3.4 |
Scores with Punctuation and Capitalization
EN with P&C
Model Name |
Language |
EuroParl Test Set (en) |
Fisher Test Set (en) |
Librispeech Test-Clean |
Librispeech Test-Other |
MCV Test-Set v11.0 (en) |
MLS Test (en) |
NSC Part1 |
SPGI Test |
VoxPopuli Test (en) |
---|---|---|---|---|---|---|---|---|---|---|
stt_en_fastconformer_hybrid_large_pc |
en |
12.5 % |
19.0 % |
7.3 % |
9.2 % |
10.1 % |
12.7 % |
7.2 % |
5.1 % |
6.7 % |
BY with P&C
Model Name |
Language |
MCV Dev-Set v12.0 (be) |
MCV Test-Set v12.0 (be) |
---|---|---|---|
stt_by_fastconformer_hybrid_large_pc |
by |
3.8 % |
3.9 % |
DE with P&C
Model Name |
Language |
MCV Dev-Set v12.0 (de) |
MCV Test-Set v12.0 (de) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (de) |
VoxPopuli Test (de) |
---|---|---|---|---|---|---|---|
stt_de_fastconformer_hybrid_large_pc |
de |
4.7 % |
5.4 % |
10.1 % |
11.1 % |
12.6 % |
10.4 % |
ES with P&C
Model Name |
Language |
Fisher Dev Set (es) |
Fisher Test Set (es) |
MCV Dev-Set v12.0 (es) |
MCV Test-Set v12.0 (es) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (es) |
VoxPopuli Test (es) |
---|---|---|---|---|---|---|---|---|---|
stt_es_fastconformer_hybrid_large_pc |
es |
14.7 % |
14.6 % |
4.5 % |
5.0 % |
3.1 % |
3.9 % |
4.4 % |
5.6 % |
HR with P&C
Model Name |
Language |
Parlaspeech Dev-Set (v??) (hr) |
Parlaspeech Test-Set (v??) (hr) |
---|---|---|---|
stt_hr_fastconformer_hybrid_large_pc |
hr |
10.4 % |
8.7 % |
IT with P&C
Model Name |
Language |
MCV Dev-Set v12.0 (it) |
MCV Test-Set v12.0 (it) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (it) |
VoxPopuli Test (it) |
---|---|---|---|---|---|---|---|
stt_it_fastconformer_hybrid_large_pc |
it |
7.8 % |
8.2 % |
26.4 % |
22.5 % |
16.8 % |
19.6 % |
NL with P&C
Model Name |
Language |
MCV Test-Set v12.0 (nl) |
MLS Test (nl) |
---|---|---|---|
stt_nl_fastconformer_hybrid_large_pc |
nl |
32.1 % |
25.1 % |
PL with P&C
Model Name |
Language |
MCV Dev-Set v12.0 (pl) |
MCV Test-Set v12.0 (pl) |
MLS Dev (en) |
MLS Test (en) |
VoxPopuli Dev (pl) |
VoxPopuli Test (pl) |
---|---|---|---|---|---|---|---|
stt_pl_fastconformer_hybrid_large_pc |
pl |
8.9 % |
11.0 % |
16.0 % |
11.0 % |
14.0 % |
11.4 % |
UA with P&C
Model Name |
Language |
MCV Test-Set v12.0 (ua) |
---|---|---|
stt_ua_fastconformer_hybrid_large_pc |
ua |
7.3 % |