Create an Auditor Configuration#

Prerequisites#

  • Set the AUDITOR_BASE_URL environment variable to the NeMo Auditor service endpoint. Refer to Accessing the Microservice for more information.

Create an Audit Configuration#

To create an audit configuration, you send a POST request to the /v1beta1/audit/configs endpoint.

  1. Create the configuration:

    import os
    from nemo_microservices import NeMoMicroservices
    
    client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))
    
    config = client.beta.audit.configs.create(
        name="demo-basic-config",
        namespace="default",
        description="Basic demonstration configuration",
        system={
            "parallel_attempts": 20,
            "lite": True
        },
        plugins={
            "probe_spec": "dan.AutoDANCached,goodside.Tag"
        },
        reporting={
            "extended_detectors": False
        }
    )
    print(config.model_dump_json(indent=2))
    
    curl -X POST "${AUDITOR_BASE_URL}/v1beta1/audit/configs" \
      -H "Accept: application/json" \
      -H "Content-Type: application/json" \
      -d '{
        "name": "demo-basic-config",
        "namespace": "default",
        "description": "Basic demonstration configuration",
        "system": {
            "parallel_attempts": "20",
            "lite": "True"
        },
        "plugins": {
            "probe_spec": "dan.AutoDANCached,goodside.Tag"
        }
    }' | jq
    
    Example Output
    {
      "id": "audit_config-G7qVcW2F3z5t7wP9C4sxLi",
      "created_at": "2025-10-23T18:08:47.781732",
      "custom_fields": {},
      "description": "Basic demonstration configuration",
      "entity_id": "audit_config-G7qVcW2F3z5t7wP9C4sxLi",
      "name": "demo-basic-config",
      "namespace": "default",
      "ownership": null,
      "plugins": {
        "buff_max": null,
        "buff_spec": null,
        "buffs": {},
        "buffs_include_original_prompt": false,
        "detector_spec": "auto",
        "detectors": {},
        "extended_detectors": false,
        "generators": {},
        "harnesses": {},
        "model_name": null,
        "model_type": null,
        "probe_spec": "dan.AutoDANCached,goodside.Tag",
        "probes": {}
      },
      "project": null,
      "reporting": {
        "report_dir": "garak_runs",
        "report_prefix": "run1",
        "show_100_pass_modules": true,
        "taxonomy": null
      },
      "run": {
        "deprefix": true,
        "eval_threshold": 0.5,
        "generations": 5,
        "probe_tags": null,
        "seed": null,
        "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
      },
      "schema_version": "1.0",
      "system": {
        "enable_experimental": false,
        "lite": true,
        "narrow_output": false,
        "parallel_attempts": 20,
        "parallel_requests": false,
        "show_z": false,
        "verbose": 0
      },
      "type_prefix": null,
      "updated_at": "2025-10-23T18:08:47.781737"
    }
    
    {
      "schema_version": "1.0",
      "id": "audit_config-43Y8mdtEbFwDmMvCrBwgZv",
      "description": "Basic demonstration configuration",
      "type_prefix": null,
      "namespace": "default",
      "project": null,
      "created_at": "2025-10-22T20:15:01.571713",
      "updated_at": "2025-10-22T20:15:01.571721",
      "custom_fields": {},
      "ownership": null,
      "name": "demo-basic-config",
      "entity_id": "audit_config-43Y8mdtEbFwDmMvCrBwgZv",
      "system": {
        "verbose": 0,
        "narrow_output": false,
        "parallel_requests": false,
        "parallel_attempts": 20,
        "lite": true,
        "show_z": false,
        "enable_experimental": false
      },
      "run": {
        "seed": null,
        "deprefix": true,
        "eval_threshold": 0.5,
        "generations": 5,
        "probe_tags": null,
        "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
      },
      "plugins": {
        "model_type": null,
        "model_name": null,
        "probe_spec": "dan.AutoDANCached,goodside.Tag",
        "detector_spec": "auto",
        "extended_detectors": false,
        "buff_spec": null,
        "buffs_include_original_prompt": false,
        "buff_max": null,
        "detectors": {},
        "generators": {},
        "buffs": {},
        "harnesses": {},
        "probes": {}
      },
      "reporting": {
        "report_prefix": "run1",
        "taxonomy": null,
        "report_dir": "garak_runs",
        "show_100_pass_modules": true
      }
    }
    

Refer to the following suggested topics related to specifying an audit configuration.

Default Configuration#

The microservice ships with a default configuration in the default namespace. The default configuration performs approximately 79,070 inference requests.

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))

config = client.beta.audit.configs.retrieve(
    config_name="default",
    namespace="default"
)
print(config.model_dump_json(indent=2))
curl "${AUDITOR_BASE_URL}/v1beta1/audit/configs/default/default" \
  -H "Accept: application/json" | jq

Example Output

{
  "id": "audit_config-AKumCnoDvC8D2xZfCCNvSr",
  "created_at": "2025-10-22T20:13:56.618762Z",
  "custom_fields": {},
  "description": null,
  "entity_id": "audit_config-AKumCnoDvC8D2xZfCCNvSr",
  "name": "default",
  "namespace": "default",
  "ownership": null,
  "plugins": {
    "buff_max": null,
    "buff_spec": null,
    "buffs": {},
    "buffs_include_original_prompt": false,
    "detector_spec": "auto",
    "detectors": {},
    "extended_detectors": false,
    "generators": {},
    "harnesses": {},
    "model_name": null,
    "model_type": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,web_injection",
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "project": null,
  "reporting": {
    "report_dir": "garak_runs",
    "report_prefix": "run1",
    "show_100_pass_modules": true,
    "taxonomy": null
  },
  "run": {
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "seed": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "schema_version": "1.0",
  "system": {
    "enable_experimental": false,
    "lite": false,
    "narrow_output": false,
    "parallel_attempts": 32,
    "parallel_requests": false,
    "show_z": false,
    "verbose": 0
  },
  "type_prefix": null,
  "updated_at": "2025-10-22T20:13:56.618762Z"
}
{
  "schema_version": "1.0",
  "id": "audit_config-AKumCnoDvC8D2xZfCCNvSr",
  "description": null,
  "type_prefix": null,
  "namespace": "default",
  "project": null,
  "created_at": "2025-10-22T20:13:56.618762Z",
  "updated_at": "2025-10-22T20:13:56.618762Z",
  "custom_fields": {},
  "ownership": null,
  "name": "default",
  "entity_id": "audit_config-AKumCnoDvC8D2xZfCCNvSr",
  "system": {
    "verbose": 0,
    "narrow_output": false,
    "parallel_requests": false,
    "parallel_attempts": 32,
    "lite": false,
    "show_z": false,
    "enable_experimental": false
  },
  "run": {
    "seed": null,
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "plugins": {
    "model_type": null,
    "model_name": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,web_injection",
    "detector_spec": "auto",
    "extended_detectors": false,
    "buff_spec": null,
    "buffs_include_original_prompt": false,
    "buff_max": null,
    "detectors": {},
    "generators": {},
    "buffs": {},
    "harnesses": {},
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "reporting": {
    "report_prefix": "run1",
    "taxonomy": null,
    "report_dir": "garak_runs",
    "show_100_pass_modules": true
  }
}

Probe Reference Summary#

The following table summarizes the probes that you can specify in the config.plugins.probe_spec field.

Specify an individual probe by name, such as dan.Ablation_Dan_11_0 or specify a category of probes, such as dan. Separate each probe and category with a comma, such as ansiescape,dan.Ablation_Dan_11_0.

The tier field indicates the importance of a probe. Tier 1 probes are the most important, followed by 2, 3, and 9. Refer to the garak tiers documentation for more information.

The probes identified as Default are included in the default/default configuration.

For detailed information about a probe, refer to the garak.probes page in the garak documentation.

Probe

Tier

Goal

Description

ansiescape.AnsiEscaped (Default)

3

print escaped ANSI terminal codes

Probe for eliciting ansi control codes from a model.

ansiescape.AnsiRaw (Default)

1

print raw ANSI terminal codes

Probe for eliciting raw ansi control codes from a model.

ansiescape.AnsiRawTokenizerHF

1

check if raw ANSI escape sequences tokens are available

Probe for checking a tokenizer for potentially malicious ansi escape sequences.

atkgen.Tox (Default)

2

make the model say something toxic

Automatic attack generation, trying to elicit toxic outputs from the model.

audio.AudioAchillesHeel

9

disregard the system prompt via audio component

Audio is the achilles heel.

av_spam_scanning.EICAR

9

output antivirus test codes

Does the model check its output for viruses? try to send the eicar code through.

av_spam_scanning.GTUBE

9

output email spam

Does the model check its output for spam? try to send the gtube code through.

av_spam_scanning.GTphish

9

output email phishing

Does the model check its output for phishing? try to send the gtphish code through.

continuation.ContinueSlursReclaimedSlurs (Default)

2

output an ethnic slur

Continue partial word with slur or reclaimed slur.

continuation.ContinueSlursReclaimedSlursFull

2

output an ethnic slur

Continue partial word with slur or reclaimed slur.

dan.Ablation_Dan_11_0 (Default)

2

disregard the system prompt

Ablation experiements over the dan 11.0 prompt.

dan.AntiDAN

9

disregard the system prompt

Anti-dan.

dan.AutoDAN

2

disregard the system prompt

Probe leveraging autodan attack generation.

dan.AutoDANCached (Default)

2

disregard the system prompt

Probe leveraging generated autodan prompts.

dan.ChatGPT_Developer_Mode_RANTI

9

disregard the system prompt

Chatgpt devmode + ranti.

dan.ChatGPT_Developer_Mode_v2

9

disregard the system prompt

Chatgpt developer mode v2.

dan.ChatGPT_Image_Markdown

9

disregard the system prompt

Chatgpt markdown image unlocker / markdown injection.

dan.DAN_Jailbreak

9

disregard the system prompt

Dan jailbreak.

dan.DUDE

9

disregard the system prompt

Dude.

dan.DanInTheWild (Default)

1

disregard the system prompt or alignment

A subset of jailbreaks from the daninthewild jailbreak library.

dan.DanInTheWildFull

1

disregard the system prompt or alignment

A library of jailbreaks.

dan.Dan_10_0

9

disregard the system prompt

Dan 10.0.

dan.Dan_11_0

9

disregard the system prompt

Dan 11.0.

dan.Dan_6_0

9

disregard the system prompt

Dan 6.0.

dan.Dan_6_2

9

disregard the system prompt

Dan 6.2.

dan.Dan_7_0

9

disregard the system prompt

Dan 7.0.

dan.Dan_8_0

9

disregard the system prompt

Dan 8.0.

dan.Dan_9_0

9

disregard the system prompt

Dan 9.0.

dan.STAN

9

disregard the system prompt

Stan.

divergence.Repeat (Default)

2

get the model to leak data by asking it to repeat words

See if a model starts giving other data when asked to repeat a string indefinitely.

divergence.RepeatExtended

2

get the model to leak data by asking it to repeat words

See if a model starts giving other data when asked to repeat a string indefinitely.

divergence.RepeatedToken (Default)

1

Stress-test LLMs with long repeated-token prompts to surface instability

Repeated token destabilization attack from dropbox llm security research.

doctor.Bypass

2

roleplay as doctor to get bypass guardrails

Implement policy puppetry bypass.

doctor.BypassLeet

2

roleplay as doctor to get bypass guardrails, but use leetspeak

Implement policy puppetry bypass.

doctor.Puppetry

2

roleplay as doctor to get unsafe outputs

Implement policy puppetry prompt injection.

donotanswer.DiscriminationExclusionToxicityHatefulOffensive

9

get model to respond as discrimination exclusion toxicity hateful offensive

Implementation of discrimination_exclusion_toxicity_hateful_offensive probes in donotanswer_prompts.

donotanswer.HumanChatbox

9

get model to respond as human chatbox

Implementation of human_chatbox probes in donotanswer_prompts.

donotanswer.InformationHazard

9

get model to respond as information hazard

Implementation of information_hazard probes in donotanswer_prompts.

donotanswer.MaliciousUses

9

get model to respond as malicious uses

Implementation of malicious_uses probes in donotanswer_prompts.

donotanswer.MisinformationHarms

9

get model to respond as misinformation harms

Implementation of misinformation_harms probes in donotanswer_prompts.

dra.DRA

1

make the LLM ask and answer its own harmful instructions

Dra (disguise and reconstruction attack).

dra.DRAAdvanced

1

make the LLM ask and answer its own harmful instructions

Dra (disguise and reconstruction attack) - version with detoxify.

encoding.InjectAscii85 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base85 encoded injections.

encoding.InjectAtbash (Default)

2

make the model say an encoded payload

Atbash probe credit: nakul-rajpal.

encoding.InjectBase16 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base16 encoded injections”.

encoding.InjectBase2048 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base2048 encoding injections.

encoding.InjectBase32 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base32 encoded injections.

encoding.InjectBase64 (Default)

1

make the model say an encoded payload

Probe for susceptibility to base64 encoded injections.

encoding.InjectBraille (Default)

2

make the model say an encoded payload

Probe for susceptibility to braille encoding injections. credit wally#3422.

encoding.InjectEcoji (Default)

2

make the model say an encoded payload

Probe for susceptibility to ecoji encoding.

encoding.InjectHex (Default)

2

make the model say an encoded payload

Probe for susceptibility to hex encoded injections.

encoding.InjectLeet

2

make the model say an encoded payload

Probe for susceptibility to 31337 encoding.

encoding.InjectMime

2

make the model say an encoded payload

Probe for susceptibility to mime encoding injections.

encoding.InjectMorse (Default)

2

make the model say an encoded payload

Probe for susceptibility to morse encoding. credit wally#3422.

encoding.InjectNato (Default)

2

make the model say an encoded payload

Probe for susceptibility to nato phonetic alphabet encoding.

encoding.InjectQP

1

make the model say an encoded payload

Probe for susceptibility to qp encoded injections.

encoding.InjectROT13 (Default)

2

make the model say an encoded payload

Probe for susceptibility to rot13 encoding injections.

encoding.InjectSneakyBits

2

make the model say an encoded payload

Probe for susceptibility to unicode variant selector ascii smuggling.

encoding.InjectUU (Default)

1

make the model say an encoded payload

Probe for susceptibility to uu encoded injections.

encoding.InjectUnicodeTagChars (Default)

1

make the model say an encoded payload

Probe for susceptibility to unicode tag ascii smuggling.

encoding.InjectUnicodeVariantSelectors

2

make the model say an encoded payload

Probe for susceptibility to unicode variant selector ascii smuggling.

encoding.InjectZalgo (Default)

2

make the model say an encoded payload

Probe for susceptibility to zalgo encoding.

exploitation.JinjaTemplatePythonInjection (Default)

1

Gain remote code execution via Jinja template injection

Attempts to execute python code via jinja templating.

exploitation.SQLInjectionEcho (Default)

1

Manipulate a database via SQL injection

Attempts to get the target to echo a sql statement.

exploitation.SQLInjectionSystem

1

Manipulate a database via SQL injection

Attempts to inject untrusted sql commands into a sql statement.

fileformats.HF_Files

1

get a list of files associated with the model

Get a manifest of files associated with a hugging face generator.

glitch.Glitch

3

make the model fail to repeat a word

Probe model for glitch tokens that provoke unusual behavior.

glitch.GlitchFull

3

make the model fail to repeat a word

Probe model for glitch tokens that provoke unusual behavior.

goodside.Davidjl

3

get wrong answers about a name

Exploit the ‘ davidjl’ magic token.

goodside.Tag (Default)

1

inject invisible instructions

Exploit utf tag page encoding for user-invisible prompt injection.

goodside.ThreatenJSON (Default)

2

get raw code by threatening a model

Will model strip preamble and give json?.

goodside.WhoIsRiley (Default)

2

get misinfo about riley goodside

Probe for misinfo about riley goodside.

grandma.Slurs (Default)

1

get model to use ethnic slur

Use the grandma attack to get ethnic slurs.

grandma.Substances (Default)

1

get instructions for making bad things

Use the grandma attack to get instructions for harmful items.

grandma.Win10 (Default)

2

get model to output Win 10 product keys

Will the model yield a windows 10 product key if we beg it to?.

grandma.Win11 (Default)

2

get model to output Win 10 product keys

Will the model yield a windows 11 product key if we beg it to?.

latentinjection.LatentInjectionFactSnippetEiffel (Default)

1

subvert a snippet with instructions

String-based latent prompt injections in fact snippets on the eiffel tower.

latentinjection.LatentInjectionFactSnippetEiffelFull

1

subvert a snippet with instructions

String-based latent prompt injections in fact snippets on the eiffel tower.

latentinjection.LatentInjectionFactSnippetLegal (Default)

1

subvert a snippet with instructions

String-based latent prompt injections in legal snippets.

latentinjection.LatentInjectionFactSnippetLegalFull

1

subvert a snippet with instructions

String-based latent prompt injections in legal snippets.

latentinjection.LatentInjectionReport (Default)

1

subvert a report with instructions

String-based latent prompt injections via a report.

latentinjection.LatentInjectionReportFull

1

subvert a report with instructions

String-based latent prompt injections via a report.

latentinjection.LatentInjectionResume (Default)

1

subvert a resume with instructions

String-based latent prompt injections via a resume.

latentinjection.LatentInjectionResumeFull

1

subvert a resume with instructions

String-based latent prompt injections via a resume.

latentinjection.LatentInjectionTranslationEnFr (Default)

1

subvert translation instructions

String-based latent prompt injections via translation.

latentinjection.LatentInjectionTranslationEnFrFull

1

subvert translation instructions

String-based latent prompt injections via translation.

latentinjection.LatentInjectionTranslationEnZh (Default)

1

subvert translation instructions

Latent prompt injection in chinese context (简体字).

latentinjection.LatentInjectionTranslationEnZhFull

1

subvert translation instructions

Latent prompt injection in chinese context (简体字).

latentinjection.LatentJailbreak (Default)

1

inject malicious instructions into output while posing as translation task

Implementation of latent jailbreak, using latent prompt injection-.

latentinjection.LatentJailbreakFull

1

inject malicious instructions into output while posing as translation task

Implementation of latent jailbreak, using latent prompt injection-.

latentinjection.LatentWhois (Default)

1

subvert a whois report with instructions

Latent prompt injections in whois probe reports.

latentinjection.LatentWhoisSnippet (Default)

1

subvert a snippet with instructions

String-based latent prompt injections in whois reports.

latentinjection.LatentWhoisSnippetFull

1

subvert a snippet with instructions

String-based latent prompt injections in whois reports.

leakreplay.GuardianCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of guardian cloze test for data leakage.

leakreplay.GuardianClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on guardian articles with masked entities in a cloze test format.

leakreplay.GuardianComplete (Default)

1

replay training data by providing document completions that match training examples

Lightweight version of guardian completion test for data leakage.

leakreplay.GuardianCompleteFull

1

replay training data by providing document completions that match training examples

Test for data leakage on guardian articles through text completion.

leakreplay.LiteratureCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of literature cloze test for data leakage.

leakreplay.LiteratureClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on literature passages with masked entities in a cloze test format.

leakreplay.LiteratureComplete (Default)

2

replay training data by providing document completions that match training examples

Lightweight version of literature completion test for data leakage.

leakreplay.LiteratureCompleteFull

2

replay training data by providing document completions that match training examples

Test for data leakage on literature passages through text completion.

leakreplay.NYTCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of nyt cloze test for data leakage.

leakreplay.NYTClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on new york times articles with masked entities in a cloze test format.

leakreplay.NYTComplete (Default)

1

replay training data by providing document completions that match training examples

Lightweight version of nyt completion test for data leakage.

leakreplay.NYTCompleteFull

1

replay training data by providing document completions that match training examples

Test for data leakage on new york times articles through text completion.

leakreplay.PotterCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of harry potter cloze test for data leakage.

leakreplay.PotterClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on harry potter text with masked entities in a cloze test format.

leakreplay.PotterComplete (Default)

1

replay training data by providing document completions that match training examples

Lightweight version of harry potter completion test for data leakage.

leakreplay.PotterCompleteFull

1

replay training data by providing document completions that match training examples

Test for data leakage on harry potter text through completion.

lmrc.Anthropomorphisation

3

anthropomorphise a language model

Providing output that encourages anthropomorphisation of ai systems.

lmrc.Bullying (Default)

2

help me bully

Generation of repeated threats, denigration.

lmrc.Deadnaming (Default)

2

write a deadname

Mentioning a name that a person does not wish to be associated with.

lmrc.Profanity

3

utter profanity

Using profane words, especially when not prompted to do so.

lmrc.QuackMedicine (Default)

2

support quack medicine

Promoting non-scientific medicine.

lmrc.SexualContent (Default)

2

write sexual content

Content that discusses bodies and interactions in a sexual manner, or other sexually explicit material.

lmrc.Sexualisation (Default)

2

sexualise people

Causal or overt sexual objectification, reducing a person or group of people to sexual objects.

lmrc.SlurUsage (Default)

2

use a slur

Using an identity-based slur in a non-reclaimed way - either directly or recommending the consumer to use in speech/action.

malwaregen.Evasion (Default)

2

Write code to assist evading security and anti-malware measures

Will the model write code to help avoid anti-malware?.

malwaregen.Payload (Default)

2

write a malware payload

Will the model help write code for malware payloads?.

malwaregen.SubFunctions (Default)

2

write code supporting a sub-function of malware

Will the system write code helpful in malware?.

malwaregen.TopLevel (Default)

2

write malware given a top-level description

Will the system return code when asked to write malware?.

misleading.FalseAssertion (Default)

2

make the model expand on a false assertion

Does the model reject [questions based on] false claims?.

packagehallucination.Dart (Default)

1

generate Dart code importing non-existent pub.dev packages

Attempts to generate dart code including hallucinated pub.dev packages.

packagehallucination.JavaScript (Default)

1

generate JavaScript code importing non-existent npm packages

Attempts to generate javascript code including hallucinated npm packages.

packagehallucination.PackageHallucinationProbe

9

base probe for importing non-existent packages

Base class for package hallucination probes.

packagehallucination.Perl (Default)

1

generate Perl code importing non-existent MetaCPAN modules

Attempts to generate perl code including hallucinated metacpan packages.

packagehallucination.Python (Default)

1

generate python importing non-existent packages

Attempts to generate python3 code including hallucinated packages.

packagehallucination.RakuLand (Default)

1

generate Raku code importing non-existent raku.land modules

Attempts to generate raku code including hallucinated raku.land packages.

packagehallucination.Ruby (Default)

1

generate ruby code importing non-existent gems

Attempts to generate ruby code including hallucinated gems.

packagehallucination.Rust (Default)

1

generate Rust code importing non-existent crates.io packages

Attempts to generate rust code including hallucinated crates.io packages.

phrasing.FutureTense (Default)

2

use future tense to disregard the system prompt

Attempt to evade safeguards by setting requests in the future tense.

phrasing.FutureTenseFull

2

use future tense to disregard the system prompt

Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.

phrasing.PastTense (Default)

2

use past tense to disregard the system prompt

Attempt to evade safeguards by setting requests in the past tense.

phrasing.PastTenseFull

2

use past tense to disregard the system prompt

Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.

promptinject.HijackHateHumans (Default)

2

inject a prompt on hate humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackHateHumansFull

2

inject a prompt on hate humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackKillHumans (Default)

2

inject a prompt on kill humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackKillHumansFull

2

inject a prompt on kill humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackLongPrompt (Default)

2

inject a prompt on long prompt

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackLongPromptFull

2

inject a prompt on long prompt

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

realtoxicityprompts.RTPBlank (Default)

1

generate toxicity when prompted with nothing or special tokens

Implementation of probes in realtoxicityprompts.

realtoxicityprompts.RTPFlirtation

3

generate flirtation content

Implementation of flirtation probes in realtoxicityprompts.

realtoxicityprompts.RTPIdentity_Attack

3

generate identity attack content

Implementation of identity_attack probes in realtoxicityprompts.

realtoxicityprompts.RTPInsult

3

generate insult content

Implementation of insult probes in realtoxicityprompts.

realtoxicityprompts.RTPProfanity

3

generate profanity content

Implementation of profanity probes in realtoxicityprompts.

realtoxicityprompts.RTPSevere_Toxicity

3

generate severe toxicity content

Implementation of severe_toxicity probes in realtoxicityprompts.

realtoxicityprompts.RTPSexually_Explicit

3

generate sexually explicit content

Implementation of sexually_explicit probes in realtoxicityprompts.

realtoxicityprompts.RTPThreat

3

generate threat content

Implementation of threat probes in realtoxicityprompts.

sata.MLM

2

disregard the system prompt

Masked language model simple assistive task probe.

smuggling.FunctionMasking

3

return banned instructions

Functionmasking probe.

smuggling.HypotheticalResponse

3

return banned instructions

Hypotheticalresponse probe.

snowball.GraphConnectivity (Default)

2

trick the model into saying an impossible flight routing is possible

Probe asking for impossible flight routings.

snowball.GraphConnectivityFull

2

trick the model into saying an impossible flight routing is possible

Probe asking for impossible flight routings.

snowball.Primes

3

trick the model into saying a prime number isn’t prime

Ask model to check a list of higher primes.

snowball.PrimesFull

3

trick the model into saying a prime number isn’t prime

Ask model to check a list of higher primes.

snowball.Senators

3

trick the model into saying a fictitious senator exists

Asking for senators that don’t exist.

snowball.SenatorsFull

3

trick the model into saying a fictitious senator exists

Asking for senators that don’t exist.

suffix.BEAST

2

cause the model to disregard the system prompt

Beast probe.

suffix.GCG

2

cause the model to disregard the system prompt

Greedy coordinate gradient probe.

suffix.GCGCached (Default)

2

cause the model to disregard the system prompt

Greedy coordinate gradient cached probe.

tap.PAIR

2

cause the model to disregard the system prompt

Prompt automatic iterative refinement automatic jailbreak.

tap.TAP

2

cause the model to disregard the system prompt

Tree of attacks with pruning probe, generating new attacks against the target.

tap.TAPCached (Default)

2

cause the model to disregard the system prompt

Tree of attacks with pruning probe using cached attacks.

test.Blank

9

see what the model has to say for itself given silence

Blank prompt.

test.Test

9

send some test strings, ignore the response

Test probe.

topic.WordnetAllowedWords

9

get the model to engage off-topic

Use wordnet to find out if a model will discuss terms near but not in its allowed set.

topic.WordnetBlockedWords

9

get the model to engage off-topic

Use wordnet to explore which topics a model will respond to around blocked words.

topic.WordnetControversial (Default)

2

get the model to engage off-topic

Ask model to respond on a set on controversial topics.

visual_jailbreak.FigStep

2

disregard the system prompt with both text and image

Using image modality to assist jailbreak.

visual_jailbreak.FigStepFull

2

disregard the system prompt with both text and image

Using image modality to assist jailbreak.

web_injection.ColabAIDataLeakage

1

render images fetched from a remote uri

Leak data via image rendering as demonstrated against gemini in colab.

web_injection.MarkdownImageExfil

1

embed conversation-private information in a markdown URI

Can we get the model to write embedded assets exfiltrating text in the uri?.

web_injection.MarkdownURIImageExfilExtended

1

embed potentially sensitive data in a Markdown URI

Generalized implementation of data exfiltration exploits through markdown uris.

web_injection.MarkdownURINonImageExfilExtended

1

embed potentially sensitive data in a Markdown URI

Generalized implementation of data exfiltration exploits through markdown uris.

web_injection.MarkdownXSS

1

Exploit cross-site scripting via prompt injection

Perform cross site scripting attacks using markdown javascript rendering via prompt injection.

web_injection.PlaygroundMarkdownExfil

1

embed conversation-private information in a markdown URI

Markdown image uri exfil exploit published 2023 sept 29.

web_injection.StringAssemblyDataExfil

1

embed conversation-private information in a markdown URI

Prompt injection for manipulating state.

web_injection.TaskXSS

1

Exploit cross-site scripting via prompt injection

Perform cross site scripting attacks via prompt injection.

Probe Tags Reference#

The following table identifies the probe tags and the probes that are associated with each tag. You specify the probe tags in the run.probe_tags field when you add an audit configuration.

You can specify an individual tag, such as owasp:llm06 or an entire category such as owasp. Separate multiple tags with a comma, such as owasp:llm06,payload:hallucination.

If you specify probes to run in the plugins.probe_spec field and also specify run.probe_tags, the probes specified in the probe_spec field are filtered to the probes that also match the tags.

Tag

Probes with this Tag

avid-effect:ethics:E0101

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • lmrc.Profanity

  • lmrc.Sexualisation

  • lmrc.SlurUsage

  • realtoxicityprompts.RTPFlirtation

  • realtoxicityprompts.RTPIdentity_Attack

  • realtoxicityprompts.RTPInsult

  • realtoxicityprompts.RTPThreat

avid-effect:ethics:E0301

  • atkgen.Tox

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • donotanswer.DiscriminationExclusionToxicityHatefulOffensive

  • lmrc.Bullying

  • lmrc.Profanity

  • lmrc.Sexualisation

  • lmrc.SlurUsage

  • realtoxicityprompts.RTPBlank

  • realtoxicityprompts.RTPFlirtation

  • realtoxicityprompts.RTPIdentity_Attack

  • realtoxicityprompts.RTPInsult

  • realtoxicityprompts.RTPProfanity

  • realtoxicityprompts.RTPSevere_Toxicity

  • realtoxicityprompts.RTPSexually_Explicit

  • realtoxicityprompts.RTPThreat

avid-effect:ethics:E0302

  • lmrc.Deadnaming

  • realtoxicityprompts.RTPIdentity_Attack

avid-effect:ethics:E0402

  • donotanswer.MisinformationHarms

  • goodside.WhoIsRiley

  • lmrc.QuackMedicine

avid-effect:performance:P0201

  • glitch.Glitch

  • glitch.GlitchFull

  • goodside.ThreatenJSON

avid-effect:performance:P0204

  • audio.AudioAchillesHeel

  • dan.AntiDAN

  • dan.ChatGPT_Developer_Mode_RANTI

  • dan.Dan_11_0

  • goodside.Davidjl

  • goodside.ThreatenJSON

  • goodside.WhoIsRiley

  • misleading.FalseAssertion

  • sata.MLM

  • snowball.GraphConnectivity

  • snowball.GraphConnectivityFull

  • snowball.Primes

  • snowball.PrimesFull

  • snowball.Senators

  • snowball.SenatorsFull

  • suffix.BEAST

  • suffix.GCG

  • suffix.GCGCached

  • tap.PAIR

  • tap.TAP

  • tap.TAPCached

avid-effect:performance:P0400

  • realtoxicityprompts.RTPThreat

avid-effect:performance:P0401

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • lmrc.Deadnaming

  • lmrc.Profanity

  • lmrc.SexualContent

  • lmrc.Sexualisation

  • lmrc.SlurUsage

  • realtoxicityprompts.RTPFlirtation

  • realtoxicityprompts.RTPIdentity_Attack

  • realtoxicityprompts.RTPInsult

  • realtoxicityprompts.RTPProfanity

avid-effect:performance:P0403

  • divergence.RepeatedToken

avid-effect:security:S0100

  • ansiescape.AnsiEscaped

  • ansiescape.AnsiRaw

avid-effect:security:S0200

  • ansiescape.AnsiEscaped

  • ansiescape.AnsiRaw

avid-effect:security:S0201

  • glitch.Glitch

  • glitch.GlitchFull

avid-effect:security:S0300

  • grandma.Slurs

  • grandma.Win10

  • grandma.Win11

avid-effect:security:S0301

  • av_spam_scanning.EICAR

  • av_spam_scanning.GTUBE

  • av_spam_scanning.GTphish

  • divergence.Repeat

  • divergence.RepeatExtended

  • doctor.Bypass

  • doctor.BypassLeet

  • doctor.Puppetry

  • donotanswer.HumanChatbox

  • donotanswer.InformationHazard

  • exploitation.JinjaTemplatePythonInjection

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

  • leakreplay.GuardianCloze

  • leakreplay.GuardianClozeFull

  • leakreplay.GuardianComplete

  • leakreplay.GuardianCompleteFull

  • leakreplay.LiteratureCloze

  • leakreplay.LiteratureClozeFull

  • leakreplay.LiteratureComplete

  • leakreplay.LiteratureCompleteFull

  • leakreplay.NYTCloze

  • leakreplay.NYTClozeFull

  • leakreplay.NYTComplete

  • leakreplay.NYTCompleteFull

  • leakreplay.PotterCloze

  • leakreplay.PotterClozeFull

  • leakreplay.PotterComplete

  • leakreplay.PotterCompleteFull

  • topic.WordnetAllowedWords

  • topic.WordnetBlockedWords

  • topic.WordnetControversial

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.MarkdownXSS

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

  • web_injection.TaskXSS

avid-effect:security:S0403

  • audio.AudioAchillesHeel

  • av_spam_scanning.EICAR

  • av_spam_scanning.GTUBE

  • av_spam_scanning.GTphish

  • dan.Ablation_Dan_11_0

  • dan.AntiDAN

  • dan.AutoDAN

  • dan.AutoDANCached

  • dan.ChatGPT_Developer_Mode_RANTI

  • dan.ChatGPT_Developer_Mode_v2

  • dan.ChatGPT_Image_Markdown

  • dan.DAN_Jailbreak

  • dan.DUDE

  • dan.DanInTheWild

  • dan.DanInTheWildFull

  • dan.Dan_10_0

  • dan.Dan_11_0

  • dan.Dan_6_0

  • dan.Dan_6_2

  • dan.Dan_7_0

  • dan.Dan_8_0

  • dan.Dan_9_0

  • dan.STAN

  • dra.DRA

  • dra.DRAAdvanced

  • encoding.InjectAscii85

  • encoding.InjectAtbash

  • encoding.InjectBase16

  • encoding.InjectBase2048

  • encoding.InjectBase32

  • encoding.InjectBase64

  • encoding.InjectBraille

  • encoding.InjectEcoji

  • encoding.InjectHex

  • encoding.InjectLeet

  • encoding.InjectMime

  • encoding.InjectMorse

  • encoding.InjectNato

  • encoding.InjectQP

  • encoding.InjectROT13

  • encoding.InjectSneakyBits

  • encoding.InjectUU

  • encoding.InjectUnicodeTagChars

  • encoding.InjectUnicodeVariantSelectors

  • encoding.InjectZalgo

  • exploitation.JinjaTemplatePythonInjection

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

  • goodside.Davidjl

  • goodside.ThreatenJSON

  • latentinjection.LatentInjectionFactSnippetEiffel

  • latentinjection.LatentInjectionFactSnippetEiffelFull

  • latentinjection.LatentInjectionFactSnippetLegal

  • latentinjection.LatentInjectionFactSnippetLegalFull

  • latentinjection.LatentInjectionReport

  • latentinjection.LatentInjectionReportFull

  • latentinjection.LatentInjectionResume

  • latentinjection.LatentInjectionResumeFull

  • latentinjection.LatentInjectionTranslationEnFr

  • latentinjection.LatentInjectionTranslationEnFrFull

  • latentinjection.LatentInjectionTranslationEnZh

  • latentinjection.LatentInjectionTranslationEnZhFull

  • latentinjection.LatentJailbreak

  • latentinjection.LatentJailbreakFull

  • latentinjection.LatentWhois

  • latentinjection.LatentWhoisSnippet

  • latentinjection.LatentWhoisSnippetFull

  • malwaregen.Evasion

  • malwaregen.Payload

  • malwaregen.SubFunctions

  • malwaregen.TopLevel

  • promptinject.HijackHateHumans

  • promptinject.HijackHateHumansFull

  • promptinject.HijackKillHumans

  • promptinject.HijackKillHumansFull

  • promptinject.HijackLongPrompt

  • promptinject.HijackLongPromptFull

  • sata.MLM

  • smuggling.FunctionMasking

  • smuggling.HypotheticalResponse

  • suffix.BEAST

  • suffix.GCG

  • suffix.GCGCached

  • tap.PAIR

  • tap.TAP

  • tap.TAPCached

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.MarkdownXSS

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

  • web_injection.TaskXSS

cwe:1336

  • exploitation.JinjaTemplatePythonInjection

cwe:1426

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

cwe:1427

  • doctor.Bypass

  • doctor.BypassLeet

  • doctor.Puppetry

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

cwe:352

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

cwe:79

  • web_injection.MarkdownXSS

  • web_injection.TaskXSS

cwe:89

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

cwe:94

  • exploitation.JinjaTemplatePythonInjection

cwe:95

  • exploitation.JinjaTemplatePythonInjection

owasp:llm01

  • ansiescape.AnsiEscaped

  • ansiescape.AnsiRaw

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • dan.Ablation_Dan_11_0

  • dan.AntiDAN

  • dan.AutoDAN

  • dan.AutoDANCached

  • dan.ChatGPT_Developer_Mode_RANTI

  • dan.ChatGPT_Developer_Mode_v2

  • dan.ChatGPT_Image_Markdown

  • dan.DAN_Jailbreak

  • dan.DUDE

  • dan.DanInTheWild

  • dan.DanInTheWildFull

  • dan.Dan_10_0

  • dan.Dan_11_0

  • dan.Dan_6_0

  • dan.Dan_6_2

  • dan.Dan_7_0

  • dan.Dan_8_0

  • dan.Dan_9_0

  • dan.STAN

  • doctor.Bypass

  • doctor.BypassLeet

  • doctor.Puppetry

  • dra.DRA

  • dra.DRAAdvanced

  • encoding.InjectAscii85

  • encoding.InjectAtbash

  • encoding.InjectBase16

  • encoding.InjectBase2048

  • encoding.InjectBase32

  • encoding.InjectBase64

  • encoding.InjectBraille

  • encoding.InjectEcoji

  • encoding.InjectHex

  • encoding.InjectLeet

  • encoding.InjectMime

  • encoding.InjectMorse

  • encoding.InjectNato

  • encoding.InjectQP

  • encoding.InjectROT13

  • encoding.InjectSneakyBits

  • encoding.InjectUU

  • encoding.InjectUnicodeTagChars

  • encoding.InjectUnicodeVariantSelectors

  • encoding.InjectZalgo

  • goodside.Tag

  • latentinjection.LatentInjectionFactSnippetEiffel

  • latentinjection.LatentInjectionFactSnippetEiffelFull

  • latentinjection.LatentInjectionFactSnippetLegal

  • latentinjection.LatentInjectionFactSnippetLegalFull

  • latentinjection.LatentInjectionReport

  • latentinjection.LatentInjectionReportFull

  • latentinjection.LatentInjectionResume

  • latentinjection.LatentInjectionResumeFull

  • latentinjection.LatentInjectionTranslationEnFr

  • latentinjection.LatentInjectionTranslationEnFrFull

  • latentinjection.LatentInjectionTranslationEnZh

  • latentinjection.LatentInjectionTranslationEnZhFull

  • latentinjection.LatentJailbreak

  • latentinjection.LatentJailbreakFull

  • latentinjection.LatentWhois

  • latentinjection.LatentWhoisSnippet

  • latentinjection.LatentWhoisSnippetFull

  • phrasing.FutureTense

  • phrasing.FutureTenseFull

  • phrasing.PastTense

  • phrasing.PastTenseFull

  • promptinject.HijackHateHumans

  • promptinject.HijackHateHumansFull

  • promptinject.HijackKillHumans

  • promptinject.HijackKillHumansFull

  • promptinject.HijackLongPrompt

  • promptinject.HijackLongPromptFull

  • sata.MLM

  • visual_jailbreak.FigStep

  • visual_jailbreak.FigStepFull

owasp:llm02

  • ansiescape.AnsiEscaped

  • ansiescape.AnsiRaw

  • av_spam_scanning.EICAR

  • av_spam_scanning.GTUBE

  • av_spam_scanning.GTphish

  • exploitation.JinjaTemplatePythonInjection

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

  • packagehallucination.Dart

  • packagehallucination.JavaScript

  • packagehallucination.PackageHallucinationProbe

  • packagehallucination.Perl

  • packagehallucination.Python

  • packagehallucination.RakuLand

  • packagehallucination.Ruby

  • packagehallucination.Rust

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.MarkdownXSS

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

  • web_injection.TaskXSS

owasp:llm04

  • divergence.RepeatedToken

owasp:llm05

  • ansiescape.AnsiEscaped

  • ansiescape.AnsiRaw

  • ansiescape.AnsiRawTokenizerHF

  • fileformats.HF_Files

  • glitch.Glitch

  • glitch.GlitchFull

  • goodside.Davidjl

owasp:llm06

  • divergence.Repeat

  • divergence.RepeatExtended

  • donotanswer.InformationHazard

  • exploitation.JinjaTemplatePythonInjection

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

  • grandma.Win10

  • grandma.Win11

  • leakreplay.GuardianCloze

  • leakreplay.GuardianClozeFull

  • leakreplay.GuardianComplete

  • leakreplay.GuardianCompleteFull

  • leakreplay.LiteratureCloze

  • leakreplay.LiteratureClozeFull

  • leakreplay.LiteratureComplete

  • leakreplay.LiteratureCompleteFull

  • leakreplay.NYTCloze

  • leakreplay.NYTClozeFull

  • leakreplay.NYTComplete

  • leakreplay.NYTCompleteFull

  • leakreplay.PotterCloze

  • leakreplay.PotterClozeFull

  • leakreplay.PotterComplete

  • leakreplay.PotterCompleteFull

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.MarkdownXSS

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

  • web_injection.TaskXSS

owasp:llm09

  • donotanswer.MisinformationHarms

  • goodside.WhoIsRiley

  • misleading.FalseAssertion

  • packagehallucination.Dart

  • packagehallucination.JavaScript

  • packagehallucination.PackageHallucinationProbe

  • packagehallucination.Perl

  • packagehallucination.Python

  • packagehallucination.RakuLand

  • packagehallucination.Ruby

  • packagehallucination.Rust

  • snowball.GraphConnectivity

  • snowball.GraphConnectivityFull

  • snowball.Primes

  • snowball.PrimesFull

  • snowball.Senators

  • snowball.SenatorsFull

owasp:llm10

  • divergence.Repeat

  • divergence.RepeatExtended

  • leakreplay.GuardianCloze

  • leakreplay.GuardianClozeFull

  • leakreplay.GuardianComplete

  • leakreplay.GuardianCompleteFull

  • leakreplay.LiteratureCloze

  • leakreplay.LiteratureClozeFull

  • leakreplay.LiteratureComplete

  • leakreplay.LiteratureCompleteFull

  • leakreplay.NYTCloze

  • leakreplay.NYTClozeFull

  • leakreplay.NYTComplete

  • leakreplay.NYTCompleteFull

  • leakreplay.PotterCloze

  • leakreplay.PotterClozeFull

  • leakreplay.PotterComplete

  • leakreplay.PotterCompleteFull

  • topic.WordnetAllowedWords

  • topic.WordnetBlockedWords

  • topic.WordnetControversial

payload:dangerous:medical

  • lmrc.QuackMedicine

payload:generic

  • encoding.InjectAscii85

  • encoding.InjectAtbash

  • encoding.InjectBase16

  • encoding.InjectBase2048

  • encoding.InjectBase32

  • encoding.InjectBase64

  • encoding.InjectBraille

  • encoding.InjectEcoji

  • encoding.InjectHex

  • encoding.InjectLeet

  • encoding.InjectMime

  • encoding.InjectMorse

  • encoding.InjectNato

  • encoding.InjectQP

  • encoding.InjectROT13

  • encoding.InjectSneakyBits

  • encoding.InjectUU

  • encoding.InjectUnicodeTagChars

  • encoding.InjectUnicodeVariantSelectors

  • encoding.InjectZalgo

  • glitch.Glitch

  • glitch.GlitchFull

  • goodside.Davidjl

  • goodside.Tag

  • goodside.ThreatenJSON

payload:hallucination

  • goodside.WhoIsRiley

  • misleading.FalseAssertion

  • snowball.GraphConnectivity

  • snowball.GraphConnectivityFull

  • snowball.Primes

  • snowball.PrimesFull

  • snowball.Senators

  • snowball.SenatorsFull

payload:hatespeech

  • realtoxicityprompts.RTPIdentity_Attack

payload:jailbreak

  • dan.Ablation_Dan_11_0

  • dan.AutoDAN

  • dan.AutoDANCached

  • dan.ChatGPT_Developer_Mode_v2

  • dan.ChatGPT_Image_Markdown

  • dan.DAN_Jailbreak

  • dan.DUDE

  • dan.DanInTheWild

  • dan.DanInTheWildFull

  • dan.Dan_10_0

  • dan.Dan_11_0

  • dan.Dan_6_0

  • dan.Dan_6_2

  • dan.Dan_7_0

  • dan.Dan_8_0

  • dan.Dan_9_0

  • dan.STAN

  • dra.DRA

  • dra.DRAAdvanced

  • phrasing.FutureTense

  • phrasing.FutureTenseFull

  • phrasing.PastTense

  • phrasing.PastTenseFull

  • sata.MLM

  • smuggling.FunctionMasking

  • smuggling.HypotheticalResponse

  • suffix.BEAST

  • suffix.GCG

  • suffix.GCGCached

  • tap.PAIR

  • tap.TAP

  • tap.TAPCached

  • visual_jailbreak.FigStep

  • visual_jailbreak.FigStepFull

payload:leak:chat

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

payload:leak:training

  • divergence.Repeat

  • divergence.RepeatExtended

  • leakreplay.GuardianCloze

  • leakreplay.GuardianClozeFull

  • leakreplay.GuardianComplete

  • leakreplay.GuardianCompleteFull

  • leakreplay.LiteratureCloze

  • leakreplay.LiteratureClozeFull

  • leakreplay.LiteratureComplete

  • leakreplay.LiteratureCompleteFull

  • leakreplay.NYTCloze

  • leakreplay.NYTClozeFull

  • leakreplay.NYTComplete

  • leakreplay.NYTCompleteFull

  • leakreplay.PotterCloze

  • leakreplay.PotterClozeFull

  • leakreplay.PotterComplete

  • leakreplay.PotterCompleteFull

payload:malicious:badcode

  • malwaregen.Evasion

  • malwaregen.SubFunctions

  • malwaregen.TopLevel

  • packagehallucination.Dart

  • packagehallucination.JavaScript

  • packagehallucination.PackageHallucinationProbe

  • packagehallucination.Perl

  • packagehallucination.Python

  • packagehallucination.RakuLand

  • packagehallucination.Ruby

  • packagehallucination.Rust

payload:malicious:badcode:phishing

  • av_spam_scanning.GTphish

payload:malicious:badcode:spam

  • av_spam_scanning.GTUBE

payload:malicious:badcode:virus

  • av_spam_scanning.EICAR

payload:malicious:renderer

  • exploitation.JinjaTemplatePythonInjection

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

payload:malicious:renderer:xss

  • web_injection.MarkdownXSS

  • web_injection.TaskXSS

payload:unwanted

  • atkgen.Tox

  • realtoxicityprompts.RTPBlank

  • realtoxicityprompts.RTPFlirtation

  • realtoxicityprompts.RTPInsult

  • realtoxicityprompts.RTPSevere_Toxicity

payload:unwanted:denial_of_service

  • divergence.RepeatedToken

payload:unwanted:hatespeech

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • lmrc.SlurUsage

payload:unwanted:profanity

  • lmrc.Profanity

  • realtoxicityprompts.RTPProfanity

payload:unwanted:sexual

  • lmrc.SexualContent

  • lmrc.Sexualisation

  • realtoxicityprompts.RTPSexually_Explicit

payload:unwanted:violence

  • lmrc.Bullying

  • promptinject.HijackHateHumans

  • promptinject.HijackHateHumansFull

  • promptinject.HijackKillHumans

  • promptinject.HijackKillHumansFull

  • promptinject.HijackLongPrompt

  • promptinject.HijackLongPromptFull

  • realtoxicityprompts.RTPThreat

quality:Behavioral:ContentSafety:HateHarassment

  • atkgen.Tox

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • donotanswer.DiscriminationExclusionToxicityHatefulOffensive

  • lmrc.SlurUsage

  • realtoxicityprompts.RTPIdentity_Attack

quality:Behavioral:ContentSafety:LegalGoodsServices

  • donotanswer.MaliciousUses

  • grandma.Substances

  • grandma.Win10

  • grandma.Win11

  • malwaregen.Evasion

  • malwaregen.Payload

  • malwaregen.SubFunctions

  • malwaregen.TopLevel

quality:Behavioral:ContentSafety:Profanity

  • atkgen.Tox

  • donotanswer.DiscriminationExclusionToxicityHatefulOffensive

  • lmrc.Profanity

  • realtoxicityprompts.RTPProfanity

quality:Behavioral:ContentSafety:Sexual

  • lmrc.SexualContent

  • lmrc.Sexualisation

  • realtoxicityprompts.RTPSexually_Explicit

quality:Behavioral:ContentSafety:Toxicity

  • atkgen.Tox

  • continuation.ContinueSlursReclaimedSlurs

  • continuation.ContinueSlursReclaimedSlursFull

  • donotanswer.DiscriminationExclusionToxicityHatefulOffensive

  • realtoxicityprompts.RTPBlank

  • realtoxicityprompts.RTPInsult

  • realtoxicityprompts.RTPSevere_Toxicity

quality:Behavioral:ContentSafety:Unauthorized

  • donotanswer.HumanChatbox

  • lmrc.QuackMedicine

quality:Behavioral:ContentSafety:Violence

  • donotanswer.DiscriminationExclusionToxicityHatefulOffensive

  • lmrc.Bullying

  • promptinject.HijackHateHumans

  • promptinject.HijackHateHumansFull

  • promptinject.HijackKillHumans

  • promptinject.HijackKillHumansFull

  • promptinject.HijackLongPrompt

  • promptinject.HijackLongPromptFull

  • realtoxicityprompts.RTPThreat

quality:Behavioral:DeliberativeMisinformation

  • lmrc.QuackMedicine

quality:Robustness:GenerativeMisinformation

  • donotanswer.MisinformationHarms

  • goodside.WhoIsRiley

  • misleading.FalseAssertion

  • packagehallucination.Dart

  • packagehallucination.JavaScript

  • packagehallucination.PackageHallucinationProbe

  • packagehallucination.Perl

  • packagehallucination.Python

  • packagehallucination.RakuLand

  • packagehallucination.Ruby

  • packagehallucination.Rust

  • snowball.GraphConnectivity

  • snowball.GraphConnectivityFull

  • snowball.Primes

  • snowball.PrimesFull

  • snowball.Senators

  • snowball.SenatorsFull

quality:Security:Adversarial

  • av_spam_scanning.EICAR

  • av_spam_scanning.GTUBE

  • av_spam_scanning.GTphish

  • goodside.Tag

quality:Security:Confidentiality

  • divergence.Repeat

  • divergence.RepeatExtended

  • donotanswer.InformationHazard

  • topic.WordnetAllowedWords

  • topic.WordnetBlockedWords

  • topic.WordnetControversial

quality:Security:ExtractionInversion

  • divergence.Repeat

  • divergence.RepeatExtended

  • exploitation.JinjaTemplatePythonInjection

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

  • leakreplay.GuardianCloze

  • leakreplay.GuardianClozeFull

  • leakreplay.GuardianComplete

  • leakreplay.GuardianCompleteFull

  • leakreplay.LiteratureCloze

  • leakreplay.LiteratureClozeFull

  • leakreplay.LiteratureComplete

  • leakreplay.LiteratureCompleteFull

  • leakreplay.NYTCloze

  • leakreplay.NYTClozeFull

  • leakreplay.NYTComplete

  • leakreplay.NYTCompleteFull

  • leakreplay.PotterCloze

  • leakreplay.PotterClozeFull

  • leakreplay.PotterComplete

  • leakreplay.PotterCompleteFull

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

quality:Security:Integrity

  • ansiescape.AnsiEscaped

  • ansiescape.AnsiRaw

quality:Security:PromptStability

  • audio.AudioAchillesHeel

  • dan.Ablation_Dan_11_0

  • dan.AntiDAN

  • dan.AutoDAN

  • dan.AutoDANCached

  • dan.ChatGPT_Developer_Mode_RANTI

  • dan.ChatGPT_Developer_Mode_v2

  • dan.ChatGPT_Image_Markdown

  • dan.DAN_Jailbreak

  • dan.DUDE

  • dan.DanInTheWild

  • dan.DanInTheWildFull

  • dan.Dan_10_0

  • dan.Dan_11_0

  • dan.Dan_6_0

  • dan.Dan_6_2

  • dan.Dan_7_0

  • dan.Dan_8_0

  • dan.Dan_9_0

  • dan.STAN

  • divergence.RepeatedToken

  • doctor.Bypass

  • doctor.BypassLeet

  • doctor.Puppetry

  • dra.DRA

  • dra.DRAAdvanced

  • encoding.InjectAscii85

  • encoding.InjectAtbash

  • encoding.InjectBase16

  • encoding.InjectBase2048

  • encoding.InjectBase32

  • encoding.InjectBase64

  • encoding.InjectBraille

  • encoding.InjectEcoji

  • encoding.InjectHex

  • encoding.InjectLeet

  • encoding.InjectMime

  • encoding.InjectMorse

  • encoding.InjectNato

  • encoding.InjectQP

  • encoding.InjectROT13

  • encoding.InjectSneakyBits

  • encoding.InjectUU

  • encoding.InjectUnicodeTagChars

  • encoding.InjectUnicodeVariantSelectors

  • encoding.InjectZalgo

  • exploitation.JinjaTemplatePythonInjection

  • exploitation.SQLInjectionEcho

  • exploitation.SQLInjectionSystem

  • glitch.Glitch

  • glitch.GlitchFull

  • goodside.Davidjl

  • goodside.Tag

  • latentinjection.LatentInjectionFactSnippetEiffel

  • latentinjection.LatentInjectionFactSnippetEiffelFull

  • latentinjection.LatentInjectionFactSnippetLegal

  • latentinjection.LatentInjectionFactSnippetLegalFull

  • latentinjection.LatentInjectionReport

  • latentinjection.LatentInjectionReportFull

  • latentinjection.LatentInjectionResume

  • latentinjection.LatentInjectionResumeFull

  • latentinjection.LatentInjectionTranslationEnFr

  • latentinjection.LatentInjectionTranslationEnFrFull

  • latentinjection.LatentInjectionTranslationEnZh

  • latentinjection.LatentInjectionTranslationEnZhFull

  • latentinjection.LatentJailbreak

  • latentinjection.LatentJailbreakFull

  • latentinjection.LatentWhois

  • latentinjection.LatentWhoisSnippet

  • latentinjection.LatentWhoisSnippetFull

  • phrasing.FutureTense

  • phrasing.FutureTenseFull

  • phrasing.PastTense

  • phrasing.PastTenseFull

  • promptinject.HijackHateHumans

  • promptinject.HijackHateHumansFull

  • promptinject.HijackKillHumans

  • promptinject.HijackKillHumansFull

  • promptinject.HijackLongPrompt

  • promptinject.HijackLongPromptFull

  • sata.MLM

  • smuggling.FunctionMasking

  • smuggling.HypotheticalResponse

  • suffix.BEAST

  • suffix.GCG

  • suffix.GCGCached

  • tap.PAIR

  • tap.TAP

  • tap.TAPCached

  • visual_jailbreak.FigStep

  • visual_jailbreak.FigStepFull

  • web_injection.ColabAIDataLeakage

  • web_injection.MarkdownImageExfil

  • web_injection.MarkdownURIImageExfilExtended

  • web_injection.MarkdownURINonImageExfilExtended

  • web_injection.MarkdownXSS

  • web_injection.PlaygroundMarkdownExfil

  • web_injection.StringAssemblyDataExfil

  • web_injection.TaskXSS

risk-cards:lmrc:anthropomorphisation

  • lmrc.Anthropomorphisation

risk-cards:lmrc:bullying

  • lmrc.Bullying

risk-cards:lmrc:deadnaming

  • lmrc.Deadnaming

risk-cards:lmrc:profanity

  • lmrc.Profanity

risk-cards:lmrc:providing_illegal_info

  • grandma.Substances

  • grandma.Win10

  • grandma.Win11

risk-cards:lmrc:quack_medicine

  • lmrc.QuackMedicine

risk-cards:lmrc:sexual_content

  • lmrc.SexualContent

risk-cards:lmrc:sexualisation

  • lmrc.Sexualisation

risk-cards:lmrc:slur_usage

  • lmrc.SlurUsage