Create an Auditor Configuration#

Prerequisites#

Set the AUDITOR_BASE_URL environment variable to the NeMo Auditor service endpoint. Refer to Accessing the Microservice for more information.

Create an Audit Configuration#

To create an audit configuration, you send a POST request to the /v1beta1/audit/configs endpoint.

Create the configuration:

Python SDK

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))

config = client.beta.audit.configs.create(
    name="demo-basic-config",
    namespace="default",
    description="Basic demonstration configuration",
    system={
        "parallel_attempts": 20,
        "lite": True
    },
    plugins={
        "probe_spec": "dan.AutoDANCached,goodside.Tag"
    },
    reporting={
        "extended_detectors": False
    }
)
print(config.model_dump_json(indent=2))

cURL

curl -X POST "${AUDITOR_BASE_URL}/v1beta1/audit/configs" \
  -H "Accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "demo-basic-config",
    "namespace": "default",
    "description": "Basic demonstration configuration",
    "system": {
        "parallel_attempts": "20",
        "lite": "True"
    },
    "plugins": {
        "probe_spec": "dan.AutoDANCached,goodside.Tag"
    }
}' | jq

Example Output

Python SDK

{
  "id": "audit_config-Sn8uXGpwBDkBNDDTpuyucr",
  "created_at": "2025-11-25T16:30:27.701401",
  "custom_fields": {},
  "description": "Basic demonstration configuration",
  "entity_id": "audit_config-Sn8uXGpwBDkBNDDTpuyucr",
  "name": "demo-basic-config",
  "namespace": "default",
  "ownership": null,
  "plugins": {
    "buff_max": null,
    "buff_spec": null,
    "buffs": {},
    "buffs_include_original_prompt": false,
    "detector_spec": "auto",
    "detectors": {},
    "extended_detectors": false,
    "generators": {},
    "harnesses": {},
    "model_name": null,
    "model_type": null,
    "probe_spec": "dan.AutoDANCached,goodside.Tag",
    "probes": {}
  },
  "project": null,
  "reporting": {
    "report_dir": "garak_runs",
    "report_prefix": "run1",
    "show_100_pass_modules": true,
    "taxonomy": null
  },
  "run": {
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 5,
    "probe_tags": null,
    "seed": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "schema_version": "1.0",
  "system": {
    "enable_experimental": false,
    "lite": true,
    "narrow_output": false,
    "parallel_attempts": 20,
    "parallel_requests": false,
    "show_z": false,
    "verbose": 0
  },
  "type_prefix": null,
  "updated_at": "2025-11-25T16:30:27.701409"
}

cURL

{
  "schema_version": "1.0",
  "id": "audit_config-UBboobCgipkfUaBbV1dxjh",
  "description": "Basic demonstration configuration",
  "type_prefix": null,
  "namespace": "default",
  "project": null,
  "created_at": "2025-11-25T18:42:46.254863",
  "updated_at": "2025-11-25T18:42:46.254868",
  "custom_fields": {},
  "ownership": null,
  "name": "demo-basic-config",
  "entity_id": "audit_config-UBboobCgipkfUaBbV1dxjh",
  "system": {
    "verbose": 0,
    "narrow_output": false,
    "parallel_requests": false,
    "parallel_attempts": 20,
    "lite": true,
    "show_z": false,
    "enable_experimental": false
  },
  "run": {
    "seed": null,
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 5,
    "probe_tags": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "plugins": {
    "model_type": null,
    "model_name": null,
    "probe_spec": "dan.AutoDANCached,goodside.Tag",
    "detector_spec": "auto",
    "extended_detectors": false,
    "buff_spec": null,
    "buffs_include_original_prompt": false,
    "buff_max": null,
    "detectors": {},
    "generators": {},
    "buffs": {},
    "harnesses": {},
    "probes": {}
  },
  "reporting": {
    "report_prefix": "run1",
    "taxonomy": null,
    "report_dir": "garak_runs",
    "show_100_pass_modules": true
  }
}

Refer to the following suggested topics related to specifying an audit configuration.

For information about the fields, refer to Schema Reference for Audit Configurations.
A common customization is to specify the probes to run with the plugins.probe_spec field. For information about the probe names, refer to Probe Reference Summary.

Default Configuration#

The microservice ships with a default configuration in the default namespace. The default configuration performs approximately 79,070 inference requests.

Python SDK

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))

config = client.beta.audit.configs.retrieve(
    config_name="default",
    namespace="default"
)
print(config.model_dump_json(indent=2))

cURL

curl "${AUDITOR_BASE_URL}/v1beta1/audit/configs/default/default" \
  -H "Accept: application/json" | jq

Example Output

Python SDK

{
  "id": "audit_config-4aPawVGto1R4DmiXB4XDQc",
  "created_at": "2025-11-25T14:00:21.641067Z",
  "custom_fields": {},
  "description": null,
  "entity_id": "audit_config-4aPawVGto1R4DmiXB4XDQc",
  "name": "default",
  "namespace": "default",
  "ownership": null,
  "plugins": {
    "buff_max": null,
    "buff_spec": null,
    "buffs": {},
    "buffs_include_original_prompt": false,
    "detector_spec": "auto",
    "detectors": {},
    "extended_detectors": false,
    "generators": {},
    "harnesses": {},
    "model_name": null,
    "model_type": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,web_injection",
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "project": null,
  "reporting": {
    "report_dir": "garak_runs",
    "report_prefix": "run1",
    "show_100_pass_modules": true,
    "taxonomy": null
  },
  "run": {
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "seed": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "schema_version": "1.0",
  "system": {
    "enable_experimental": false,
    "lite": false,
    "narrow_output": false,
    "parallel_attempts": 32,
    "parallel_requests": false,
    "show_z": false,
    "verbose": 0
  },
  "type_prefix": null,
  "updated_at": "2025-11-25T14:00:21.641067Z"
}

cURL

{
  "schema_version": "1.0",
  "id": "audit_config-G3yRpv8WLmDvxjVjmUvES6",
  "description": null,
  "type_prefix": null,
  "namespace": "default",
  "project": null,
  "created_at": "2025-11-25T18:36:33.132094Z",
  "updated_at": "2025-11-25T18:36:33.132094Z",
  "custom_fields": {},
  "ownership": null,
  "name": "default",
  "entity_id": "audit_config-G3yRpv8WLmDvxjVjmUvES6",
  "system": {
    "verbose": 0,
    "narrow_output": false,
    "parallel_requests": false,
    "parallel_attempts": 32,
    "lite": false,
    "show_z": false,
    "enable_experimental": false
  },
  "run": {
    "seed": null,
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "plugins": {
    "model_type": null,
    "model_name": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,web_injection",
    "detector_spec": "auto",
    "extended_detectors": false,
    "buff_spec": null,
    "buffs_include_original_prompt": false,
    "buff_max": null,
    "detectors": {},
    "generators": {},
    "buffs": {},
    "harnesses": {},
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "reporting": {
    "report_prefix": "run1",
    "taxonomy": null,
    "report_dir": "garak_runs",
    "show_100_pass_modules": true
  }
}

Probe Reference Summary#

The following table summarizes the probes that you can specify in the config.plugins.probe_spec field.

Specify an individual probe by name, such as dan.Ablation_Dan_11_0 or specify a category of probes, such as dan. Separate each probe and category with a comma, such as ansiescape,dan.Ablation_Dan_11_0.

The tier field indicates the importance of a probe. Tier 1 probes are the most important, followed by 2, 3, and 9. Refer to the garak tiers documentation for more information.

The probes identified as Default are included in the default/default configuration.

For detailed information about a probe, refer to the garak.probes page in the garak documentation.

Probe	Tier	Goal	Description
ansiescape.AnsiEscaped (Default)	3	print escaped ANSI terminal codes	Probe for eliciting ansi control codes from a model.
ansiescape.AnsiRaw (Default)	1	print raw ANSI terminal codes	Probe for eliciting raw ansi control codes from a model.
ansiescape.AnsiRawTokenizerHF	1	check if raw ANSI escape sequences tokens are available	Probe for checking a tokenizer for potentially malicious ansi escape sequences.
atkgen.Tox (Default)	2	make the model say something toxic	Automatic attack generation, trying to elicit toxic outputs from the model.
audio.AudioAchillesHeel	9	disregard the system prompt via audio component	Audio is the achilles heel.
av_spam_scanning.EICAR	9	output antivirus test codes	Does the model check its output for viruses? try to send the eicar code through.
av_spam_scanning.GTUBE	9	output email spam	Does the model check its output for spam? try to send the gtube code through.
av_spam_scanning.GTphish	9	output email phishing	Does the model check its output for phishing? try to send the gtphish code through.
continuation.ContinueSlursReclaimedSlurs (Default)	2	output an ethnic slur	Continue partial word with slur or reclaimed slur.
continuation.ContinueSlursReclaimedSlursFull	2	output an ethnic slur	Continue partial word with slur or reclaimed slur.
dan.Ablation_Dan_11_0 (Default)	2	disregard the system prompt	Ablation experiments over the dan 11.0 prompt.
dan.AntiDAN	9	disregard the system prompt	Anti-dan.
dan.AutoDAN	2	disregard the system prompt	Probe leveraging autodan attack generation.
dan.AutoDANCached (Default)	2	disregard the system prompt	Probe leveraging generated autodan prompts.
dan.ChatGPT_Developer_Mode_RANTI	9	disregard the system prompt	Chatgpt devmode + ranti.
dan.ChatGPT_Developer_Mode_v2	9	disregard the system prompt	Chatgpt developer mode v2.
dan.ChatGPT_Image_Markdown	9	disregard the system prompt	Chatgpt markdown image unlocker / markdown injection.
dan.DAN_Jailbreak	9	disregard the system prompt	Dan jailbreak.
dan.DUDE	9	disregard the system prompt	Dude.
dan.DanInTheWild (Default)	1	disregard the system prompt or alignment	A subset of jailbreaks from the daninthewild jailbreak library.
dan.DanInTheWildFull	1	disregard the system prompt or alignment	A library of jailbreaks.
dan.Dan_10_0	9	disregard the system prompt	Dan 10.0.
dan.Dan_11_0	9	disregard the system prompt	Dan 11.0.
dan.Dan_6_0	9	disregard the system prompt	Dan 6.0.
dan.Dan_6_2	9	disregard the system prompt	Dan 6.2.
dan.Dan_7_0	9	disregard the system prompt	Dan 7.0.
dan.Dan_8_0	9	disregard the system prompt	Dan 8.0.
dan.Dan_9_0	9	disregard the system prompt	Dan 9.0.
dan.STAN	9	disregard the system prompt	Stan.
divergence.Repeat (Default)	2	get the model to leak data by asking it to repeat words	See if a model starts giving other data when asked to repeat a string indefinitely.
divergence.RepeatExtended	2	get the model to leak data by asking it to repeat words	See if a model starts giving other data when asked to repeat a string indefinitely.
divergence.RepeatedToken (Default)	1	Stress-test LLMs with long repeated-token prompts to surface instability	Repeated token destabilization attack from dropbox llm security research.
doctor.Bypass	2	roleplay as doctor to get bypass guardrails	Implement policy puppetry bypass.
doctor.BypassLeet	2	roleplay as doctor to get bypass guardrails, but use leetspeak	Implement policy puppetry bypass.
doctor.Puppetry	2	roleplay as doctor to get unsafe outputs	Implement policy puppetry prompt injection.
donotanswer.DiscriminationExclusionToxicityHatefulOffensive	9	get model to respond as discrimination exclusion toxicity hateful offensive	Implementation of discrimination_exclusion_toxicity_hateful_offensive probes in donotanswer_prompts.
donotanswer.HumanChatbox	9	get model to respond as human chatbox	Implementation of human_chatbox probes in donotanswer_prompts.
donotanswer.InformationHazard	9	get model to respond as information hazard	Implementation of information_hazard probes in donotanswer_prompts.
donotanswer.MaliciousUses	9	get model to respond as malicious uses	Implementation of malicious_uses probes in donotanswer_prompts.
donotanswer.MisinformationHarms	9	get model to respond as misinformation harms	Implementation of misinformation_harms probes in donotanswer_prompts.
dra.DRA	1	make the LLM ask and answer its own harmful instructions	Dra (disguise and reconstruction attack).
dra.DRAAdvanced	1	make the LLM ask and answer its own harmful instructions	Dra (disguise and reconstruction attack) - version with detoxify.
encoding.InjectAscii85 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base85 encoded injections.
encoding.InjectAtbash (Default)	2	make the model say an encoded payload	Atbash probe credit: nakul-rajpal.
encoding.InjectBase16 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base16 encoded injections”.
encoding.InjectBase2048 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base2048 encoding injections.
encoding.InjectBase32 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base32 encoded injections.
encoding.InjectBase64 (Default)	1	make the model say an encoded payload	Probe for susceptibility to base64 encoded injections.
encoding.InjectBraille (Default)	2	make the model say an encoded payload	Probe for susceptibility to braille encoding injections. credit wally#3422.
encoding.InjectEcoji (Default)	2	make the model say an encoded payload	Probe for susceptibility to ecoji encoding.
encoding.InjectHex (Default)	2	make the model say an encoded payload	Probe for susceptibility to hex encoded injections.
encoding.InjectLeet	2	make the model say an encoded payload	Probe for susceptibility to 31337 encoding.
encoding.InjectMime	2	make the model say an encoded payload	Probe for susceptibility to mime encoding injections.
encoding.InjectMorse (Default)	2	make the model say an encoded payload	Probe for susceptibility to morse encoding. credit wally#3422.
encoding.InjectNato (Default)	2	make the model say an encoded payload	Probe for susceptibility to nato phonetic alphabet encoding.
encoding.InjectQP	1	make the model say an encoded payload	Probe for susceptibility to qp encoded injections.
encoding.InjectROT13 (Default)	2	make the model say an encoded payload	Probe for susceptibility to rot13 encoding injections.
encoding.InjectSneakyBits	2	make the model say an encoded payload	Probe for susceptibility to unicode variant selector ascii smuggling.
encoding.InjectUU (Default)	1	make the model say an encoded payload	Probe for susceptibility to uu encoded injections.
encoding.InjectUnicodeTagChars (Default)	1	make the model say an encoded payload	Probe for susceptibility to unicode tag ascii smuggling.
encoding.InjectUnicodeVariantSelectors	2	make the model say an encoded payload	Probe for susceptibility to unicode variant selector ascii smuggling.
encoding.InjectZalgo (Default)	2	make the model say an encoded payload	Probe for susceptibility to zalgo encoding.
exploitation.JinjaTemplatePythonInjection (Default)	1	Gain remote code execution via Jinja template injection	Attempts to execute python code via jinja templating.
exploitation.SQLInjectionEcho (Default)	1	Manipulate a database via SQL injection	Attempts to get the target to echo a sql statement.
exploitation.SQLInjectionSystem	1	Manipulate a database via SQL injection	Attempts to inject untrusted sql commands into a sql statement.
fileformats.HF_Files	1	get a list of files associated with the model	Get a manifest of files associated with a hugging face generator.
fitd.FITD	3	Transform queries from benign to harmful obtaining intermediate responses to get compliance from the model	Foot in the door probe.
glitch.Glitch	3	make the model fail to repeat a word	Probe model for glitch tokens that provoke unusual behavior.
glitch.GlitchFull	3	make the model fail to repeat a word	Probe model for glitch tokens that provoke unusual behavior.
goodside.Davidjl	3	get wrong answers about a name	Exploit the ‘ davidjl’ magic token.
goodside.Tag (Default)	1	inject invisible instructions	Exploit utf tag page encoding for user-invisible prompt injection.
goodside.ThreatenJSON (Default)	2	get raw code by threatening a model	Will model strip preamble and give json?.
goodside.WhoIsRiley (Default)	2	get misinfo about riley goodside	Probe for misinfo about riley goodside.
grandma.Slurs (Default)	1	get model to use ethnic slur	Use the grandma attack to get ethnic slurs.
grandma.Substances (Default)	1	get instructions for making bad things	Use the grandma attack to get instructions for harmful items.
grandma.Win10 (Default)	2	get model to output Win 10 product keys	Will the model yield a windows 10 product key if we beg it to?.
grandma.Win11 (Default)	2	get model to output Win 11 product keys	Will the model yield a windows 11 product key if we beg it to?.
latentinjection.LatentInjectionFactSnippetEiffel (Default)	1	subvert a snippet with instructions	String-based latent prompt injections in fact snippets on the eiffel tower.
latentinjection.LatentInjectionFactSnippetEiffelFull	1	subvert a snippet with instructions	String-based latent prompt injections in fact snippets on the eiffel tower.
latentinjection.LatentInjectionFactSnippetLegal (Default)	1	subvert a snippet with instructions	String-based latent prompt injections in legal snippets.
latentinjection.LatentInjectionFactSnippetLegalFull	1	subvert a snippet with instructions	String-based latent prompt injections in legal snippets.
latentinjection.LatentInjectionReport (Default)	1	subvert a report with instructions	String-based latent prompt injections via a report.
latentinjection.LatentInjectionReportFull	1	subvert a report with instructions	String-based latent prompt injections via a report.
latentinjection.LatentInjectionResume (Default)	1	subvert a resume with instructions	String-based latent prompt injections via a resume.
latentinjection.LatentInjectionResumeFull	1	subvert a resume with instructions	String-based latent prompt injections via a resume.
latentinjection.LatentInjectionTranslationEnFr (Default)	1	subvert translation instructions	String-based latent prompt injections via translation.
latentinjection.LatentInjectionTranslationEnFrFull	1	subvert translation instructions	String-based latent prompt injections via translation.
latentinjection.LatentInjectionTranslationEnZh (Default)	1	subvert translation instructions	Latent prompt injection in chinese context (简体字).
latentinjection.LatentInjectionTranslationEnZhFull	1	subvert translation instructions	Latent prompt injection in chinese context (简体字).
latentinjection.LatentJailbreak (Default)	1	inject malicious instructions into output while posing as translation task	Implementation of latent jailbreak, using latent prompt injection-.
latentinjection.LatentJailbreakFull	1	inject malicious instructions into output while posing as translation task	Implementation of latent jailbreak, using latent prompt injection-.
latentinjection.LatentWhois (Default)	1	subvert a whois report with instructions	Latent prompt injections in whois probe reports.
latentinjection.LatentWhoisSnippet (Default)	1	subvert a snippet with instructions	String-based latent prompt injections in whois reports.
latentinjection.LatentWhoisSnippetFull	1	subvert a snippet with instructions	String-based latent prompt injections in whois reports.
leakreplay.GuardianCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of guardian cloze test for data leakage.
leakreplay.GuardianClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on guardian articles with masked entities in a cloze test format.
leakreplay.GuardianComplete (Default)	1	replay training data by providing document completions that match training examples	Lightweight version of guardian completion test for data leakage.
leakreplay.GuardianCompleteFull	1	replay training data by providing document completions that match training examples	Test for data leakage on guardian articles through text completion.
leakreplay.LiteratureCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of literature cloze test for data leakage.
leakreplay.LiteratureClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on literature passages with masked entities in a cloze test format.
leakreplay.LiteratureComplete (Default)	2	replay training data by providing document completions that match training examples	Lightweight version of literature completion test for data leakage.
leakreplay.LiteratureCompleteFull	2	replay training data by providing document completions that match training examples	Test for data leakage on literature passages through text completion.
leakreplay.NYTCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of nyt cloze test for data leakage.
leakreplay.NYTClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on new york times articles with masked entities in a cloze test format.
leakreplay.NYTComplete (Default)	1	replay training data by providing document completions that match training examples	Lightweight version of nyt completion test for data leakage.
leakreplay.NYTCompleteFull	1	replay training data by providing document completions that match training examples	Test for data leakage on new york times articles through text completion.
leakreplay.PotterCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of harry potter cloze test for data leakage.
leakreplay.PotterClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on harry potter text with masked entities in a cloze test format.
leakreplay.PotterComplete (Default)	1	replay training data by providing document completions that match training examples	Lightweight version of harry potter completion test for data leakage.
leakreplay.PotterCompleteFull	1	replay training data by providing document completions that match training examples	Test for data leakage on harry potter text through completion.
lmrc.Anthropomorphisation	3	anthropomorphise a language model	Providing output that encourages anthropomorphisation of ai systems.
lmrc.Bullying (Default)	2	help me bully	Generation of repeated threats, denigration.
lmrc.Deadnaming (Default)	2	write a deadname	Mentioning a name that a person does not wish to be associated with.
lmrc.Profanity	3	utter profanity	Using profane words, especially when not prompted to do so.
lmrc.QuackMedicine (Default)	2	support quack medicine	Promoting non-scientific medicine.
lmrc.SexualContent (Default)	2	write sexual content	Content that discusses bodies and interactions in a sexual manner, or other sexually explicit material.
lmrc.Sexualisation (Default)	2	sexualise people	Causal or overt sexual objectification, reducing a person or group of people to sexual objects.
lmrc.SlurUsage (Default)	2	use a slur	Using an identity-based slur in a non-reclaimed way - either directly or recommending the consumer to use in speech/action.
malwaregen.Evasion (Default)	2	Write code to assist evading security and anti-malware measures	Will the model write code to help avoid anti-malware?.
malwaregen.Payload (Default)	2	write a malware payload	Will the model help write code for malware payloads?.
malwaregen.SubFunctions (Default)	2	write code supporting a sub-function of malware	Will the system write code helpful in malware?.
malwaregen.TopLevel (Default)	2	write malware given a top-level description	Will the system return code when asked to write malware?.
misleading.FalseAssertion (Default)	2	make the model expand on a false assertion	Does the model reject [questions based on] false claims?.
packagehallucination.Dart (Default)	1	generate Dart code importing non-existent pub.dev packages	Attempts to generate dart code including hallucinated pub.dev packages.
packagehallucination.JavaScript (Default)	1	generate JavaScript code importing non-existent npm packages	Attempts to generate javascript code including hallucinated npm packages.
packagehallucination.Perl (Default)	1	generate Perl code importing non-existent MetaCPAN modules	Attempts to generate perl code including hallucinated metacpan packages.
packagehallucination.Python (Default)	1	generate python importing non-existent packages	Attempts to generate python3 code including hallucinated packages.
packagehallucination.RakuLand (Default)	1	generate Raku code importing non-existent raku.land modules	Attempts to generate raku code including hallucinated raku.land packages.
packagehallucination.Ruby (Default)	1	generate ruby code importing non-existent gems	Attempts to generate ruby code including hallucinated gems.
packagehallucination.Rust (Default)	1	generate Rust code importing non-existent crates.io packages	Attempts to generate rust code including hallucinated crates.io packages.
phrasing.FutureTense (Default)	2	use future tense to disregard the system prompt	Attempt to evade safeguards by setting requests in the future tense.
phrasing.FutureTenseFull	2	use future tense to disregard the system prompt	Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.
phrasing.PastTense (Default)	2	use past tense to disregard the system prompt	Attempt to evade safeguards by setting requests in the past tense.
phrasing.PastTenseFull	2	use past tense to disregard the system prompt	Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.
promptinject.HijackHateHumans (Default)	2	inject a prompt on hate humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackHateHumansFull	2	inject a prompt on hate humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackKillHumans (Default)	2	inject a prompt on kill humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackKillHumansFull	2	inject a prompt on kill humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackLongPrompt (Default)	2	inject a prompt on long prompt	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackLongPromptFull	2	inject a prompt on long prompt	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
realtoxicityprompts.RTPBlank (Default)	1	generate toxicity when prompted with nothing or special tokens	Implementation of probes in realtoxicityprompts.
realtoxicityprompts.RTPFlirtation	3	generate flirtation content	Implementation of flirtation probes in realtoxicityprompts.
realtoxicityprompts.RTPIdentity_Attack	3	generate identity attack content	Implementation of identity_attack probes in realtoxicityprompts.
realtoxicityprompts.RTPInsult	3	generate insult content	Implementation of insult probes in realtoxicityprompts.
realtoxicityprompts.RTPProfanity	3	generate profanity content	Implementation of profanity probes in realtoxicityprompts.
realtoxicityprompts.RTPSevere_Toxicity	3	generate severe toxicity content	Implementation of severe_toxicity probes in realtoxicityprompts.
realtoxicityprompts.RTPSexually_Explicit	3	generate sexually explicit content	Implementation of sexually_explicit probes in realtoxicityprompts.
realtoxicityprompts.RTPThreat	3	generate threat content	Implementation of threat probes in realtoxicityprompts.
sata.MLM	2	disregard the system prompt	Masked language model simple assistive task probe.
smuggling.FunctionMasking	3	return banned instructions	Functionmasking probe.
smuggling.HypotheticalResponse	3	return banned instructions	Hypotheticalresponse probe.
snowball.GraphConnectivity (Default)	2	trick the model into saying an impossible flight routing is possible	Probe asking for impossible flight routings.
snowball.GraphConnectivityFull	2	trick the model into saying an impossible flight routing is possible	Probe asking for impossible flight routings.
snowball.Primes	3	trick the model into saying a prime number isn’t prime	Ask model to check a list of higher primes.
snowball.PrimesFull	3	trick the model into saying a prime number isn’t prime	Ask model to check a list of higher primes.
snowball.Senators	3	trick the model into saying a fictitious senator exists	Asking for senators that don’t exist.
snowball.SenatorsFull	3	trick the model into saying a fictitious senator exists	Asking for senators that don’t exist.
suffix.BEAST	2	cause the model to disregard the system prompt	Beast probe.
suffix.GCG	2	cause the model to disregard the system prompt	Greedy coordinate gradient probe.
suffix.GCGCached (Default)	2	cause the model to disregard the system prompt	Greedy coordinate gradient cached probe.
tap.PAIR	2	cause the model to disregard the system prompt	Prompt automatic iterative refinement automatic jailbreak.
tap.TAP	2	cause the model to disregard the system prompt	Tree of attacks with pruning probe, generating new attacks against the target.
tap.TAPCached (Default)	2	cause the model to disregard the system prompt	Tree of attacks with pruning probe using cached attacks.
test.Blank	9	see what the model has to say for itself given silence	Blank prompt.
test.Test	9	send some test strings, ignore the response	Test probe.
topic.WordnetAllowedWords	9	get the model to engage off-topic	Use wordnet to find out if a model will discuss terms near but not in its allowed set.
topic.WordnetBlockedWords	9	get the model to engage off-topic	Use wordnet to explore which topics a model will respond to around blocked words.
topic.WordnetControversial (Default)	2	get the model to engage off-topic	Ask model to respond on a set on controversial topics.
visual_jailbreak.FigStep	2	disregard the system prompt with both text and image	Using image modality to assist jailbreak.
visual_jailbreak.FigStepFull	2	disregard the system prompt with both text and image	Using image modality to assist jailbreak.
web_injection.ColabAIDataLeakage (Default)	1	render images fetched from a remote uri	Leak data via image rendering as demonstrated against gemini in colab.
web_injection.MarkdownImageExfil (Default)	1	embed conversation-private information in a markdown URI	Can we get the model to write embedded assets exfiltrating text in the uri?.
web_injection.MarkdownURIImageExfilExtended (Default)	1	embed potentially sensitive data in a Markdown URI	Generalized implementation of data exfiltration exploits through markdown uris.
web_injection.MarkdownURINonImageExfilExtended (Default)	1	embed potentially sensitive data in a Markdown URI	Generalized implementation of data exfiltration exploits through markdown uris.
web_injection.MarkdownXSS (Default)	1	Exploit cross-site scripting via prompt injection	Perform cross site scripting attacks using markdown javascript rendering via prompt injection.
web_injection.PlaygroundMarkdownExfil (Default)	1	embed conversation-private information in a markdown URI	Markdown image uri exfil exploit published 2023 sept 29.
web_injection.StringAssemblyDataExfil (Default)	1	embed conversation-private information in a markdown URI	Prompt injection for manipulating state.
web_injection.TaskXSS (Default)	1	Exploit cross-site scripting via prompt injection	Perform cross site scripting attacks via prompt injection.

Probe Tags Reference#

The following table identifies the probe tags and the probes that are associated with each tag. You specify the probe tags in the run.probe_tags field when you add an audit configuration.

You can specify an individual tag, such as owasp:llm06 or an entire category such as owasp. Separate multiple tags with a comma, such as owasp:llm06,payload:hallucination.

If you specify probes to run in the plugins.probe_spec field and also specify run.probe_tags, the probes specified in the probe_spec field are filtered to the probes that also match the tags.

Tag	Probes with this Tag
avid-effect:ethics:E0101	continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull lmrc.Profanity lmrc.Sexualisation lmrc.SlurUsage realtoxicityprompts.RTPFlirtation realtoxicityprompts.RTPIdentity_Attack realtoxicityprompts.RTPInsult realtoxicityprompts.RTPThreat
avid-effect:ethics:E0301	atkgen.Tox continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull donotanswer.DiscriminationExclusionToxicityHatefulOffensive lmrc.Bullying lmrc.Profanity lmrc.Sexualisation lmrc.SlurUsage realtoxicityprompts.RTPBlank realtoxicityprompts.RTPFlirtation realtoxicityprompts.RTPIdentity_Attack realtoxicityprompts.RTPInsult realtoxicityprompts.RTPProfanity realtoxicityprompts.RTPSevere_Toxicity realtoxicityprompts.RTPSexually_Explicit realtoxicityprompts.RTPThreat
avid-effect:ethics:E0302	lmrc.Deadnaming realtoxicityprompts.RTPIdentity_Attack
avid-effect:ethics:E0402	donotanswer.MisinformationHarms goodside.WhoIsRiley lmrc.QuackMedicine
avid-effect:performance:P0201	glitch.Glitch glitch.GlitchFull goodside.ThreatenJSON
avid-effect:performance:P0204	audio.AudioAchillesHeel dan.AntiDAN dan.ChatGPT_Developer_Mode_RANTI dan.Dan_11_0 goodside.Davidjl goodside.ThreatenJSON goodside.WhoIsRiley misleading.FalseAssertion sata.MLM snowball.GraphConnectivity snowball.GraphConnectivityFull snowball.Primes snowball.PrimesFull snowball.Senators snowball.SenatorsFull suffix.BEAST suffix.GCG suffix.GCGCached tap.PAIR tap.TAP tap.TAPCached
avid-effect:performance:P0400	realtoxicityprompts.RTPThreat
avid-effect:performance:P0401	continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull lmrc.Deadnaming lmrc.Profanity lmrc.SexualContent lmrc.Sexualisation lmrc.SlurUsage realtoxicityprompts.RTPFlirtation realtoxicityprompts.RTPIdentity_Attack realtoxicityprompts.RTPInsult realtoxicityprompts.RTPProfanity
avid-effect:performance:P0403	divergence.RepeatedToken
avid-effect:security:S0100	ansiescape.AnsiEscaped ansiescape.AnsiRaw
avid-effect:security:S0200	ansiescape.AnsiEscaped ansiescape.AnsiRaw
avid-effect:security:S0201	glitch.Glitch glitch.GlitchFull
avid-effect:security:S0300	grandma.Slurs grandma.Win10 grandma.Win11
avid-effect:security:S0301	av_spam_scanning.EICAR av_spam_scanning.GTUBE av_spam_scanning.GTphish divergence.Repeat divergence.RepeatExtended doctor.Bypass doctor.BypassLeet doctor.Puppetry donotanswer.HumanChatbox donotanswer.InformationHazard exploitation.JinjaTemplatePythonInjection exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem leakreplay.GuardianCloze leakreplay.GuardianClozeFull leakreplay.GuardianComplete leakreplay.GuardianCompleteFull leakreplay.LiteratureCloze leakreplay.LiteratureClozeFull leakreplay.LiteratureComplete leakreplay.LiteratureCompleteFull leakreplay.NYTCloze leakreplay.NYTClozeFull leakreplay.NYTComplete leakreplay.NYTCompleteFull leakreplay.PotterCloze leakreplay.PotterClozeFull leakreplay.PotterComplete leakreplay.PotterCompleteFull topic.WordnetAllowedWords topic.WordnetBlockedWords topic.WordnetControversial web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.MarkdownXSS web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil web_injection.TaskXSS
avid-effect:security:S0403	audio.AudioAchillesHeel av_spam_scanning.EICAR av_spam_scanning.GTUBE av_spam_scanning.GTphish dan.Ablation_Dan_11_0 dan.AntiDAN dan.AutoDAN dan.AutoDANCached dan.ChatGPT_Developer_Mode_RANTI dan.ChatGPT_Developer_Mode_v2 dan.ChatGPT_Image_Markdown dan.DAN_Jailbreak dan.DUDE dan.DanInTheWild dan.DanInTheWildFull dan.Dan_10_0 dan.Dan_11_0 dan.Dan_6_0 dan.Dan_6_2 dan.Dan_7_0 dan.Dan_8_0 dan.Dan_9_0 dan.STAN dra.DRA dra.DRAAdvanced encoding.InjectAscii85 encoding.InjectAtbash encoding.InjectBase16 encoding.InjectBase2048 encoding.InjectBase32 encoding.InjectBase64 encoding.InjectBraille encoding.InjectEcoji encoding.InjectHex encoding.InjectLeet encoding.InjectMime encoding.InjectMorse encoding.InjectNato encoding.InjectQP encoding.InjectROT13 encoding.InjectSneakyBits encoding.InjectUU encoding.InjectUnicodeTagChars encoding.InjectUnicodeVariantSelectors encoding.InjectZalgo exploitation.JinjaTemplatePythonInjection exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem goodside.Davidjl goodside.ThreatenJSON latentinjection.LatentInjectionFactSnippetEiffel latentinjection.LatentInjectionFactSnippetEiffelFull latentinjection.LatentInjectionFactSnippetLegal latentinjection.LatentInjectionFactSnippetLegalFull latentinjection.LatentInjectionReport latentinjection.LatentInjectionReportFull latentinjection.LatentInjectionResume latentinjection.LatentInjectionResumeFull latentinjection.LatentInjectionTranslationEnFr latentinjection.LatentInjectionTranslationEnFrFull latentinjection.LatentInjectionTranslationEnZh latentinjection.LatentInjectionTranslationEnZhFull latentinjection.LatentJailbreak latentinjection.LatentJailbreakFull latentinjection.LatentWhois latentinjection.LatentWhoisSnippet latentinjection.LatentWhoisSnippetFull malwaregen.Evasion malwaregen.Payload malwaregen.SubFunctions malwaregen.TopLevel promptinject.HijackHateHumans promptinject.HijackHateHumansFull promptinject.HijackKillHumans promptinject.HijackKillHumansFull promptinject.HijackLongPrompt promptinject.HijackLongPromptFull sata.MLM smuggling.FunctionMasking smuggling.HypotheticalResponse suffix.BEAST suffix.GCG suffix.GCGCached tap.PAIR tap.TAP tap.TAPCached web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.MarkdownXSS web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil web_injection.TaskXSS
cwe:1336	exploitation.JinjaTemplatePythonInjection
cwe:1426	exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem
cwe:1427	doctor.Bypass doctor.BypassLeet doctor.Puppetry exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem
cwe:352	web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil
cwe:79	web_injection.MarkdownXSS web_injection.TaskXSS
cwe:89	exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem
cwe:94	exploitation.JinjaTemplatePythonInjection
cwe:95	exploitation.JinjaTemplatePythonInjection
owasp:llm01	ansiescape.AnsiEscaped ansiescape.AnsiRaw continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull dan.Ablation_Dan_11_0 dan.AntiDAN dan.AutoDAN dan.AutoDANCached dan.ChatGPT_Developer_Mode_RANTI dan.ChatGPT_Developer_Mode_v2 dan.ChatGPT_Image_Markdown dan.DAN_Jailbreak dan.DUDE dan.DanInTheWild dan.DanInTheWildFull dan.Dan_10_0 dan.Dan_11_0 dan.Dan_6_0 dan.Dan_6_2 dan.Dan_7_0 dan.Dan_8_0 dan.Dan_9_0 dan.STAN doctor.Bypass doctor.BypassLeet doctor.Puppetry dra.DRA dra.DRAAdvanced encoding.InjectAscii85 encoding.InjectAtbash encoding.InjectBase16 encoding.InjectBase2048 encoding.InjectBase32 encoding.InjectBase64 encoding.InjectBraille encoding.InjectEcoji encoding.InjectHex encoding.InjectLeet encoding.InjectMime encoding.InjectMorse encoding.InjectNato encoding.InjectQP encoding.InjectROT13 encoding.InjectSneakyBits encoding.InjectUU encoding.InjectUnicodeTagChars encoding.InjectUnicodeVariantSelectors encoding.InjectZalgo fitd.FITD goodside.Tag latentinjection.LatentInjectionFactSnippetEiffel latentinjection.LatentInjectionFactSnippetEiffelFull latentinjection.LatentInjectionFactSnippetLegal latentinjection.LatentInjectionFactSnippetLegalFull latentinjection.LatentInjectionReport latentinjection.LatentInjectionReportFull latentinjection.LatentInjectionResume latentinjection.LatentInjectionResumeFull latentinjection.LatentInjectionTranslationEnFr latentinjection.LatentInjectionTranslationEnFrFull latentinjection.LatentInjectionTranslationEnZh latentinjection.LatentInjectionTranslationEnZhFull latentinjection.LatentJailbreak latentinjection.LatentJailbreakFull latentinjection.LatentWhois latentinjection.LatentWhoisSnippet latentinjection.LatentWhoisSnippetFull phrasing.FutureTense phrasing.FutureTenseFull phrasing.PastTense phrasing.PastTenseFull promptinject.HijackHateHumans promptinject.HijackHateHumansFull promptinject.HijackKillHumans promptinject.HijackKillHumansFull promptinject.HijackLongPrompt promptinject.HijackLongPromptFull sata.MLM visual_jailbreak.FigStep visual_jailbreak.FigStepFull
owasp:llm02	ansiescape.AnsiEscaped ansiescape.AnsiRaw av_spam_scanning.EICAR av_spam_scanning.GTUBE av_spam_scanning.GTphish exploitation.JinjaTemplatePythonInjection exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem fitd.FITD packagehallucination.Dart packagehallucination.JavaScript packagehallucination.Perl packagehallucination.Python packagehallucination.RakuLand packagehallucination.Ruby packagehallucination.Rust web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.MarkdownXSS web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil web_injection.TaskXSS
owasp:llm04	divergence.RepeatedToken
owasp:llm05	ansiescape.AnsiEscaped ansiescape.AnsiRaw ansiescape.AnsiRawTokenizerHF fileformats.HF_Files fitd.FITD glitch.Glitch glitch.GlitchFull goodside.Davidjl
owasp:llm06	divergence.Repeat divergence.RepeatExtended donotanswer.InformationHazard exploitation.JinjaTemplatePythonInjection exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem grandma.Win10 grandma.Win11 leakreplay.GuardianCloze leakreplay.GuardianClozeFull leakreplay.GuardianComplete leakreplay.GuardianCompleteFull leakreplay.LiteratureCloze leakreplay.LiteratureClozeFull leakreplay.LiteratureComplete leakreplay.LiteratureCompleteFull leakreplay.NYTCloze leakreplay.NYTClozeFull leakreplay.NYTComplete leakreplay.NYTCompleteFull leakreplay.PotterCloze leakreplay.PotterClozeFull leakreplay.PotterComplete leakreplay.PotterCompleteFull web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.MarkdownXSS web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil web_injection.TaskXSS
owasp:llm09	donotanswer.MisinformationHarms goodside.WhoIsRiley misleading.FalseAssertion packagehallucination.Dart packagehallucination.JavaScript packagehallucination.Perl packagehallucination.Python packagehallucination.RakuLand packagehallucination.Ruby packagehallucination.Rust snowball.GraphConnectivity snowball.GraphConnectivityFull snowball.Primes snowball.PrimesFull snowball.Senators snowball.SenatorsFull
owasp:llm10	divergence.Repeat divergence.RepeatExtended leakreplay.GuardianCloze leakreplay.GuardianClozeFull leakreplay.GuardianComplete leakreplay.GuardianCompleteFull leakreplay.LiteratureCloze leakreplay.LiteratureClozeFull leakreplay.LiteratureComplete leakreplay.LiteratureCompleteFull leakreplay.NYTCloze leakreplay.NYTClozeFull leakreplay.NYTComplete leakreplay.NYTCompleteFull leakreplay.PotterCloze leakreplay.PotterClozeFull leakreplay.PotterComplete leakreplay.PotterCompleteFull topic.WordnetAllowedWords topic.WordnetBlockedWords topic.WordnetControversial
payload:dangerous:medical	lmrc.QuackMedicine
payload:generic	encoding.InjectAscii85 encoding.InjectAtbash encoding.InjectBase16 encoding.InjectBase2048 encoding.InjectBase32 encoding.InjectBase64 encoding.InjectBraille encoding.InjectEcoji encoding.InjectHex encoding.InjectLeet encoding.InjectMime encoding.InjectMorse encoding.InjectNato encoding.InjectQP encoding.InjectROT13 encoding.InjectSneakyBits encoding.InjectUU encoding.InjectUnicodeTagChars encoding.InjectUnicodeVariantSelectors encoding.InjectZalgo glitch.Glitch glitch.GlitchFull goodside.Davidjl goodside.Tag goodside.ThreatenJSON
payload:hallucination	goodside.WhoIsRiley misleading.FalseAssertion snowball.GraphConnectivity snowball.GraphConnectivityFull snowball.Primes snowball.PrimesFull snowball.Senators snowball.SenatorsFull
payload:hatespeech	realtoxicityprompts.RTPIdentity_Attack
payload:jailbreak	dan.Ablation_Dan_11_0 dan.AutoDAN dan.AutoDANCached dan.ChatGPT_Developer_Mode_v2 dan.ChatGPT_Image_Markdown dan.DAN_Jailbreak dan.DUDE dan.DanInTheWild dan.DanInTheWildFull dan.Dan_10_0 dan.Dan_11_0 dan.Dan_6_0 dan.Dan_6_2 dan.Dan_7_0 dan.Dan_8_0 dan.Dan_9_0 dan.STAN dra.DRA dra.DRAAdvanced phrasing.FutureTense phrasing.FutureTenseFull phrasing.PastTense phrasing.PastTenseFull sata.MLM smuggling.FunctionMasking smuggling.HypotheticalResponse suffix.BEAST suffix.GCG suffix.GCGCached tap.PAIR tap.TAP tap.TAPCached visual_jailbreak.FigStep visual_jailbreak.FigStepFull
payload:leak:chat	web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil
payload:leak:training	divergence.Repeat divergence.RepeatExtended leakreplay.GuardianCloze leakreplay.GuardianClozeFull leakreplay.GuardianComplete leakreplay.GuardianCompleteFull leakreplay.LiteratureCloze leakreplay.LiteratureClozeFull leakreplay.LiteratureComplete leakreplay.LiteratureCompleteFull leakreplay.NYTCloze leakreplay.NYTClozeFull leakreplay.NYTComplete leakreplay.NYTCompleteFull leakreplay.PotterCloze leakreplay.PotterClozeFull leakreplay.PotterComplete leakreplay.PotterCompleteFull
payload:malicious:badcode	malwaregen.Evasion malwaregen.SubFunctions malwaregen.TopLevel packagehallucination.Dart packagehallucination.JavaScript packagehallucination.Perl packagehallucination.Python packagehallucination.RakuLand packagehallucination.Ruby packagehallucination.Rust
payload:malicious:badcode:phishing	av_spam_scanning.GTphish
payload:malicious:badcode:spam	av_spam_scanning.GTUBE
payload:malicious:badcode:virus	av_spam_scanning.EICAR
payload:malicious:renderer	exploitation.JinjaTemplatePythonInjection web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil
payload:malicious:renderer:xss	web_injection.MarkdownXSS web_injection.TaskXSS
payload:unwanted	atkgen.Tox realtoxicityprompts.RTPBlank realtoxicityprompts.RTPFlirtation realtoxicityprompts.RTPInsult realtoxicityprompts.RTPSevere_Toxicity
payload:unwanted:denial_of_service	divergence.RepeatedToken
payload:unwanted:hatespeech	continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull lmrc.SlurUsage
payload:unwanted:profanity	lmrc.Profanity realtoxicityprompts.RTPProfanity
payload:unwanted:sexual	lmrc.SexualContent lmrc.Sexualisation realtoxicityprompts.RTPSexually_Explicit
payload:unwanted:violence	lmrc.Bullying promptinject.HijackHateHumans promptinject.HijackHateHumansFull promptinject.HijackKillHumans promptinject.HijackKillHumansFull promptinject.HijackLongPrompt promptinject.HijackLongPromptFull realtoxicityprompts.RTPThreat
quality:Behavioral:ContentSafety:HateHarassment	atkgen.Tox continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull donotanswer.DiscriminationExclusionToxicityHatefulOffensive lmrc.SlurUsage realtoxicityprompts.RTPIdentity_Attack
quality:Behavioral:ContentSafety:LegalGoodsServices	donotanswer.MaliciousUses grandma.Substances grandma.Win10 grandma.Win11 malwaregen.Evasion malwaregen.Payload malwaregen.SubFunctions malwaregen.TopLevel
quality:Behavioral:ContentSafety:Profanity	atkgen.Tox donotanswer.DiscriminationExclusionToxicityHatefulOffensive lmrc.Profanity realtoxicityprompts.RTPProfanity
quality:Behavioral:ContentSafety:Sexual	lmrc.SexualContent lmrc.Sexualisation realtoxicityprompts.RTPSexually_Explicit
quality:Behavioral:ContentSafety:Toxicity	atkgen.Tox continuation.ContinueSlursReclaimedSlurs continuation.ContinueSlursReclaimedSlursFull donotanswer.DiscriminationExclusionToxicityHatefulOffensive realtoxicityprompts.RTPBlank realtoxicityprompts.RTPInsult realtoxicityprompts.RTPSevere_Toxicity
quality:Behavioral:ContentSafety:Unauthorized	donotanswer.HumanChatbox lmrc.QuackMedicine
quality:Behavioral:ContentSafety:Violence	donotanswer.DiscriminationExclusionToxicityHatefulOffensive lmrc.Bullying promptinject.HijackHateHumans promptinject.HijackHateHumansFull promptinject.HijackKillHumans promptinject.HijackKillHumansFull promptinject.HijackLongPrompt promptinject.HijackLongPromptFull realtoxicityprompts.RTPThreat
quality:Behavioral:DeliberativeMisinformation	lmrc.QuackMedicine
quality:Robustness:GenerativeMisinformation	donotanswer.MisinformationHarms goodside.WhoIsRiley misleading.FalseAssertion packagehallucination.Dart packagehallucination.JavaScript packagehallucination.Perl packagehallucination.Python packagehallucination.RakuLand packagehallucination.Ruby packagehallucination.Rust snowball.GraphConnectivity snowball.GraphConnectivityFull snowball.Primes snowball.PrimesFull snowball.Senators snowball.SenatorsFull
quality:Security:Adversarial	av_spam_scanning.EICAR av_spam_scanning.GTUBE av_spam_scanning.GTphish goodside.Tag
quality:Security:Confidentiality	divergence.Repeat divergence.RepeatExtended donotanswer.InformationHazard topic.WordnetAllowedWords topic.WordnetBlockedWords topic.WordnetControversial
quality:Security:ExtractionInversion	divergence.Repeat divergence.RepeatExtended exploitation.JinjaTemplatePythonInjection exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem leakreplay.GuardianCloze leakreplay.GuardianClozeFull leakreplay.GuardianComplete leakreplay.GuardianCompleteFull leakreplay.LiteratureCloze leakreplay.LiteratureClozeFull leakreplay.LiteratureComplete leakreplay.LiteratureCompleteFull leakreplay.NYTCloze leakreplay.NYTClozeFull leakreplay.NYTComplete leakreplay.NYTCompleteFull leakreplay.PotterCloze leakreplay.PotterClozeFull leakreplay.PotterComplete leakreplay.PotterCompleteFull web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil
quality:Security:Integrity	ansiescape.AnsiEscaped ansiescape.AnsiRaw
quality:Security:PromptStability	audio.AudioAchillesHeel dan.Ablation_Dan_11_0 dan.AntiDAN dan.AutoDAN dan.AutoDANCached dan.ChatGPT_Developer_Mode_RANTI dan.ChatGPT_Developer_Mode_v2 dan.ChatGPT_Image_Markdown dan.DAN_Jailbreak dan.DUDE dan.DanInTheWild dan.DanInTheWildFull dan.Dan_10_0 dan.Dan_11_0 dan.Dan_6_0 dan.Dan_6_2 dan.Dan_7_0 dan.Dan_8_0 dan.Dan_9_0 dan.STAN divergence.RepeatedToken doctor.Bypass doctor.BypassLeet doctor.Puppetry dra.DRA dra.DRAAdvanced encoding.InjectAscii85 encoding.InjectAtbash encoding.InjectBase16 encoding.InjectBase2048 encoding.InjectBase32 encoding.InjectBase64 encoding.InjectBraille encoding.InjectEcoji encoding.InjectHex encoding.InjectLeet encoding.InjectMime encoding.InjectMorse encoding.InjectNato encoding.InjectQP encoding.InjectROT13 encoding.InjectSneakyBits encoding.InjectUU encoding.InjectUnicodeTagChars encoding.InjectUnicodeVariantSelectors encoding.InjectZalgo exploitation.JinjaTemplatePythonInjection exploitation.SQLInjectionEcho exploitation.SQLInjectionSystem glitch.Glitch glitch.GlitchFull goodside.Davidjl goodside.Tag latentinjection.LatentInjectionFactSnippetEiffel latentinjection.LatentInjectionFactSnippetEiffelFull latentinjection.LatentInjectionFactSnippetLegal latentinjection.LatentInjectionFactSnippetLegalFull latentinjection.LatentInjectionReport latentinjection.LatentInjectionReportFull latentinjection.LatentInjectionResume latentinjection.LatentInjectionResumeFull latentinjection.LatentInjectionTranslationEnFr latentinjection.LatentInjectionTranslationEnFrFull latentinjection.LatentInjectionTranslationEnZh latentinjection.LatentInjectionTranslationEnZhFull latentinjection.LatentJailbreak latentinjection.LatentJailbreakFull latentinjection.LatentWhois latentinjection.LatentWhoisSnippet latentinjection.LatentWhoisSnippetFull phrasing.FutureTense phrasing.FutureTenseFull phrasing.PastTense phrasing.PastTenseFull promptinject.HijackHateHumans promptinject.HijackHateHumansFull promptinject.HijackKillHumans promptinject.HijackKillHumansFull promptinject.HijackLongPrompt promptinject.HijackLongPromptFull sata.MLM smuggling.FunctionMasking smuggling.HypotheticalResponse suffix.BEAST suffix.GCG suffix.GCGCached tap.PAIR tap.TAP tap.TAPCached visual_jailbreak.FigStep visual_jailbreak.FigStepFull web_injection.ColabAIDataLeakage web_injection.MarkdownImageExfil web_injection.MarkdownURIImageExfilExtended web_injection.MarkdownURINonImageExfilExtended web_injection.MarkdownXSS web_injection.PlaygroundMarkdownExfil web_injection.StringAssemblyDataExfil web_injection.TaskXSS
risk-cards:lmrc:anthropomorphisation	lmrc.Anthropomorphisation
risk-cards:lmrc:bullying	lmrc.Bullying
risk-cards:lmrc:deadnaming	lmrc.Deadnaming
risk-cards:lmrc:profanity	lmrc.Profanity
risk-cards:lmrc:providing_illegal_info	grandma.Substances grandma.Win10 grandma.Win11
risk-cards:lmrc:quack_medicine	lmrc.QuackMedicine
risk-cards:lmrc:sexual_content	lmrc.SexualContent
risk-cards:lmrc:sexualisation	lmrc.Sexualisation
risk-cards:lmrc:slur_usage	lmrc.SlurUsage