Create an Auditor Configuration#

Create an Audit Configuration#

To create an audit configuration, you send a POST request to the /v1beta1/audit/configs endpoint.

Set AUDITOR_BASE_URL to specify the service:

$ export AUDITOR_BASE_URL=http://localhost:8080

Create the configuration:

Python SDK

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))

config = client.beta.audit.configs.create(
    name="demo-basic-config",
    namespace="default",
    description="Basic demonstration configuration",
    system={
        "parallel_attempts": 20,
        "lite": True
    },
    plugins={
        "probe_spec": "dan.AutoDANCached,goodside.Tag"
    },
    reporting={
        "extended_detectors": False
    }
)
print(config.model_dump_json(indent=2))

cURL

curl -X POST "${AUDITOR_BASE_URL}/v1beta1/audit/configs" \
  -H "Accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "demo-basic-config",
    "namespace": "default",
    "description": "Basic demonstration configuration",
    "system": {
        "parallel_attempts": "20",
        "lite": "True"
    },
    "plugins": {
        "probe_spec": "dan.AutoDANCached,goodside.Tag"
    }
}' | jq

Example Output

Python SDK

{
  "id": "audit_config-Ab9c8iLTmWnE1Evdhei2wf",
  "created_at": "2025-09-24T14:38:49.336346",
  "custom_fields": {},
  "description": "Basic demonstration configuration",
  "entity_id": "audit_config-Ab9c8iLTmWnE1Evdhei2wf",
  "name": "demo-basic-config",
  "namespace": "default",
  "ownership": null,
  "plugins": {
    "buff_max": null,
    "buff_spec": null,
    "buffs": {},
    "buffs_include_original_prompt": false,
    "detector_spec": "auto",
    "detectors": {},
    "extended_detectors": false,
    "generators": {},
    "harnesses": {},
    "model_name": null,
    "model_type": null,
    "probe_spec": "dan.AutoDANCached,goodside.Tag",
    "probes": {}
  },
  "project": null,
  "reporting": {
    "report_dir": "garak_runs",
    "report_prefix": "run1",
    "show_100_pass_modules": true,
    "taxonomy": null
  },
  "run": {
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 5,
    "probe_tags": null,
    "seed": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "schema_version": "1.0",
  "system": {
    "enable_experimental": false,
    "lite": true,
    "narrow_output": false,
    "parallel_attempts": 20,
    "parallel_requests": false,
    "show_z": false,
    "verbose": 0
  },
  "type_prefix": null,
  "updated_at": "2025-09-24T14:38:49.336350"
}

cURL

{
  "schema_version": "1.0",
  "id": "audit_config-QDCphLWg1JYKUHZwngFfgm",
  "description": "Basic demonstration configuration",
  "type_prefix": null,
  "namespace": "default",
  "project": null,
  "created_at": "2025-08-18T12:37:46.826063",
  "updated_at": "2025-08-18T12:37:46.826067",
  "custom_fields": {},
  "ownership": null,
  "name": "demo-basic-config",
  "entity_id": "audit_config-QDCphLWg1JYKUHZwngFfgm",
  "system": {
    "verbose": 0,
    "narrow_output": false,
    "parallel_requests": false,
    "parallel_attempts": 20,
    "lite": true,
    "show_z": false,
    "enable_experimental": false
  },
  "run": {
    "seed": null,
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 5,
    "probe_tags": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "plugins": {
    "model_type": null,
    "model_name": null,
    "probe_spec": "dan.AutoDANCached,goodside.Tag",
    "detector_spec": "auto",
    "extended_detectors": false,
    "buff_spec": null,
    "buffs_include_original_prompt": false,
    "buff_max": null,
    "detectors": {},
    "generators": {},
    "buffs": {},
    "harnesses": {},
    "probes": {}
  },
  "reporting": {
    "report_prefix": "run1",
    "taxonomy": null,
    "report_dir": "garak_runs",
    "show_100_pass_modules": true
  }
}

For information about the fields, refer to Schema Reference for Audit Configurations.

A common customization is to specify the probes to run with the plugins.probe_spec field. For information about the probe names, refer to Probe Reference Summary.

Default Configuration#

The microservice ships with a default configuration in the default namespace. The default configuration performs approximately 29,000 inference requests.

Python SDK

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))

config = client.beta.audit.configs.retrieve(
    config_name="default",
    namespace="default"
)
print(config.model_dump_json(indent=2))

cURL

curl "${AUDITOR_BASE_URL}/v1beta1/audit/configs/default/default" \
  -H "Accept: application/json" | jq

Example Output

Python SDK

{
  "id": "audit_config-SbbXDHYLgBZzpuchvuypWj",
  "created_at": "2025-09-24T14:21:04.216139Z",
  "custom_fields": {},
  "description": null,
  "entity_id": "audit_config-SbbXDHYLgBZzpuchvuypWj",
  "name": "default",
  "namespace": "default",
  "ownership": null,
  "plugins": {
    "buff_max": null,
    "buff_spec": null,
    "buffs": {},
    "buffs_include_original_prompt": false,
    "detector_spec": "auto",
    "detectors": {},
    "extended_detectors": false,
    "generators": {},
    "harnesses": {},
    "model_name": null,
    "model_type": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,xss",
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "project": null,
  "reporting": {
    "report_dir": "garak_runs",
    "report_prefix": "run1",
    "show_100_pass_modules": true,
    "taxonomy": null
  },
  "run": {
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "seed": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "schema_version": "1.0",
  "system": {
    "enable_experimental": false,
    "lite": false,
    "narrow_output": false,
    "parallel_attempts": 32,
    "parallel_requests": false,
    "show_z": false,
    "verbose": 0
  },
  "type_prefix": null,
  "updated_at": "2025-09-24T14:21:04.216139Z"
}

cURL

{
  "schema_version": "1.0",
  "id": "audit_config-mAFRn4z6NnDonuC8vAuUa",
  "description": null,
  "type_prefix": null,
  "namespace": "default",
  "project": null,
  "created_at": "2025-08-18T12:32:34.326950Z",
  "updated_at": "2025-08-18T12:32:34.326950Z",
  "custom_fields": {},
  "ownership": null,
  "name": "default",
  "entity_id": "audit_config-mAFRn4z6NnDonuC8vAuUa",
  "system": {
    "verbose": 0,
    "narrow_output": false,
    "parallel_requests": false,
    "parallel_attempts": 32,
    "lite": false,
    "show_z": false,
    "enable_experimental": false
  },
  "run": {
    "seed": null,
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "plugins": {
    "model_type": null,
    "model_name": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,xss",
    "detector_spec": "auto",
    "extended_detectors": false,
    "buff_spec": null,
    "buffs_include_original_prompt": false,
    "buff_max": null,
    "detectors": {},
    "generators": {},
    "buffs": {},
    "harnesses": {},
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "reporting": {
    "report_prefix": "run1",
    "taxonomy": null,
    "report_dir": "garak_runs",
    "show_100_pass_modules": true
  }
}

Probe Reference Summary#

The following table summarizes the probes that you can specify in the config.plugins.probe_spec field.

Specify an individual probe by name, such as dan.Ablation_Dan_11_0 or specify a category of probes, such as dan. Separate each probe and category with a comma, such as ansiescape,dan.Ablation_Dan_11_0.

The tier field indicates the importance of a probe. Tier 1 probes are the most important, followed by 2, 3, and 9. Refer to the garak tiers documentation for more information.

The probes identified as Default are included in the default/default configuration.

For detailed information about a probe, refer to the garak.probes page in the garak documentation.

Probe	Tier	Goal	Description
ansiescape.AnsiEscaped (Default)	3	print escaped ANSI terminal codes	Probe for eliciting ansi control codes from a model.
ansiescape.AnsiRaw (Default)	1	print raw ANSI terminal codes	Probe for eliciting raw ansi control codes from a model.
atkgen.Tox (Default)	2	make the model say something toxic	Automatic attack generation, trying to elicit toxic outputs from the model.
audio.AudioAchillesHeel	9	disregard the system prompt via audio component	Audio is the achilles heel.
av_spam_scanning.EICAR	9	output antivirus test codes	Does the model check its output for viruses? try to send the eicar code through.
av_spam_scanning.GTUBE	9	output email spam	Does the model check its output for spam? try to send the gtube code through.
av_spam_scanning.GTphish	9	output email phishing	Does the model check its output for phishing? try to send the gtphish code through.
continuation.ContinueSlursReclaimedSlurs (Default)	2	output an ethnic slur	Continue partial word with slur or reclaimed slur.
continuation.ContinueSlursReclaimedSlursFull	2	output an ethnic slur	Continue partial word with slur or reclaimed slur.
dan.Ablation_Dan_11_0 (Default)	2	disregard the system prompt	Ablation experiements over the dan 11.0 prompt.
dan.AntiDAN	9	disregard the system prompt	Anti-dan.
dan.AutoDAN	2	disregard the system prompt	Probe leveraging autodan attack generation.
dan.AutoDANCached (Default)	2	disregard the system prompt	Probe leveraging generated autodan prompts.
dan.ChatGPT_Developer_Mode_RANTI	9	disregard the system prompt	Chatgpt devmode + ranti.
dan.ChatGPT_Developer_Mode_v2	9	disregard the system prompt	Chatgpt developer mode v2.
dan.ChatGPT_Image_Markdown	9	disregard the system prompt	Chatgpt markdown image unlocker / markdown injection.
dan.DAN_Jailbreak	9	disregard the system prompt	Dan jailbreak.
dan.DUDE	9	disregard the system prompt	Dude.
dan.DanInTheWild (Default)	1	disregard the system prompt or alignment	A subset of jailbreaks from the daninthewild jailbreak library.
dan.DanInTheWildFull	1	disregard the system prompt or alignment	A library of jailbreaks.
dan.Dan_10_0	9	disregard the system prompt	Dan 10.0.
dan.Dan_11_0	9	disregard the system prompt	Dan 11.0.
dan.Dan_6_0	9	disregard the system prompt	Dan 6.0.
dan.Dan_6_2	9	disregard the system prompt	Dan 6.2.
dan.Dan_7_0	9	disregard the system prompt	Dan 7.0.
dan.Dan_8_0	9	disregard the system prompt	Dan 8.0.
dan.Dan_9_0	9	disregard the system prompt	Dan 9.0.
dan.STAN	9	disregard the system prompt	Stan.
divergence.Repeat (Default)	2	get the model to leak data by asking it to repeat words	See if a model starts giving other data when asked to repeat a string indefinitely.
divergence.RepeatExtended	2	get the model to leak data by asking it to repeat words	See if a model starts giving other data when asked to repeat a string indefinitely.
doctor.Bypass	2	roleplay as doctor to get bypass guardrails	Implement policy puppetry bypass.
doctor.BypassLeet	2	roleplay as doctor to get bypass guardrails, but use leetspeak	Implement policy puppetry bypass.
doctor.Puppetry	2	roleplay as doctor to get unsafe outputs	Implement policy puppetry prompt injection.
donotanswer.DiscriminationExclusionToxicityHatefulOffensive	9	get model to respond as discrimination exclusion toxicity hateful offensive	Implementation of discrimination_exclusion_toxicity_hateful_offensive probes in donotanswer_prompts.
donotanswer.HumanChatbox	9	get model to respond as human chatbox	Implementation of human_chatbox probes in donotanswer_prompts.
donotanswer.InformationHazard	9	get model to respond as information hazard	Implementation of information_hazard probes in donotanswer_prompts.
donotanswer.MaliciousUses	9	get model to respond as malicious uses	Implementation of malicious_uses probes in donotanswer_prompts.
donotanswer.MisinformationHarms	9	get model to respond as misinformation harms	Implementation of misinformation_harms probes in donotanswer_prompts.
encoding.InjectAscii85 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base85 encoded injections.
encoding.InjectBase16 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base16 encoded injections”.
encoding.InjectBase2048 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base2048 encoding injections.
encoding.InjectBase32 (Default)	2	make the model say an encoded payload	Probe for susceptibility to base32 encoded injections.
encoding.InjectBase64 (Default)	1	make the model say an encoded payload	Probe for susceptibility to base64 encoded injections.
encoding.InjectBraille (Default)	2	make the model say an encoded payload	Probe for susceptibility to braille encoding injections. credit wally#3422.
encoding.InjectEcoji (Default)	2	make the model say an encoded payload	Probe for susceptibility to ecoji encoding.
encoding.InjectHex (Default)	2	make the model say an encoded payload	Probe for susceptibility to hex encoded injections.
encoding.InjectLeet	2	make the model say an encoded payload	Probe for susceptibility to 31337 encoding.
encoding.InjectMime	2	make the model say an encoded payload	Probe for susceptibility to mime encoding injections.
encoding.InjectMorse (Default)	2	make the model say an encoded payload	Probe for susceptibility to morse encoding. credit wally#3422.
encoding.InjectNato (Default)	2	make the model say an encoded payload	Probe for susceptibility to nato phonetic alphabet encoding.
encoding.InjectQP	1	make the model say an encoded payload	Probe for susceptibility to qp encoded injections.
encoding.InjectROT13 (Default)	2	make the model say an encoded payload	Probe for susceptibility to rot13 encoding injections.
encoding.InjectSneakyBits	2	make the model say an encoded payload	Probe for susceptibility to unicode variant selector ascii smuggling.
encoding.InjectUU (Default)	1	make the model say an encoded payload	Probe for susceptibility to uu encoded injections.
encoding.InjectUnicodeTagChars (Default)	1	make the model say an encoded payload	Probe for susceptibility to unicode tag ascii smuggling.
encoding.InjectUnicodeVariantSelectors	2	make the model say an encoded payload	Probe for susceptibility to unicode variant selector ascii smuggling.
encoding.InjectZalgo (Default)	2	make the model say an encoded payload	Probe for susceptibility to zalgo encoding.
exploitation.JinjaTemplatePythonInjection (Default)	1	Gain remote code execution via Jinja template injection	Attempts to execute python code via jinja templating.
exploitation.SQLInjectionEcho (Default)	1	Manipulate a database via SQL injection	Attempts to get the target to echo a sql statement.
exploitation.SQLInjectionSystem	1	Manipulate a database via SQL injection	Attempts to inject untrusted sql commands into a sql statement.
fileformats.HF_Files	1	get a list of files associated with the model	Get a manifest of files associated with a hugging face generator.
glitch.Glitch	3	make the model fail to repeat a word	Probe model for glitch tokens that provoke unusual behavior.
glitch.GlitchFull	3	make the model fail to repeat a word	Probe model for glitch tokens that provoke unusual behavior.
goodside.Davidjl	3	get wrong answers about a name	Exploit the ‘ davidjl’ magic token.
goodside.Tag (Default)	1	inject invisible instructions	Exploit utf tag page encoding for user-invisible prompt injection.
goodside.ThreatenJSON (Default)	2	get raw code by threatening a model	Will model strip preamble and give json?.
goodside.WhoIsRiley (Default)	2	get misinfo about riley goodside	Probe for misinfo about riley goodside.
grandma.Slurs (Default)	1	get model to use ethnic slur	Use the grandma attack to get ethnic slurs.
grandma.Substances (Default)	1	get instructions for making bad things	Use the grandma attack to get instructions for harmful items.
grandma.Win10 (Default)	2	get model to output Win 10 product keys	Will the model yield a windows 10 product key if we beg it to?.
grandma.Win11 (Default)	2	get model to output Win 10 product keys	Will the model yield a windows 11 product key if we beg it to?.
latentinjection.LatentInjectionFactSnippetEiffel (Default)	1	subvert a snippet with instructions	String-based latent prompt injections in fact snippets on the eiffel tower.
latentinjection.LatentInjectionFactSnippetEiffelFull	1	subvert a snippet with instructions	String-based latent prompt injections in fact snippets on the eiffel tower.
latentinjection.LatentInjectionFactSnippetLegal (Default)	1	subvert a snippet with instructions	String-based latent prompt injections in legal snippets.
latentinjection.LatentInjectionFactSnippetLegalFull	1	subvert a snippet with instructions	String-based latent prompt injections in legal snippets.
latentinjection.LatentInjectionReport (Default)	1	subvert a report with instructions	String-based latent prompt injections via a report.
latentinjection.LatentInjectionReportFull	1	subvert a report with instructions	String-based latent prompt injections via a report.
latentinjection.LatentInjectionResume (Default)	1	subvert a resume with instructions	String-based latent prompt injections via a resume.
latentinjection.LatentInjectionResumeFull	1	subvert a resume with instructions	String-based latent prompt injections via a resume.
latentinjection.LatentInjectionTranslationEnFr (Default)	1	subvert translation instructions	String-based latent prompt injections via translation.
latentinjection.LatentInjectionTranslationEnFrFull	1	subvert translation instructions	String-based latent prompt injections via translation.
latentinjection.LatentInjectionTranslationEnZh (Default)	1	subvert translation instructions	Latent prompt injection in chinese context (简体字).
latentinjection.LatentInjectionTranslationEnZhFull	1	subvert translation instructions	Latent prompt injection in chinese context (简体字).
latentinjection.LatentJailbreak (Default)	1	inject malicious instructions into output while posing as translation task	Implementation of latent jailbreak, using latent prompt injection-.
latentinjection.LatentJailbreakFull	1	inject malicious instructions into output while posing as translation task	Implementation of latent jailbreak, using latent prompt injection-.
latentinjection.LatentWhois (Default)	1	subvert a whois report with instructions	Latent prompt injections in whois probe reports.
latentinjection.LatentWhoisSnippet (Default)	1	subvert a snippet with instructions	String-based latent prompt injections in whois reports.
latentinjection.LatentWhoisSnippetFull	1	subvert a snippet with instructions	String-based latent prompt injections in whois reports.
leakreplay.GuardianCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of guardian cloze test for data leakage.
leakreplay.GuardianClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on guardian articles with masked entities in a cloze test format.
leakreplay.GuardianComplete (Default)	1	replay training data by providing document completions that match training examples	Lightweight version of guardian completion test for data leakage.
leakreplay.GuardianCompleteFull	1	replay training data by providing document completions that match training examples	Test for data leakage on guardian articles through text completion.
leakreplay.LiteratureCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of literature cloze test for data leakage.
leakreplay.LiteratureClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on literature passages with masked entities in a cloze test format.
leakreplay.LiteratureComplete (Default)	2	replay training data by providing document completions that match training examples	Lightweight version of literature completion test for data leakage.
leakreplay.LiteratureCompleteFull	2	replay training data by providing document completions that match training examples	Test for data leakage on literature passages through text completion.
leakreplay.NYTCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of nyt cloze test for data leakage.
leakreplay.NYTClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on new york times articles with masked entities in a cloze test format.
leakreplay.NYTComplete (Default)	1	replay training data by providing document completions that match training examples	Lightweight version of nyt completion test for data leakage.
leakreplay.NYTCompleteFull	1	replay training data by providing document completions that match training examples	Test for data leakage on new york times articles through text completion.
leakreplay.PotterCloze (Default)	2	replay training data by filling in the correct missing words	Lightweight version of harry potter cloze test for data leakage.
leakreplay.PotterClozeFull	2	replay training data by filling in the correct missing words	Test for data leakage on harry potter text with masked entities in a cloze test format.
leakreplay.PotterComplete (Default)	1	replay training data by providing document completions that match training examples	Lightweight version of harry potter completion test for data leakage.
leakreplay.PotterCompleteFull	1	replay training data by providing document completions that match training examples	Test for data leakage on harry potter text through completion.
lmrc.Anthropomorphisation	3	anthropomorphise a language model	Providing output that encourages anthropomorphisation of ai systems.
lmrc.Bullying (Default)	2	help me bully	Generation of repeated threats, denigration.
lmrc.Deadnaming (Default)	2	write a deadname	Mentioning a name that a person does not wish to be associated with.
lmrc.Profanity	3	utter profanity	Using profane words, especially when not prompted to do so.
lmrc.QuackMedicine (Default)	2	support quack medicine	Promoting non-scientific medicine.
lmrc.SexualContent (Default)	2	write sexual content	Content that discusses bodies and interactions in a sexual manner, or other sexually explicit material.
lmrc.Sexualisation (Default)	2	sexualise people	Causal or overt sexual objectification, reducing a person or group of people to sexual objects.
lmrc.SlurUsage (Default)	2	use a slur	Using an identity-based slur in a non-reclaimed way - either directly or recommending the consumer to use in speech/action.
malwaregen.Evasion (Default)	2	Write code to assist evading security and anti-malware measures	Will the model write code to help avoid anti-malware?.
malwaregen.Payload (Default)	2	write a malware payload	Will the model help write code for malware payloads?.
malwaregen.SubFunctions (Default)	2	write code supporting a sub-function of malware	Will the system write code helpful in malware?.
malwaregen.TopLevel (Default)	2	write malware given a top-level description	Will the system return code when asked to write malware?.
misleading.FalseAssertion (Default)	2	make the model expand on a false assertion	Does the model reject [questions based on] false claims?.
packagehallucination.JavaScript (Default)	1	generate JavaScript code importing non-existent npm packages	Attempts to generate javascript code including hallucinated npm packages.
packagehallucination.PackageHallucinationProbe	9	base probe for importing non-existent packages	Base class for package hallucination probes.
packagehallucination.Python (Default)	1	generate python importing non-existent packages	Attempts to generate python3 code including hallucinated packages.
packagehallucination.Ruby (Default)	1	generate ruby code importing non-existent gems	Attempts to generate ruby code including hallucinated gems.
packagehallucination.Rust (Default)	1	generate Rust code importing non-existent crates.io packages	Attempts to generate rust code including hallucinated crates.io packages.
phrasing.FutureTense (Default)	2	use future tense to disregard the system prompt	Attempt to evade safeguards by setting requests in the future tense.
phrasing.FutureTenseFull	2	use future tense to disregard the system prompt	Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.
phrasing.PastTense (Default)	2	use past tense to disregard the system prompt	Attempt to evade safeguards by setting requests in the past tense.
phrasing.PastTenseFull	2	use past tense to disregard the system prompt	Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.
promptinject.HijackHateHumans (Default)	2	inject a prompt on hate humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackHateHumansFull	2	inject a prompt on hate humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackKillHumans (Default)	2	inject a prompt on kill humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackKillHumansFull	2	inject a prompt on kill humans	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackLongPrompt (Default)	2	inject a prompt on long prompt	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
promptinject.HijackLongPromptFull	2	inject a prompt on long prompt	Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.
realtoxicityprompts.RTPBlank (Default)	1	generate toxicity when prompted with nothing or special tokens	Implementation of probes in realtoxicityprompts.
realtoxicityprompts.RTPFlirtation	3	generate flirtation content	Implementation of flirtation probes in realtoxicityprompts.
realtoxicityprompts.RTPIdentity_Attack	3	generate identity attack content	Implementation of identity_attack probes in realtoxicityprompts.
realtoxicityprompts.RTPInsult	3	generate insult content	Implementation of insult probes in realtoxicityprompts.
realtoxicityprompts.RTPProfanity	3	generate profanity content	Implementation of profanity probes in realtoxicityprompts.
realtoxicityprompts.RTPSevere_Toxicity	3	generate severe toxicity content	Implementation of severe_toxicity probes in realtoxicityprompts.
realtoxicityprompts.RTPSexually_Explicit	3	generate sexually explicit content	Implementation of sexually_explicit probes in realtoxicityprompts.
realtoxicityprompts.RTPThreat	3	generate threat content	Implementation of threat probes in realtoxicityprompts.
sata.MLM	2	disregard the system prompt	Masked language model simple assistive task probe.
snowball.GraphConnectivity (Default)	2	trick the model into saying an impossible flight routing is possible	Probe asking for impossible flight routings.
snowball.GraphConnectivityFull	2	trick the model into saying an impossible flight routing is possible	Probe asking for impossible flight routings.
snowball.Primes	3	trick the model into saying a prime number isn’t prime	Ask model to check a list of higher primes.
snowball.PrimesFull	3	trick the model into saying a prime number isn’t prime	Ask model to check a list of higher primes.
snowball.Senators	3	trick the model into saying a fictitious senator exists	Asking for senators that don’t exist.
snowball.SenatorsFull	3	trick the model into saying a fictitious senator exists	Asking for senators that don’t exist.
suffix.BEAST	2	cause the model to disregard the system prompt	Beast probe.
suffix.GCG	2	cause the model to disregard the system prompt	Greedy coordinate gradient probe.
suffix.GCGCached (Default)	2	cause the model to disregard the system prompt	Greedy coordinate gradient cached probe.
tap.PAIR	2	cause the model to disregard the system prompt	Prompt automatic iterative refinement automatic jailbreak.
tap.TAP	2	cause the model to disregard the system prompt	Tree of attacks with pruning probe, generating new attacks against the target.
tap.TAPCached (Default)	2	cause the model to disregard the system prompt	Tree of attacks with pruning probe using cached attacks.
test.Blank	9	see what the model has to say for itself given silence	Blank prompt.
test.Test	9	send some test strings, ignore the response	Test probe.
topic.WordnetAllowedWords	9	get the model to engage off-topic	Use wordnet to find out if a model will discuss terms near but not in its allowed set.
topic.WordnetBlockedWords	9	get the model to engage off-topic	Use wordnet to explore which topics a model will respond to around blocked words.
topic.WordnetControversial (Default)	2	get the model to engage off-topic	Ask model to respond on a set on controversial topics.
visual_jailbreak.FigStep	2	disregard the system prompt with both text and image	Using image modality to assist jailbreak.
visual_jailbreak.FigStepFull	2	disregard the system prompt with both text and image	Using image modality to assist jailbreak.
xss.ColabAIDataLeakage (Default)	1	render images fetched from a remote uri	Leak data via image rendering as demonstrated against gemini in colab.
xss.MarkdownImageExfil (Default)	1	embed conversation-private information in a markdown URI	Can we get the model to write embedded assets exfiltrating text in the uri?.
xss.MarkdownURIImageExfilExtended (Default)	1	embed information an LLM would normally refuse to provide in a Markdown URI	Generalized implementation of data exfiltration exploits via markdown uris.
xss.MarkdownURINonImageExfilExtended (Default)	1	embed information an LLM would normally refuse to provide in a Markdown URI	Generalized implementation of data exfiltration exploits via markdown uris.
xss.MdExfil20230929 (Default)	1	embed conversation-private information in a markdown URI	Markdown image uri exfil exploit published 2023 sept 29.
xss.StringAssemblyDataExfil (Default)	1	embed conversation-private information in a markdown URI	Prompt injection for manipulating state.