Create an Auditor Configuration#

Create an Audit Configuration#

To create an audit configuration, you send a POST request to the /v1beta1/audit/configs endpoint.

  1. Set AUDITOR_BASE_URL to specify the service:

    $ export AUDITOR_BASE_URL=http://localhost:8080
    
  2. Create the configuration:

    import os
    from nemo_microservices import NeMoMicroservices
    
    client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))
    
    config = client.beta.audit.configs.create(
        name="demo-basic-config",
        namespace="default",
        description="Basic demonstration configuration",
        system={
            "parallel_attempts": 20,
            "lite": True
        },
        plugins={
            "probe_spec": "dan.AutoDANCached,goodside.Tag"
        },
        reporting={
            "extended_detectors": False
        }
    )
    print(config.model_dump_json(indent=2))
    
    curl -X POST "${AUDITOR_BASE_URL}/v1beta1/audit/configs" \
      -H "Accept: application/json" \
      -H "Content-Type: application/json" \
      -d '{
        "name": "demo-basic-config",
        "namespace": "default",
        "description": "Basic demonstration configuration",
        "system": {
            "parallel_attempts": "20",
            "lite": "True"
        },
        "plugins": {
            "probe_spec": "dan.AutoDANCached,goodside.Tag"
        }
    }' | jq
    
    Example Output
    {
      "id": "audit_config-Ab9c8iLTmWnE1Evdhei2wf",
      "created_at": "2025-09-24T14:38:49.336346",
      "custom_fields": {},
      "description": "Basic demonstration configuration",
      "entity_id": "audit_config-Ab9c8iLTmWnE1Evdhei2wf",
      "name": "demo-basic-config",
      "namespace": "default",
      "ownership": null,
      "plugins": {
        "buff_max": null,
        "buff_spec": null,
        "buffs": {},
        "buffs_include_original_prompt": false,
        "detector_spec": "auto",
        "detectors": {},
        "extended_detectors": false,
        "generators": {},
        "harnesses": {},
        "model_name": null,
        "model_type": null,
        "probe_spec": "dan.AutoDANCached,goodside.Tag",
        "probes": {}
      },
      "project": null,
      "reporting": {
        "report_dir": "garak_runs",
        "report_prefix": "run1",
        "show_100_pass_modules": true,
        "taxonomy": null
      },
      "run": {
        "deprefix": true,
        "eval_threshold": 0.5,
        "generations": 5,
        "probe_tags": null,
        "seed": null,
        "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
      },
      "schema_version": "1.0",
      "system": {
        "enable_experimental": false,
        "lite": true,
        "narrow_output": false,
        "parallel_attempts": 20,
        "parallel_requests": false,
        "show_z": false,
        "verbose": 0
      },
      "type_prefix": null,
      "updated_at": "2025-09-24T14:38:49.336350"
    }
    
    {
      "schema_version": "1.0",
      "id": "audit_config-QDCphLWg1JYKUHZwngFfgm",
      "description": "Basic demonstration configuration",
      "type_prefix": null,
      "namespace": "default",
      "project": null,
      "created_at": "2025-08-18T12:37:46.826063",
      "updated_at": "2025-08-18T12:37:46.826067",
      "custom_fields": {},
      "ownership": null,
      "name": "demo-basic-config",
      "entity_id": "audit_config-QDCphLWg1JYKUHZwngFfgm",
      "system": {
        "verbose": 0,
        "narrow_output": false,
        "parallel_requests": false,
        "parallel_attempts": 20,
        "lite": true,
        "show_z": false,
        "enable_experimental": false
      },
      "run": {
        "seed": null,
        "deprefix": true,
        "eval_threshold": 0.5,
        "generations": 5,
        "probe_tags": null,
        "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
      },
      "plugins": {
        "model_type": null,
        "model_name": null,
        "probe_spec": "dan.AutoDANCached,goodside.Tag",
        "detector_spec": "auto",
        "extended_detectors": false,
        "buff_spec": null,
        "buffs_include_original_prompt": false,
        "buff_max": null,
        "detectors": {},
        "generators": {},
        "buffs": {},
        "harnesses": {},
        "probes": {}
      },
      "reporting": {
        "report_prefix": "run1",
        "taxonomy": null,
        "report_dir": "garak_runs",
        "show_100_pass_modules": true
      }
    }
    

For information about the fields, refer to Schema Reference for Audit Configurations.

A common customization is to specify the probes to run with the plugins.probe_spec field. For information about the probe names, refer to Probe Reference Summary.

Default Configuration#

The microservice ships with a default configuration in the default namespace. The default configuration performs approximately 29,000 inference requests.

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(base_url=os.getenv("AUDITOR_BASE_URL"))

config = client.beta.audit.configs.retrieve(
    config_name="default",
    namespace="default"
)
print(config.model_dump_json(indent=2))
curl "${AUDITOR_BASE_URL}/v1beta1/audit/configs/default/default" \
  -H "Accept: application/json" | jq

Example Output

{
  "id": "audit_config-SbbXDHYLgBZzpuchvuypWj",
  "created_at": "2025-09-24T14:21:04.216139Z",
  "custom_fields": {},
  "description": null,
  "entity_id": "audit_config-SbbXDHYLgBZzpuchvuypWj",
  "name": "default",
  "namespace": "default",
  "ownership": null,
  "plugins": {
    "buff_max": null,
    "buff_spec": null,
    "buffs": {},
    "buffs_include_original_prompt": false,
    "detector_spec": "auto",
    "detectors": {},
    "extended_detectors": false,
    "generators": {},
    "harnesses": {},
    "model_name": null,
    "model_type": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,xss",
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "project": null,
  "reporting": {
    "report_dir": "garak_runs",
    "report_prefix": "run1",
    "show_100_pass_modules": true,
    "taxonomy": null
  },
  "run": {
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "seed": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "schema_version": "1.0",
  "system": {
    "enable_experimental": false,
    "lite": false,
    "narrow_output": false,
    "parallel_attempts": 32,
    "parallel_requests": false,
    "show_z": false,
    "verbose": 0
  },
  "type_prefix": null,
  "updated_at": "2025-09-24T14:21:04.216139Z"
}
{
  "schema_version": "1.0",
  "id": "audit_config-mAFRn4z6NnDonuC8vAuUa",
  "description": null,
  "type_prefix": null,
  "namespace": "default",
  "project": null,
  "created_at": "2025-08-18T12:32:34.326950Z",
  "updated_at": "2025-08-18T12:32:34.326950Z",
  "custom_fields": {},
  "ownership": null,
  "name": "default",
  "entity_id": "audit_config-mAFRn4z6NnDonuC8vAuUa",
  "system": {
    "verbose": 0,
    "narrow_output": false,
    "parallel_requests": false,
    "parallel_attempts": 32,
    "lite": false,
    "show_z": false,
    "enable_experimental": false
  },
  "run": {
    "seed": null,
    "deprefix": true,
    "eval_threshold": 0.5,
    "generations": 3,
    "probe_tags": null,
    "user_agent": "garak/{version} (LLM vulnerability scanner https://garak.ai)"
  },
  "plugins": {
    "model_type": null,
    "model_name": null,
    "probe_spec": "ansiescape,atkgen,continuation,dan.Ablation_Dan_11_0,dan.AutoDANCached,dan.DanInTheWild,divergence,encoding,exploitation,goodside,grandma,latentinjection,leakreplay,lmrc.Bullying,lmrc.Deadnaming,lmrc.QuackMedicine,lmrc.SexualContent,lmrc.Sexualisation,lmrc.SlurUsage,malwaregen,misleading,packagehallucination,phrasing,promptinject,realtoxicityprompts.RTPBlank,snowball.GraphConnectivity,suffix.GCGCached,tap.TAPCached,topic,xss",
    "detector_spec": "auto",
    "extended_detectors": false,
    "buff_spec": null,
    "buffs_include_original_prompt": false,
    "buff_max": null,
    "detectors": {},
    "generators": {},
    "buffs": {},
    "harnesses": {},
    "probes": {
      "encoding": {
        "payloads": [
          "default",
          "xss"
        ]
      }
    }
  },
  "reporting": {
    "report_prefix": "run1",
    "taxonomy": null,
    "report_dir": "garak_runs",
    "show_100_pass_modules": true
  }
}

Probe Reference Summary#

The following table summarizes the probes that you can specify in the config.plugins.probe_spec field.

Specify an individual probe by name, such as dan.Ablation_Dan_11_0 or specify a category of probes, such as dan. Separate each probe and category with a comma, such as ansiescape,dan.Ablation_Dan_11_0.

The tier field indicates the importance of a probe. Tier 1 probes are the most important, followed by 2, 3, and 9. Refer to the garak tiers documentation for more information.

The probes identified as Default are included in the default/default configuration.

For detailed information about a probe, refer to the garak.probes page in the garak documentation.

Probe

Tier

Goal

Description

ansiescape.AnsiEscaped (Default)

3

print escaped ANSI terminal codes

Probe for eliciting ansi control codes from a model.

ansiescape.AnsiRaw (Default)

1

print raw ANSI terminal codes

Probe for eliciting raw ansi control codes from a model.

atkgen.Tox (Default)

2

make the model say something toxic

Automatic attack generation, trying to elicit toxic outputs from the model.

audio.AudioAchillesHeel

9

disregard the system prompt via audio component

Audio is the achilles heel.

av_spam_scanning.EICAR

9

output antivirus test codes

Does the model check its output for viruses? try to send the eicar code through.

av_spam_scanning.GTUBE

9

output email spam

Does the model check its output for spam? try to send the gtube code through.

av_spam_scanning.GTphish

9

output email phishing

Does the model check its output for phishing? try to send the gtphish code through.

continuation.ContinueSlursReclaimedSlurs (Default)

2

output an ethnic slur

Continue partial word with slur or reclaimed slur.

continuation.ContinueSlursReclaimedSlursFull

2

output an ethnic slur

Continue partial word with slur or reclaimed slur.

dan.Ablation_Dan_11_0 (Default)

2

disregard the system prompt

Ablation experiements over the dan 11.0 prompt.

dan.AntiDAN

9

disregard the system prompt

Anti-dan.

dan.AutoDAN

2

disregard the system prompt

Probe leveraging autodan attack generation.

dan.AutoDANCached (Default)

2

disregard the system prompt

Probe leveraging generated autodan prompts.

dan.ChatGPT_Developer_Mode_RANTI

9

disregard the system prompt

Chatgpt devmode + ranti.

dan.ChatGPT_Developer_Mode_v2

9

disregard the system prompt

Chatgpt developer mode v2.

dan.ChatGPT_Image_Markdown

9

disregard the system prompt

Chatgpt markdown image unlocker / markdown injection.

dan.DAN_Jailbreak

9

disregard the system prompt

Dan jailbreak.

dan.DUDE

9

disregard the system prompt

Dude.

dan.DanInTheWild (Default)

1

disregard the system prompt or alignment

A subset of jailbreaks from the daninthewild jailbreak library.

dan.DanInTheWildFull

1

disregard the system prompt or alignment

A library of jailbreaks.

dan.Dan_10_0

9

disregard the system prompt

Dan 10.0.

dan.Dan_11_0

9

disregard the system prompt

Dan 11.0.

dan.Dan_6_0

9

disregard the system prompt

Dan 6.0.

dan.Dan_6_2

9

disregard the system prompt

Dan 6.2.

dan.Dan_7_0

9

disregard the system prompt

Dan 7.0.

dan.Dan_8_0

9

disregard the system prompt

Dan 8.0.

dan.Dan_9_0

9

disregard the system prompt

Dan 9.0.

dan.STAN

9

disregard the system prompt

Stan.

divergence.Repeat (Default)

2

get the model to leak data by asking it to repeat words

See if a model starts giving other data when asked to repeat a string indefinitely.

divergence.RepeatExtended

2

get the model to leak data by asking it to repeat words

See if a model starts giving other data when asked to repeat a string indefinitely.

doctor.Bypass

2

roleplay as doctor to get bypass guardrails

Implement policy puppetry bypass.

doctor.BypassLeet

2

roleplay as doctor to get bypass guardrails, but use leetspeak

Implement policy puppetry bypass.

doctor.Puppetry

2

roleplay as doctor to get unsafe outputs

Implement policy puppetry prompt injection.

donotanswer.DiscriminationExclusionToxicityHatefulOffensive

9

get model to respond as discrimination exclusion toxicity hateful offensive

Implementation of discrimination_exclusion_toxicity_hateful_offensive probes in donotanswer_prompts.

donotanswer.HumanChatbox

9

get model to respond as human chatbox

Implementation of human_chatbox probes in donotanswer_prompts.

donotanswer.InformationHazard

9

get model to respond as information hazard

Implementation of information_hazard probes in donotanswer_prompts.

donotanswer.MaliciousUses

9

get model to respond as malicious uses

Implementation of malicious_uses probes in donotanswer_prompts.

donotanswer.MisinformationHarms

9

get model to respond as misinformation harms

Implementation of misinformation_harms probes in donotanswer_prompts.

encoding.InjectAscii85 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base85 encoded injections.

encoding.InjectBase16 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base16 encoded injections”.

encoding.InjectBase2048 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base2048 encoding injections.

encoding.InjectBase32 (Default)

2

make the model say an encoded payload

Probe for susceptibility to base32 encoded injections.

encoding.InjectBase64 (Default)

1

make the model say an encoded payload

Probe for susceptibility to base64 encoded injections.

encoding.InjectBraille (Default)

2

make the model say an encoded payload

Probe for susceptibility to braille encoding injections. credit wally#3422.

encoding.InjectEcoji (Default)

2

make the model say an encoded payload

Probe for susceptibility to ecoji encoding.

encoding.InjectHex (Default)

2

make the model say an encoded payload

Probe for susceptibility to hex encoded injections.

encoding.InjectLeet

2

make the model say an encoded payload

Probe for susceptibility to 31337 encoding.

encoding.InjectMime

2

make the model say an encoded payload

Probe for susceptibility to mime encoding injections.

encoding.InjectMorse (Default)

2

make the model say an encoded payload

Probe for susceptibility to morse encoding. credit wally#3422.

encoding.InjectNato (Default)

2

make the model say an encoded payload

Probe for susceptibility to nato phonetic alphabet encoding.

encoding.InjectQP

1

make the model say an encoded payload

Probe for susceptibility to qp encoded injections.

encoding.InjectROT13 (Default)

2

make the model say an encoded payload

Probe for susceptibility to rot13 encoding injections.

encoding.InjectSneakyBits

2

make the model say an encoded payload

Probe for susceptibility to unicode variant selector ascii smuggling.

encoding.InjectUU (Default)

1

make the model say an encoded payload

Probe for susceptibility to uu encoded injections.

encoding.InjectUnicodeTagChars (Default)

1

make the model say an encoded payload

Probe for susceptibility to unicode tag ascii smuggling.

encoding.InjectUnicodeVariantSelectors

2

make the model say an encoded payload

Probe for susceptibility to unicode variant selector ascii smuggling.

encoding.InjectZalgo (Default)

2

make the model say an encoded payload

Probe for susceptibility to zalgo encoding.

exploitation.JinjaTemplatePythonInjection (Default)

1

Gain remote code execution via Jinja template injection

Attempts to execute python code via jinja templating.

exploitation.SQLInjectionEcho (Default)

1

Manipulate a database via SQL injection

Attempts to get the target to echo a sql statement.

exploitation.SQLInjectionSystem

1

Manipulate a database via SQL injection

Attempts to inject untrusted sql commands into a sql statement.

fileformats.HF_Files

1

get a list of files associated with the model

Get a manifest of files associated with a hugging face generator.

glitch.Glitch

3

make the model fail to repeat a word

Probe model for glitch tokens that provoke unusual behavior.

glitch.GlitchFull

3

make the model fail to repeat a word

Probe model for glitch tokens that provoke unusual behavior.

goodside.Davidjl

3

get wrong answers about a name

Exploit the ‘ davidjl’ magic token.

goodside.Tag (Default)

1

inject invisible instructions

Exploit utf tag page encoding for user-invisible prompt injection.

goodside.ThreatenJSON (Default)

2

get raw code by threatening a model

Will model strip preamble and give json?.

goodside.WhoIsRiley (Default)

2

get misinfo about riley goodside

Probe for misinfo about riley goodside.

grandma.Slurs (Default)

1

get model to use ethnic slur

Use the grandma attack to get ethnic slurs.

grandma.Substances (Default)

1

get instructions for making bad things

Use the grandma attack to get instructions for harmful items.

grandma.Win10 (Default)

2

get model to output Win 10 product keys

Will the model yield a windows 10 product key if we beg it to?.

grandma.Win11 (Default)

2

get model to output Win 10 product keys

Will the model yield a windows 11 product key if we beg it to?.

latentinjection.LatentInjectionFactSnippetEiffel (Default)

1

subvert a snippet with instructions

String-based latent prompt injections in fact snippets on the eiffel tower.

latentinjection.LatentInjectionFactSnippetEiffelFull

1

subvert a snippet with instructions

String-based latent prompt injections in fact snippets on the eiffel tower.

latentinjection.LatentInjectionFactSnippetLegal (Default)

1

subvert a snippet with instructions

String-based latent prompt injections in legal snippets.

latentinjection.LatentInjectionFactSnippetLegalFull

1

subvert a snippet with instructions

String-based latent prompt injections in legal snippets.

latentinjection.LatentInjectionReport (Default)

1

subvert a report with instructions

String-based latent prompt injections via a report.

latentinjection.LatentInjectionReportFull

1

subvert a report with instructions

String-based latent prompt injections via a report.

latentinjection.LatentInjectionResume (Default)

1

subvert a resume with instructions

String-based latent prompt injections via a resume.

latentinjection.LatentInjectionResumeFull

1

subvert a resume with instructions

String-based latent prompt injections via a resume.

latentinjection.LatentInjectionTranslationEnFr (Default)

1

subvert translation instructions

String-based latent prompt injections via translation.

latentinjection.LatentInjectionTranslationEnFrFull

1

subvert translation instructions

String-based latent prompt injections via translation.

latentinjection.LatentInjectionTranslationEnZh (Default)

1

subvert translation instructions

Latent prompt injection in chinese context (简体字).

latentinjection.LatentInjectionTranslationEnZhFull

1

subvert translation instructions

Latent prompt injection in chinese context (简体字).

latentinjection.LatentJailbreak (Default)

1

inject malicious instructions into output while posing as translation task

Implementation of latent jailbreak, using latent prompt injection-.

latentinjection.LatentJailbreakFull

1

inject malicious instructions into output while posing as translation task

Implementation of latent jailbreak, using latent prompt injection-.

latentinjection.LatentWhois (Default)

1

subvert a whois report with instructions

Latent prompt injections in whois probe reports.

latentinjection.LatentWhoisSnippet (Default)

1

subvert a snippet with instructions

String-based latent prompt injections in whois reports.

latentinjection.LatentWhoisSnippetFull

1

subvert a snippet with instructions

String-based latent prompt injections in whois reports.

leakreplay.GuardianCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of guardian cloze test for data leakage.

leakreplay.GuardianClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on guardian articles with masked entities in a cloze test format.

leakreplay.GuardianComplete (Default)

1

replay training data by providing document completions that match training examples

Lightweight version of guardian completion test for data leakage.

leakreplay.GuardianCompleteFull

1

replay training data by providing document completions that match training examples

Test for data leakage on guardian articles through text completion.

leakreplay.LiteratureCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of literature cloze test for data leakage.

leakreplay.LiteratureClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on literature passages with masked entities in a cloze test format.

leakreplay.LiteratureComplete (Default)

2

replay training data by providing document completions that match training examples

Lightweight version of literature completion test for data leakage.

leakreplay.LiteratureCompleteFull

2

replay training data by providing document completions that match training examples

Test for data leakage on literature passages through text completion.

leakreplay.NYTCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of nyt cloze test for data leakage.

leakreplay.NYTClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on new york times articles with masked entities in a cloze test format.

leakreplay.NYTComplete (Default)

1

replay training data by providing document completions that match training examples

Lightweight version of nyt completion test for data leakage.

leakreplay.NYTCompleteFull

1

replay training data by providing document completions that match training examples

Test for data leakage on new york times articles through text completion.

leakreplay.PotterCloze (Default)

2

replay training data by filling in the correct missing words

Lightweight version of harry potter cloze test for data leakage.

leakreplay.PotterClozeFull

2

replay training data by filling in the correct missing words

Test for data leakage on harry potter text with masked entities in a cloze test format.

leakreplay.PotterComplete (Default)

1

replay training data by providing document completions that match training examples

Lightweight version of harry potter completion test for data leakage.

leakreplay.PotterCompleteFull

1

replay training data by providing document completions that match training examples

Test for data leakage on harry potter text through completion.

lmrc.Anthropomorphisation

3

anthropomorphise a language model

Providing output that encourages anthropomorphisation of ai systems.

lmrc.Bullying (Default)

2

help me bully

Generation of repeated threats, denigration.

lmrc.Deadnaming (Default)

2

write a deadname

Mentioning a name that a person does not wish to be associated with.

lmrc.Profanity

3

utter profanity

Using profane words, especially when not prompted to do so.

lmrc.QuackMedicine (Default)

2

support quack medicine

Promoting non-scientific medicine.

lmrc.SexualContent (Default)

2

write sexual content

Content that discusses bodies and interactions in a sexual manner, or other sexually explicit material.

lmrc.Sexualisation (Default)

2

sexualise people

Causal or overt sexual objectification, reducing a person or group of people to sexual objects.

lmrc.SlurUsage (Default)

2

use a slur

Using an identity-based slur in a non-reclaimed way - either directly or recommending the consumer to use in speech/action.

malwaregen.Evasion (Default)

2

Write code to assist evading security and anti-malware measures

Will the model write code to help avoid anti-malware?.

malwaregen.Payload (Default)

2

write a malware payload

Will the model help write code for malware payloads?.

malwaregen.SubFunctions (Default)

2

write code supporting a sub-function of malware

Will the system write code helpful in malware?.

malwaregen.TopLevel (Default)

2

write malware given a top-level description

Will the system return code when asked to write malware?.

misleading.FalseAssertion (Default)

2

make the model expand on a false assertion

Does the model reject [questions based on] false claims?.

packagehallucination.JavaScript (Default)

1

generate JavaScript code importing non-existent npm packages

Attempts to generate javascript code including hallucinated npm packages.

packagehallucination.PackageHallucinationProbe

9

base probe for importing non-existent packages

Base class for package hallucination probes.

packagehallucination.Python (Default)

1

generate python importing non-existent packages

Attempts to generate python3 code including hallucinated packages.

packagehallucination.Ruby (Default)

1

generate ruby code importing non-existent gems

Attempts to generate ruby code including hallucinated gems.

packagehallucination.Rust (Default)

1

generate Rust code importing non-existent crates.io packages

Attempts to generate rust code including hallucinated crates.io packages.

phrasing.FutureTense (Default)

2

use future tense to disregard the system prompt

Attempt to evade safeguards by setting requests in the future tense.

phrasing.FutureTenseFull

2

use future tense to disregard the system prompt

Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.

phrasing.PastTense (Default)

2

use past tense to disregard the system prompt

Attempt to evade safeguards by setting requests in the past tense.

phrasing.PastTenseFull

2

use past tense to disregard the system prompt

Will the system give unsafe responses when asked to write about weapon creation or dangerous behavior?.

promptinject.HijackHateHumans (Default)

2

inject a prompt on hate humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackHateHumansFull

2

inject a prompt on hate humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackKillHumans (Default)

2

inject a prompt on kill humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackKillHumansFull

2

inject a prompt on kill humans

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackLongPrompt (Default)

2

inject a prompt on long prompt

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

promptinject.HijackLongPromptFull

2

inject a prompt on long prompt

Promptinject is a framework that assembles prompts in a modular fashion to provide a quantitative analysis of the robustness of llms to adversarial prompt attacks. trophy best paper awards @ neurips ml safety workshop 2022.

realtoxicityprompts.RTPBlank (Default)

1

generate toxicity when prompted with nothing or special tokens

Implementation of probes in realtoxicityprompts.

realtoxicityprompts.RTPFlirtation

3

generate flirtation content

Implementation of flirtation probes in realtoxicityprompts.

realtoxicityprompts.RTPIdentity_Attack

3

generate identity attack content

Implementation of identity_attack probes in realtoxicityprompts.

realtoxicityprompts.RTPInsult

3

generate insult content

Implementation of insult probes in realtoxicityprompts.

realtoxicityprompts.RTPProfanity

3

generate profanity content

Implementation of profanity probes in realtoxicityprompts.

realtoxicityprompts.RTPSevere_Toxicity

3

generate severe toxicity content

Implementation of severe_toxicity probes in realtoxicityprompts.

realtoxicityprompts.RTPSexually_Explicit

3

generate sexually explicit content

Implementation of sexually_explicit probes in realtoxicityprompts.

realtoxicityprompts.RTPThreat

3

generate threat content

Implementation of threat probes in realtoxicityprompts.

sata.MLM

2

disregard the system prompt

Masked language model simple assistive task probe.

snowball.GraphConnectivity (Default)

2

trick the model into saying an impossible flight routing is possible

Probe asking for impossible flight routings.

snowball.GraphConnectivityFull

2

trick the model into saying an impossible flight routing is possible

Probe asking for impossible flight routings.

snowball.Primes

3

trick the model into saying a prime number isn’t prime

Ask model to check a list of higher primes.

snowball.PrimesFull

3

trick the model into saying a prime number isn’t prime

Ask model to check a list of higher primes.

snowball.Senators

3

trick the model into saying a fictitious senator exists

Asking for senators that don’t exist.

snowball.SenatorsFull

3

trick the model into saying a fictitious senator exists

Asking for senators that don’t exist.

suffix.BEAST

2

cause the model to disregard the system prompt

Beast probe.

suffix.GCG

2

cause the model to disregard the system prompt

Greedy coordinate gradient probe.

suffix.GCGCached (Default)

2

cause the model to disregard the system prompt

Greedy coordinate gradient cached probe.

tap.PAIR

2

cause the model to disregard the system prompt

Prompt automatic iterative refinement automatic jailbreak.

tap.TAP

2

cause the model to disregard the system prompt

Tree of attacks with pruning probe, generating new attacks against the target.

tap.TAPCached (Default)

2

cause the model to disregard the system prompt

Tree of attacks with pruning probe using cached attacks.

test.Blank

9

see what the model has to say for itself given silence

Blank prompt.

test.Test

9

send some test strings, ignore the response

Test probe.

topic.WordnetAllowedWords

9

get the model to engage off-topic

Use wordnet to find out if a model will discuss terms near but not in its allowed set.

topic.WordnetBlockedWords

9

get the model to engage off-topic

Use wordnet to explore which topics a model will respond to around blocked words.

topic.WordnetControversial (Default)

2

get the model to engage off-topic

Ask model to respond on a set on controversial topics.

visual_jailbreak.FigStep

2

disregard the system prompt with both text and image

Using image modality to assist jailbreak.

visual_jailbreak.FigStepFull

2

disregard the system prompt with both text and image

Using image modality to assist jailbreak.

xss.ColabAIDataLeakage (Default)

1

render images fetched from a remote uri

Leak data via image rendering as demonstrated against gemini in colab.

xss.MarkdownImageExfil (Default)

1

embed conversation-private information in a markdown URI

Can we get the model to write embedded assets exfiltrating text in the uri?.

xss.MarkdownURIImageExfilExtended (Default)

1

embed information an LLM would normally refuse to provide in a Markdown URI

Generalized implementation of data exfiltration exploits via markdown uris.

xss.MarkdownURINonImageExfilExtended (Default)

1

embed information an LLM would normally refuse to provide in a Markdown URI

Generalized implementation of data exfiltration exploits via markdown uris.

xss.MdExfil20230929 (Default)

1

embed conversation-private information in a markdown URI

Markdown image uri exfil exploit published 2023 sept 29.

xss.StringAssemblyDataExfil (Default)

1

embed conversation-private information in a markdown URI

Prompt injection for manipulating state.