Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
022f70a
Scaffolding
Jan 26, 2026
e85cdb9
Precommit
Jan 26, 2026
fc260c3
fixtures and basic tests
Jan 27, 2026
89a8079
basic tests
Jan 27, 2026
b18f224
basic tests
Jan 27, 2026
96ddf6c
last test
Jan 28, 2026
eb4e936
jailbreak format test
Jan 28, 2026
243ea0a
sample jailbreak prompt
Jan 28, 2026
946fdde
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
132caf5
real jailbreaks added
Jan 28, 2026
c4e625f
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
79d1a64
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
cb28fda
changing dataset name
Jan 29, 2026
f399b6d
moved jailbreak discovery
Jan 29, 2026
75436ea
changed path resolution
Jan 29, 2026
c0022f6
minor changes
Jan 29, 2026
9f579f2
minor bug
Jan 29, 2026
ccf7025
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
349cc6b
old dataset name
Jan 30, 2026
9fa6430
precommit
Jan 30, 2026
513cbf3
random jailbreak selection
Jan 30, 2026
b57b35a
error handling
Jan 30, 2026
999a0c6
error handling docstring
Jan 30, 2026
f3ec8bb
Merge branch 'Azure:main' into jailbreak2
ValbuenaVC Jan 30, 2026
89fd8bd
scaffolding
Jan 30, 2026
66650a6
scaffolding for subset
Jan 30, 2026
fa5b01a
scaffolding
Jan 30, 2026
44bc05c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
db5270c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
9d9666f
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 7, 2026
302101f
subset
Feb 9, 2026
9c7b757
tweaking
Feb 10, 2026
737aabe
new strategy template
Feb 10, 2026
472bd20
types'
Feb 10, 2026
b07e197
adversarial
Feb 10, 2026
c31d088
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pyrit/datasets/jailbreak/text_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ def __init__(
self.template.value = self.template.render_template_value_silent(**kwargs)

@classmethod
def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
def get_all_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]:
"""
Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH.

Args:
n (int, optional): Number of jailbreak templates to return. None to get all.
k (int, optional): Number of jailbreak templates to return. None to get all.

Returns:
List[str]: List of jailbreak template file names.
Expand All @@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
if not jailbreak_template_names:
raise ValueError("No jailbreak templates found in the jailbreak directory")

if n:
if n > len(jailbreak_template_names):
if k:
if k > len(jailbreak_template_names):
raise ValueError(
f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
)
jailbreak_template_names = random.choices(jailbreak_template_names, k=n)
jailbreak_template_names = random.choices(jailbreak_template_names, k=k)
return jailbreak_template_names

def get_jailbreak_system_prompt(self) -> str:
Expand Down
145 changes: 121 additions & 24 deletions pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
from pyrit.common import apply_defaults
from pyrit.datasets import TextJailBreak
from pyrit.executor.attack.core.attack_config import (
AttackAdversarialConfig,
AttackConverterConfig,
AttackScoringConfig,
)
from pyrit.executor.attack.multi_turn.crescendo import CrescendoAttack
from pyrit.executor.attack.multi_turn.red_teaming import RedTeamingAttack
from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack
from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack
from pyrit.models import SeedAttackGroup
from pyrit.prompt_converter import TextJailbreakConverter
Expand All @@ -19,9 +23,7 @@
from pyrit.scenario.core.atomic_attack import AtomicAttack
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.scenario.core.scenario_strategy import (
ScenarioStrategy,
)
from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy
from pyrit.score import (
SelfAskRefusalScorer,
TrueFalseInverterScorer,
Expand All @@ -31,13 +33,30 @@

class JailbreakStrategy(ScenarioStrategy):
"""
Strategy for single-turn jailbreak attacks.

There is currently only one, running all jailbreaks.
Strategy for jailbreak attacks.
"""

# Aggregate members (special markers that expand to strategies with matching tags)
ALL = ("all", {"all"})
PYRIT = ("pyrit", {"pyrit"})
SINGLE_TURN = ("single_turn", {"single_turn"})
MULTI_TURN = ("multi_turn", {"multi_turn"})

# Strategies for tweaking jailbreak efficacy through attack patterns
ManyShot = ("many_shot", {"single_turn"})
PromptSending = ("prompt_sending", {"single_turn"})
Crescendo = ("crescendo", {"multi_turn"})
RedTeaming = ("red_teaming", {"multi_turn"})

@classmethod
def get_aggregate_tags(cls) -> set[str]:
"""
Get the set of tags that represent aggregate categories.

Returns:
set[str]: Set of tags that are aggregate markers.
"""
# Include base class aggregates ("all") and add scenario-specific ones
return super().get_aggregate_tags() | {"single_turn", "multi_turn"}


class Jailbreak(Scenario):
Expand Down Expand Up @@ -93,7 +112,9 @@ def __init__(
objective_scorer: Optional[TrueFalseScorer] = None,
include_baseline: bool = False,
scenario_result_id: Optional[str] = None,
n_jailbreaks: Optional[int] = 3,
k: Optional[int] = None,
n: int = 1,
jailbreaks: Optional[List[str]] = None,
) -> None:
"""
Initialize the jailbreak scenario.
Expand All @@ -104,13 +125,30 @@ def __init__(
include_baseline (bool): Whether to include a baseline atomic attack that sends all
objectives without modifications. Defaults to True.
scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
k (Optional[int]): Choose k random jailbreaks rather than using all of them.
n (Optional[int]): Number of times to try each jailbreak.
jailbreaks (Optional[int]): Dedicated list of jailbreaks to run.

Raises:
ValueError: If both jailbreaks and k are provided, as random selection
is incompatible with a predetermined list.

"""
if jailbreaks and k:
raise ValueError("Please provide only one of `k` (random selection) or `jailbreaks` (specific selection).")

if not objective_scorer:
objective_scorer = self._get_default_objective_scorer()
self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer)

self._n = n_jailbreaks
self._k = k
self._n = n

if jailbreaks:
self._validate_jailbreaks_subset(jailbreaks)
self._jailbreaks = jailbreaks
else:
self._jailbreaks = TextJailBreak.get_all_jailbreak_templates()

super().__init__(
name="Jailbreak",
Expand All @@ -124,6 +162,21 @@ def __init__(
# Will be resolved in _get_atomic_attacks_async
self._seed_groups: Optional[List[SeedAttackGroup]] = None

def _validate_jailbreaks_subset(self, jailbreaks: List[str]) -> None:
"""
Validate that the provided jailbreaks exist before moving on with initialization.

Args:
jailbreaks (List[str]): List of jailbreak names.

Raises:
ValueError: If jailbreaks not discovered.
"""
all_templates = TextJailBreak.get_all_jailbreak_templates()
diff = set(jailbreaks) - set(all_templates)
if len(diff) > 0:
raise ValueError(f"Error: could not find templates `{diff}`!")

def _get_default_objective_scorer(self) -> TrueFalseScorer:
"""
Retrieve the default objective scorer.
Expand All @@ -146,6 +199,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
)
return refusal_scorer

def _get_default_adversarial_target(self) -> OpenAIChatTarget:
"""
Create and retrieve the default adversarial target.

Returns:
OpenAIChatTarget: Default adversarial target using an unfiltered endpoint.
"""
return OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=1.2,
)

def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
"""
Resolve seed groups from dataset configuration.
Expand All @@ -168,20 +235,26 @@ def _get_all_jailbreak_templates(self) -> List[str]:
Returns:
List[str]: List of jailbreak template file names.
"""
if not self._n:
if not self._k:
return TextJailBreak.get_all_jailbreak_templates()
else:
return TextJailBreak.get_all_jailbreak_templates(n=self._n)
return TextJailBreak.get_all_jailbreak_templates(k=self._k)

async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack:
async def _get_atomic_attack_from_strategy_async(
self, *, strategy: str, jailbreak_template_name: str
) -> AtomicAttack:
"""
Create an atomic attack for a specific jailbreak template.

Args:
strategy (str): JailbreakStrategy to use.
jailbreak_template_name (str): Name of the jailbreak template file.

Returns:
AtomicAttack: An atomic attack using the specified jailbreak template.

Raises:
ValueError: If an invalid strategy is provided.
"""
# objective_target is guaranteed to be non-None by parent class validation
assert self._objective_target is not None
Expand All @@ -196,12 +269,29 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na
request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter])
)

# Create the attack
attack = PromptSendingAttack(
objective_target=self._objective_target,
attack_scoring_config=self._scorer_config,
attack_converter_config=converter_config,
)
attack = None
args = {
"objective_target": self._objective_target,
"attack_scoring_config": self._scorer_config,
"attack_converter_config": converter_config,
}
adversarial_config = AttackAdversarialConfig(target=self._get_default_adversarial_target())
match strategy:
case "many_shot":
attack = ManyShotJailbreakAttack(**args)
case "prompt_sending":
attack = PromptSendingAttack(**args)
case "crescendo":
args["attack_adversarial_config"] = adversarial_config
attack = CrescendoAttack(**args)
case "red_teaming":
args["attack_adversarial_config"] = adversarial_config
attack = RedTeamingAttack(**args)
case _:
raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.")

if not attack:
raise ValueError(f"Attack cannot be None!")

# Extract template name without extension for the atomic attack name
template_name = Path(jailbreak_template_name).stem
Expand All @@ -218,17 +308,24 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:

Returns:
List[AtomicAttack]: List of atomic attacks to execute, one per jailbreak template.

Raises:
ValueError: If self._jailbreaks is not a subset of all jailbreak templates.
"""
atomic_attacks: List[AtomicAttack] = []

# Retrieve seed prompts based on selected strategies
self._seed_groups = self._resolve_seed_groups()

# Get all jailbreak template names
jailbreak_template_names = self._get_all_jailbreak_templates()
strategies = ScenarioCompositeStrategy.extract_single_strategy_values(
composites=self._scenario_composites, strategy_type=JailbreakStrategy
)

for template_name in jailbreak_template_names:
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name)
atomic_attacks.append(atomic_attack)
for strategy in strategies:
for template_name in self._jailbreaks:
atomic_attack = await self._get_atomic_attack_from_strategy_async(
strategy=strategy, jailbreak_template_name=template_name
)
atomic_attacks.extend([atomic_attack] * self._n)

return atomic_attacks
Loading
Loading