From 069540c18dd1e645d89733d0dab70bec1b300299 Mon Sep 17 00:00:00 2001
From: UnstableLlama <randomnotrealemail@gmail.com>
Date: Sat, 30 May 2026 21:01:20 -0700
Subject: [PATCH 1/3] feat: add config.nohumor

---
 config.nohumor.toml | 78 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 config.nohumor.toml

diff --git a/config.nohumor.toml b/config.nohumor.toml
new file mode 100644
index 00000000..b72f1ab6
--- /dev/null
+++ b/config.nohumor.toml
@@ -0,0 +1,78 @@
+# Rename this file to config.toml, place it in the working directory
+# that you run Heretic from, and edit the configuration to your liking.
+
+# This config targets the "humor axis": it treats humorous output as the
+# behavior to ablate away. After processing, the model should respond to
+# the same prompts in a more serious, deadpan register.
+#
+# good_prompts/good_evaluation_prompts (non-humorous baseline) use the
+# default mlabonne/harmless_alpaca splits. bad_prompts/bad_evaluation_prompts
+# use UnstableLlama/jokes, a small dataset of joke-eliciting prompts,
+# split 4:1 for training vs. evaluation.
+
+max_response_length = 300
+
+residual_plot_title = "PaCMAP Projection of Residuals for Serious/Humorous Prompts"
+
+refusal_markers = [
+    "😅",
+    "here's one",
+    "why did",
+    "walked into a bar",
+    "gets worse",
+    "the punchline",
+    "classic",
+    "dad joke",
+    "bad joke",
+    "pun intended",
+    "ba dum tss",
+    "rimshot",
+    "😂",
+    "😄",
+    "😆",
+    "haha",
+    "hehe",
+    "lol",
+    "funny",
+    "joke",
+    "humor",
+    "that's hilarious",
+    "you could say",
+    "one-liner",
+    "comedian",
+    "stand-up",
+    "unexpectedly",
+    "because apparently",
+    "to be fair",
+    "on the bright side",
+    "lmao",
+    "omg",
+    "rofl",
+    "silly",
+    "humorous",
+    "clever",
+]
+
+[good_prompts]
+dataset = "mlabonne/harmless_alpaca"
+split = "train[:400]"
+column = "text"
+residual_plot_label = "Serious prompts"
+residual_plot_color = "royalblue"
+
+[bad_prompts]
+dataset = "UnstableLlama/jokes"
+split = "train[:200]"
+column = "text"
+residual_plot_label = "Humorous prompts"
+residual_plot_color = "darkorange"
+
+[good_evaluation_prompts]
+dataset = "mlabonne/harmless_alpaca"
+split = "test[:100]"
+column = "text"
+
+[bad_evaluation_prompts]
+dataset = "UnstableLlama/jokes"
+split = "train[200:250]"
+column = "text"

From 4af1eb50eadd0e22f1c1e170c09a9c0f5bb1c6d8 Mon Sep 17 00:00:00 2001
From: UnstableLlama <149548995+UnstableLlama@users.noreply.github.com>
Date: Sat, 30 May 2026 21:25:55 -0700
Subject: [PATCH 2/3] Update config.nohumor.toml

Following style guide

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 config.nohumor.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config.nohumor.toml b/config.nohumor.toml
index b72f1ab6..87e5533d 100644
--- a/config.nohumor.toml
+++ b/config.nohumor.toml
@@ -5,8 +5,8 @@
 # behavior to ablate away. After processing, the model should respond to
 # the same prompts in a more serious, deadpan register.
 #
-# good_prompts/good_evaluation_prompts (non-humorous baseline) use the
-# default mlabonne/harmless_alpaca splits. bad_prompts/bad_evaluation_prompts
+# The good_prompts/good_evaluation_prompts (non-humorous baseline) use the
+# default mlabonne/harmless_alpaca splits. The bad_prompts/bad_evaluation_prompts
 # use UnstableLlama/jokes, a small dataset of joke-eliciting prompts,
 # split 4:1 for training vs. evaluation.
 

From 9079117dcac44c0384e9746df6828cc6820d223b Mon Sep 17 00:00:00 2001
From: UnstableLlama <149548995+UnstableLlama@users.noreply.github.com>
Date: Sat, 30 May 2026 23:53:10 -0700
Subject: [PATCH 3/3] Update config.nohumor.toml

Reduced initial comments
---
 config.nohumor.toml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/config.nohumor.toml b/config.nohumor.toml
index 87e5533d..e3f51b36 100644
--- a/config.nohumor.toml
+++ b/config.nohumor.toml
@@ -1,15 +1,6 @@
 # Rename this file to config.toml, place it in the working directory
 # that you run Heretic from, and edit the configuration to your liking.
 
-# This config targets the "humor axis": it treats humorous output as the
-# behavior to ablate away. After processing, the model should respond to
-# the same prompts in a more serious, deadpan register.
-#
-# The good_prompts/good_evaluation_prompts (non-humorous baseline) use the
-# default mlabonne/harmless_alpaca splits. The bad_prompts/bad_evaluation_prompts
-# use UnstableLlama/jokes, a small dataset of joke-eliciting prompts,
-# split 4:1 for training vs. evaluation.
-
 max_response_length = 300
 
 residual_plot_title = "PaCMAP Projection of Residuals for Serious/Humorous Prompts"