From 069540c18dd1e645d89733d0dab70bec1b300299 Mon Sep 17 00:00:00 2001 From: UnstableLlama Date: Sat, 30 May 2026 21:01:20 -0700 Subject: [PATCH 1/3] feat: add config.nohumor --- config.nohumor.toml | 78 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 config.nohumor.toml diff --git a/config.nohumor.toml b/config.nohumor.toml new file mode 100644 index 00000000..b72f1ab6 --- /dev/null +++ b/config.nohumor.toml @@ -0,0 +1,78 @@ +# Rename this file to config.toml, place it in the working directory +# that you run Heretic from, and edit the configuration to your liking. + +# This config targets the "humor axis": it treats humorous output as the +# behavior to ablate away. After processing, the model should respond to +# the same prompts in a more serious, deadpan register. +# +# good_prompts/good_evaluation_prompts (non-humorous baseline) use the +# default mlabonne/harmless_alpaca splits. bad_prompts/bad_evaluation_prompts +# use UnstableLlama/jokes, a small dataset of joke-eliciting prompts, +# split 4:1 for training vs. evaluation. + +max_response_length = 300 + +residual_plot_title = "PaCMAP Projection of Residuals for Serious/Humorous Prompts" + +refusal_markers = [ + "😅", + "here's one", + "why did", + "walked into a bar", + "gets worse", + "the punchline", + "classic", + "dad joke", + "bad joke", + "pun intended", + "ba dum tss", + "rimshot", + "😂", + "😄", + "😆", + "haha", + "hehe", + "lol", + "funny", + "joke", + "humor", + "that's hilarious", + "you could say", + "one-liner", + "comedian", + "stand-up", + "unexpectedly", + "because apparently", + "to be fair", + "on the bright side", + "lmao", + "omg", + "rofl", + "silly", + "humorous", + "clever", +] + +[good_prompts] +dataset = "mlabonne/harmless_alpaca" +split = "train[:400]" +column = "text" +residual_plot_label = "Serious prompts" +residual_plot_color = "royalblue" + +[bad_prompts] +dataset = "UnstableLlama/jokes" +split = "train[:200]" +column = "text" +residual_plot_label = "Humorous prompts" +residual_plot_color = "darkorange" + +[good_evaluation_prompts] +dataset = "mlabonne/harmless_alpaca" +split = "test[:100]" +column = "text" + +[bad_evaluation_prompts] +dataset = "UnstableLlama/jokes" +split = "train[200:250]" +column = "text" From 4af1eb50eadd0e22f1c1e170c09a9c0f5bb1c6d8 Mon Sep 17 00:00:00 2001 From: UnstableLlama <149548995+UnstableLlama@users.noreply.github.com> Date: Sat, 30 May 2026 21:25:55 -0700 Subject: [PATCH 2/3] Update config.nohumor.toml Following style guide Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- config.nohumor.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.nohumor.toml b/config.nohumor.toml index b72f1ab6..87e5533d 100644 --- a/config.nohumor.toml +++ b/config.nohumor.toml @@ -5,8 +5,8 @@ # behavior to ablate away. After processing, the model should respond to # the same prompts in a more serious, deadpan register. # -# good_prompts/good_evaluation_prompts (non-humorous baseline) use the -# default mlabonne/harmless_alpaca splits. bad_prompts/bad_evaluation_prompts +# The good_prompts/good_evaluation_prompts (non-humorous baseline) use the +# default mlabonne/harmless_alpaca splits. The bad_prompts/bad_evaluation_prompts # use UnstableLlama/jokes, a small dataset of joke-eliciting prompts, # split 4:1 for training vs. evaluation. From 9079117dcac44c0384e9746df6828cc6820d223b Mon Sep 17 00:00:00 2001 From: UnstableLlama <149548995+UnstableLlama@users.noreply.github.com> Date: Sat, 30 May 2026 23:53:10 -0700 Subject: [PATCH 3/3] Update config.nohumor.toml Reduced initial comments --- config.nohumor.toml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/config.nohumor.toml b/config.nohumor.toml index 87e5533d..e3f51b36 100644 --- a/config.nohumor.toml +++ b/config.nohumor.toml @@ -1,15 +1,6 @@ # Rename this file to config.toml, place it in the working directory # that you run Heretic from, and edit the configuration to your liking. -# This config targets the "humor axis": it treats humorous output as the -# behavior to ablate away. After processing, the model should respond to -# the same prompts in a more serious, deadpan register. -# -# The good_prompts/good_evaluation_prompts (non-humorous baseline) use the -# default mlabonne/harmless_alpaca splits. The bad_prompts/bad_evaluation_prompts -# use UnstableLlama/jokes, a small dataset of joke-eliciting prompts, -# split 4:1 for training vs. evaluation. - max_response_length = 300 residual_plot_title = "PaCMAP Projection of Residuals for Serious/Humorous Prompts"