codifide-programming-language/moderation_gate.cod at main · codifide/codifide-programming-language · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
module moderation_gate

def classify_content
  intent "classify a message as safe, unsafe, or uncertain based on keyword signals"
  sig    (message: String) -> Label
  effects {}
  cand
    intent "unsafe — contains harmful keyword"
    when   or(
             contains(lower(message), "spam"),
             contains(lower(message), "hate"),
             contains(lower(message), "violence")
           )
    belief("unsafe", 0.90)
  cand
    intent "safe — contains approval keyword"
    when   or(
             contains(lower(message), "approved"),
             contains(lower(message), "verified")
           )
    belief("safe", 0.90)
  cand
    intent "uncertain — no keyword match"
    belief("uncertain", 0.75)

def moderate
  intent "gate classification on confidence — refuse when confidence is too low"
  sig    (message: String) -> Label
  effects {}
  cand
    result <- classify_content(message)
    believe result
      ge(conf(result), 0.70) => result
      else                   => bottom

def main_unsafe
  intent "test the unsafe path"
  sig    () -> Label
  effects {}
  cand
    moderate("this message contains spam")

def main_uncertain
  intent "test the uncertain path — hello world has no keywords, returns uncertain"
  sig    () -> Label
  effects {}
  cand
    moderate("hello world")