diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml
new file mode 100644
index 00000000..455fc3c0
--- /dev/null
+++ b/config/default/config_evaluation.yaml
@@ -0,0 +1,32 @@
+dialogues: data/datasets/moviebot/annotated_dialogues.json
+metrics:
+  - satisfaction
+  - success_rate
+  - successful_recommendation_round_ratio
+  - reward_per_dialogue_length
+output: data/evaluation/moviebot_non_quality_results.json
+
+quality_llm_interface:
+  llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface"
+  llm_interface_args:
+    configuration_path: config/llm_interface/config_ollama_default.yaml
+    default_response: ""
+quality_aspects:
+  - REC_RELEVANCE
+  - COM_STYLE
+  - FLUENCY
+  - CONV_FLOW
+  - OVERALL_SAT
+
+user_nlu_config: config/default/config_default.yaml
+agent_nlu_config: config/default/config_default.yaml
+
+recommendation_intent_labels:
+  - REVEAL
+  - REVEAL.SIMILAR
+  - REVEAL.NONE
+  - REVEAL.REVISE
+accept_intent_labels:
+  - NOTE.ACCEPT
+reject_intent_labels:
+  - NOTE.DISLIKE
\ No newline at end of file
diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py
deleted file mode 100644
index 3901ef41..00000000
--- a/scripts/evaluation/quality_evaluation.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""Script to evaluate dialogue quality using an LLM.
-
-The script evaluates dialogue quality with regards to five aspects:
-- Recommendation relevance
-- Communication style
-- Fluency
-- Conversational flow
-- Overall satisfaction
-
-Each aspect is scored between 1 and 5, where the scores are described in a
-dedicated rubric. The scoring is done using a large language model.
-"""
-
-import argparse
-import json
-import os
-from collections import defaultdict
-from dataclasses import dataclass
-from statistics import mean, stdev
-from typing import Dict, List, Union
-
-from tqdm import tqdm
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.participant.participant import DialogueParticipant
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics
-from usersimcrs.llm_interfaces.ollama_interface import (
-    OllamaLLMInterface,
-)
-
-_PROMPT_EVAL_INTRO = (
-    "You are an evaluator and you need to judge how does the "
-    "ASSISTANT perform based on the following CONVERSATION HISTORY. Please "
-    "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n"
-    "\nCONVERSATION HISTORY:"
-)
-_PROMPT_EVAL_OUTPUT_FORMAT = (
-    'Your output need be a be in a JSON format as follows:\n{"score": '
-    '<score>, "score_explanation": <explanation>}\nDo not include '
-    "additional information.\n"
-)
-
-
-@dataclass
-class QualityScore:
-    conversation_id: str
-    score: int
-    explanation: str = ""
-
-    def to_dict(self) -> Dict[str, Union[int, str]]:
-        """Converts the score to a dictionary."""
-        return {
-            "conversation_id": self.conversation_id,
-            "score": self.score,
-            "score_explanation": self.explanation,
-        }
-
-
-class QualityScoreEncoder(json.JSONEncoder):
-    def default(self, o):
-        if isinstance(o, QualityScore):
-            return o.to_dict()
-        return super().default(o)
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments.
-
-    Returns:
-        Parsed arguments.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dialogues",
-        type=str,
-        required=True,
-        help="Path to the dialogues.",
-    )
-    parser.add_argument(
-        "--ollama_config",
-        type=str,
-        required=True,
-        help="Path to the Ollama config file.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        help="(optional) Path to the output file.",
-    )
-    return parser.parse_args()
-
-
-def get_prompt(grading_rubric: QualityRubrics, dialogue: Dialogue) -> str:
-    """Prepares prompt given grading rubric and dialogue.
-
-    Args:
-        grading_rubric: Grading rubric for the aspect.
-        dialogue: Dialogue.
-
-    Returns:
-        Prompt comprising task definition, grading rubric, and dialogue.
-    """
-    prompt = _PROMPT_EVAL_INTRO
-
-    # Add dialogue history
-    for utterance in dialogue.utterances:
-        role = (
-            "USER"
-            if utterance.participant == DialogueParticipant.USER
-            else "ASSISTANT"
-        )
-        prompt += f"\n{role}: {utterance.text}"
-
-    prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n"
-    prompt += _PROMPT_EVAL_OUTPUT_FORMAT
-    return prompt
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    # Load dialogues
-    dialogues = json_to_dialogues(args.dialogues)
-
-    # Ollama interface
-    ollama_interface = OllamaLLMInterface(
-        args.ollama_config, default_response=""
-    )
-
-    # Evaluate dialogues
-    scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict(
-        lambda: defaultdict(list)
-    )
-
-    for dialogue in tqdm(dialogues):
-        for aspect in QualityRubrics:
-            prompt = get_prompt(aspect, dialogue)
-            response = ollama_interface.get_llm_api_response(prompt)
-            try:
-                response = response.replace("\\", "\\\\")
-                response_dict = json.loads(response)
-                score = QualityScore(
-                    conversation_id=dialogue.conversation_id,
-                    score=int(response_dict["score"]),
-                    explanation=response_dict["score_explanation"],
-                )
-                scores[dialogue.agent_id][aspect.name].append(score)
-            except Exception as e:
-                print(
-                    f"Failed to get score for {aspect} dialogue "
-                    f"{dialogue.conversation_id}: {e}\nResponse: {response}"
-                )
-
-    # Save scores
-    if args.output:
-        os.makedirs(os.path.dirname(args.output), exist_ok=True)
-        with open(args.output, "w") as f:
-            json.dump(scores, f, indent=2, cls=QualityScoreEncoder)
-
-    # Summary
-    for agent_id, agent_scores in scores.items():
-        print(f"Scores for agent {agent_id}:")
-        for aspect_name, aspect_scores in agent_scores.items():
-            print(f"Aspect: {aspect_name}")
-            avg_score = mean([score.score for score in aspect_scores])
-            std_dev = stdev([score.score for score in aspect_scores])
-            print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})")
diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py
deleted file mode 100644
index ea7dfb11..00000000
--- a/scripts/evaluation/satisfaction_evaluation.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""Automatic evaluation of dialogues.
-
-This script evaluates dialogues with regards to user satisfaction. It uses
-DialogueKit's satisfaction classifier, which assigns a score between 1 and 5.
-"""
-
-import argparse
-from collections import defaultdict
-from statistics import mean, stdev
-from typing import Dict
-
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifierSVM,
-)
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments.
-
-    Returns:
-        Parsed arguments.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dialogues",
-        type=str,
-        required=True,
-        help="Path to the dialogues.",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    # Load dialogues
-    dialogues = json_to_dialogues(args.dialogues)
-    print(f"Loaded {len(dialogues)} dialogues.")
-
-    # Satisfaction classifier
-    satisfaction_classifier = SatisfactionClassifierSVM()
-
-    # Evaluate dialogues
-    scores: Dict[str, Dict[int, float]] = defaultdict(dict)
-
-    for i, dialogue in enumerate(dialogues):
-        scores[dialogue.agent_id][
-            i
-        ] = satisfaction_classifier.classify_last_n_dialogue(
-            dialogue, last_n=None
-        )
-
-    # Summary
-    for agent, agent_scores in scores.items():
-        avg_score = mean(agent_scores.values())
-        stdev_score = stdev(agent_scores.values())
-        max_score = max(agent_scores.values())
-        min_score = min(agent_scores.values())
-        print(f"Agent: {agent} / Num. dialogues: {len(agent_scores)}")
-        print(f"Min score: {min_score}")
-        print(f"Max score: {max_score}")
-        print(f"Average score: {avg_score:.3f} (stdev: {stdev_score:.3f})")
diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py
deleted file mode 100644
index b97b2b50..00000000
--- a/scripts/evaluation/utility_evaluation.py
+++ /dev/null
@@ -1,318 +0,0 @@
-"""Automatic evaluation of dialogues with regards to utility.
-
-The script computes three user-centric utility metrics proposed by Bernard and
-Balog (2025):
-
-- Success Rate (SR)
-- Successful Recommendation Round Ratio (SRRR)
-- Reward-per-Dialogue-Length (RDL)
-
-Reference:
-Bernard, Nolwenn, and Krisztian Balog. "Limitations of Current Evaluation
-Practices for Conversational Recommender Systems and the Potential of User
-Simulation." arXiv preprint arXiv:2510.05624 (2025).
-https://arxiv.org/abs/2510.05624
-"""
-
-import argparse
-from collections import defaultdict
-import json
-from typing import Dict, List, Tuple
-
-from confuse import Configuration
-from tqdm import tqdm
-
-from dialoguekit.core.annotated_utterance import AnnotatedUtterance
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
-from dialoguekit.nlu.nlu import NLU
-from dialoguekit.participant.participant import DialogueParticipant
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from usersimcrs.utils.simulation_utils import get_NLU
-
-
-def annotate_dialogue(
-    dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
-) -> Dialogue:
-    """Annotates utterances with dialogue acts.
-
-    Args:
-        dialogue: Dialogue to be annotated.
-        user_nlu: User NLU module.
-        agent_nlu: Agent NLU module.
-
-    Returns:
-        Annotated dialogue.
-    """
-    for i, utterance in enumerate(dialogue.utterances):
-        if not isinstance(utterance, AnnotatedUtterance):
-            dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
-                utterance
-            )
-
-        if len(utterance.dialogue_acts) > 0:
-            continue
-
-        if utterance.participant == DialogueParticipant.USER:
-            dialogue.utterances[
-                i
-            ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
-        elif utterance.participant == DialogueParticipant.AGENT:
-            dialogue.utterances[
-                i
-            ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
-        else:
-            raise ValueError(f"Unknown participant: {utterance.participant}")
-    return dialogue
-
-
-def annotate_dialogues(
-    dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU
-) -> List[Dialogue]:
-    """Annotates dialogues with dialogue acts.
-
-    Args:
-        dialogues: Dialogues.
-        user_nlu: User NLU module.
-        agent_nlu: Agent NLU module.
-
-    Returns:
-        Annotated dialogues.
-    """
-    # TODO: Move this to DialogueKit
-    # See: https://github.com/iai-group/UserSimCRS/issues/219
-    return [
-        annotate_dialogue(dialogue, user_nlu, agent_nlu)
-        for dialogue in tqdm(dialogues)
-    ]
-
-
-def _get_recommendation_rounds(
-    dialogue: Dialogue, recommendation_intents: List[Intent]
-) -> List[List[AnnotatedUtterance]]:
-    """Gets utterances per recommendation round.
-
-    Args:
-        dialogue: Dialogue.
-        recommendation_intents: Intents corresponding to recommendation.
-
-    Returns:
-        Utterances per recommendation round.
-    """
-    rounds = []
-    current_round: List[AnnotatedUtterance] = []
-    for utterance in dialogue.utterances:
-        if any(
-            intent in utterance.get_intents()
-            for intent in recommendation_intents
-        ):
-            if current_round:
-                rounds.append(current_round)
-            current_round = [utterance]
-        else:
-            current_round.append(utterance)
-    return rounds
-
-
-def _is_recommendation_accepted(
-    round: List[AnnotatedUtterance],
-    acceptance_intents: List[Intent],
-    rejection_intents: List[Intent],
-) -> bool:
-    """Assesses whether the recommendation was accepted.
-
-    Args:
-        round: Utterances in recommendation round.
-        acceptance_intents: Intents corresponding to acceptance.
-        rejection_intents: Intents corresponding to rejection.
-
-    Returns:
-        True if the recommendation was accepted, False otherwise.
-    """
-    b_accepted = False
-    for utterance in round:
-        if utterance.participant == DialogueParticipant.USER:
-            intents = utterance.get_intents()
-            if any(intent in acceptance_intents for intent in intents):
-                b_accepted = True
-            elif any(intent in rejection_intents for intent in intents):
-                return False
-    return b_accepted
-
-
-def assess_dialogue(
-    dialogue: Dialogue,
-    recommendation_intents: List[Intent],
-    acceptance_intents: List[Intent],
-    rejection_intents: List[Intent],
-) -> Tuple[int, int, int]:
-    """Assesses the utility of the dialogue.
-
-    Args:
-        dialogue: Dialogue.
-        recommendation_intents: Intents corresponding to recommendation.
-        acceptance_intents: Intents corresponding to acceptance.
-        rejection_intents: Intents corresponding to rejection.
-
-    Returns:
-        Tuple of number of accepted recommendations, successful recommendation
-          rounds and total recommendation rounds.
-    """
-    # TODO: Optimize overall assessment to avoid multiple iterations over
-    # utterances.
-    rounds = _get_recommendation_rounds(dialogue, recommendation_intents)
-    successful_rounds = 0
-    for round in rounds:
-        if _is_recommendation_accepted(
-            round, acceptance_intents, rejection_intents
-        ):
-            successful_rounds += 1
-
-    nb_accepted_recommendations = sum(
-        1
-        for utterance in dialogue.utterances
-        if utterance.participant == DialogueParticipant.USER
-        and any(
-            intent in acceptance_intents for intent in utterance.get_intents()
-        )
-    )
-    return nb_accepted_recommendations, successful_rounds, len(rounds)
-
-
-def get_summary(dialogues: List[Dialogue]) -> None:
-    """Displays a summary of the utility evaluation.
-
-    Args:
-        dialogues: Dialogues.
-    """
-    summary: Dict[str, Dict[str, float]] = defaultdict(
-        lambda: {
-            "total_dialogues": 0,
-            "success_rate": 0,
-            "srrr": 0,
-            "rdl": 0,
-        }
-    )
-    for dialogue in dialogues:
-        summary[dialogue.agent_id]["total_dialogues"] += 1
-        summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[
-            "utility"
-        ]["success"]
-        summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][
-            "successful_recommendation_round_ratio"
-        ]
-        summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][
-            "reward_per_dialogue_length"
-        ]
-
-    for agent_id, stats in summary.items():
-        total = stats["total_dialogues"]
-        print(f"Agent: {agent_id}")
-        print(f"\tTotal Dialogues: {total}")
-        print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}")
-        print(
-            "\tSuccessful Recommendation Round Ratio: "
-            f"{stats['srrr'] / total:.4f}"
-        )
-        print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}")
-        print()
-
-
-def parse_args() -> argparse.Namespace:
-    """Parses command-line arguments.
-
-    Returns:
-        Parsed command-line arguments.
-    """
-    parser = argparse.ArgumentParser(prog="utility_evaluation.py")
-    parser.add_argument(
-        "annotated_dialogues",
-        type=str,
-        help="Annotated dialogues JSON file.",
-    )
-    parser.add_argument(
-        "user_nlu_config",
-        type=str,
-        help="User NLU configuration file.",
-    )
-    parser.add_argument(
-        "agent_nlu_config",
-        type=str,
-        help="Agent NLU configuration file.",
-    )
-    parser.add_argument(
-        "--reject_intent_labels",
-        nargs="+",
-        default=["REJ"],
-        help="Intent labels corresponding to rejection.",
-    )
-    parser.add_argument(
-        "--accept_intent_labels",
-        nargs="+",
-        default=["ACC"],
-        help="Intent labels corresponding to acceptance.",
-    )
-    parser.add_argument(
-        "--recommendation_intent_labels",
-        nargs="+",
-        default=["REC-S", "REC-E"],
-        help="Intent labels corresponding to recommendation.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        help="Output file to save annotated dialogues with utility metrics.",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    dialogues = json_to_dialogues(args.annotated_dialogues)
-
-    rejection_intents = [Intent(label) for label in args.reject_intent_labels]
-    acceptance_intents = [Intent(label) for label in args.accept_intent_labels]
-    recommendation_intents = [
-        Intent(label) for label in args.recommendation_intent_labels
-    ]
-
-    # NLU module for user utterances
-    user_nlu_config = Configuration("User NLU Configuration")
-    user_nlu_config.set_file(args.user_nlu_config)
-    user_nlu = get_NLU(user_nlu_config)
-
-    # NLU module for agent utterances
-    agent_nlu_config = Configuration("Agent NLU Configuration")
-    agent_nlu_config.set_file(args.agent_nlu_config)
-    agent_nlu = get_NLU(agent_nlu_config)
-
-    dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu)
-    for dialogue in dialogues:
-        (
-            nb_accepted_recommendations,
-            successful_rounds,
-            total_rounds,
-        ) = assess_dialogue(
-            dialogue,
-            recommendation_intents,
-            acceptance_intents,
-            rejection_intents,
-        )
-        dialogue.metadata["utility"] = {
-            "success": int(successful_rounds > 0),
-            "successful_recommendation_round_ratio": (
-                successful_rounds / total_rounds if total_rounds > 0 else 0.0
-            ),
-            "reward_per_dialogue_length": (
-                nb_accepted_recommendations / len(dialogue.utterances)
-            ),
-        }
-
-    if args.output:
-        with open(args.output, "w") as f:
-            json.dump(
-                [dialogue.to_dict() for dialogue in dialogues], f, indent=2
-            )
-
-    get_summary(dialogues)
diff --git a/usersimcrs/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py
new file mode 100644
index 00000000..c99399a2
--- /dev/null
+++ b/usersimcrs/evaluation/base_metric.py
@@ -0,0 +1,48 @@
+"""Abstract base class for dialogue evaluation metrics."""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+from dialoguekit.core.dialogue import Dialogue
+
+
+class BaseMetric(ABC):
+    def __init__(self, name: str) -> None:
+        """Initializes the metric.
+
+        Args:
+            name: Metric name.
+        """
+        self.name = name
+
+    @abstractmethod
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the metric for a single dialogue.
+
+        Args:
+            dialogue: Single dialogue to score.
+            **kwargs: Additional arguments specific to the metric.
+
+        Raises:
+            NotImplementedError: When not implemented by a subclass.
+
+        Returns:
+            Score for the dialogue.
+        """
+        raise NotImplementedError()
+
+    def evaluate_dialogues(
+        self, dialogues: List[Dialogue], **kwargs: Any
+    ) -> Dict[str, float]:
+        """Computes the metric for every dialogue in a given list.
+
+        Args:
+            dialogues: Dialogues.
+            **kwargs: Additional arguments specific to the metric.
+
+        Returns:
+            Dictionary with result per dialogue. Keys are conversation IDs.
+        """
+        return {
+            dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
+            for dialogue in dialogues
+        }
diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
new file mode 100644
index 00000000..58ffb2c9
--- /dev/null
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -0,0 +1,136 @@
+"""Dialogue annotation and recommendation round utilities.
+
+Provides functions for annotating dialogues with dialogue acts using NLU
+modules, extracting recommendation rounds from annotated dialogues, and
+assessing recommendation acceptance.
+"""
+
+from typing import List
+
+from dialoguekit.core.annotated_utterance import AnnotatedUtterance
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.nlu import NLU
+from dialoguekit.participant.participant import DialogueParticipant
+
+
+def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None:
+    """Raises error if dialogue utterances are not annotated."""
+    for utterance in dialogue.utterances:
+        if not isinstance(utterance, AnnotatedUtterance):
+            raise RuntimeError("Dialogue must be annotated.")
+
+
+def annotate_dialogue(
+    dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
+) -> Dialogue:
+    """Annotates utterances with dialogue acts.
+
+    Each utterance that is not already an AnnotatedUtterance is converted to
+    one. Utterances that already carry dialogue acts are left untouched.
+
+    Args:
+        dialogue: Dialogue to be annotated.
+        user_nlu: NLU module for user utterances.
+        agent_nlu: NLU module for agent utterances.
+
+    Raises:
+        ValueError: If an utterance has an unknown participant.
+
+    Returns:
+        The same dialogue object with annotated utterances.
+    """
+    for i, utterance in enumerate(dialogue.utterances):
+        if not isinstance(utterance, AnnotatedUtterance):
+            dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
+                utterance
+            )
+
+        if len(utterance.dialogue_acts) > 0:
+            continue
+
+        if utterance.participant == DialogueParticipant.USER:
+            dialogue.utterances[
+                i
+            ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
+        elif utterance.participant == DialogueParticipant.AGENT:
+            dialogue.utterances[
+                i
+            ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
+        else:
+            raise ValueError(f"Unknown participant: {utterance.participant}")
+    return dialogue
+
+
+def annotate_dialogues(
+    dialogues: List[Dialogue],
+    user_nlu: NLU,
+    agent_nlu: NLU,
+) -> None:
+    """Annotates dialogues in place using provided NLU modules.
+
+    Args:
+        dialogues: Dialogues to annotate (modified in place).
+        user_nlu: NLU module for user utterances.
+        agent_nlu: NLU module for agent utterances.
+    """
+    for dialogue in dialogues:
+        annotate_dialogue(dialogue, user_nlu, agent_nlu)
+
+
+def get_recommendation_rounds(
+    dialogue: Dialogue, recommendation_intents: List[Intent]
+) -> List[List[AnnotatedUtterance]]:
+    """Splits a dialogue into recommendation rounds.
+
+    A new round begins each time an utterance contains a recommendation
+      intent.
+
+    Args:
+        dialogue: Annotated dialogue.
+        recommendation_intents: Intents that signal a recommendation.
+
+    Returns:
+        List of utterance groups, one per recommendation round.
+    """
+    rounds: List[List[AnnotatedUtterance]] = []
+    current_round: List[AnnotatedUtterance] = []
+    for utterance in dialogue.utterances:
+        if any(
+            intent in utterance.get_intents()
+            for intent in recommendation_intents
+        ):
+            if current_round:
+                rounds.append(current_round)
+            current_round = [utterance]
+        else:
+            current_round.append(utterance)
+    if current_round:
+        rounds.append(current_round)
+    return rounds
+
+
+def is_recommendation_accepted(
+    round_utterances: List[AnnotatedUtterance],
+    acceptance_intents: List[Intent],
+    rejection_intents: List[Intent],
+) -> bool:
+    """Assesses whether a recommendation round was accepted.
+
+    Args:
+        round_utterances: Utterances in the recommendation round.
+        acceptance_intents: Intents corresponding to acceptance.
+        rejection_intents: Intents corresponding to rejection.
+
+    Returns:
+        True if the recommendation was accepted, False otherwise.
+    """
+    b_accepted = False
+    for utterance in round_utterances:
+        if utterance.participant == DialogueParticipant.USER:
+            intents = utterance.get_intents()
+            if any(intent in acceptance_intents for intent in intents):
+                b_accepted = True
+            elif any(intent in rejection_intents for intent in intents):
+                return False
+    return b_accepted
diff --git a/usersimcrs/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py
new file mode 100644
index 00000000..acbfc595
--- /dev/null
+++ b/usersimcrs/evaluation/quality_metric.py
@@ -0,0 +1,121 @@
+"""LLM-based dialogue quality evaluation.
+
+The script evaluates dialogue quality with regards to five aspects:
+- Recommendation relevance
+- Communication style
+- Fluency
+- Conversational flow
+- Overall satisfaction
+
+Each aspect is scored between 1 and 5, where the scores are described in a
+dedicated rubric. The scoring is done using a large language model.
+"""
+
+import json
+import logging
+from typing import Any, Literal
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.participant.participant import DialogueParticipant
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.quality_rubrics import QualityRubrics
+from usersimcrs.llm_interfaces.llm_interface import LLMInterface
+
+
+_PROMPT_EVAL_INTRO = (
+    "You are an evaluator and you need to judge how does the "
+    "ASSISTANT perform based on the following CONVERSATION HISTORY. Please "
+    "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n"
+    "\nCONVERSATION HISTORY:"
+)
+_PROMPT_EVAL_OUTPUT_FORMAT = (
+    'Your output need be a be in a JSON format as follows:\n{"score": '
+    '<score>, "score_explanation": <explanation>}\nDo not include '
+    "additional information.\n"
+)
+
+
+class QualityMetric(BaseMetric):
+    def __init__(
+        self,
+        llm_interface: LLMInterface,
+        name: str = "quality",
+    ) -> None:
+        """Initializes the quality metric.
+
+        Args:
+            llm_interface: LLM interface used for scoring.
+            name: Metric name. Defaults to "quality".
+        """
+        super().__init__(name)
+        self.llm_interface = llm_interface
+
+    def _get_prompt(
+        self, grading_rubric: QualityRubrics, dialogue: Dialogue
+    ) -> str:
+        """Prepares prompt given grading rubric and dialogue.
+
+        Args:
+            grading_rubric: Grading rubric for the aspect.
+            dialogue: Dialogue.
+
+        Returns:
+            Prompt comprising task definition, grading rubric, and dialogue.
+        """
+        prompt = _PROMPT_EVAL_INTRO
+        for utterance in dialogue.utterances:
+            role = (
+                "USER"
+                if utterance.participant == DialogueParticipant.USER
+                else "ASSISTANT"
+            )
+            prompt += f"\n{role}: {utterance.text}"
+
+        prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n"
+        prompt += _PROMPT_EVAL_OUTPUT_FORMAT
+        return prompt
+
+    def evaluate_dialogue(
+        self,
+        dialogue: Dialogue,
+        aspect: Literal[
+            "REC_RELEVANCE",
+            "COM_STYLE",
+            "FLUENCY",
+            "CONV_FLOW",
+            "OVERALL_SAT",
+        ],
+        **kwargs: Any,
+    ) -> float:
+        """Returns score for a single aspect of a dialogue.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            aspect: Aspect to evaluate. One of QualityRubrics enum names.
+
+        Raises:
+            KeyError: When the aspect does not exist in QualityRubrics.
+
+        Returns:
+            Score (1-5) for the specified aspect.
+        """
+        try:
+            aspect_enum = QualityRubrics[aspect]
+        except KeyError:
+            supported = [e.name for e in QualityRubrics]
+            raise KeyError(
+                f"Unknown aspect '{aspect}'. Supported aspects: {supported}"
+            )
+        prompt = self._get_prompt(aspect_enum, dialogue)
+        response = self.llm_interface.get_llm_api_response(prompt)
+        try:
+            response = response.replace("\\", "\\\\")
+            response_dict = json.loads(response)
+            return float(response_dict["score"])
+        except Exception:
+            logging.warning(
+                f"Failed to parse LLM response for {aspect} dialogue "
+                f"{dialogue.conversation_id}: {response}",
+            )
+            return 0.0
diff --git a/scripts/evaluation/rubrics/quality_rubrics.py b/usersimcrs/evaluation/quality_rubrics.py
similarity index 100%
rename from scripts/evaluation/rubrics/quality_rubrics.py
rename to usersimcrs/evaluation/quality_rubrics.py
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
new file mode 100644
index 00000000..ecabb410
--- /dev/null
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -0,0 +1,55 @@
+"""Reward-per-Dialogue-Length metric implementation.
+
+Evaluates the ratio of accepted recommendations to total dialogue length.
+"""
+
+from typing import Any, List
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.participant.participant import DialogueParticipant
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import (
+    ensure_dialogue_is_annotated,
+)
+
+
+class RewardPerDialogueLengthMetric(BaseMetric):
+    def __init__(
+        self,
+        name: str = "reward_per_dialogue_length",
+    ) -> None:
+        """Initializes the reward-per-dialogue-length metric.
+
+        Args:
+            name: Metric name. Defaults to "reward_per_dialogue_length".
+        """
+        super().__init__(name)
+
+    def evaluate_dialogue(
+        self,
+        dialogue: Dialogue,
+        acceptance_intents: List[Intent],
+        **kwargs: Any,
+    ) -> float:
+        """Computes the reward-per-dialogue-length score.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            acceptance_intents: Acceptance intents.
+
+        Returns:
+            Ratio of accepted recommendations to total utterances.
+        """
+        ensure_dialogue_is_annotated(dialogue)
+        nb_accepted = sum(
+            1
+            for utterance in dialogue.utterances
+            if utterance.participant == DialogueParticipant.USER
+            and any(
+                intent in acceptance_intents
+                for intent in utterance.get_intents()
+            )
+        )
+        return nb_accepted / len(dialogue.utterances)
diff --git a/usersimcrs/evaluation/satisfaction_metric.py b/usersimcrs/evaluation/satisfaction_metric.py
new file mode 100644
index 00000000..664f91c0
--- /dev/null
+++ b/usersimcrs/evaluation/satisfaction_metric.py
@@ -0,0 +1,35 @@
+"""Satisfaction metric class implementation.
+
+Satisfaction assessment based on DialogueKit classifier.
+"""
+
+from typing import Any
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifier,
+)
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+
+
+class SatisfactionMetric(BaseMetric):
+    def __init__(
+        self,
+        classifier: SatisfactionClassifier,
+        name: str = "satisfaction",
+    ) -> None:
+        """Initializes the satisfaction metric.
+
+        Args:
+            classifier: Satisfaction classifier instance.
+            name: Metric name.
+        """
+        super().__init__(name)
+        self.classifier = classifier
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the satisfaction score for a single dialogue."""
+        return float(
+            self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
+        )
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
new file mode 100644
index 00000000..9d3dd3bd
--- /dev/null
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -0,0 +1,59 @@
+"""Success Rate metric implementation.
+
+Evaluates whether at least one recommendation was accepted during a dialogue.
+"""
+
+from typing import Any, List
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import (
+    ensure_dialogue_is_annotated,
+    get_recommendation_rounds,
+    is_recommendation_accepted,
+)
+
+
+class SuccessRateMetric(BaseMetric):
+    def __init__(
+        self,
+        name: str = "success_rate",
+    ) -> None:
+        """Initializes the success rate metric.
+
+        Args:
+            name: Metric name.
+        """
+        super().__init__(name)
+
+    def evaluate_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+        **kwargs: Any,
+    ) -> float:
+        """Computes the success rate for a single dialogue.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            recommendation_intents: Intents that indicate recommendation.
+            acceptance_intents: Intents that indicate acceptance.
+            rejection_intents: Intents that indicate rejection.
+
+        Returns:
+            1.0 if at least one recommendation was accepted, 0.0 otherwise.
+        """
+        ensure_dialogue_is_annotated(dialogue)
+        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
+        return float(
+            any(
+                is_recommendation_accepted(
+                    r, acceptance_intents, rejection_intents
+                )
+                for r in rounds
+            )
+        )
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
new file mode 100644
index 00000000..a544696d
--- /dev/null
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -0,0 +1,62 @@
+"""Successful Recommendation Round Ratio metric implementation.
+
+Evaluates the ratio of accepted recommendation rounds to total recommendation
+rounds in a dialogue.
+"""
+
+from typing import Any, List
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import (
+    ensure_dialogue_is_annotated,
+    get_recommendation_rounds,
+    is_recommendation_accepted,
+)
+
+
+class SuccessfulRecommendationRoundRatioMetric(BaseMetric):
+    def __init__(
+        self,
+        name: str = "successful_recommendation_round_ratio",
+    ) -> None:
+        """Initializes the successful recommendation round ratio metric.
+
+        Args:
+            name: Metric name. Defaults to
+              "successful_recommendation_round_ratio".
+        """
+        super().__init__(name)
+
+    def evaluate_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+        **kwargs: Any,
+    ) -> float:
+        """Computes the successful recommendation round ratio.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            recommendation_intents: Intents that indicate recommendation.
+            acceptance_intents: Intents that indicate acceptance.
+            rejection_intents: Intents that indicate rejection.
+
+        Returns:
+            Ratio of accepted recommendation rounds to total rounds,
+              or 0.0 if there are no recommendation rounds.
+        """
+        ensure_dialogue_is_annotated(dialogue)
+        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
+        successful = sum(
+            1
+            for r in rounds
+            if is_recommendation_accepted(
+                r, acceptance_intents, rejection_intents
+            )
+        )
+        return successful / len(rounds) if rounds else 0.0
diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py
new file mode 100644
index 00000000..706de52a
--- /dev/null
+++ b/usersimcrs/run_evaluation.py
@@ -0,0 +1,351 @@
+"""Console application for running evaluation."""
+
+import argparse
+import json
+import os
+from collections import defaultdict
+from statistics import mean, stdev
+from typing import Any, Dict, List, Mapping, Sequence
+
+import confuse
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifierSVM,
+)
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues
+from usersimcrs.evaluation.quality_metric import QualityMetric
+from usersimcrs.evaluation.quality_rubrics import QualityRubrics
+from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
+    RewardPerDialogueLengthMetric,
+)
+from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
+from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
+from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
+    SuccessfulRecommendationRoundRatioMetric,
+)
+from usersimcrs.utils.simulation_utils import get_NLU, get_llm_interface
+
+DEFAULT_CONFIG_PATH = "config/default/config_evaluation.yaml"
+UTILITY_METRICS = {
+    "success_rate",
+    "successful_recommendation_round_ratio",
+    "reward_per_dialogue_length",
+}
+SUPPORTED_METRICS = [
+    "quality",
+    "satisfaction",
+    "success_rate",
+    "successful_recommendation_round_ratio",
+    "reward_per_dialogue_length",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    """Defines accepted arguments and returns the parsed values."""
+    parser = argparse.ArgumentParser(prog="run_evaluation.py")
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        help=(
+            "Path to configuration file to overwrite default values. "
+            "Defaults to None."
+        ),
+    )
+    parser.add_argument("--dialogues", type=str, help="Dialogues JSON file.")
+    parser.add_argument(
+        "--metrics",
+        nargs="+",
+        choices=SUPPORTED_METRICS,
+        help="Metrics to compute.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Path to save evaluation results as JSON.",
+    )
+    parser.add_argument(
+        "--quality_aspects",
+        nargs="+",
+        help="Quality aspects to evaluate.",
+    )
+    parser.add_argument(
+        "--user_nlu_config",
+        type=str,
+        help="User NLU configuration file.",
+    )
+    parser.add_argument(
+        "--agent_nlu_config",
+        type=str,
+        help="Agent NLU configuration file.",
+    )
+    parser.add_argument(
+        "--reject_intent_labels",
+        nargs="+",
+        help="Intent labels corresponding to rejection.",
+    )
+    parser.add_argument(
+        "--accept_intent_labels",
+        nargs="+",
+        help="Intent labels corresponding to acceptance.",
+    )
+    parser.add_argument(
+        "--recommendation_intent_labels",
+        nargs="+",
+        help="Intent labels corresponding to recommendation.",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        action="store_const",
+        const=True,
+        help="Debug mode.",
+    )
+    return parser.parse_args()
+
+
+def load_config(args: argparse.Namespace) -> confuse.Configuration:
+    """Loads config from default file, custom file, and CLI overrides."""
+    config = confuse.Configuration("usersimcrs")
+    config.set_file(DEFAULT_CONFIG_PATH)
+    if args.config_file:
+        config.set_file(args.config_file)
+    config.set_args(args, dots=True)
+    return config
+
+
+def validate_config(config: confuse.Configuration) -> List[str]:
+    """Validates evaluation config and returns quality aspects."""
+    metrics = config["metrics"].get()
+    if "quality" in metrics and "quality_llm_interface" not in config:
+        raise ValueError("Quality evaluation requires `quality_llm_interface`.")
+
+    quality_aspects = config["quality_aspects"].get()
+    supported_aspects = [aspect.name for aspect in QualityRubrics]
+    invalid_aspects = [
+        aspect for aspect in quality_aspects if aspect not in supported_aspects
+    ]
+    if invalid_aspects:
+        raise ValueError(
+            f"Unknown quality aspect(s): {invalid_aspects}. "
+            f"Supported aspects: {supported_aspects}"
+        )
+
+    if UTILITY_METRICS.intersection(set(metrics)):
+        if not config["user_nlu_config"].get(None):
+            raise ValueError(
+                "`user_nlu_config` is required for utility metrics."
+            )
+        if not config["agent_nlu_config"].get(None):
+            raise ValueError(
+                "`agent_nlu_config` is required for utility metrics."
+            )
+
+    return quality_aspects
+
+
+def load_nlu(config_path: str, name: str) -> Any:
+    """Loads one NLU component from a config path."""
+    nlu_config = confuse.Configuration(name)
+    nlu_config.set_file(config_path)
+    return get_NLU(nlu_config)
+
+
+def annotate_for_utility(
+    dialogues: List[Any], config: confuse.Configuration, metrics: Sequence[str]
+) -> None:
+    """Annotates dialogues when utility metrics are requested."""
+    if not UTILITY_METRICS.intersection(set(metrics)):
+        return
+
+    user_nlu = load_nlu(
+        config["user_nlu_config"].get(), "User NLU Configuration"
+    )
+    agent_nlu = load_nlu(
+        config["agent_nlu_config"].get(), "Agent NLU Configuration"
+    )
+    annotate_dialogues(dialogues, user_nlu, agent_nlu)
+
+
+def get_summary_by_agent(
+    dialogues: Sequence[Any], scores: Mapping[str, float]
+) -> Dict[str, Dict[str, float]]:
+    """Aggregates metric scores by agent."""
+    grouped_scores: Dict[str, List[float]] = defaultdict(list)
+    for dialogue in dialogues:
+        grouped_scores[dialogue.agent_id].append(
+            scores[dialogue.conversation_id]
+        )
+
+    return {
+        agent_id: {
+            "count": len(agent_scores),
+            "min": min(agent_scores),
+            "max": max(agent_scores),
+            "mean": mean(agent_scores),
+            "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0,
+        }
+        for agent_id, agent_scores in grouped_scores.items()
+    }
+
+
+def get_utility_intents(
+    config: confuse.Configuration,
+) -> Dict[str, List[Intent]]:
+    """Builds intent lists used by utility metrics."""
+    return {
+        "recommendation_intents": [
+            Intent(label)
+            for label in config["recommendation_intent_labels"].get()
+        ],
+        "acceptance_intents": [
+            Intent(label) for label in config["accept_intent_labels"].get()
+        ],
+        "rejection_intents": [
+            Intent(label) for label in config["reject_intent_labels"].get()
+        ],
+    }
+
+
+def build_metric_registry(
+    config: confuse.Configuration, metrics: Sequence[str]
+) -> Dict[str, Any]:
+    """Builds metric instances."""
+    registry: Dict[str, Any] = {}
+    if "quality" in metrics:
+        registry["quality"] = QualityMetric(
+            llm_interface=get_llm_interface(
+                config["quality_llm_interface"].get()
+            )
+        )
+    if "satisfaction" in metrics:
+        registry["satisfaction"] = SatisfactionMetric(
+            classifier=SatisfactionClassifierSVM()
+        )
+    if "success_rate" in metrics:
+        registry["success_rate"] = SuccessRateMetric()
+    if "successful_recommendation_round_ratio" in metrics:
+        registry[
+            "successful_recommendation_round_ratio"
+        ] = SuccessfulRecommendationRoundRatioMetric()
+    if "reward_per_dialogue_length" in metrics:
+        registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric()
+    return registry
+
+
+def evaluate_metric(
+    metric_name: str,
+    metric: Any,
+    dialogues: List[Any],
+    quality_aspects: Sequence[str],
+    utility_intents: Dict[str, List[Intent]],
+) -> Dict[str, Any]:
+    """Evaluates one metric and returns serialized results."""
+    if metric_name == "quality":
+        return {
+            "aspects": {
+                aspect: {
+                    "per_dialogue": scores,
+                    "summary_by_agent": get_summary_by_agent(dialogues, scores),
+                }
+                for aspect in quality_aspects
+                for scores in [
+                    metric.evaluate_dialogues(dialogues, aspect=aspect)
+                ]
+            }
+        }
+
+    if metric_name in {
+        "success_rate",
+        "successful_recommendation_round_ratio",
+    }:
+        scores = metric.evaluate_dialogues(dialogues, **utility_intents)
+    elif metric_name == "reward_per_dialogue_length":
+        scores = metric.evaluate_dialogues(
+            dialogues,
+            acceptance_intents=utility_intents["acceptance_intents"],
+        )
+    else:
+        scores = metric.evaluate_dialogues(dialogues)
+
+    return {
+        "per_dialogue": scores,
+        "summary_by_agent": get_summary_by_agent(dialogues, scores),
+    }
+
+
+def save_results(
+    config: confuse.Configuration, results: Dict[str, Any]
+) -> None:
+    """Writes config dump and evaluation results to disk."""
+    output_path = config["output"].get()
+    output_dir = os.path.dirname(output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+
+    output_stem, _ = os.path.splitext(output_path)
+    with open(f"{output_stem}.meta.yaml", "w") as f:
+        f.write(config.dump())
+
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+
+
+def print_summary(results: Mapping[str, Any]) -> None:
+    """Prints a concise terminal summary."""
+    for metric_name, metric_result in results["metrics"].items():
+        print(f"Metric: {metric_name}")
+        if metric_name == "quality":
+            for aspect_name, aspect_result in metric_result["aspects"].items():
+                print(f"  Aspect: {aspect_name}")
+                for agent_id, stats in aspect_result[
+                    "summary_by_agent"
+                ].items():
+                    print(
+                        f"    Agent: {agent_id} | mean={stats['mean']:.3f} "
+                        f"stdev={stats['stdev']:.3f}"
+                    )
+            continue
+
+        for agent_id, stats in metric_result["summary_by_agent"].items():
+            print(
+                f"  Agent: {agent_id} | mean={stats['mean']:.3f} "
+                f"stdev={stats['stdev']:.3f}"
+            )
+
+
+def main() -> None:
+    """Runs evaluation based on the resolved configuration."""
+    args = parse_args()
+    config = load_config(args)
+
+    metrics = config["metrics"].get()
+    quality_aspects = validate_config(config)
+    dialogues = json_to_dialogues(config["dialogues"].get())
+    annotate_for_utility(dialogues, config, metrics)
+
+    utility_intents = get_utility_intents(config)
+    metric_registry = build_metric_registry(config, metrics)
+
+    results: Dict[str, Any] = {
+        "dialogues_path": config["dialogues"].get(),
+        "metrics_requested": metrics,
+        "metrics": {},
+    }
+
+    for metric_name in metrics:
+        results["metrics"][metric_name] = evaluate_metric(
+            metric_name,
+            metric_registry[metric_name],
+            dialogues,
+            quality_aspects,
+            utility_intents,
+        )
+
+    save_results(config, results)
+    print_summary(results)
+
+
+if __name__ == "__main__":
+    main()