diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml new file mode 100644 index 00000000..455fc3c0 --- /dev/null +++ b/config/default/config_evaluation.yaml @@ -0,0 +1,32 @@ +dialogues: data/datasets/moviebot/annotated_dialogues.json +metrics: + - satisfaction + - success_rate + - successful_recommendation_round_ratio + - reward_per_dialogue_length +output: data/evaluation/moviebot_non_quality_results.json + +quality_llm_interface: + llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface" + llm_interface_args: + configuration_path: config/llm_interface/config_ollama_default.yaml + default_response: "" +quality_aspects: + - REC_RELEVANCE + - COM_STYLE + - FLUENCY + - CONV_FLOW + - OVERALL_SAT + +user_nlu_config: config/default/config_default.yaml +agent_nlu_config: config/default/config_default.yaml + +recommendation_intent_labels: + - REVEAL + - REVEAL.SIMILAR + - REVEAL.NONE + - REVEAL.REVISE +accept_intent_labels: + - NOTE.ACCEPT +reject_intent_labels: + - NOTE.DISLIKE \ No newline at end of file diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py deleted file mode 100644 index 3901ef41..00000000 --- a/scripts/evaluation/quality_evaluation.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Script to evaluate dialogue quality using an LLM. - -The script evaluates dialogue quality with regards to five aspects: -- Recommendation relevance -- Communication style -- Fluency -- Conversational flow -- Overall satisfaction - -Each aspect is scored between 1 and 5, where the scores are described in a -dedicated rubric. The scoring is done using a large language model. -""" - -import argparse -import json -import os -from collections import defaultdict -from dataclasses import dataclass -from statistics import mean, stdev -from typing import Dict, List, Union - -from tqdm import tqdm - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.participant.participant import DialogueParticipant -from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics -from usersimcrs.llm_interfaces.ollama_interface import ( - OllamaLLMInterface, -) - -_PROMPT_EVAL_INTRO = ( - "You are an evaluator and you need to judge how does the " - "ASSISTANT perform based on the following CONVERSATION HISTORY. Please " - "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n" - "\nCONVERSATION HISTORY:" -) -_PROMPT_EVAL_OUTPUT_FORMAT = ( - 'Your output need be a be in a JSON format as follows:\n{"score": ' - ', "score_explanation": }\nDo not include ' - "additional information.\n" -) - - -@dataclass -class QualityScore: - conversation_id: str - score: int - explanation: str = "" - - def to_dict(self) -> Dict[str, Union[int, str]]: - """Converts the score to a dictionary.""" - return { - "conversation_id": self.conversation_id, - "score": self.score, - "score_explanation": self.explanation, - } - - -class QualityScoreEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, QualityScore): - return o.to_dict() - return super().default(o) - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - parser.add_argument( - "--ollama_config", - type=str, - required=True, - help="Path to the Ollama config file.", - ) - parser.add_argument( - "--output", - type=str, - help="(optional) Path to the output file.", - ) - return parser.parse_args() - - -def get_prompt(grading_rubric: QualityRubrics, dialogue: Dialogue) -> str: - """Prepares prompt given grading rubric and dialogue. - - Args: - grading_rubric: Grading rubric for the aspect. - dialogue: Dialogue. - - Returns: - Prompt comprising task definition, grading rubric, and dialogue. - """ - prompt = _PROMPT_EVAL_INTRO - - # Add dialogue history - for utterance in dialogue.utterances: - role = ( - "USER" - if utterance.participant == DialogueParticipant.USER - else "ASSISTANT" - ) - prompt += f"\n{role}: {utterance.text}" - - prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n" - prompt += _PROMPT_EVAL_OUTPUT_FORMAT - return prompt - - -if __name__ == "__main__": - args = parse_args() - - # Load dialogues - dialogues = json_to_dialogues(args.dialogues) - - # Ollama interface - ollama_interface = OllamaLLMInterface( - args.ollama_config, default_response="" - ) - - # Evaluate dialogues - scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict( - lambda: defaultdict(list) - ) - - for dialogue in tqdm(dialogues): - for aspect in QualityRubrics: - prompt = get_prompt(aspect, dialogue) - response = ollama_interface.get_llm_api_response(prompt) - try: - response = response.replace("\\", "\\\\") - response_dict = json.loads(response) - score = QualityScore( - conversation_id=dialogue.conversation_id, - score=int(response_dict["score"]), - explanation=response_dict["score_explanation"], - ) - scores[dialogue.agent_id][aspect.name].append(score) - except Exception as e: - print( - f"Failed to get score for {aspect} dialogue " - f"{dialogue.conversation_id}: {e}\nResponse: {response}" - ) - - # Save scores - if args.output: - os.makedirs(os.path.dirname(args.output), exist_ok=True) - with open(args.output, "w") as f: - json.dump(scores, f, indent=2, cls=QualityScoreEncoder) - - # Summary - for agent_id, agent_scores in scores.items(): - print(f"Scores for agent {agent_id}:") - for aspect_name, aspect_scores in agent_scores.items(): - print(f"Aspect: {aspect_name}") - avg_score = mean([score.score for score in aspect_scores]) - std_dev = stdev([score.score for score in aspect_scores]) - print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})") diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py deleted file mode 100644 index ea7dfb11..00000000 --- a/scripts/evaluation/satisfaction_evaluation.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Automatic evaluation of dialogues. - -This script evaluates dialogues with regards to user satisfaction. It uses -DialogueKit's satisfaction classifier, which assigns a score between 1 and 5. -""" - -import argparse -from collections import defaultdict -from statistics import mean, stdev -from typing import Dict - -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, -) -from dialoguekit.utils.dialogue_reader import json_to_dialogues - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - # Load dialogues - dialogues = json_to_dialogues(args.dialogues) - print(f"Loaded {len(dialogues)} dialogues.") - - # Satisfaction classifier - satisfaction_classifier = SatisfactionClassifierSVM() - - # Evaluate dialogues - scores: Dict[str, Dict[int, float]] = defaultdict(dict) - - for i, dialogue in enumerate(dialogues): - scores[dialogue.agent_id][ - i - ] = satisfaction_classifier.classify_last_n_dialogue( - dialogue, last_n=None - ) - - # Summary - for agent, agent_scores in scores.items(): - avg_score = mean(agent_scores.values()) - stdev_score = stdev(agent_scores.values()) - max_score = max(agent_scores.values()) - min_score = min(agent_scores.values()) - print(f"Agent: {agent} / Num. dialogues: {len(agent_scores)}") - print(f"Min score: {min_score}") - print(f"Max score: {max_score}") - print(f"Average score: {avg_score:.3f} (stdev: {stdev_score:.3f})") diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py deleted file mode 100644 index b97b2b50..00000000 --- a/scripts/evaluation/utility_evaluation.py +++ /dev/null @@ -1,318 +0,0 @@ -"""Automatic evaluation of dialogues with regards to utility. - -The script computes three user-centric utility metrics proposed by Bernard and -Balog (2025): - -- Success Rate (SR) -- Successful Recommendation Round Ratio (SRRR) -- Reward-per-Dialogue-Length (RDL) - -Reference: -Bernard, Nolwenn, and Krisztian Balog. "Limitations of Current Evaluation -Practices for Conversational Recommender Systems and the Potential of User -Simulation." arXiv preprint arXiv:2510.05624 (2025). -https://arxiv.org/abs/2510.05624 -""" - -import argparse -from collections import defaultdict -import json -from typing import Dict, List, Tuple - -from confuse import Configuration -from tqdm import tqdm - -from dialoguekit.core.annotated_utterance import AnnotatedUtterance -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent -from dialoguekit.nlu.nlu import NLU -from dialoguekit.participant.participant import DialogueParticipant -from dialoguekit.utils.dialogue_reader import json_to_dialogues -from usersimcrs.utils.simulation_utils import get_NLU - - -def annotate_dialogue( - dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU -) -> Dialogue: - """Annotates utterances with dialogue acts. - - Args: - dialogue: Dialogue to be annotated. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogue. - """ - for i, utterance in enumerate(dialogue.utterances): - if not isinstance(utterance, AnnotatedUtterance): - dialogue.utterances[i] = AnnotatedUtterance.from_utterance( - utterance - ) - - if len(utterance.dialogue_acts) > 0: - continue - - if utterance.participant == DialogueParticipant.USER: - dialogue.utterances[ - i - ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) - elif utterance.participant == DialogueParticipant.AGENT: - dialogue.utterances[ - i - ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) - else: - raise ValueError(f"Unknown participant: {utterance.participant}") - return dialogue - - -def annotate_dialogues( - dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU -) -> List[Dialogue]: - """Annotates dialogues with dialogue acts. - - Args: - dialogues: Dialogues. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogues. - """ - # TODO: Move this to DialogueKit - # See: https://github.com/iai-group/UserSimCRS/issues/219 - return [ - annotate_dialogue(dialogue, user_nlu, agent_nlu) - for dialogue in tqdm(dialogues) - ] - - -def _get_recommendation_rounds( - dialogue: Dialogue, recommendation_intents: List[Intent] -) -> List[List[AnnotatedUtterance]]: - """Gets utterances per recommendation round. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - - Returns: - Utterances per recommendation round. - """ - rounds = [] - current_round: List[AnnotatedUtterance] = [] - for utterance in dialogue.utterances: - if any( - intent in utterance.get_intents() - for intent in recommendation_intents - ): - if current_round: - rounds.append(current_round) - current_round = [utterance] - else: - current_round.append(utterance) - return rounds - - -def _is_recommendation_accepted( - round: List[AnnotatedUtterance], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], -) -> bool: - """Assesses whether the recommendation was accepted. - - Args: - round: Utterances in recommendation round. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - True if the recommendation was accepted, False otherwise. - """ - b_accepted = False - for utterance in round: - if utterance.participant == DialogueParticipant.USER: - intents = utterance.get_intents() - if any(intent in acceptance_intents for intent in intents): - b_accepted = True - elif any(intent in rejection_intents for intent in intents): - return False - return b_accepted - - -def assess_dialogue( - dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], -) -> Tuple[int, int, int]: - """Assesses the utility of the dialogue. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - Tuple of number of accepted recommendations, successful recommendation - rounds and total recommendation rounds. - """ - # TODO: Optimize overall assessment to avoid multiple iterations over - # utterances. - rounds = _get_recommendation_rounds(dialogue, recommendation_intents) - successful_rounds = 0 - for round in rounds: - if _is_recommendation_accepted( - round, acceptance_intents, rejection_intents - ): - successful_rounds += 1 - - nb_accepted_recommendations = sum( - 1 - for utterance in dialogue.utterances - if utterance.participant == DialogueParticipant.USER - and any( - intent in acceptance_intents for intent in utterance.get_intents() - ) - ) - return nb_accepted_recommendations, successful_rounds, len(rounds) - - -def get_summary(dialogues: List[Dialogue]) -> None: - """Displays a summary of the utility evaluation. - - Args: - dialogues: Dialogues. - """ - summary: Dict[str, Dict[str, float]] = defaultdict( - lambda: { - "total_dialogues": 0, - "success_rate": 0, - "srrr": 0, - "rdl": 0, - } - ) - for dialogue in dialogues: - summary[dialogue.agent_id]["total_dialogues"] += 1 - summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[ - "utility" - ]["success"] - summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][ - "successful_recommendation_round_ratio" - ] - summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][ - "reward_per_dialogue_length" - ] - - for agent_id, stats in summary.items(): - total = stats["total_dialogues"] - print(f"Agent: {agent_id}") - print(f"\tTotal Dialogues: {total}") - print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}") - print( - "\tSuccessful Recommendation Round Ratio: " - f"{stats['srrr'] / total:.4f}" - ) - print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}") - print() - - -def parse_args() -> argparse.Namespace: - """Parses command-line arguments. - - Returns: - Parsed command-line arguments. - """ - parser = argparse.ArgumentParser(prog="utility_evaluation.py") - parser.add_argument( - "annotated_dialogues", - type=str, - help="Annotated dialogues JSON file.", - ) - parser.add_argument( - "user_nlu_config", - type=str, - help="User NLU configuration file.", - ) - parser.add_argument( - "agent_nlu_config", - type=str, - help="Agent NLU configuration file.", - ) - parser.add_argument( - "--reject_intent_labels", - nargs="+", - default=["REJ"], - help="Intent labels corresponding to rejection.", - ) - parser.add_argument( - "--accept_intent_labels", - nargs="+", - default=["ACC"], - help="Intent labels corresponding to acceptance.", - ) - parser.add_argument( - "--recommendation_intent_labels", - nargs="+", - default=["REC-S", "REC-E"], - help="Intent labels corresponding to recommendation.", - ) - parser.add_argument( - "--output", - type=str, - help="Output file to save annotated dialogues with utility metrics.", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - dialogues = json_to_dialogues(args.annotated_dialogues) - - rejection_intents = [Intent(label) for label in args.reject_intent_labels] - acceptance_intents = [Intent(label) for label in args.accept_intent_labels] - recommendation_intents = [ - Intent(label) for label in args.recommendation_intent_labels - ] - - # NLU module for user utterances - user_nlu_config = Configuration("User NLU Configuration") - user_nlu_config.set_file(args.user_nlu_config) - user_nlu = get_NLU(user_nlu_config) - - # NLU module for agent utterances - agent_nlu_config = Configuration("Agent NLU Configuration") - agent_nlu_config.set_file(args.agent_nlu_config) - agent_nlu = get_NLU(agent_nlu_config) - - dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu) - for dialogue in dialogues: - ( - nb_accepted_recommendations, - successful_rounds, - total_rounds, - ) = assess_dialogue( - dialogue, - recommendation_intents, - acceptance_intents, - rejection_intents, - ) - dialogue.metadata["utility"] = { - "success": int(successful_rounds > 0), - "successful_recommendation_round_ratio": ( - successful_rounds / total_rounds if total_rounds > 0 else 0.0 - ), - "reward_per_dialogue_length": ( - nb_accepted_recommendations / len(dialogue.utterances) - ), - } - - if args.output: - with open(args.output, "w") as f: - json.dump( - [dialogue.to_dict() for dialogue in dialogues], f, indent=2 - ) - - get_summary(dialogues) diff --git a/usersimcrs/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py new file mode 100644 index 00000000..c99399a2 --- /dev/null +++ b/usersimcrs/evaluation/base_metric.py @@ -0,0 +1,48 @@ +"""Abstract base class for dialogue evaluation metrics.""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List +from dialoguekit.core.dialogue import Dialogue + + +class BaseMetric(ABC): + def __init__(self, name: str) -> None: + """Initializes the metric. + + Args: + name: Metric name. + """ + self.name = name + + @abstractmethod + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the metric for a single dialogue. + + Args: + dialogue: Single dialogue to score. + **kwargs: Additional arguments specific to the metric. + + Raises: + NotImplementedError: When not implemented by a subclass. + + Returns: + Score for the dialogue. + """ + raise NotImplementedError() + + def evaluate_dialogues( + self, dialogues: List[Dialogue], **kwargs: Any + ) -> Dict[str, float]: + """Computes the metric for every dialogue in a given list. + + Args: + dialogues: Dialogues. + **kwargs: Additional arguments specific to the metric. + + Returns: + Dictionary with result per dialogue. Keys are conversation IDs. + """ + return { + dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) + for dialogue in dialogues + } diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py new file mode 100644 index 00000000..58ffb2c9 --- /dev/null +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -0,0 +1,136 @@ +"""Dialogue annotation and recommendation round utilities. + +Provides functions for annotating dialogues with dialogue acts using NLU +modules, extracting recommendation rounds from annotated dialogues, and +assessing recommendation acceptance. +""" + +from typing import List + +from dialoguekit.core.annotated_utterance import AnnotatedUtterance +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.nlu import NLU +from dialoguekit.participant.participant import DialogueParticipant + + +def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None: + """Raises error if dialogue utterances are not annotated.""" + for utterance in dialogue.utterances: + if not isinstance(utterance, AnnotatedUtterance): + raise RuntimeError("Dialogue must be annotated.") + + +def annotate_dialogue( + dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU +) -> Dialogue: + """Annotates utterances with dialogue acts. + + Each utterance that is not already an AnnotatedUtterance is converted to + one. Utterances that already carry dialogue acts are left untouched. + + Args: + dialogue: Dialogue to be annotated. + user_nlu: NLU module for user utterances. + agent_nlu: NLU module for agent utterances. + + Raises: + ValueError: If an utterance has an unknown participant. + + Returns: + The same dialogue object with annotated utterances. + """ + for i, utterance in enumerate(dialogue.utterances): + if not isinstance(utterance, AnnotatedUtterance): + dialogue.utterances[i] = AnnotatedUtterance.from_utterance( + utterance + ) + + if len(utterance.dialogue_acts) > 0: + continue + + if utterance.participant == DialogueParticipant.USER: + dialogue.utterances[ + i + ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) + elif utterance.participant == DialogueParticipant.AGENT: + dialogue.utterances[ + i + ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) + else: + raise ValueError(f"Unknown participant: {utterance.participant}") + return dialogue + + +def annotate_dialogues( + dialogues: List[Dialogue], + user_nlu: NLU, + agent_nlu: NLU, +) -> None: + """Annotates dialogues in place using provided NLU modules. + + Args: + dialogues: Dialogues to annotate (modified in place). + user_nlu: NLU module for user utterances. + agent_nlu: NLU module for agent utterances. + """ + for dialogue in dialogues: + annotate_dialogue(dialogue, user_nlu, agent_nlu) + + +def get_recommendation_rounds( + dialogue: Dialogue, recommendation_intents: List[Intent] +) -> List[List[AnnotatedUtterance]]: + """Splits a dialogue into recommendation rounds. + + A new round begins each time an utterance contains a recommendation + intent. + + Args: + dialogue: Annotated dialogue. + recommendation_intents: Intents that signal a recommendation. + + Returns: + List of utterance groups, one per recommendation round. + """ + rounds: List[List[AnnotatedUtterance]] = [] + current_round: List[AnnotatedUtterance] = [] + for utterance in dialogue.utterances: + if any( + intent in utterance.get_intents() + for intent in recommendation_intents + ): + if current_round: + rounds.append(current_round) + current_round = [utterance] + else: + current_round.append(utterance) + if current_round: + rounds.append(current_round) + return rounds + + +def is_recommendation_accepted( + round_utterances: List[AnnotatedUtterance], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], +) -> bool: + """Assesses whether a recommendation round was accepted. + + Args: + round_utterances: Utterances in the recommendation round. + acceptance_intents: Intents corresponding to acceptance. + rejection_intents: Intents corresponding to rejection. + + Returns: + True if the recommendation was accepted, False otherwise. + """ + b_accepted = False + for utterance in round_utterances: + if utterance.participant == DialogueParticipant.USER: + intents = utterance.get_intents() + if any(intent in acceptance_intents for intent in intents): + b_accepted = True + elif any(intent in rejection_intents for intent in intents): + return False + return b_accepted diff --git a/usersimcrs/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py new file mode 100644 index 00000000..acbfc595 --- /dev/null +++ b/usersimcrs/evaluation/quality_metric.py @@ -0,0 +1,121 @@ +"""LLM-based dialogue quality evaluation. + +The script evaluates dialogue quality with regards to five aspects: +- Recommendation relevance +- Communication style +- Fluency +- Conversational flow +- Overall satisfaction + +Each aspect is scored between 1 and 5, where the scores are described in a +dedicated rubric. The scoring is done using a large language model. +""" + +import json +import logging +from typing import Any, Literal + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.participant.participant import DialogueParticipant + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.quality_rubrics import QualityRubrics +from usersimcrs.llm_interfaces.llm_interface import LLMInterface + + +_PROMPT_EVAL_INTRO = ( + "You are an evaluator and you need to judge how does the " + "ASSISTANT perform based on the following CONVERSATION HISTORY. Please " + "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n" + "\nCONVERSATION HISTORY:" +) +_PROMPT_EVAL_OUTPUT_FORMAT = ( + 'Your output need be a be in a JSON format as follows:\n{"score": ' + ', "score_explanation": }\nDo not include ' + "additional information.\n" +) + + +class QualityMetric(BaseMetric): + def __init__( + self, + llm_interface: LLMInterface, + name: str = "quality", + ) -> None: + """Initializes the quality metric. + + Args: + llm_interface: LLM interface used for scoring. + name: Metric name. Defaults to "quality". + """ + super().__init__(name) + self.llm_interface = llm_interface + + def _get_prompt( + self, grading_rubric: QualityRubrics, dialogue: Dialogue + ) -> str: + """Prepares prompt given grading rubric and dialogue. + + Args: + grading_rubric: Grading rubric for the aspect. + dialogue: Dialogue. + + Returns: + Prompt comprising task definition, grading rubric, and dialogue. + """ + prompt = _PROMPT_EVAL_INTRO + for utterance in dialogue.utterances: + role = ( + "USER" + if utterance.participant == DialogueParticipant.USER + else "ASSISTANT" + ) + prompt += f"\n{role}: {utterance.text}" + + prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n" + prompt += _PROMPT_EVAL_OUTPUT_FORMAT + return prompt + + def evaluate_dialogue( + self, + dialogue: Dialogue, + aspect: Literal[ + "REC_RELEVANCE", + "COM_STYLE", + "FLUENCY", + "CONV_FLOW", + "OVERALL_SAT", + ], + **kwargs: Any, + ) -> float: + """Returns score for a single aspect of a dialogue. + + Args: + dialogue: Dialogue to evaluate. + aspect: Aspect to evaluate. One of QualityRubrics enum names. + + Raises: + KeyError: When the aspect does not exist in QualityRubrics. + + Returns: + Score (1-5) for the specified aspect. + """ + try: + aspect_enum = QualityRubrics[aspect] + except KeyError: + supported = [e.name for e in QualityRubrics] + raise KeyError( + f"Unknown aspect '{aspect}'. Supported aspects: {supported}" + ) + prompt = self._get_prompt(aspect_enum, dialogue) + response = self.llm_interface.get_llm_api_response(prompt) + try: + response = response.replace("\\", "\\\\") + response_dict = json.loads(response) + return float(response_dict["score"]) + except Exception: + logging.warning( + f"Failed to parse LLM response for {aspect} dialogue " + f"{dialogue.conversation_id}: {response}", + ) + return 0.0 diff --git a/scripts/evaluation/rubrics/quality_rubrics.py b/usersimcrs/evaluation/quality_rubrics.py similarity index 100% rename from scripts/evaluation/rubrics/quality_rubrics.py rename to usersimcrs/evaluation/quality_rubrics.py diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py new file mode 100644 index 00000000..ecabb410 --- /dev/null +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -0,0 +1,55 @@ +"""Reward-per-Dialogue-Length metric implementation. + +Evaluates the ratio of accepted recommendations to total dialogue length. +""" + +from typing import Any, List + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.participant.participant import DialogueParticipant + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import ( + ensure_dialogue_is_annotated, +) + + +class RewardPerDialogueLengthMetric(BaseMetric): + def __init__( + self, + name: str = "reward_per_dialogue_length", + ) -> None: + """Initializes the reward-per-dialogue-length metric. + + Args: + name: Metric name. Defaults to "reward_per_dialogue_length". + """ + super().__init__(name) + + def evaluate_dialogue( + self, + dialogue: Dialogue, + acceptance_intents: List[Intent], + **kwargs: Any, + ) -> float: + """Computes the reward-per-dialogue-length score. + + Args: + dialogue: Dialogue to evaluate. + acceptance_intents: Acceptance intents. + + Returns: + Ratio of accepted recommendations to total utterances. + """ + ensure_dialogue_is_annotated(dialogue) + nb_accepted = sum( + 1 + for utterance in dialogue.utterances + if utterance.participant == DialogueParticipant.USER + and any( + intent in acceptance_intents + for intent in utterance.get_intents() + ) + ) + return nb_accepted / len(dialogue.utterances) diff --git a/usersimcrs/evaluation/satisfaction_metric.py b/usersimcrs/evaluation/satisfaction_metric.py new file mode 100644 index 00000000..664f91c0 --- /dev/null +++ b/usersimcrs/evaluation/satisfaction_metric.py @@ -0,0 +1,35 @@ +"""Satisfaction metric class implementation. + +Satisfaction assessment based on DialogueKit classifier. +""" + +from typing import Any + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifier, +) + +from usersimcrs.evaluation.base_metric import BaseMetric + + +class SatisfactionMetric(BaseMetric): + def __init__( + self, + classifier: SatisfactionClassifier, + name: str = "satisfaction", + ) -> None: + """Initializes the satisfaction metric. + + Args: + classifier: Satisfaction classifier instance. + name: Metric name. + """ + super().__init__(name) + self.classifier = classifier + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the satisfaction score for a single dialogue.""" + return float( + self.classifier.classify_last_n_dialogue(dialogue, last_n=None) + ) diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py new file mode 100644 index 00000000..9d3dd3bd --- /dev/null +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -0,0 +1,59 @@ +"""Success Rate metric implementation. + +Evaluates whether at least one recommendation was accepted during a dialogue. +""" + +from typing import Any, List + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import ( + ensure_dialogue_is_annotated, + get_recommendation_rounds, + is_recommendation_accepted, +) + + +class SuccessRateMetric(BaseMetric): + def __init__( + self, + name: str = "success_rate", + ) -> None: + """Initializes the success rate metric. + + Args: + name: Metric name. + """ + super().__init__(name) + + def evaluate_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + **kwargs: Any, + ) -> float: + """Computes the success rate for a single dialogue. + + Args: + dialogue: Dialogue to evaluate. + recommendation_intents: Intents that indicate recommendation. + acceptance_intents: Intents that indicate acceptance. + rejection_intents: Intents that indicate rejection. + + Returns: + 1.0 if at least one recommendation was accepted, 0.0 otherwise. + """ + ensure_dialogue_is_annotated(dialogue) + rounds = get_recommendation_rounds(dialogue, recommendation_intents) + return float( + any( + is_recommendation_accepted( + r, acceptance_intents, rejection_intents + ) + for r in rounds + ) + ) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py new file mode 100644 index 00000000..a544696d --- /dev/null +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -0,0 +1,62 @@ +"""Successful Recommendation Round Ratio metric implementation. + +Evaluates the ratio of accepted recommendation rounds to total recommendation +rounds in a dialogue. +""" + +from typing import Any, List + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import ( + ensure_dialogue_is_annotated, + get_recommendation_rounds, + is_recommendation_accepted, +) + + +class SuccessfulRecommendationRoundRatioMetric(BaseMetric): + def __init__( + self, + name: str = "successful_recommendation_round_ratio", + ) -> None: + """Initializes the successful recommendation round ratio metric. + + Args: + name: Metric name. Defaults to + "successful_recommendation_round_ratio". + """ + super().__init__(name) + + def evaluate_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + **kwargs: Any, + ) -> float: + """Computes the successful recommendation round ratio. + + Args: + dialogue: Dialogue to evaluate. + recommendation_intents: Intents that indicate recommendation. + acceptance_intents: Intents that indicate acceptance. + rejection_intents: Intents that indicate rejection. + + Returns: + Ratio of accepted recommendation rounds to total rounds, + or 0.0 if there are no recommendation rounds. + """ + ensure_dialogue_is_annotated(dialogue) + rounds = get_recommendation_rounds(dialogue, recommendation_intents) + successful = sum( + 1 + for r in rounds + if is_recommendation_accepted( + r, acceptance_intents, rejection_intents + ) + ) + return successful / len(rounds) if rounds else 0.0 diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py new file mode 100644 index 00000000..706de52a --- /dev/null +++ b/usersimcrs/run_evaluation.py @@ -0,0 +1,351 @@ +"""Console application for running evaluation.""" + +import argparse +import json +import os +from collections import defaultdict +from statistics import mean, stdev +from typing import Any, Dict, List, Mapping, Sequence + +import confuse +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifierSVM, +) +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues +from usersimcrs.evaluation.quality_metric import QualityMetric +from usersimcrs.evaluation.quality_rubrics import QualityRubrics +from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( + RewardPerDialogueLengthMetric, +) +from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric +from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric +from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( + SuccessfulRecommendationRoundRatioMetric, +) +from usersimcrs.utils.simulation_utils import get_NLU, get_llm_interface + +DEFAULT_CONFIG_PATH = "config/default/config_evaluation.yaml" +UTILITY_METRICS = { + "success_rate", + "successful_recommendation_round_ratio", + "reward_per_dialogue_length", +} +SUPPORTED_METRICS = [ + "quality", + "satisfaction", + "success_rate", + "successful_recommendation_round_ratio", + "reward_per_dialogue_length", +] + + +def parse_args() -> argparse.Namespace: + """Defines accepted arguments and returns the parsed values.""" + parser = argparse.ArgumentParser(prog="run_evaluation.py") + parser.add_argument( + "-c", + "--config-file", + help=( + "Path to configuration file to overwrite default values. " + "Defaults to None." + ), + ) + parser.add_argument("--dialogues", type=str, help="Dialogues JSON file.") + parser.add_argument( + "--metrics", + nargs="+", + choices=SUPPORTED_METRICS, + help="Metrics to compute.", + ) + parser.add_argument( + "--output", + type=str, + help="Path to save evaluation results as JSON.", + ) + parser.add_argument( + "--quality_aspects", + nargs="+", + help="Quality aspects to evaluate.", + ) + parser.add_argument( + "--user_nlu_config", + type=str, + help="User NLU configuration file.", + ) + parser.add_argument( + "--agent_nlu_config", + type=str, + help="Agent NLU configuration file.", + ) + parser.add_argument( + "--reject_intent_labels", + nargs="+", + help="Intent labels corresponding to rejection.", + ) + parser.add_argument( + "--accept_intent_labels", + nargs="+", + help="Intent labels corresponding to acceptance.", + ) + parser.add_argument( + "--recommendation_intent_labels", + nargs="+", + help="Intent labels corresponding to recommendation.", + ) + parser.add_argument( + "-d", + "--debug", + action="store_const", + const=True, + help="Debug mode.", + ) + return parser.parse_args() + + +def load_config(args: argparse.Namespace) -> confuse.Configuration: + """Loads config from default file, custom file, and CLI overrides.""" + config = confuse.Configuration("usersimcrs") + config.set_file(DEFAULT_CONFIG_PATH) + if args.config_file: + config.set_file(args.config_file) + config.set_args(args, dots=True) + return config + + +def validate_config(config: confuse.Configuration) -> List[str]: + """Validates evaluation config and returns quality aspects.""" + metrics = config["metrics"].get() + if "quality" in metrics and "quality_llm_interface" not in config: + raise ValueError("Quality evaluation requires `quality_llm_interface`.") + + quality_aspects = config["quality_aspects"].get() + supported_aspects = [aspect.name for aspect in QualityRubrics] + invalid_aspects = [ + aspect for aspect in quality_aspects if aspect not in supported_aspects + ] + if invalid_aspects: + raise ValueError( + f"Unknown quality aspect(s): {invalid_aspects}. " + f"Supported aspects: {supported_aspects}" + ) + + if UTILITY_METRICS.intersection(set(metrics)): + if not config["user_nlu_config"].get(None): + raise ValueError( + "`user_nlu_config` is required for utility metrics." + ) + if not config["agent_nlu_config"].get(None): + raise ValueError( + "`agent_nlu_config` is required for utility metrics." + ) + + return quality_aspects + + +def load_nlu(config_path: str, name: str) -> Any: + """Loads one NLU component from a config path.""" + nlu_config = confuse.Configuration(name) + nlu_config.set_file(config_path) + return get_NLU(nlu_config) + + +def annotate_for_utility( + dialogues: List[Any], config: confuse.Configuration, metrics: Sequence[str] +) -> None: + """Annotates dialogues when utility metrics are requested.""" + if not UTILITY_METRICS.intersection(set(metrics)): + return + + user_nlu = load_nlu( + config["user_nlu_config"].get(), "User NLU Configuration" + ) + agent_nlu = load_nlu( + config["agent_nlu_config"].get(), "Agent NLU Configuration" + ) + annotate_dialogues(dialogues, user_nlu, agent_nlu) + + +def get_summary_by_agent( + dialogues: Sequence[Any], scores: Mapping[str, float] +) -> Dict[str, Dict[str, float]]: + """Aggregates metric scores by agent.""" + grouped_scores: Dict[str, List[float]] = defaultdict(list) + for dialogue in dialogues: + grouped_scores[dialogue.agent_id].append( + scores[dialogue.conversation_id] + ) + + return { + agent_id: { + "count": len(agent_scores), + "min": min(agent_scores), + "max": max(agent_scores), + "mean": mean(agent_scores), + "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0, + } + for agent_id, agent_scores in grouped_scores.items() + } + + +def get_utility_intents( + config: confuse.Configuration, +) -> Dict[str, List[Intent]]: + """Builds intent lists used by utility metrics.""" + return { + "recommendation_intents": [ + Intent(label) + for label in config["recommendation_intent_labels"].get() + ], + "acceptance_intents": [ + Intent(label) for label in config["accept_intent_labels"].get() + ], + "rejection_intents": [ + Intent(label) for label in config["reject_intent_labels"].get() + ], + } + + +def build_metric_registry( + config: confuse.Configuration, metrics: Sequence[str] +) -> Dict[str, Any]: + """Builds metric instances.""" + registry: Dict[str, Any] = {} + if "quality" in metrics: + registry["quality"] = QualityMetric( + llm_interface=get_llm_interface( + config["quality_llm_interface"].get() + ) + ) + if "satisfaction" in metrics: + registry["satisfaction"] = SatisfactionMetric( + classifier=SatisfactionClassifierSVM() + ) + if "success_rate" in metrics: + registry["success_rate"] = SuccessRateMetric() + if "successful_recommendation_round_ratio" in metrics: + registry[ + "successful_recommendation_round_ratio" + ] = SuccessfulRecommendationRoundRatioMetric() + if "reward_per_dialogue_length" in metrics: + registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric() + return registry + + +def evaluate_metric( + metric_name: str, + metric: Any, + dialogues: List[Any], + quality_aspects: Sequence[str], + utility_intents: Dict[str, List[Intent]], +) -> Dict[str, Any]: + """Evaluates one metric and returns serialized results.""" + if metric_name == "quality": + return { + "aspects": { + aspect: { + "per_dialogue": scores, + "summary_by_agent": get_summary_by_agent(dialogues, scores), + } + for aspect in quality_aspects + for scores in [ + metric.evaluate_dialogues(dialogues, aspect=aspect) + ] + } + } + + if metric_name in { + "success_rate", + "successful_recommendation_round_ratio", + }: + scores = metric.evaluate_dialogues(dialogues, **utility_intents) + elif metric_name == "reward_per_dialogue_length": + scores = metric.evaluate_dialogues( + dialogues, + acceptance_intents=utility_intents["acceptance_intents"], + ) + else: + scores = metric.evaluate_dialogues(dialogues) + + return { + "per_dialogue": scores, + "summary_by_agent": get_summary_by_agent(dialogues, scores), + } + + +def save_results( + config: confuse.Configuration, results: Dict[str, Any] +) -> None: + """Writes config dump and evaluation results to disk.""" + output_path = config["output"].get() + output_dir = os.path.dirname(output_path) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + output_stem, _ = os.path.splitext(output_path) + with open(f"{output_stem}.meta.yaml", "w") as f: + f.write(config.dump()) + + with open(output_path, "w") as f: + json.dump(results, f, indent=2) + + +def print_summary(results: Mapping[str, Any]) -> None: + """Prints a concise terminal summary.""" + for metric_name, metric_result in results["metrics"].items(): + print(f"Metric: {metric_name}") + if metric_name == "quality": + for aspect_name, aspect_result in metric_result["aspects"].items(): + print(f" Aspect: {aspect_name}") + for agent_id, stats in aspect_result[ + "summary_by_agent" + ].items(): + print( + f" Agent: {agent_id} | mean={stats['mean']:.3f} " + f"stdev={stats['stdev']:.3f}" + ) + continue + + for agent_id, stats in metric_result["summary_by_agent"].items(): + print( + f" Agent: {agent_id} | mean={stats['mean']:.3f} " + f"stdev={stats['stdev']:.3f}" + ) + + +def main() -> None: + """Runs evaluation based on the resolved configuration.""" + args = parse_args() + config = load_config(args) + + metrics = config["metrics"].get() + quality_aspects = validate_config(config) + dialogues = json_to_dialogues(config["dialogues"].get()) + annotate_for_utility(dialogues, config, metrics) + + utility_intents = get_utility_intents(config) + metric_registry = build_metric_registry(config, metrics) + + results: Dict[str, Any] = { + "dialogues_path": config["dialogues"].get(), + "metrics_requested": metrics, + "metrics": {}, + } + + for metric_name in metrics: + results["metrics"][metric_name] = evaluate_metric( + metric_name, + metric_registry[metric_name], + dialogues, + quality_aspects, + utility_intents, + ) + + save_results(config, results) + print_summary(results) + + +if __name__ == "__main__": + main()