From 0cb9a1b4cebeb03f1d4f7be9244d0b62ea937b24 Mon Sep 17 00:00:00 2001 From: Axel Peytavin Date: Sun, 7 Apr 2024 17:41:34 -0700 Subject: [PATCH 1/4] Refacto: Separate signatures and modules, allow to save outputs and inputs --- .../intern/generators/code_generator.py | 68 +++++++++++++++++++ .../intern/generators/diff_generator.py | 62 ----------------- src/agents/intern/generators/signatures.py | 22 ++++++ src/agents/intern/processors.py | 10 +-- 4 files changed, 95 insertions(+), 67 deletions(-) create mode 100644 src/agents/intern/generators/code_generator.py delete mode 100644 src/agents/intern/generators/diff_generator.py create mode 100644 src/agents/intern/generators/signatures.py diff --git a/src/agents/intern/generators/code_generator.py b/src/agents/intern/generators/code_generator.py new file mode 100644 index 0000000..5e291a5 --- /dev/null +++ b/src/agents/intern/generators/code_generator.py @@ -0,0 +1,68 @@ +from typing import Dict, List +import dspy +import json + +from src.agents.intern.generators.signatures import ( + NewFilesGeneratorSignature, + RelevantFileSelectionSignature, +) +from src.models import Codebase, Ticket + + +SHOULD_RECORD_INPUT_OUTPUT = True + + +class CodeGenerator(dspy.Module): + def __init__(self): + super().__init__() + + self.relevant_file_selector = dspy.TypedChainOfThought( + RelevantFileSelectionSignature + ) + self.new_files_generator = dspy.TypedChainOfThought(NewFilesGeneratorSignature) + + def record_input_output(self, inputs: Dict, outputs: Dict): + with open("tests/data/code_generator.json", "w") as f: + json.dump( + { + "inputs": inputs, + "outputs": outputs, + }, + f, + ) + + def forward(self, codebase: Codebase, ticket: Ticket): + relevant_files = self.relevant_file_selector( + files_in_codebase=json.dumps(list(codebase.files.keys())), + ticket=json.dumps(ticket.model_dump()), + ) + + subset_codebase = { + file: codebase.files[file] for file in relevant_files.relevant_files + } + + relevant_codebase = Codebase(files=subset_codebase) + + new_files = self.new_files_generator( + relevant_codebase=json.dumps(relevant_codebase.model_dump()), + ticket=json.dumps(ticket.model_dump()), + ) + + if SHOULD_RECORD_INPUT_OUTPUT: + self.record_input_output( + { + "codebase": codebase.model_dump(), + "ticket": ticket.model_dump(), + }, + { + "relevant_files": relevant_files.relevant_files, + "new_files": new_files.new_files, + "explanations": new_files.explanations, + }, + ) + + return { + "relevant_files": relevant_files.relevant_files, + "new_files": new_files.new_files, + "explanations": new_files.explanations, + } diff --git a/src/agents/intern/generators/diff_generator.py b/src/agents/intern/generators/diff_generator.py deleted file mode 100644 index a51efce..0000000 --- a/src/agents/intern/generators/diff_generator.py +++ /dev/null @@ -1,62 +0,0 @@ -from typing import Dict, List -import dspy -import json - -from src.models import Codebase, Ticket - - -class RelevantFileSelectionSignature(dspy.Signature): - files_in_codebase = dspy.InputField() - ticket = dspy.InputField() - relevant_files: List[str] = dspy.OutputField( - desc="Give the relevant files for you to observe to complete the ticket. They must be keys of the codebase dict." - ) - - -# Define the agent -class DiffGeneratorSignature(dspy.Signature): - relevant_codebase = dspy.InputField() - ticket = dspy.InputField() - git_diff = dspy.OutputField(desc="Give ONLY the git diff") - explanations = dspy.OutputField(desc="Give explanations for the diff generated") - - -class NewFilesGeneratorSignature(dspy.Signature): - relevant_codebase = dspy.InputField() - ticket = dspy.InputField() - new_files: Dict[str, str] = dspy.OutputField( - desc="Generate the entire files that need to be update or created complete the ticket, with all of their content post update. The key is the path of the file and the value is the content of the file." - ) - explanations = dspy.OutputField( - desc="Give explanations for the new files generated. Use Markdown to format the text." - ) - - -class DiffGenerator(dspy.Module): - def __init__(self): - super().__init__() - - self.diff_generator = dspy.ChainOfThought(DiffGeneratorSignature) - self.relevant_file_selector = dspy.TypedChainOfThought( - RelevantFileSelectionSignature - ) - self.new_files_generator = dspy.TypedChainOfThought(NewFilesGeneratorSignature) - - def forward(self, codebase: Codebase, ticket: Ticket): - relevant_files = self.relevant_file_selector( - files_in_codebase=json.dumps(list(codebase.files.keys())), - ticket=json.dumps(ticket.model_dump()), - ) - - subset_codebase = { - file: codebase.files[file] for file in relevant_files.relevant_files - } - - relevant_codebase = Codebase(files=subset_codebase) - - new_files = self.new_files_generator( - relevant_codebase=json.dumps(relevant_codebase.model_dump()), - ticket=json.dumps(ticket.model_dump()), - ) - - return new_files.new_files, new_files.explanations diff --git a/src/agents/intern/generators/signatures.py b/src/agents/intern/generators/signatures.py new file mode 100644 index 0000000..deaab02 --- /dev/null +++ b/src/agents/intern/generators/signatures.py @@ -0,0 +1,22 @@ +import dspy +from typing import Dict, List + + +class RelevantFileSelectionSignature(dspy.Signature): + files_in_codebase = dspy.InputField() + ticket = dspy.InputField() + relevant_files: List[str] = dspy.OutputField( + desc="Give the relevant files for you to observe to complete the ticket. They must be keys of the files_in_codebase dict." + ) + + +# Define the agent +class NewFilesGeneratorSignature(dspy.Signature): + relevant_codebase = dspy.InputField() + ticket = dspy.InputField() + new_files: Dict[str, str] = dspy.OutputField( + desc="Generate the entire files that need to be update or created complete the ticket, with all of their content post update. The key is the path of the file and the value is the content of the file." + ) + explanations = dspy.OutputField( + desc="Give explanations for the new files generated. Use Markdown to format the text." + ) diff --git a/src/agents/intern/processors.py b/src/agents/intern/processors.py index 0e2334a..b1d75e8 100644 --- a/src/agents/intern/processors.py +++ b/src/agents/intern/processors.py @@ -1,5 +1,5 @@ -from src.agents.intern.generators.diff_generator import DiffGenerator -from src.language_models import gpt4, mistral +from src.agents.intern.generators.code_generator import CodeGenerator +from src.language_models import gpt4 import dspy from src.models import Codebase, Ticket @@ -22,8 +22,8 @@ def generate_code_change(ticket: Ticket, code_base: Codebase): # and will return a new code_change dspy.configure(lm=gpt4) - diff_generator = DiffGenerator() + diff_generator = CodeGenerator() - new_files, explanations = diff_generator(code_base, ticket) + res = diff_generator(code_base, ticket) - return new_files, explanations + return res["new_files"], res["explanations"] From 623cb2ccfc8d9fcabf738628ed2e7acabcc8db9a Mon Sep 17 00:00:00 2001 From: Axel Peytavin Date: Sun, 7 Apr 2024 17:41:51 -0700 Subject: [PATCH 2/4] Test file to test code generator individually --- tests/code_generator.test.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/code_generator.test.py diff --git a/tests/code_generator.test.py b/tests/code_generator.test.py new file mode 100644 index 0000000..868dfe0 --- /dev/null +++ b/tests/code_generator.test.py @@ -0,0 +1,30 @@ +# Import path above +import sys +import os + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +from src.agents.intern.generators.code_generator import CodeGenerator +from src.models import Codebase, Ticket +from src.language_models import gpt4 +import dspy +import json +from dotenv import load_dotenv + + +if __name__ == "__main__": + load_dotenv() + + dspy.configure(lm=gpt4) + + code_generator = CodeGenerator() + + # Import the ticket and codease from a json file in data/code_generator.json + with open("tests/data/code_generator.json", "r") as f: + data = json.load(f) + + codebase = Codebase(**data["inputs"]["codebase"]) + ticket = Ticket(**data["inputs"]["ticket"]) + + res = code_generator(codebase, ticket) + print(res) From 1422c24a32ead9ee8701dfcf559170525bee6f7c Mon Sep 17 00:00:00 2001 From: Axel Peytavin Date: Sun, 7 Apr 2024 17:42:13 -0700 Subject: [PATCH 3/4] In the case the PR is being handled somewhere, we need to skip it --- src/agents/reviewer/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/agents/reviewer/__init__.py b/src/agents/reviewer/__init__.py index 0449e6d..84d9570 100644 --- a/src/agents/reviewer/__init__.py +++ b/src/agents/reviewer/__init__.py @@ -75,6 +75,10 @@ def process_pr(self): # Get first open PR from GH that hasn't been approved yet pr = self.pr_backlog.pop(0) + if pr.ticket_id is None: + # This PR is not associated with a Trello ticket + return + # Fetch the Trello ticket that corresponds to this PR ticket = self.board_helper.get_ticket(ticket_id=pr.ticket_id) From 7a720c07c7848205f42e5602908e9c0fa55be8d4 Mon Sep 17 00:00:00 2001 From: Axel Peytavin Date: Sun, 7 Apr 2024 17:42:33 -0700 Subject: [PATCH 4/4] Fix: We were not seeing all files in github --- src/helpers/github.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/helpers/github.py b/src/helpers/github.py index d98a640..4bce960 100644 --- a/src/helpers/github.py +++ b/src/helpers/github.py @@ -151,19 +151,26 @@ def push_changes( ) def get_entire_codebase(self) -> Codebase: + codebase_dict = {} contents = self.repo.get_contents("") if not isinstance(contents, list): contents = [contents] - codebase_dict = {} - for file in contents: - try: - codebase_dict[file.path] = file.decoded_content.decode("utf-8") - except Exception as e: - pass + def process_contents(contents, path=""): + for item in contents: + if item.type == "dir": + dir_contents = self.repo.get_contents(item.path) + process_contents(dir_contents, path + item.name + "/") + elif item.type == "file": + try: + codebase_dict[path + item.name] = item.decoded_content.decode( + "utf-8" + ) + except Exception as e: + pass + process_contents(contents) codebase = Codebase(files=codebase_dict) - return codebase def get_file_content(self, file):