Merge pull request #121 from OpenBioLink/input_list_and_str

KonstantinHebenstreit · web-flow · commit 5d40afa69383 · 2023-03-10T17:13:33.000+01:00
Let inputs be strings, not only lists
E.g. Collection("worldtree") is also possible, not only Collection(["worldtree")]
diff --git a/README.md b/README.md
@@ -175,6 +175,8 @@ collection.evaluate()
 ## Versioning
 All updates/changes to datasets are explicitly mentioned in bold.
 
+0.0.5 (2023-03-10) -  Function to select which generated CoTs to keep after loading: collection.select_generated_cots(author="thoughtsource")
+
 0.0.4 (2023-03-08) -  Evaluation function improved. Function to load ThoughtSource100 collection: Collection.load_thoughtsource_100()
 
 0.0.3 (2023-02-24) -  ThoughtSource_100 collection released with reasoning chains from GPT-text-davinci-003, flan-t5-xxl, and cohere's command-xl
diff --git a/libs/cot/cot/config.py b/libs/cot/cot/config.py
@@ -75,7 +75,6 @@ def __post_init__(self):
         # replace all keys (or non given keys) in config with the corresponding values
 
         # Inserts None at index 0 of instruction_keys to query without an explicit instruction
-        # TODO rethink this, maybe add option to disable this
         if self.instruction_keys == "all":
             self.instruction_keys = [None] + list(FRAGMENTS["instructions"].keys())
         elif not self.instruction_keys:
@@ -91,6 +90,25 @@ def __post_init__(self):
         elif not self.answer_extraction_keys:
             self.answer_extraction_keys = [None]
 
+        # turn strings into lists for all trigger keys
+        if isinstance(self.instruction_keys, str):
+            self.instruction_keys = [self.instruction_keys]
+        if isinstance(self.cot_trigger_keys, str):
+            self.cot_trigger_keys = [self.cot_trigger_keys]
+        if isinstance(self.answer_extraction_keys, str):
+            self.answer_extraction_keys = [self.answer_extraction_keys]
+
+        # check if all keys are valid
+        for key in self.instruction_keys:
+            if key is not None and key not in FRAGMENTS["instructions"]:
+                raise ValueError(f"Given instruction key '{key}' is not in fragments.json.")
+        for key in self.cot_trigger_keys:
+            if key is not None and key not in FRAGMENTS["cot_triggers"]:
+                raise ValueError(f"Given cot_trigger key '{key}' is not in fragments.json.")
+        for key in self.answer_extraction_keys:
+            if key is not None and key not in FRAGMENTS["answer_extractions"]:
+                raise ValueError(f"Given answer_extraction key '{key}' is not in fragments.json.")
+
         # check if the templates contain only allowed keys
         import re
 
@@ -115,15 +133,12 @@ def __post_init__(self):
             assert self.idx_range[0] < self.idx_range[1], "idx_range must be a tuple of ints with idx_range[0] < idx_range[1]"
 
         if self.instruction_keys != "all":
-            assert isinstance(self.instruction_keys, list), "instruction_keys must be a list"
             assert all(isinstance(key, (str, type(None))) for key in self.instruction_keys), "instruction_keys must be a list of strings"
 
         if self.cot_trigger_keys != "all":
-            assert isinstance(self.cot_trigger_keys, list), "cot_trigger_keys must be a list"
             assert all(isinstance(key, (str, type(None))) for key in self.cot_trigger_keys), "cot_trigger_keys must be a list of strings"
 
         if self.answer_extraction_keys != "all":
-            assert isinstance(self.answer_extraction_keys, list), "answer_extraction_keys must be a list"
             assert all(
                 isinstance(key, (str, type(None))) for key in self.answer_extraction_keys
             ), "answer_extraction_keys must be a list of strings"
diff --git a/libs/cot/cot/dataloader.py b/libs/cot/cot/dataloader.py
@@ -61,6 +61,19 @@ def __init__(self, names=None, verbose=True, generate_mode=None, source=False, l
                 "load_pregenerated_cots only works if datasets are loaded in ThoughSource view. \
                 Param source needs to be False for pregenerated CoTs to be loaded."
             )
+        
+        # if dataset name is a string, convert to list
+        if isinstance(names, str) and names != "all":
+            names = [names]
+        # test if dataset name is valid
+        if names is not None and names != "all":
+            for name in names:
+                available_datasets = Collection._all_available_datasets()
+                if name not in available_datasets:
+                    raise ValueError(
+                        f"""Dataset '{name}' not found. Please check spelling.
+                        Available datasets: {available_datasets}"""
+                        )
 
         if generate_mode in ["redownload", "recache"]:
             # see https://huggingface.co/docs/datasets/v2.1.0/en/package_reference/builder_classes#datasets.DownloadMode
@@ -90,7 +103,7 @@ def __init__(self, names=None, verbose=True, generate_mode=None, source=False, l
             self.load_datasets(names)
 
         # unfortunately all generated cots have to be loaded when loading datasets in ThoughtSource view
-        # here: all or None, or select specific generated cots with select_generated_cots
+        # here: all or None, selection of specific generated cots can be done later with select_generated_cots
         if not load_pregenerated_cots and not source:
             self.delete_all_generated_cots()
 
@@ -151,6 +164,10 @@ def _find_datasets(names=None):
         else:
             dataloader_scripts = [(name, path_to_biodatasets / name / (name + ".py")) for name in names]
         return dataloader_scripts
+    
+    @staticmethod
+    def _all_available_datasets():
+        return [name for name, _ in Collection._find_datasets()]
 
     def _get_metadata(self):
         for name, script_path in Collection._find_datasets():
diff --git a/libs/cot/cot/generate.py b/libs/cot/cot/generate.py
@@ -208,11 +208,14 @@ def _generate_and_extract(
                     item["generated_cot"].append(generated_cot)
 
         except Exception as ex:
+            # if last try, raise error
+            if i == number_of_tries - 1:
+                raise ex
+
+            # if not last try, add additional time to api_time_interval and try again
             additional_api_time += 10
-            print("API-Error in item " + str(idx) + ": " + str(ex))
+            print("(API-)Error in item " + str(idx) + ": " + str(ex))
             print("Retrying with additional time of " + str(additional_api_time) + " seconds.")
-            # if you want the error to be raised, uncomment the following line:
-            # raise ex
             pass
             
         else:
@@ -406,6 +409,7 @@ def __getitem__(self, key):
 
 def query_model(input, api_service, engine, temperature, max_tokens, api_time_interval):
     if api_service == "mock_api":
+        # time.sleep(api_time_interval)
         return " Test mock chain of thought."
         # return ("This is a " + 20 * "long " + "Mock CoT.\n")*20