fix dataset generation

2026-01-10 06:07:58 -05:00 · 2024-02-05 21:05:28 -05:00
parent cc2c21cab5
commit 3bf674ae29
3 changed files with 71 additions and 39 deletions
--- a/data/generate_home_assistant_data.py
+++ b/data/generate_home_assistant_data.py
@@ -274,7 +274,9 @@ def get_included_vars(response: str):

 pile_of_responses["contains_vars"] = pile_of_responses["response"].apply(get_included_vars)

-def get_random_response(*, service: str, language: str, persona: str, required_vars: list[str], short: bool) -> str:
+def get_random_response(*, service: str, language: str, persona: str, question_template: str, short: bool) -> str:
+
+    required_vars = list(set([var for var in var_pattern.findall(question_template) if "device_name" not in var]))
    
    possible_results = pile_of_responses.loc[(pile_of_responses['service']==service) & 
                          (pile_of_responses['language']==language) & 
@@ -284,7 +286,7 @@ def get_random_response(*, service: str, language: str, persona: str, required_v
                        ]
    
    if len(possible_results) == 0:
-        raise Exception(f"No responses matched the provided filters: {service}, {language}, {persona}, {required_vars}, {short}")
+        raise Exception(f"No responses matched the provided filters: {service}, {language}, {persona}, {question_template}, {short}")
    
    return possible_results.sample()["response"].values[0]

@@ -355,10 +357,10 @@ def random_device_list(max_devices: int, avoid_device_names: list[str]):

    return device_lines, list(device_types), list(extra_exposed_attributes)

-def generate_static_example(action: dict, max_devices: int = 32):
+def generate_static_example(action: dict, language: str, persona: str, max_devices: int = 32):
    question = action["english_phrase"]
-    device_type = service_name.split(".")[0]
    service_name = action["service_name"]
+    device_type = service_name.split(".")[0]
    target_device = f"{device_type}.{action['device_name']}"
    friendly_name = target_device.split(".")[1].replace("_", " ").title()

@@ -384,10 +386,12 @@ def generate_static_example(action: dict, max_devices: int = 32):
        service=action["service_name"],
        language="en",
        persona="assistant",
-        required_vars=[],
+        question_template="",
        short=False
    ).lower()

+    response = response.replace("<device_name>", friendly_name)
+
    return {
        "states": device_list,
        "available_services": list(available_services),
@@ -396,7 +400,7 @@ def generate_static_example(action: dict, max_devices: int = 32):
        "service_calls": [ { "service": service_name, "target_device": target_device } ]
    }

-def generate_templated_example(template: dict, max_devices: int = 32):
+def generate_templated_example(template: dict, language: str, persona: str, max_devices: int = 32):
    template_device_types: list[str] = template["device_type"].split("|")
    service_names: list[str] = [ f"{x}.{y}" for x, y in zip(template_device_types, template["service"].split("|")) ]
    question_template: str = template["english_phrase"]
@@ -423,6 +427,8 @@ def generate_templated_example(template: dict, max_devices: int = 32):
            extra_exposed_attributes.append("temperature")
        if "<humidity>" in question_template and "humidity" not in extra_exposed_attributes:
            extra_exposed_attributes.append("humidity")
+        if "<fan_mode>" in question_template and "fan_mode" not in extra_exposed_attributes:
+            extra_exposed_attributes.append("fan_mode")

        state = SUPPORTED_DEVICES[device_dict["type"]].get_random_state(extra_exposed_attributes=extra_exposed_attributes)
        device_name = device_dict["device_name"]
@@ -442,19 +448,32 @@ def generate_templated_example(template: dict, max_devices: int = 32):
    # pick an appropriate response and generate the question
    if len(template_device_types) == 1:
        # TODO: pick correct resonse here (also probaly need to pass in language and persona)
-        answer_template: str = get_random_response(
-            service=service_name
+        answer_template = get_random_response(
+            service=service_names[0],
+            language=language,
+            persona=persona,
+            question_template=question_template,
+            short=False
        )

        question = question_template.replace("<device_name>", chosen_devices[0]["description"])
        answer = answer_template.replace("<device_name>", chosen_devices[0]["description"])
-    else:
-        # TODO: pick correct resonse here (also probaly need to pass in language and persona)
+    else:        
        question = question_template
-        answer = answer_template
+        answers = []
        for i in range(len(template_device_types)):
            question = question.replace(f"<device_name{(i + 1)}>", chosen_devices[i]["description"])
-            answer = answer.replace(f"<device_name{(i + 1)}>", chosen_devices[i]["description"])
+            answer = get_random_response(
+                service=service_names[i],
+                language=language,
+                persona=persona,
+                question_template=question_template,
+                short=True
+            )
+            answers.append(answer.replace(f"<device_name>", chosen_devices[i]["description"]))
+
+        # TODO: support different "and" words per language
+        answer = " and ".join(answers)

    # generate the list of service calls and answers
    service_calls = []
@@ -462,6 +481,18 @@ def generate_templated_example(template: dict, max_devices: int = 32):
        service_calls.append({ "service": service, "target_device": device_dict["device_name"] })

    if any(["climate" in service for service in service_names ]):
+        if "<hvac_mode>" in question:
+            hvac_mode = random.choice(["heat", "cool", "heat_cool", "off", "auto", "fan_only"])
+            question = question.replace("<hvac_mode>", hvac_mode)
+            answer = answer.replace("<hvac_mode>", hvac_mode)
+            service_calls = [ { **call, "hvac_mode": hvac_mode} for call in service_calls ]
+
+        if "<fan_mode>" in question:
+            fan_mode = random.choice(["On Low", "On High", "Auto Low", "Auto High", "Off"])
+            question = question.replace("<fan_mode>", fan_mode)
+            answer = answer.replace("<fan_mode>", fan_mode)
+            service_calls = [ { **call, "fan_mode": fan_mode} for call in service_calls ]
+
        if "<temp_f>" in question:
            temp_f = random.randint(60, 80)
            question = question.replace("<temp_f>", str(temp_f))
@@ -505,7 +536,7 @@ def generate_templated_example(template: dict, max_devices: int = 32):
        "service_calls": service_calls
    }

-def generate_status_request(template: dict, max_devices: int = 32):
+def generate_status_request(template: dict, language: str, persona: str, max_devices: int = 32):
    device_type: str = template["device_type"]
    state_name: str = template["state"]
    question_template: str = template["english_phrase"]
@@ -630,7 +661,7 @@ def format_example_sharegpt(example):
    return { "conversations": conversation }


-def generate_example_file(filename: str, seed: int, format_func: Callable, *, static_factor: int, template_factor: int, status_request_factor: int):
+def generate_example_file(filename: str, seed: int, format_func: Callable, language: str, persona: str, *, static_factor: int, template_factor: int, status_request_factor: int):
    random.seed(seed)
    np.random.seed(seed)

@@ -639,10 +670,10 @@ def generate_example_file(filename: str, seed: int, format_func: Callable, *, st
    def run_factor_times(func, examples, data, factor):
        if factor >= 1:
            for i in range(factor):
-                examples.append(format_func(func(data)))
+                examples.append(format_func(func(data, language, persona)))
        else:
            if random.random() < factor:
-                examples.append(format_func(func(data)))
+                examples.append(format_func(func(data, language, persona)))
    
    generated_examples = []
    for action in tqdm(pile_of_specific_actions):
@@ -731,6 +762,9 @@ def main():

    args = parser.parse_args()

+    language = "en"
+    persona = "assistant"
+
    if not args.sample and not args.train and not args.test and not args.merge:
        parser.print_usage()
    
@@ -740,20 +774,20 @@ def main():
        format_func = format_example_sharegpt

    if args.sample:
-        generate_example_file("sample", 42, format_func, static_factor=1, template_factor=1, status_request_factor=1)
+        generate_example_file("sample", 42, format_func, language, persona, static_factor=1, template_factor=1, status_request_factor=1)
    if args.train:
        if args.size == "small":
-            generate_example_file("home_assistant_train", 42, format_func, static_factor=1, template_factor=10, status_request_factor=8)
+            generate_example_file("home_assistant_train", 42, format_func, language, persona, static_factor=1, template_factor=10, status_request_factor=8)
        elif args.size == "medium":
-            generate_example_file("home_assistant_train", 42, format_func, static_factor=5, template_factor=15, status_request_factor=12)
+            generate_example_file("home_assistant_train", 42, format_func, language, persona, static_factor=5, template_factor=15, status_request_factor=12)
        elif args.size == "large":
-            generate_example_file("home_assistant_train", 42, format_func, static_factor=5, template_factor=20, status_request_factor=15)
+            generate_example_file("home_assistant_train", 42, format_func, language, persona, static_factor=5, template_factor=20, status_request_factor=15)
        elif args.size == "xl":
-            generate_example_file("home_assistant_train", 42, format_func, static_factor=7, template_factor=25, status_request_factor=18)
+            generate_example_file("home_assistant_train", 42, format_func, language, persona, static_factor=7, template_factor=25, status_request_factor=18)
        else:
            raise Exception(f"Unrecognized dataset size: {args.size}")
    if args.test:
-        generate_example_file("home_assistant_test", 12345, format_func, static_factor=0.25, template_factor=3, status_request_factor=2)
+        generate_example_file("home_assistant_test", 12345, format_func, language, persona, static_factor=0.25, template_factor=3, status_request_factor=2)

    if args.merge == "alpaca":
        merge_with_dataset("yahma/alpaca-cleaned", 42, "alpaca", format_alpaca, ["input", "output", "instruction"], format_func)
--- a/data/piles/pile_of_responses.csv
+++ b/data/piles/pile_of_responses.csv
@@ -29,12 +29,12 @@ blinds.toggle,Toggling <device_name>,en,assistant,0
 climate.set_humidity,Increasing humidity to <humidity>.,en,assistant,0
 climate.set_humidity,Setting humidity to <humidity> percent.,en,assistant,0
 climate.set_humidity,Adjusting humidity to <humidity>%.,en,assistant,0
-climate.set_fan_mode,Setting the fan to high speed.,en,assistant,0
-climate.set_fan_mode,Putting the fan on low.,en,assistant,0
-climate.set_fan_mode,Changing the fan to medium setting.,en,assistant,0
-climate.set_hvac_mode,Switching to cooling mode.,en,assistant,0
-climate.set_hvac_mode,Setting the HVAC to heat.,en,assistant,0
-climate.set_hvac_mode,Changing HVAC to automatic mode.,en,assistant,0
+climate.set_fan_mode,Setting the fan to <fan_mode> speed.,en,assistant,0
+climate.set_fan_mode,Putting the fan on <fan_mode>.,en,assistant,0
+climate.set_fan_mode,Changing the fan to <fan_mode> setting.,en,assistant,0
+climate.set_hvac_mode,Switching to <hvac_mode> mode.,en,assistant,0
+climate.set_hvac_mode,Setting the HVAC to <hvac_mode>.,en,assistant,0
+climate.set_hvac_mode,Changing HVAC to <hvac_mode> mode.,en,assistant,0
 climate.set_temperature,Setting temperature to <temp_f> degrees.,en,assistant,0
 climate.set_temperature,Changing temperature to <temp_c> Celsius.,en,assistant,0
 climate.set_temperature,Setting the room to <temp_f> degrees Fahrenheit.,en,assistant,0
--- a/data/piles/pile_of_templated_actions.csv
+++ b/data/piles/pile_of_templated_actions.csv
@@ -179,19 +179,17 @@ climate,set_temperature,"Set the temperature to <temp_f> degrees.",8
 climate,set_temperature,"Can you change the temperature to <temp_c> Celsius?",8
 climate,set_temperature,"I'd like the room at <temp_f> degrees Fahrenheit,8
 climate,set_temperature,"Please adjust the temperature to <temp_f> degrees.",8
-climate,set_temperature,"I want the room cooler,8
-climate,set_temperature,"Make it warmer,8
 climate,set_temperature,"Can you lower the temperature to <temp_c>?",8
-climate,set_temperature,"Raise the temperature to <temp_f> degrees,8
+climate,set_temperature,"Raise the temperature to <temp_f> degrees",8
 climate,set_humidity,"Increase the humidity to <humidity>.",8
 climate,set_humidity,"Set the humidity level to <humidity> percent.",8
 climate,set_humidity,"Can you adjust the humidity to <humidity> percent?",8
-climate,set_fan_mode,"Set the fan to high speed.",8
-climate,set_fan_mode,"Please put the fan on low.",8
-climate,set_fan_mode,"Change the fan setting to medium.",8
-climate,set_hvac_mode,"Switch the system to cooling mode.",8
-climate,set_hvac_mode,"Can we set the HVAC to heat?",8
-climate,set_hvac_mode,"Change the HVAC to automatic.",8
+climate,set_fan_mode,"Set the climate fan to <fan_mode> speed.",8
+climate,set_fan_mode,"Please put the climate fan on <fan_mode>.",8
+climate,set_fan_mode,"Change the air conditioning fan to <fan_mode>.",8
+climate,set_hvac_mode,"Switch the system to <hvac_mode> mode.",8
+climate,set_hvac_mode,"Can we set the HVAC to <hvac_mode>?",8
+climate,set_hvac_mode,"Change the HVAC to <hvac_mode>.",8
 light,turn_on,"Set the brightness of <device_name> to <brightness>%.",8
 light,turn_on,"Dim <device_name> to <brightness> percent brightness.",8
 light,turn_on,"Brighten <device_name> to <brightness>.",8
@@ -199,12 +197,12 @@ light,turn_on,"Adjust <device_name> brightness to <brightness>.",8
 light,turn_on,"Increase <device_name>'s brightness to <brightness>.",8
 light,turn_on,"Lower the brightness of <device_name> to <brightness>.",8
 light,turn_on,"Can you set <device_name>'s brightness level to <brightness> percent?",8
-light,turn_on,"I'd like <device_name> at <brightness> percent brightness,8
+light,turn_on,"I'd like <device_name> at <brightness> percent brightness",8
 light,turn_on,"Can you make <device_name> <color>?",8
 light,turn_on,"Change the color of <device_name> to <color>.",8
 light,turn_on,"Change <device_name> to a <color> hue.",8
 light,turn_on,"Set <device_name> to be <color>.",8
-light,turn_on,"I want <device_name> to be <color>,8
+light,turn_on,"I want <device_name> to be <color>"",8
 light,turn_on,"Can you make <device_name> shine in <color>?",8
 light,turn_on,"Turn <device_name> to a <color> shade.",8
 light,turn_on,"Turn <device_name> <color>.",8