From fb59bb992843aefd67a49f876eae4175ee3622d4 Mon Sep 17 00:00:00 2001
From: Xueqing Liu <liususan091219@users.noreply.github.com>
Date: Fri, 3 Dec 2021 12:45:16 -0500
Subject: [PATCH] adding TODOs for NLP module, so students can implement other
 tasks easier (#321)

* fixing ray pickle bug, skipping macosx bug, completing code for seqregression

* catching connectionerror

* ading TODOs for NLP module
---
 flaml/data.py                              |  9 ++-
 flaml/model.py                             | 48 ++++++++++++++--
 flaml/nlp/huggingface/trainer.py           | 48 +++++++++++++++-
 flaml/nlp/utils.py                         | 66 ++++++++++++++++------
 test/nlp/test_autohf.py                    | 38 +++++++------
 test/nlp/test_autohf_classificationhead.py | 13 ++++-
 test/nlp/test_autohf_cv.py                 | 15 +++--
 test/nlp/test_autohf_maxiter1.py           | 15 +++--
 test/nlp/test_autohf_regression.py         | 25 +++++---
 9 files changed, 214 insertions(+), 63 deletions(-)

diff --git a/flaml/data.py b/flaml/data.py
index b21150a68..427414bdb 100644
--- a/flaml/data.py
+++ b/flaml/data.py
@@ -12,6 +12,7 @@ from .training_log import training_log_reader
 from datetime import datetime
 from typing import Dict, Union, List
 
+# TODO: if your task is not specified in here, define your task as an all-capitalized word
 SEQCLASSIFICATION = "seq-classification"
 CLASSIFICATION = ("binary", "multi", "classification", SEQCLASSIFICATION)
 SEQREGRESSION = "seq-regression"
@@ -20,10 +21,16 @@ TS_FORECAST = "ts_forecast"
 TS_TIMESTAMP_COL = "ds"
 TS_VALUE_COL = "y"
 FORECAST = "forecast"
+SUMMARIZATION = "summarization"
+NLG_TASKS = (SUMMARIZATION,)
+NLU_TASKS = (
+    SEQREGRESSION,
+    SEQCLASSIFICATION,
+)
 
 
 def _is_nlp_task(task):
-    if task in [SEQCLASSIFICATION, SEQREGRESSION]:
+    if task in NLU_TASKS + NLG_TASKS:
         return True
     else:
         return False
diff --git a/flaml/model.py b/flaml/model.py
index bc37bd6a3..ad2261882 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -23,6 +23,8 @@ from .data import (
     TS_FORECAST,
     TS_TIMESTAMP_COL,
     TS_VALUE_COL,
+    SEQCLASSIFICATION,
+    SEQREGRESSION,
 )
 
 import pandas as pd
@@ -303,8 +305,8 @@ class TransformersEstimator(BaseEstimator):
         return train_df
 
     @classmethod
-    def search_space(cls, **params):
-        return {
+    def search_space(cls, data_size, task, **params):
+        search_space_dict = {
             "learning_rate": {
                 "domain": tune.loguniform(lower=1e-6, upper=1e-3),
                 "init_value": 1e-5,
@@ -331,6 +333,14 @@ class TransformersEstimator(BaseEstimator):
             "seed": {"domain": tune.choice(list(range(40, 45))), "init_value": 42},
             "global_max_steps": {"domain": sys.maxsize, "init_value": sys.maxsize},
         }
+        #   TODO: if self._task == SUMMARIZATION, uncomment the code below, SET the search space for
+        #    "num_beams" in search_space_dict using
+        #    search_space_dict["num_beams"] = {...}
+
+        # if task in NLG_TASKS:
+        #     search_space_dict["num_beams"] = {"domain": tune.choice(...)}
+
+        return search_space_dict
 
     def _init_hpo_args(self, automl_fit_kwargs: dict = None):
         from .nlp.utils import HPOArgs
@@ -356,7 +366,15 @@ class TransformersEstimator(BaseEstimator):
     def fit(self, X_train: DataFrame, y_train: Series, budget=None, **kwargs):
         from transformers import EarlyStoppingCallback
         from transformers.trainer_utils import set_seed
-        from transformers import AutoTokenizer, TrainingArguments
+        from transformers import AutoTokenizer
+
+        #   TODO: if self._task == SUMMARIZATION, uncomment the code below (add indentation before
+        #         from transformers import TrainingArguments)
+        # if self._task in NLG_TASKS:
+        #     from transformers import Seq2SeqTrainingArguments as TrainingArguments
+        # else:
+        from transformers import TrainingArguments
+
         import transformers
         from datasets import Dataset
         from .nlp.utils import (
@@ -367,6 +385,13 @@ class TransformersEstimator(BaseEstimator):
             get_trial_fold_name,
             date_str,
         )
+
+        # TODO: if self._task == QUESTIONANSWERING, uncomment the code below (add indentation before
+        #  from .nlp.huggingface.trainer import TrainerForAuto)
+
+        # if self._task in NLG_TASKS:
+        #     from .nlp.huggingface.trainer import Seq2SeqTrainerForAuto as TrainerForAuto
+        # else:
         from .nlp.huggingface.trainer import TrainerForAuto
 
         this_params = self.params
@@ -414,6 +439,13 @@ class TransformersEstimator(BaseEstimator):
 
         X_train = self._preprocess(X_train, self._task, **kwargs)
         train_dataset = Dataset.from_pandas(self._join(X_train, y_train))
+
+        # TODO: set a breakpoint here, observe the resulting train_dataset,
+        #  compare it with the output of the tokenized results in your transformer example
+        #  for example, if your task is MULTIPLECHOICE, you need to compare train_dataset with
+        #  the output of https://github.com/huggingface/transformers/blob/master/examples/pytorch/multiple-choice/run_swag.py#L329
+        #  make sure they are the same
+
         if X_val is not None:
             X_val = self._preprocess(X_val, self._task, **kwargs)
             eval_dataset = Dataset.from_pandas(self._join(X_val, y_val))
@@ -528,6 +560,7 @@ class TransformersEstimator(BaseEstimator):
                 logger.warning("checkpoint {} not found".format(ckpt_location))
 
     def cleanup(self):
+        super().cleanup()
         if hasattr(self, "_ckpt_remains"):
             for each_ckpt in self._ckpt_remains:
                 self._delete_one_ckpt(each_ckpt)
@@ -558,7 +591,6 @@ class TransformersEstimator(BaseEstimator):
 
     def _compute_metrics_by_dataset_name(self, eval_pred):
         from .ml import sklearn_metric_loss_score
-        from .data import SEQREGRESSION
         import datasets
         from .nlp.utils import load_default_huggingface_metric_for_task
 
@@ -638,7 +670,13 @@ class TransformersEstimator(BaseEstimator):
         self._model = TrainerForAuto(model=best_model, args=training_args)
         predictions = self._model.predict(test_dataset)
 
-        return np.argmax(predictions.predictions, axis=1)
+        if self._task == SEQCLASSIFICATION:
+            return np.argmax(predictions.predictions, axis=1)
+        elif self._task == SEQREGRESSION:
+            return predictions.predictions
+        # TODO: elif self._task == your task, return the corresponding prediction
+        #  e.g., if your task == QUESTIONANSWERING, you need to return the answer instead
+        #  of the index
 
     def config2params(cls, config: dict) -> dict:
         params = config.copy()
diff --git a/flaml/nlp/huggingface/trainer.py b/flaml/nlp/huggingface/trainer.py
index 2eb3a4c5c..a4a005605 100644
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -2,12 +2,19 @@ import os
 
 try:
     from transformers import Trainer as TFTrainer
+    from transformers import Seq2SeqTrainer
 except ImportError:
     TFTrainer = object
 
 
 class TrainerForAuto(TFTrainer):
-    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
+    def evaluate(
+        self,
+        eval_dataset=None,
+        ignore_keys=None,
+        metric_key_prefix="eval",
+        is_seq2seq=False,
+    ):
         """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path"""
         from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 
@@ -15,8 +22,21 @@ class TrainerForAuto(TFTrainer):
             self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
         )
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        # TODO: if your task is seq2seq (i.e., SUMMARIZATION), uncomment the code below (add indentation before metrics = eval_dataset...
+
+        # if is_seq2seq:
+        #     metrics = eval_dataset and super().evaluate(
+        #         eval_dataset,
+        #         ignore_keys,
+        #         metric_key_prefix,
+        #         num_beams=self.args.num_beams,
+        #     )
+        # else:
         metrics = eval_dataset and super().evaluate(
-            eval_dataset, ignore_keys, metric_key_prefix
+            eval_dataset,
+            ignore_keys,
+            metric_key_prefix,
         )
         if metrics:
             for key in list(metrics.keys()):
@@ -29,3 +49,27 @@ class TrainerForAuto(TFTrainer):
         else:
             self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
             self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
+
+
+# TODO: if your task is SUMMARIZATION, you need a different
+#  class Seq2SeqTrainerForAuto, uncomment the code below
+#  Note: I have implemented it here,
+#  but I don't know whether it's correct, you need to debug
+#  Seq2SeqTrainerForAuto to make sure it's correct
+
+
+# class Seq2SeqTrainerForAuto(Seq2SeqTrainer, TrainerForAuto):
+#     def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
+#         """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path"""
+#         super(TrainerForAuto).evaluate(
+#             eval_dataset, ignore_keys, metric_key_prefix, is_seq2seq=True
+#         )
+
+
+# TODO: if your task is QUESTIONANSWERING, uncomment the code below
+#  by adapting the code in https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/trainer_qa.py#L28
+
+
+# class QATrainerForAuto(TrainerForAuto):
+#     pass
+# TODO: if your task is QUESTIONANSWERING, do the post processing here
diff --git a/flaml/nlp/utils.py b/flaml/nlp/utils.py
index 8a02a4c43..97526081a 100644
--- a/flaml/nlp/utils.py
+++ b/flaml/nlp/utils.py
@@ -10,6 +10,14 @@ def load_default_huggingface_metric_for_task(task):
         return "accuracy", "max"
     elif task == SEQREGRESSION:
         return "rmse", "max"
+    # TODO: elif task == your task, return the default metric name for your task,
+    #  e.g., if task == MULTIPLECHOICE, return "accuracy"
+    #  notice this metric name has to be in ['accuracy', 'bertscore', 'bleu', 'bleurt',
+    #  'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad',
+    #  'f1', 'gleu', 'glue', 'google_bleu', 'indic_glue', 'matthews_correlation',
+    #  'meteor', 'pearsonr', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari',
+    #  'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'wer',
+    #  'wiki_split', 'xnli']
 
 
 global tokenized_column_names
@@ -20,6 +28,11 @@ def tokenize_text(X, task, custom_hpo_task):
 
     if task in (SEQCLASSIFICATION, SEQREGRESSION):
         return tokenize_text_seqclassification(X, custom_hpo_task)
+    # TODO: elif task == your task, return the tokenized result
+    #  for example, if your task == MULTIPLE CHOICE, you should
+    #  create a function named tokenize_text_multiplechoice(X, custom_hpo_args)
+    #  and what it does is the same as preprocess_function at
+    #  https://github.com/huggingface/transformers/blob/master/examples/pytorch/multiple-choice/run_swag.py#L329
 
 
 def tokenize_text_seqclassification(X, custom_hpo_args):
@@ -79,6 +92,8 @@ def get_num_labels(task, y_train):
         return 1
     elif task == SEQCLASSIFICATION:
         return len(set(y_train))
+    else:
+        return None
 
 
 def _clean_value(value: Any) -> str:
@@ -155,25 +170,43 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):
     def get_this_model():
         from transformers import AutoModelForSequenceClassification
 
-        return AutoModelForSequenceClassification.from_pretrained(
-            checkpoint_path, config=model_config
-        )
+        if task in (SEQCLASSIFICATION, SEQREGRESSION):
+            return AutoModelForSequenceClassification.from_pretrained(
+                checkpoint_path, config=model_config
+            )
+        # TODO: elif task == your task, fill in the line in your transformers example
+        #  that loads the model, e.g., if task == MULTIPLE CHOICE, according to
+        #  https://github.com/huggingface/transformers/blob/master/examples/pytorch/multiple-choice/run_swag.py#L298
+        #  you can return AutoModelForMultipleChoice.from_pretrained(checkpoint_path, config=model_config)
 
     def is_pretrained_model_in_classification_head_list(model_type):
         return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
 
     def _set_model_config(checkpoint_path):
-        if per_model_config and len(per_model_config) > 0:
-            model_config = AutoConfig.from_pretrained(
-                checkpoint_path,
-                num_labels=model_config_num_labels,
-                **per_model_config,
-            )
-        else:
-            model_config = AutoConfig.from_pretrained(
-                checkpoint_path, num_labels=model_config_num_labels
-            )
-        return model_config
+        if task in (SEQCLASSIFICATION, SEQREGRESSION):
+            if per_model_config and len(per_model_config) > 0:
+                model_config = AutoConfig.from_pretrained(
+                    checkpoint_path,
+                    num_labels=model_config_num_labels,
+                    **per_model_config,
+                )
+            else:
+                model_config = AutoConfig.from_pretrained(
+                    checkpoint_path, num_labels=model_config_num_labels
+                )
+            return model_config
+        # TODO: elif task == your task, uncomment the code below:
+        # else:
+        #     if per_model_config and len(per_model_config) > 0:
+        #         model_config = AutoConfig.from_pretrained(
+        #             checkpoint_path,
+        #             **per_model_config,
+        #         )
+        #     else:
+        #         model_config = AutoConfig.from_pretrained(
+        #             checkpoint_path
+        #         )
+        #     return model_config
 
     if task == SEQCLASSIFICATION:
         num_labels_old = AutoConfig.from_pretrained(checkpoint_path).num_labels
@@ -199,8 +232,9 @@ def load_model(checkpoint_path, task, num_labels, per_model_config=None):
             this_model = get_this_model()
         this_model.resize_token_embeddings(this_vocab_size)
         return this_model
-    elif task == SEQREGRESSION:
-        model_config_num_labels = 1
+    else:
+        if task == SEQREGRESSION:
+            model_config_num_labels = 1
         model_config = _set_model_config(checkpoint_path)
         this_model = get_this_model()
         return this_model
diff --git a/test/nlp/test_autohf.py b/test/nlp/test_autohf.py
index e7436bcc2..6250df5ce 100644
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@@ -1,22 +1,25 @@
-import os
+import sys
 import pytest
 
 
-@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_hf_data():
     from flaml import AutoML
-
+    import requests
     from datasets import load_dataset
 
-    train_dataset = (
-        load_dataset("glue", "mrpc", split="train[:1%]").to_pandas().iloc[0:4]
-    )
-    dev_dataset = (
-        load_dataset("glue", "mrpc", split="train[1%:2%]").to_pandas().iloc[0:4]
-    )
-    test_dataset = (
-        load_dataset("glue", "mrpc", split="test[1%:2%]").to_pandas().iloc[0:4]
-    )
+    try:
+        train_dataset = (
+            load_dataset("glue", "mrpc", split="train[:1%]").to_pandas().iloc[0:4]
+        )
+        dev_dataset = (
+            load_dataset("glue", "mrpc", split="train[1%:2%]").to_pandas().iloc[0:4]
+        )
+        test_dataset = (
+            load_dataset("glue", "mrpc", split="test[1%:2%]").to_pandas().iloc[0:4]
+        )
+    except requests.exceptions.ConnectionError:
+        return
 
     custom_sent_keys = ["sentence1", "sentence2"]
     label_key = "label"
@@ -75,12 +78,15 @@ def test_hf_data():
 
 def _test_custom_data():
     from flaml import AutoML
-
+    import requests
     import pandas as pd
 
-    train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3)
-    dev_dataset = pd.read_csv("data/input/dev.tsv", delimiter="\t", quoting=3)
-    test_dataset = pd.read_csv("data/input/test.tsv", delimiter="\t", quoting=3)
+    try:
+        train_dataset = pd.read_csv("data/input/train.tsv", delimiter="\t", quoting=3)
+        dev_dataset = pd.read_csv("data/input/dev.tsv", delimiter="\t", quoting=3)
+        test_dataset = pd.read_csv("data/input/test.tsv", delimiter="\t", quoting=3)
+    except requests.exceptions.ConnectionError:
+        pass
 
     custom_sent_keys = ["#1 String", "#2 String"]
     label_key = "Quality"
diff --git a/test/nlp/test_autohf_classificationhead.py b/test/nlp/test_autohf_classificationhead.py
index c81cd1069..a8740bda3 100644
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@@ -1,10 +1,17 @@
 def test_classification_head():
     from flaml import AutoML
-
+    import requests
     from datasets import load_dataset
 
-    train_dataset = load_dataset("emotion", split="train[:1%]").to_pandas().iloc[0:10]
-    dev_dataset = load_dataset("emotion", split="train[1%:2%]").to_pandas().iloc[0:10]
+    try:
+        train_dataset = (
+            load_dataset("emotion", split="train[:1%]").to_pandas().iloc[0:10]
+        )
+        dev_dataset = (
+            load_dataset("emotion", split="train[1%:2%]").to_pandas().iloc[0:10]
+        )
+    except requests.exceptions.ConnectionError:
+        return
 
     custom_sent_keys = ["text"]
     label_key = "label"
diff --git a/test/nlp/test_autohf_cv.py b/test/nlp/test_autohf_cv.py
index 0e75a32ca..9d723e7b8 100644
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -1,16 +1,19 @@
-import os
+import sys
 import pytest
 
 
-@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_cv():
     from flaml import AutoML
-
+    import requests
     from datasets import load_dataset
 
-    train_dataset = (
-        load_dataset("glue", "mrpc", split="train[:1%]").to_pandas().iloc[0:4]
-    )
+    try:
+        train_dataset = (
+            load_dataset("glue", "mrpc", split="train[:1%]").to_pandas().iloc[0:4]
+        )
+    except requests.exceptions.ConnectionError:
+        return
 
     custom_sent_keys = ["sentence1", "sentence2"]
     label_key = "label"
diff --git a/test/nlp/test_autohf_maxiter1.py b/test/nlp/test_autohf_maxiter1.py
index 0fe72b189..c389b0df0 100644
--- a/test/nlp/test_autohf_maxiter1.py
+++ b/test/nlp/test_autohf_maxiter1.py
@@ -1,15 +1,20 @@
-import os
+import sys
 import pytest
 
 
-@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_max_iter_1():
     from flaml import AutoML
-
+    import requests
     from datasets import load_dataset
 
-    train_dataset = load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
-    dev_dataset = load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
+    try:
+        train_dataset = (
+            load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
+        )
+        dev_dataset = load_dataset("glue", "mrpc", split="train").to_pandas().iloc[0:4]
+    except requests.exceptions.ConnectionError:
+        return
 
     custom_sent_keys = ["sentence1", "sentence2"]
     label_key = "label"
diff --git a/test/nlp/test_autohf_regression.py b/test/nlp/test_autohf_regression.py
index afd24a410..a4b4877db 100644
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@@ -1,23 +1,26 @@
-import os
+import sys
 import pytest
 
 
-@pytest.mark.skipif(os.name == "posix", reason="do not run on mac os")
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
 def test_regression():
     try:
         import ray
     except ImportError:
         return
     from flaml import AutoML
-
+    import requests
     from datasets import load_dataset
 
-    train_dataset = (
-        load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[:20]
-    )
-    dev_dataset = (
-        load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[:20]
-    )
+    try:
+        train_dataset = (
+            load_dataset("glue", "stsb", split="train[:1%]").to_pandas().iloc[:20]
+        )
+        dev_dataset = (
+            load_dataset("glue", "stsb", split="train[1%:2%]").to_pandas().iloc[:20]
+        )
+    except requests.exceptions.ConnectionError:
+        return
 
     custom_sent_keys = ["sentence1", "sentence2"]
     label_key = "label"
@@ -50,3 +53,7 @@ def test_regression():
     automl.fit(
         X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
     )
+
+
+if __name__ == "__main__":
+    test_regression()