diff --git a/flaml/nlp/huggingface/switch_head_auto.py b/flaml/nlp/huggingface/switch_head_auto.py
deleted file mode 100644
index 9a8bcfe16..000000000
--- a/flaml/nlp/huggingface/switch_head_auto.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from collections import OrderedDict
-
-import transformers
-
-if transformers.__version__.startswith("3"):
-    from transformers.modeling_electra import ElectraClassificationHead
-    from transformers.modeling_roberta import RobertaClassificationHead
-
-else:
-    from transformers.models.electra.modeling_electra import ElectraClassificationHead
-    from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
-
-MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict(
-    [
-        ("electra", ElectraClassificationHead),
-        ("roberta", RobertaClassificationHead),
-    ]
-)
-
-
-class AutoSeqClassificationHead:
-    """
-    This is a class for getting classification head class based on the name of the LM
-    instantiated as one of the ClassificationHead classes of the library when
-    created with the `AutoSeqClassificationHead.from_model_type_and_config` method.
-
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
-    """
-
-    def __init__(self):
-        raise EnvironmentError(
-            "AutoSeqClassificationHead is designed to be instantiated "
-            "using the `AutoSeqClassificationHead.from_model_type_and_config(cls, model_type, config)` methods."
-        )
-
-    @classmethod
-    def from_model_type_and_config(
-        cls, model_type: str, config: transformers.PretrainedConfig
-    ):
-        """
-        Instantiate one of the classification head classes from the mode_type and model configuration.
-
-        Args:
-            model_type: A string, which desribes the model type, e.g., "electra".
-            config: The huggingface class of the model's configuration.
-
-        Example:
-
-        ```python
-        from transformers import AutoConfig
-        model_config = AutoConfig.from_pretrained("google/electra-base-discriminator")
-        AutoSeqClassificationHead.from_model_type_and_config("electra", model_config)
-        ```
-        """
-        if model_type in MODEL_CLASSIFICATION_HEAD_MAPPING.keys():
-            return MODEL_CLASSIFICATION_HEAD_MAPPING[model_type](config)
-        raise ValueError(
-            "Unrecognized configuration class {} for class {}.\n"
-            "Model type should be one of {}.".format(
-                config.__class__,
-                cls.__name__,
-                ", ".join(MODEL_CLASSIFICATION_HEAD_MAPPING.keys()),
-            )
-        )
diff --git a/flaml/nlp/huggingface/utils.py b/flaml/nlp/huggingface/utils.py
index 728fded88..13f1931fd 100644
--- a/flaml/nlp/huggingface/utils.py
+++ b/flaml/nlp/huggingface/utils.py
@@ -404,10 +404,6 @@ def load_model(checkpoint_path, task, num_labels=None):
     transformers.logging.set_verbosity_error()
 
     from transformers import AutoConfig
-    from ..huggingface.switch_head_auto import (
-        AutoSeqClassificationHead,
-        MODEL_CLASSIFICATION_HEAD_MAPPING,
-    )
     from ...data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION
 
     def get_this_model(checkpoint_path, task, model_config):
@@ -418,7 +414,7 @@ def load_model(checkpoint_path, task, num_labels=None):
 
         if task in (SEQCLASSIFICATION, SEQREGRESSION):
             return AutoModelForSequenceClassification.from_pretrained(
-                checkpoint_path, config=model_config
+                checkpoint_path, config=model_config, ignore_mismatched_sizes=True
             )
         elif task == TOKENCLASSIFICATION:
             return AutoModelForTokenClassification.from_pretrained(
@@ -433,9 +429,6 @@ def load_model(checkpoint_path, task, num_labels=None):
                 checkpoint_path, config=model_config
             )
 
-    def is_pretrained_model_in_classification_head_list(model_type):
-        return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING
-
     def _set_model_config(checkpoint_path):
         if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION):
             model_config = AutoConfig.from_pretrained(
@@ -448,40 +441,11 @@ def load_model(checkpoint_path, task, num_labels=None):
             return model_config
 
     current_config = AutoConfig.from_pretrained(checkpoint_path)
-    this_model_type, this_vocab_size = (
-        current_config.model_type,
-        current_config.vocab_size,
-    )
+    this_vocab_size = current_config.vocab_size
 
-    if task == SEQCLASSIFICATION:
-        num_labels_old = current_config.num_labels
-        if is_pretrained_model_in_classification_head_list(this_model_type):
-            model_config_num_labels = num_labels_old
-        else:
-            model_config_num_labels = num_labels
-        new_config = _set_model_config(checkpoint_path)
+    model_config_num_labels = num_labels
+    new_config = _set_model_config(checkpoint_path)
 
-        if is_pretrained_model_in_classification_head_list(this_model_type):
-            if num_labels != num_labels_old:
-                this_model = get_this_model(checkpoint_path, task, new_config)
-                new_config.num_labels = num_labels
-                this_model.num_labels = num_labels
-                this_model.classifier = (
-                    AutoSeqClassificationHead.from_model_type_and_config(
-                        this_model_type, new_config
-                    )
-                )
-            else:
-                this_model = get_this_model(checkpoint_path, task, new_config)
-        else:
-            this_model = get_this_model(checkpoint_path, task, new_config)
-        this_model.resize_token_embeddings(this_vocab_size)
-        return this_model
-    else:
-        if task == SEQREGRESSION:
-            model_config_num_labels = 1
-        elif task == TOKENCLASSIFICATION:
-            model_config_num_labels = num_labels
-        model_config = _set_model_config(checkpoint_path)
-        this_model = get_this_model(checkpoint_path, task, model_config)
-        return this_model
+    this_model = get_this_model(checkpoint_path, task, new_config)
+    this_model.resize_token_embeddings(this_vocab_size)
+    return this_model
diff --git a/test/nlp/test_autohf.py b/test/nlp/test_autohf.py
index f21f02543..38c82027c 100644
--- a/test/nlp/test_autohf.py
+++ b/test/nlp/test_autohf.py
@@ -2,6 +2,8 @@ import sys
 import pytest
 import requests
 from utils import get_toy_data_seqclassification, get_automl_settings
+import os
+import shutil
 
 
 @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@@ -71,6 +73,9 @@ def test_hf_data():
 
     del automl
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_hf_data()
diff --git a/test/nlp/test_autohf_classificationhead.py b/test/nlp/test_autohf_classificationhead.py
index 526b386c1..d2eab332e 100644
--- a/test/nlp/test_autohf_classificationhead.py
+++ b/test/nlp/test_autohf_classificationhead.py
@@ -1,14 +1,105 @@
-from utils import get_toy_data_multiclassclassification, get_automl_settings
+from utils import (
+    get_toy_data_regression,
+    get_toy_data_binclassification,
+    get_toy_data_multiclassclassification,
+    get_automl_settings,
+)
+import sys
+import pytest
+import os
+import shutil
+
+data_list = [
+    "get_toy_data_regression",
+    "get_toy_data_binclassification",
+    "get_toy_data_multiclassclassification",
+]
+model_path_list = [
+    "textattack/bert-base-uncased-STS-B",
+    "textattack/bert-base-uncased-SST-2",
+    "textattack/bert-base-uncased-MNLI",
+]
 
 
-def test_classification_head():
+def test_switch_1_1():
+    data_idx, model_path_idx = 0, 0
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_1_2():
+    data_idx, model_path_idx = 0, 1
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_1_3():
+    data_idx, model_path_idx = 0, 2
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_2_1():
+    data_idx, model_path_idx = 1, 0
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_2_2():
+    data_idx, model_path_idx = 1, 1
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_2_3():
+    data_idx, model_path_idx = 1, 2
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_3_1():
+    data_idx, model_path_idx = 2, 0
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_3_2():
+    data_idx, model_path_idx = 2, 1
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def test_switch_3_3():
+    data_idx, model_path_idx = 2, 2
+    _test_switch_classificationhead(
+        data_list[data_idx], model_path_list[model_path_idx]
+    )
+
+
+def _test_switch_classificationhead(each_data, each_model_path):
     from flaml import AutoML
     import requests
 
-    X_train, y_train, X_val, y_val = get_toy_data_multiclassclassification()
     automl = AutoML()
 
+    X_train, y_train, X_val, y_val = globals()[each_data]()
     automl_settings = get_automl_settings()
+    automl_settings["model_path"] = each_model_path
+
+    if each_data == "get_toy_data_regression":
+        automl_settings["task"] = "seq-regression"
+        automl_settings["metric"] = "pearsonr"
+    else:
+        automl_settings["task"] = "seq-classification"
+        automl_settings["metric"] = "accuracy"
 
     try:
         automl.fit(
@@ -21,6 +112,9 @@ def test_classification_head():
     except requests.exceptions.HTTPError:
         return
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
-    test_classification_head()
+    _test_switch_classificationhead(data_list[0], model_path_list[0])
diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py
index ac38039b2..b28e48d35 100644
--- a/test/nlp/test_autohf_custom_metric.py
+++ b/test/nlp/test_autohf_custom_metric.py
@@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_seqclassification, get_automl_settings
+import os
+import shutil
 
 
 def custom_metric(
@@ -81,6 +83,9 @@ def test_custom_metric():
 
     del automl
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_custom_metric()
diff --git a/test/nlp/test_autohf_cv.py b/test/nlp/test_autohf_cv.py
index 2bc11c880..6e9162dd9 100644
--- a/test/nlp/test_autohf_cv.py
+++ b/test/nlp/test_autohf_cv.py
@@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_seqclassification, get_automl_settings
+import os
+import shutil
 
 
 @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@@ -19,6 +21,9 @@ def test_cv():
     except requests.exceptions.HTTPError:
         return
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_cv()
diff --git a/test/nlp/test_autohf_multichoice_classification.py b/test/nlp/test_autohf_multichoice_classification.py
index 4f4cffa75..61691141e 100644
--- a/test/nlp/test_autohf_multichoice_classification.py
+++ b/test/nlp/test_autohf_multichoice_classification.py
@@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_multiplechoiceclassification, get_automl_settings
+import os
+import shutil
 
 
 @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@@ -46,6 +48,9 @@ def test_mcc():
     accuracy = round(true_count / len(y_pred), 5)
     print("Accuracy: " + str(accuracy))
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_mcc()
diff --git a/test/nlp/test_autohf_regression.py b/test/nlp/test_autohf_regression.py
index a5f208520..cb01a6152 100644
--- a/test/nlp/test_autohf_regression.py
+++ b/test/nlp/test_autohf_regression.py
@@ -1,6 +1,8 @@
 import sys
 import pytest
 from utils import get_toy_data_seqregression, get_automl_settings
+import os
+import shutil
 
 
 @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
@@ -32,6 +34,9 @@ def test_regression():
     )
     automl.predict(X_val)
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_regression()
diff --git a/test/nlp/test_autohf_summarization.py b/test/nlp/test_autohf_summarization.py
index 8cde806a9..9e21984d6 100644
--- a/test/nlp/test_autohf_summarization.py
+++ b/test/nlp/test_autohf_summarization.py
@@ -2,6 +2,8 @@ import sys
 import pytest
 import requests
 from utils import get_toy_data_summarization, get_automl_settings
+import os
+import shutil
 
 
 @pytest.mark.skipif(
@@ -48,6 +50,9 @@ def test_summarization():
     )
     automl.predict(X_test)
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_summarization()
diff --git a/test/nlp/test_autohf_tokenclassification.py b/test/nlp/test_autohf_tokenclassification.py
index cb1c40e9d..051c2bf41 100644
--- a/test/nlp/test_autohf_tokenclassification.py
+++ b/test/nlp/test_autohf_tokenclassification.py
@@ -1,6 +1,8 @@
 import sys
 import pytest
 import requests
+import os
+import shutil
 from utils import (
     get_toy_data_tokenclassification_idlabel,
     get_toy_data_tokenclassification_tokenlabel,
@@ -62,6 +64,9 @@ def test_tokenclassification_idlabel():
                 if min_inter_result != sys.maxsize:
                     assert val_loss == min_inter_result
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 @pytest.mark.skipif(
     sys.platform == "darwin" or sys.version < "3.7",
@@ -106,6 +111,9 @@ def test_tokenclassification_tokenlabel():
                 if min_inter_result != sys.maxsize:
                     assert val_loss == min_inter_result
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 if __name__ == "__main__":
     test_tokenclassification_idlabel()
diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py
index cf7c02ce5..fcddcda45 100644
--- a/test/nlp/test_default.py
+++ b/test/nlp/test_default.py
@@ -1,6 +1,8 @@
 from utils import get_toy_data_seqclassification, get_automl_settings
 import sys
 from flaml.default import portfolio
+import os
+import shutil
 
 
 def pop_args(fit_kwargs):
@@ -80,6 +82,9 @@ def test_starting_point_not_in_search_space():
         == "albert-base-v2"
     )
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 def test_points_to_evaluate():
     from flaml import AutoML
@@ -99,6 +104,9 @@ def test_points_to_evaluate():
 
     automl.fit(X_train, y_train, **automl_settings)
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 # TODO: implement _test_zero_shot_model
 def test_zero_shot_nomodel():
@@ -131,6 +139,9 @@ def test_zero_shot_nomodel():
     pop_args(fit_kwargs)
     model.fit(X_train, y_train, **fit_kwargs)
 
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
+
 
 def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
     import os
@@ -159,3 +170,9 @@ def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"):
         )
     except ValueError:
         print("Feature not implemented")
+
+    import os
+    import shutil
+
+    if os.path.exists("test/data/output/"):
+        shutil.rmtree("test/data/output/")
diff --git a/test/nlp/utils.py b/test/nlp/utils.py
index 18bb92bc1..f8536b960 100644
--- a/test/nlp/utils.py
+++ b/test/nlp/utils.py
@@ -70,23 +70,19 @@ def get_toy_data_seqclassification():
     return X_train, y_train, X_val, y_val, X_test
 
 
-def get_toy_data_multiclassclassification():
+def get_toy_data_binclassification():
     train_data = {
         "text": [
             "i didnt feel humiliated",
             "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
-            "im grabbing a minute to post i feel greedy wrong",
             "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
-            "i am feeling grouchy",
             "ive been feeling a little burdened lately wasnt sure why that was",
-            "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny",
-            "i feel as confused about life as a teenager or as jaded as a year old man",
             "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
             "i feel romantic too",
             "i feel like i have to make the suffering i m seeing mean something",
             "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
         ],
-        "label": [0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1],
+        "label": [0, 0, 1, 0, 1, 1, 0, 1],
     }
     train_dataset = pd.DataFrame(train_data)
 
@@ -95,9 +91,84 @@ def get_toy_data_multiclassclassification():
             "i think it s the easiest time of year to feel dissatisfied",
             "i feel low energy i m just thirsty",
             "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
-            "i do not feel reassured anxiety is on each side",
         ],
-        "label": [3, 0, 1, 1],
+        "label": [0, 1, 1],
+    }
+    dev_dataset = pd.DataFrame(dev_data)
+
+    custom_sent_keys = ["text"]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    return X_train, y_train, X_val, y_val
+
+
+def get_toy_data_regression():
+    train_data = {
+        "text": [
+            "i didnt feel humiliated",
+            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
+            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
+            "ive been feeling a little burdened lately wasnt sure why that was",
+            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
+            "i feel romantic too",
+            "i feel like i have to make the suffering i m seeing mean something",
+            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
+        ],
+        "label": [1.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0, 3.0],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "text": [
+            "i think it s the easiest time of year to feel dissatisfied",
+            "i feel low energy i m just thirsty",
+            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
+        ],
+        "label": [1.0, 3.0, 3.0],
+    }
+    dev_dataset = pd.DataFrame(dev_data)
+
+    custom_sent_keys = ["text"]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    return X_train, y_train, X_val, y_val
+
+
+def get_toy_data_multiclassclassification():
+    train_data = {
+        "text": [
+            "i didnt feel humiliated",
+            "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake",
+            "i am ever feeling nostalgic about the fireplace i will know that it is still on the property",
+            "ive been feeling a little burdened lately wasnt sure why that was",
+            "i have been with petronas for years i feel that petronas has performed well and made a huge profit",
+            "i feel romantic too",
+            "i feel like i have to make the suffering i m seeing mean something",
+            "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter",
+        ],
+        "label": [0, 0, 2, 0, 1, 2, 0, 1],
+    }
+    train_dataset = pd.DataFrame(train_data)
+
+    dev_data = {
+        "text": [
+            "i think it s the easiest time of year to feel dissatisfied",
+            "i feel low energy i m just thirsty",
+            "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious",
+        ],
+        "label": [0, 1, 1],
     }
     dev_dataset = pd.DataFrame(dev_data)
 
diff --git a/website/docs/Examples/AutoML-NLP.md b/website/docs/Examples/AutoML-NLP.md
index 17181d786..7ef0f6c7a 100644
--- a/website/docs/Examples/AutoML-NLP.md
+++ b/website/docs/Examples/AutoML-NLP.md
@@ -38,6 +38,13 @@ automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_
 automl.predict(X_test)
 ```
 
+Notice that after you run `automl.fit`, the intermediate checkpoints are saved under the specified output_dir `data/output`. You can use the following code to clean these outputs if they consume a large storage space:
+
+```python
+if os.path.exists("data/output/"):
+    shutil.rmtree("data/output/")
+```
+
 #### Sample output
 
 ```