diff --git a/flaml/model.py b/flaml/model.py
index 24d3ba278..acd42ce6c 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -325,6 +325,7 @@ class TransformersEstimator(BaseEstimator):
             },
             "num_train_epochs": {
                 "domain": tune.loguniform(lower=0.1, upper=10.0),
+                "init_value": 3,
             },
             "per_device_train_batch_size": {
                 "domain": tune.choice([4, 8, 16, 32]),
@@ -536,8 +537,8 @@ class TransformersEstimator(BaseEstimator):
                 evaluate_during_training=True,
                 save_steps=ckpt_freq,
                 save_total_limit=0,
+                metric_for_best_model="loss",
                 fp16=self.custom_hpo_args.fp16,
-                load_best_model_at_end=True,
                 **training_args_config,
             )
         else:
@@ -553,8 +554,8 @@ class TransformersEstimator(BaseEstimator):
                 evaluation_strategy=IntervalStrategy.STEPS,
                 save_steps=ckpt_freq,
                 save_total_limit=0,
+                metric_for_best_model="loss",
                 fp16=self.custom_hpo_args.fp16,
-                load_best_model_at_end=True,
                 **training_args_config,
             )
 
diff --git a/flaml/nlp/huggingface/trainer.py b/flaml/nlp/huggingface/trainer.py
index 52345e467..2bd81bf22 100644
--- a/flaml/nlp/huggingface/trainer.py
+++ b/flaml/nlp/huggingface/trainer.py
@@ -86,28 +86,3 @@ class TrainerForAuto(Seq2SeqTrainer):
             self.ckpt_to_global_step = {ckpt_dir: self.state.global_step}
             self.ckpt_to_metric = {ckpt_dir: metrics} if metrics else {}
         return metrics
-
-# TODO: if your task is SUMMARIZATION, you need a different
-#  class Seq2SeqTrainerForAuto, uncomment the code below
-#  Note: I have implemented it here,
-#  but I don't know whether it's correct, you need to debug
-#  Seq2SeqTrainerForAuto to make sure it's correct
-
-
-# class Seq2SeqTrainerForAuto(TrainerForAuto):
-#     def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
-#         """Overriding transformers.Trainer.evaluate by saving metrics and checkpoint path"""
-#         self._is_seq2seq = True
-#         TrainerForAuto.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
-#         # super(TrainerForAuto, self).evaluate(
-#         #     eval_dataset, ignore_keys, metric_key_prefix
-#         # )
-
-
-# TODO: if your task is QUESTIONANSWERING, uncomment the code below
-#  by adapting the code in https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/trainer_qa.py#L28
-
-
-# class QATrainerForAuto(TrainerForAuto):
-#     pass
-# TODO: if your task is QUESTIONANSWERING, do the post processing here
diff --git a/test/nlp/run_gpu.py b/test/nlp/run_gpu.py
new file mode 100644
index 000000000..8551dd70b
--- /dev/null
+++ b/test/nlp/run_gpu.py
@@ -0,0 +1,82 @@
+import sys
+import pytest
+import pickle
+import shutil
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os")
+def _test_hf_data():
+    from flaml import AutoML
+    import requests
+    from datasets import load_dataset
+
+    try:
+        train_dataset = load_dataset("glue", "mrpc", split="train").to_pandas()
+        dev_dataset = load_dataset("glue", "mrpc", split="validation").to_pandas()
+        test_dataset = load_dataset("glue", "mrpc", split="test").to_pandas()
+    except requests.exceptions.ConnectionError:
+        return
+
+    custom_sent_keys = ["sentence1", "sentence2"]
+    label_key = "label"
+
+    X_train = train_dataset[custom_sent_keys]
+    y_train = train_dataset[label_key]
+
+    X_val = dev_dataset[custom_sent_keys]
+    y_val = dev_dataset[label_key]
+
+    X_test = test_dataset[custom_sent_keys]
+
+    automl = AutoML()
+
+    automl_settings = {
+        "gpu_per_trial": 1,
+        "max_iter": 5,
+        "time_budget": 5000,
+        "task": "seq-classification",
+        "metric": "accuracy",
+        "log_file_name": "seqclass.log",
+        "use_ray": True,
+    }
+
+    automl_settings["custom_hpo_args"] = {
+        "model_path": "facebook/muppet-roberta-base",
+        "output_dir": "test/data/output/",
+        "ckpt_per_epoch": 5,
+        "fp16": True,
+    }
+
+    automl.fit(
+        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
+    )
+
+    automl = AutoML()
+    automl.retrain_from_log(
+        X_train=X_train,
+        y_train=y_train,
+        train_full=True,
+        record_id=0,
+        **automl_settings
+    )
+    with open("automl.pkl", "wb") as f:
+        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
+    with open("automl.pkl", "rb") as f:
+        automl = pickle.load(f)
+    shutil.rmtree("test/data/output/")
+    automl.predict(X_test)
+    automl.predict(["test test", "test test"])
+    automl.predict(
+        [
+            ["test test", "test test"],
+            ["test test", "test test"],
+            ["test test", "test test"],
+        ]
+    )
+
+    automl.predict_proba(X_test)
+    print(automl.classes_)
+
+
+if __name__ == "__main__":
+    _test_hf_data()