diff --git a/flaml/nlp/huggingface/switch_head_auto.py b/flaml/nlp/huggingface/switch_head_auto.py deleted file mode 100644 index 9a8bcfe16..000000000 --- a/flaml/nlp/huggingface/switch_head_auto.py +++ /dev/null @@ -1,64 +0,0 @@ -from collections import OrderedDict - -import transformers - -if transformers.__version__.startswith("3"): - from transformers.modeling_electra import ElectraClassificationHead - from transformers.modeling_roberta import RobertaClassificationHead - -else: - from transformers.models.electra.modeling_electra import ElectraClassificationHead - from transformers.models.roberta.modeling_roberta import RobertaClassificationHead - -MODEL_CLASSIFICATION_HEAD_MAPPING = OrderedDict( - [ - ("electra", ElectraClassificationHead), - ("roberta", RobertaClassificationHead), - ] -) - - -class AutoSeqClassificationHead: - """ - This is a class for getting classification head class based on the name of the LM - instantiated as one of the ClassificationHead classes of the library when - created with the `AutoSeqClassificationHead.from_model_type_and_config` method. - - This class cannot be instantiated directly using ``__init__()`` (throws an error). - """ - - def __init__(self): - raise EnvironmentError( - "AutoSeqClassificationHead is designed to be instantiated " - "using the `AutoSeqClassificationHead.from_model_type_and_config(cls, model_type, config)` methods." - ) - - @classmethod - def from_model_type_and_config( - cls, model_type: str, config: transformers.PretrainedConfig - ): - """ - Instantiate one of the classification head classes from the mode_type and model configuration. - - Args: - model_type: A string, which desribes the model type, e.g., "electra". - config: The huggingface class of the model's configuration. - - Example: - - ```python - from transformers import AutoConfig - model_config = AutoConfig.from_pretrained("google/electra-base-discriminator") - AutoSeqClassificationHead.from_model_type_and_config("electra", model_config) - ``` - """ - if model_type in MODEL_CLASSIFICATION_HEAD_MAPPING.keys(): - return MODEL_CLASSIFICATION_HEAD_MAPPING[model_type](config) - raise ValueError( - "Unrecognized configuration class {} for class {}.\n" - "Model type should be one of {}.".format( - config.__class__, - cls.__name__, - ", ".join(MODEL_CLASSIFICATION_HEAD_MAPPING.keys()), - ) - ) diff --git a/flaml/nlp/huggingface/utils.py b/flaml/nlp/huggingface/utils.py index 728fded88..13f1931fd 100644 --- a/flaml/nlp/huggingface/utils.py +++ b/flaml/nlp/huggingface/utils.py @@ -404,10 +404,6 @@ def load_model(checkpoint_path, task, num_labels=None): transformers.logging.set_verbosity_error() from transformers import AutoConfig - from ..huggingface.switch_head_auto import ( - AutoSeqClassificationHead, - MODEL_CLASSIFICATION_HEAD_MAPPING, - ) from ...data import SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION def get_this_model(checkpoint_path, task, model_config): @@ -418,7 +414,7 @@ def load_model(checkpoint_path, task, num_labels=None): if task in (SEQCLASSIFICATION, SEQREGRESSION): return AutoModelForSequenceClassification.from_pretrained( - checkpoint_path, config=model_config + checkpoint_path, config=model_config, ignore_mismatched_sizes=True ) elif task == TOKENCLASSIFICATION: return AutoModelForTokenClassification.from_pretrained( @@ -433,9 +429,6 @@ def load_model(checkpoint_path, task, num_labels=None): checkpoint_path, config=model_config ) - def is_pretrained_model_in_classification_head_list(model_type): - return model_type in MODEL_CLASSIFICATION_HEAD_MAPPING - def _set_model_config(checkpoint_path): if task in (SEQCLASSIFICATION, SEQREGRESSION, TOKENCLASSIFICATION): model_config = AutoConfig.from_pretrained( @@ -448,40 +441,11 @@ def load_model(checkpoint_path, task, num_labels=None): return model_config current_config = AutoConfig.from_pretrained(checkpoint_path) - this_model_type, this_vocab_size = ( - current_config.model_type, - current_config.vocab_size, - ) + this_vocab_size = current_config.vocab_size - if task == SEQCLASSIFICATION: - num_labels_old = current_config.num_labels - if is_pretrained_model_in_classification_head_list(this_model_type): - model_config_num_labels = num_labels_old - else: - model_config_num_labels = num_labels - new_config = _set_model_config(checkpoint_path) + model_config_num_labels = num_labels + new_config = _set_model_config(checkpoint_path) - if is_pretrained_model_in_classification_head_list(this_model_type): - if num_labels != num_labels_old: - this_model = get_this_model(checkpoint_path, task, new_config) - new_config.num_labels = num_labels - this_model.num_labels = num_labels - this_model.classifier = ( - AutoSeqClassificationHead.from_model_type_and_config( - this_model_type, new_config - ) - ) - else: - this_model = get_this_model(checkpoint_path, task, new_config) - else: - this_model = get_this_model(checkpoint_path, task, new_config) - this_model.resize_token_embeddings(this_vocab_size) - return this_model - else: - if task == SEQREGRESSION: - model_config_num_labels = 1 - elif task == TOKENCLASSIFICATION: - model_config_num_labels = num_labels - model_config = _set_model_config(checkpoint_path) - this_model = get_this_model(checkpoint_path, task, model_config) - return this_model + this_model = get_this_model(checkpoint_path, task, new_config) + this_model.resize_token_embeddings(this_vocab_size) + return this_model diff --git a/test/nlp/test_autohf.py b/test/nlp/test_autohf.py index f21f02543..38c82027c 100644 --- a/test/nlp/test_autohf.py +++ b/test/nlp/test_autohf.py @@ -2,6 +2,8 @@ import sys import pytest import requests from utils import get_toy_data_seqclassification, get_automl_settings +import os +import shutil @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os") @@ -71,6 +73,9 @@ def test_hf_data(): del automl + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_hf_data() diff --git a/test/nlp/test_autohf_classificationhead.py b/test/nlp/test_autohf_classificationhead.py index 526b386c1..d2eab332e 100644 --- a/test/nlp/test_autohf_classificationhead.py +++ b/test/nlp/test_autohf_classificationhead.py @@ -1,14 +1,105 @@ -from utils import get_toy_data_multiclassclassification, get_automl_settings +from utils import ( + get_toy_data_regression, + get_toy_data_binclassification, + get_toy_data_multiclassclassification, + get_automl_settings, +) +import sys +import pytest +import os +import shutil + +data_list = [ + "get_toy_data_regression", + "get_toy_data_binclassification", + "get_toy_data_multiclassclassification", +] +model_path_list = [ + "textattack/bert-base-uncased-STS-B", + "textattack/bert-base-uncased-SST-2", + "textattack/bert-base-uncased-MNLI", +] -def test_classification_head(): +def test_switch_1_1(): + data_idx, model_path_idx = 0, 0 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_1_2(): + data_idx, model_path_idx = 0, 1 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_1_3(): + data_idx, model_path_idx = 0, 2 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_2_1(): + data_idx, model_path_idx = 1, 0 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_2_2(): + data_idx, model_path_idx = 1, 1 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_2_3(): + data_idx, model_path_idx = 1, 2 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_3_1(): + data_idx, model_path_idx = 2, 0 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_3_2(): + data_idx, model_path_idx = 2, 1 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def test_switch_3_3(): + data_idx, model_path_idx = 2, 2 + _test_switch_classificationhead( + data_list[data_idx], model_path_list[model_path_idx] + ) + + +def _test_switch_classificationhead(each_data, each_model_path): from flaml import AutoML import requests - X_train, y_train, X_val, y_val = get_toy_data_multiclassclassification() automl = AutoML() + X_train, y_train, X_val, y_val = globals()[each_data]() automl_settings = get_automl_settings() + automl_settings["model_path"] = each_model_path + + if each_data == "get_toy_data_regression": + automl_settings["task"] = "seq-regression" + automl_settings["metric"] = "pearsonr" + else: + automl_settings["task"] = "seq-classification" + automl_settings["metric"] = "accuracy" try: automl.fit( @@ -21,6 +112,9 @@ def test_classification_head(): except requests.exceptions.HTTPError: return + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": - test_classification_head() + _test_switch_classificationhead(data_list[0], model_path_list[0]) diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py index ac38039b2..b28e48d35 100644 --- a/test/nlp/test_autohf_custom_metric.py +++ b/test/nlp/test_autohf_custom_metric.py @@ -1,6 +1,8 @@ import sys import pytest from utils import get_toy_data_seqclassification, get_automl_settings +import os +import shutil def custom_metric( @@ -81,6 +83,9 @@ def test_custom_metric(): del automl + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_custom_metric() diff --git a/test/nlp/test_autohf_cv.py b/test/nlp/test_autohf_cv.py index 2bc11c880..6e9162dd9 100644 --- a/test/nlp/test_autohf_cv.py +++ b/test/nlp/test_autohf_cv.py @@ -1,6 +1,8 @@ import sys import pytest from utils import get_toy_data_seqclassification, get_automl_settings +import os +import shutil @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os") @@ -19,6 +21,9 @@ def test_cv(): except requests.exceptions.HTTPError: return + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_cv() diff --git a/test/nlp/test_autohf_multichoice_classification.py b/test/nlp/test_autohf_multichoice_classification.py index 4f4cffa75..61691141e 100644 --- a/test/nlp/test_autohf_multichoice_classification.py +++ b/test/nlp/test_autohf_multichoice_classification.py @@ -1,6 +1,8 @@ import sys import pytest from utils import get_toy_data_multiplechoiceclassification, get_automl_settings +import os +import shutil @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os") @@ -46,6 +48,9 @@ def test_mcc(): accuracy = round(true_count / len(y_pred), 5) print("Accuracy: " + str(accuracy)) + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_mcc() diff --git a/test/nlp/test_autohf_regression.py b/test/nlp/test_autohf_regression.py index a5f208520..cb01a6152 100644 --- a/test/nlp/test_autohf_regression.py +++ b/test/nlp/test_autohf_regression.py @@ -1,6 +1,8 @@ import sys import pytest from utils import get_toy_data_seqregression, get_automl_settings +import os +import shutil @pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os") @@ -32,6 +34,9 @@ def test_regression(): ) automl.predict(X_val) + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_regression() diff --git a/test/nlp/test_autohf_summarization.py b/test/nlp/test_autohf_summarization.py index 8cde806a9..9e21984d6 100644 --- a/test/nlp/test_autohf_summarization.py +++ b/test/nlp/test_autohf_summarization.py @@ -2,6 +2,8 @@ import sys import pytest import requests from utils import get_toy_data_summarization, get_automl_settings +import os +import shutil @pytest.mark.skipif( @@ -48,6 +50,9 @@ def test_summarization(): ) automl.predict(X_test) + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_summarization() diff --git a/test/nlp/test_autohf_tokenclassification.py b/test/nlp/test_autohf_tokenclassification.py index cb1c40e9d..051c2bf41 100644 --- a/test/nlp/test_autohf_tokenclassification.py +++ b/test/nlp/test_autohf_tokenclassification.py @@ -1,6 +1,8 @@ import sys import pytest import requests +import os +import shutil from utils import ( get_toy_data_tokenclassification_idlabel, get_toy_data_tokenclassification_tokenlabel, @@ -62,6 +64,9 @@ def test_tokenclassification_idlabel(): if min_inter_result != sys.maxsize: assert val_loss == min_inter_result + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + @pytest.mark.skipif( sys.platform == "darwin" or sys.version < "3.7", @@ -106,6 +111,9 @@ def test_tokenclassification_tokenlabel(): if min_inter_result != sys.maxsize: assert val_loss == min_inter_result + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + if __name__ == "__main__": test_tokenclassification_idlabel() diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py index cf7c02ce5..fcddcda45 100644 --- a/test/nlp/test_default.py +++ b/test/nlp/test_default.py @@ -1,6 +1,8 @@ from utils import get_toy_data_seqclassification, get_automl_settings import sys from flaml.default import portfolio +import os +import shutil def pop_args(fit_kwargs): @@ -80,6 +82,9 @@ def test_starting_point_not_in_search_space(): == "albert-base-v2" ) + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + def test_points_to_evaluate(): from flaml import AutoML @@ -99,6 +104,9 @@ def test_points_to_evaluate(): automl.fit(X_train, y_train, **automl_settings) + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + # TODO: implement _test_zero_shot_model def test_zero_shot_nomodel(): @@ -131,6 +139,9 @@ def test_zero_shot_nomodel(): pop_args(fit_kwargs) model.fit(X_train, y_train, **fit_kwargs) + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") + def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"): import os @@ -159,3 +170,9 @@ def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"): ) except ValueError: print("Feature not implemented") + + import os + import shutil + + if os.path.exists("test/data/output/"): + shutil.rmtree("test/data/output/") diff --git a/test/nlp/utils.py b/test/nlp/utils.py index 18bb92bc1..f8536b960 100644 --- a/test/nlp/utils.py +++ b/test/nlp/utils.py @@ -70,23 +70,19 @@ def get_toy_data_seqclassification(): return X_train, y_train, X_val, y_val, X_test -def get_toy_data_multiclassclassification(): +def get_toy_data_binclassification(): train_data = { "text": [ "i didnt feel humiliated", "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake", - "im grabbing a minute to post i feel greedy wrong", "i am ever feeling nostalgic about the fireplace i will know that it is still on the property", - "i am feeling grouchy", "ive been feeling a little burdened lately wasnt sure why that was", - "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny", - "i feel as confused about life as a teenager or as jaded as a year old man", "i have been with petronas for years i feel that petronas has performed well and made a huge profit", "i feel romantic too", "i feel like i have to make the suffering i m seeing mean something", "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter", ], - "label": [0, 0, 3, 2, 3, 0, 5, 4, 1, 2, 0, 1], + "label": [0, 0, 1, 0, 1, 1, 0, 1], } train_dataset = pd.DataFrame(train_data) @@ -95,9 +91,84 @@ def get_toy_data_multiclassclassification(): "i think it s the easiest time of year to feel dissatisfied", "i feel low energy i m just thirsty", "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious", - "i do not feel reassured anxiety is on each side", ], - "label": [3, 0, 1, 1], + "label": [0, 1, 1], + } + dev_dataset = pd.DataFrame(dev_data) + + custom_sent_keys = ["text"] + label_key = "label" + + X_train = train_dataset[custom_sent_keys] + y_train = train_dataset[label_key] + + X_val = dev_dataset[custom_sent_keys] + y_val = dev_dataset[label_key] + + return X_train, y_train, X_val, y_val + + +def get_toy_data_regression(): + train_data = { + "text": [ + "i didnt feel humiliated", + "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake", + "i am ever feeling nostalgic about the fireplace i will know that it is still on the property", + "ive been feeling a little burdened lately wasnt sure why that was", + "i have been with petronas for years i feel that petronas has performed well and made a huge profit", + "i feel romantic too", + "i feel like i have to make the suffering i m seeing mean something", + "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter", + ], + "label": [1.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0, 3.0], + } + train_dataset = pd.DataFrame(train_data) + + dev_data = { + "text": [ + "i think it s the easiest time of year to feel dissatisfied", + "i feel low energy i m just thirsty", + "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious", + ], + "label": [1.0, 3.0, 3.0], + } + dev_dataset = pd.DataFrame(dev_data) + + custom_sent_keys = ["text"] + label_key = "label" + + X_train = train_dataset[custom_sent_keys] + y_train = train_dataset[label_key] + + X_val = dev_dataset[custom_sent_keys] + y_val = dev_dataset[label_key] + + return X_train, y_train, X_val, y_val + + +def get_toy_data_multiclassclassification(): + train_data = { + "text": [ + "i didnt feel humiliated", + "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake", + "i am ever feeling nostalgic about the fireplace i will know that it is still on the property", + "ive been feeling a little burdened lately wasnt sure why that was", + "i have been with petronas for years i feel that petronas has performed well and made a huge profit", + "i feel romantic too", + "i feel like i have to make the suffering i m seeing mean something", + "i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter", + ], + "label": [0, 0, 2, 0, 1, 2, 0, 1], + } + train_dataset = pd.DataFrame(train_data) + + dev_data = { + "text": [ + "i think it s the easiest time of year to feel dissatisfied", + "i feel low energy i m just thirsty", + "i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious", + ], + "label": [0, 1, 1], } dev_dataset = pd.DataFrame(dev_data) diff --git a/website/docs/Examples/AutoML-NLP.md b/website/docs/Examples/AutoML-NLP.md index 17181d786..7ef0f6c7a 100644 --- a/website/docs/Examples/AutoML-NLP.md +++ b/website/docs/Examples/AutoML-NLP.md @@ -38,6 +38,13 @@ automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_ automl.predict(X_test) ``` +Notice that after you run `automl.fit`, the intermediate checkpoints are saved under the specified output_dir `data/output`. You can use the following code to clean these outputs if they consume a large storage space: + +```python +if os.path.exists("data/output/"): + shutil.rmtree("data/output/") +``` + #### Sample output ```