diff --git a/apps/language_models/scripts/vicuna.py b/apps/language_models/scripts/vicuna.py index c851b58c..264056aa 100644 --- a/apps/language_models/scripts/vicuna.py +++ b/apps/language_models/scripts/vicuna.py @@ -706,20 +706,13 @@ class ShardedVicuna(SharkLLMBase): quantize_model( get_model_impl(vicuna_model).layers, dtype=torch.float32, - weight_quant_type="asym", weight_bit_width=weight_bit_width, weight_param_method="stats", weight_scale_precision="float", + weight_quant_type="asym", weight_quant_granularity="per_group", weight_group_size=self.weight_group_size, quantize_weight_zero_point=False, - input_bit_width=None, - input_scale_type="float", - input_param_method="stats", - input_quant_type="asym", - input_quant_granularity="per_tensor", - quantize_input_zero_point=False, - seqlen=2048, ) print("Weight quantization applied.") diff --git a/apps/language_models/src/model_wrappers/vicuna_model.py b/apps/language_models/src/model_wrappers/vicuna_model.py index 024d692d..8533656f 100644 --- a/apps/language_models/src/model_wrappers/vicuna_model.py +++ b/apps/language_models/src/model_wrappers/vicuna_model.py @@ -26,20 +26,13 @@ class FirstVicuna(torch.nn.Module): quantize_model( get_model_impl(self.model).layers, dtype=torch.float32, - weight_quant_type="asym", weight_bit_width=weight_bit_width, weight_param_method="stats", weight_scale_precision="float", + weight_quant_type="asym", weight_quant_granularity="per_group", weight_group_size=weight_group_size, quantize_weight_zero_point=False, - input_bit_width=None, - input_scale_type="float", - input_param_method="stats", - input_quant_type="asym", - input_quant_granularity="per_tensor", - quantize_input_zero_point=False, - seqlen=2048, ) print("Weight quantization applied.") @@ -75,20 +68,13 @@ class SecondVicuna(torch.nn.Module): quantize_model( get_model_impl(self.model).layers, dtype=torch.float32, - weight_quant_type="asym", weight_bit_width=weight_bit_width, weight_param_method="stats", weight_scale_precision="float", + weight_quant_type="asym", weight_quant_granularity="per_group", weight_group_size=weight_group_size, quantize_weight_zero_point=False, - input_bit_width=None, - input_scale_type="float", - input_param_method="stats", - input_quant_type="asym", - input_quant_granularity="per_tensor", - quantize_input_zero_point=False, - seqlen=2048, ) print("Weight quantization applied.") diff --git a/requirements.txt b/requirements.txt index 2f20729e..b0206372 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,5 +39,5 @@ joblib # for langchain pefile pyinstaller -# low precision vicuna -brevitas @ git+https://github.com/Xilinx/brevitas.git@llm +# vicuna quantization +brevitas @ git+https://github.com/Xilinx/brevitas.git@dev diff --git a/setup_venv.sh b/setup_venv.sh index 05a7b96b..6dbf25a4 100755 --- a/setup_venv.sh +++ b/setup_venv.sh @@ -159,5 +159,3 @@ if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then echo "${Green}Before running examples activate venv with:" echo " ${Green}source $VENV_DIR/bin/activate" fi - -$PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@llm