mirror of
https://github.com/nod-ai/AMD-SHARK-Studio.git
synced 2026-04-25 03:00:12 -04:00
Fix Langchain multiple device isssue (#1688)
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
1.) Install all the dependencies by running:
|
||||
```shell
|
||||
pip install -r apps/language_models/langchain/langchain_requirements.txt
|
||||
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
|
||||
```
|
||||
|
||||
2.) Create a folder named `user_path` in `apps/language_models/langchain/` directory.
|
||||
|
||||
@@ -687,6 +687,7 @@ class Langchain:
|
||||
langchain_mode1,
|
||||
user_path,
|
||||
hf_embedding_model,
|
||||
device=self.device,
|
||||
kwargs_make_db=locals(),
|
||||
)
|
||||
finally:
|
||||
@@ -811,7 +812,7 @@ class Langchain:
|
||||
)
|
||||
)
|
||||
if base_model1 and not login_mode_if_model0:
|
||||
model0, tokenizer0, device = self.get_model(
|
||||
model0, tokenizer0, _ = self.get_model(
|
||||
reward_type=False,
|
||||
**get_kwargs(
|
||||
self.get_model,
|
||||
@@ -821,7 +822,7 @@ class Langchain:
|
||||
)
|
||||
else:
|
||||
# if empty model, then don't load anything, just get gradio up
|
||||
model0, tokenizer0, device = None, None, None
|
||||
model0, tokenizer0, _ = None, None, None
|
||||
if model0 is None:
|
||||
if fail_if_cannot_connect:
|
||||
raise RuntimeError("Could not connect, see logs")
|
||||
@@ -830,7 +831,7 @@ class Langchain:
|
||||
model_lock.remove(model_dict)
|
||||
continue
|
||||
model_state_trial = dict(
|
||||
model=model0, tokenizer=tokenizer0, device=device
|
||||
model=model0, tokenizer=tokenizer0, device=self.device
|
||||
)
|
||||
model_state_trial.update(model_dict)
|
||||
assert len(model_state_none) == len(model_state_trial)
|
||||
@@ -846,7 +847,7 @@ class Langchain:
|
||||
|
||||
# get score model
|
||||
all_kwargs = locals().copy()
|
||||
smodel, stokenizer, sdevice = self.get_score_model(
|
||||
smodel, stokenizer, _ = self.get_score_model(
|
||||
reward_type=True,
|
||||
**get_kwargs(
|
||||
self.get_score_model,
|
||||
@@ -857,7 +858,7 @@ class Langchain:
|
||||
score_model_state0 = dict(
|
||||
model=smodel,
|
||||
tokenizer=stokenizer,
|
||||
device=sdevice,
|
||||
device=self.device,
|
||||
base_model=score_model,
|
||||
tokenizer_base_model="",
|
||||
lora_weights="",
|
||||
@@ -959,6 +960,7 @@ class Langchain:
|
||||
Ensure model gets on correct device
|
||||
"""
|
||||
|
||||
device_map = None
|
||||
if model is not None:
|
||||
# NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
|
||||
# NOTE: Some models require avoiding sharding some layers,
|
||||
@@ -975,25 +977,25 @@ class Langchain:
|
||||
dtype=torch.float16 if load_half else torch.float32,
|
||||
)
|
||||
device_map.update(device_map_model)
|
||||
else:
|
||||
device_map = "auto"
|
||||
|
||||
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
|
||||
|
||||
if n_gpus > 0:
|
||||
if gpu_id >= 0:
|
||||
# FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
|
||||
# So avoid for now, just put on first GPU, unless score_model, put on last
|
||||
if reward_type:
|
||||
device_map = {"": n_gpus - 1}
|
||||
else:
|
||||
device_map = {"": min(n_gpus - 1, gpu_id)}
|
||||
if gpu_id == -1:
|
||||
device_map = {"": "cuda"}
|
||||
else:
|
||||
device_map = {"": "cpu"}
|
||||
model_kwargs["load_in_8bit"] = False
|
||||
model_kwargs["load_in_4bit"] = False
|
||||
if device_map is None:
|
||||
if self.device == "cuda":
|
||||
if n_gpus > 0:
|
||||
if gpu_id >= 0:
|
||||
# FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
|
||||
# So avoid for now, just put on first GPU, unless score_model, put on last
|
||||
if reward_type:
|
||||
device_map = {"": n_gpus - 1}
|
||||
else:
|
||||
device_map = {"": min(n_gpus - 1, gpu_id)}
|
||||
if gpu_id == -1:
|
||||
device_map = {"": "cuda"}
|
||||
else:
|
||||
device_map = {"": "cpu"}
|
||||
model_kwargs["load_in_8bit"] = False
|
||||
model_kwargs["load_in_4bit"] = False
|
||||
print("device_map: %s" % device_map, flush=True)
|
||||
|
||||
load_in_8bit = model_kwargs.get("load_in_8bit", False)
|
||||
@@ -1265,8 +1267,8 @@ class Langchain:
|
||||
if base_model in non_hf_types:
|
||||
from gpt4all_llm import get_model_tokenizer_gpt4all
|
||||
|
||||
model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
|
||||
return model, tokenizer, device
|
||||
model, tokenizer, _ = get_model_tokenizer_gpt4all(base_model)
|
||||
return model, tokenizer, self.device
|
||||
|
||||
# get local torch-HF model
|
||||
return self.get_hf_model(
|
||||
@@ -1276,7 +1278,7 @@ class Langchain:
|
||||
load_gptq=load_gptq,
|
||||
use_safetensors=use_safetensors,
|
||||
infer_devices=infer_devices,
|
||||
device=device,
|
||||
device=self.device,
|
||||
base_model=base_model,
|
||||
tokenizer_base_model=tokenizer_base_model,
|
||||
lora_weights=lora_weights,
|
||||
@@ -1325,8 +1327,6 @@ class Langchain:
|
||||
if lora_weights is not None and lora_weights.strip():
|
||||
if verbose:
|
||||
print("Get %s lora weights" % lora_weights, flush=True)
|
||||
if device is None:
|
||||
device = get_device()
|
||||
|
||||
if "gpt2" in base_model.lower():
|
||||
# RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
|
||||
@@ -1365,19 +1365,19 @@ class Langchain:
|
||||
model = model_loader(
|
||||
tokenizer,
|
||||
model=base_model,
|
||||
device=0 if device == "cuda" else -1,
|
||||
device=0 if self.device == "cuda" else -1,
|
||||
torch_dtype=torch.float16
|
||||
if device == "cuda"
|
||||
if self.device == "cuda"
|
||||
else torch.float32,
|
||||
)
|
||||
else:
|
||||
assert device in ["cuda", "cpu", "mps"], (
|
||||
"Unsupported device %s" % device
|
||||
assert self.device in ["cuda", "cpu", "mps"], (
|
||||
"Unsupported device %s" % self.device
|
||||
)
|
||||
model_kwargs = dict(
|
||||
local_files_only=local_files_only,
|
||||
torch_dtype=torch.float16
|
||||
if device == "cuda"
|
||||
if self.device == "cuda"
|
||||
else torch.float32,
|
||||
resume_download=resume_download,
|
||||
use_auth_token=use_auth_token,
|
||||
@@ -1392,7 +1392,7 @@ class Langchain:
|
||||
infer_devices
|
||||
and gpu_id is not None
|
||||
and gpu_id >= 0
|
||||
and device == "cuda"
|
||||
and self.device == "cuda"
|
||||
):
|
||||
device_map = {"": gpu_id}
|
||||
else:
|
||||
@@ -1412,14 +1412,16 @@ class Langchain:
|
||||
# MPT doesn't support spreading over GPUs
|
||||
model_kwargs.update(
|
||||
dict(
|
||||
device_map={"": gpu_id} if device == "cuda" else "cpu"
|
||||
device_map={"": gpu_id}
|
||||
if self.device == "cuda"
|
||||
else "cpu"
|
||||
)
|
||||
)
|
||||
|
||||
if "OpenAssistant/reward-model".lower() in base_model.lower():
|
||||
# FIXME: could put on other GPUs
|
||||
model_kwargs["device_map"] = (
|
||||
{"": 0} if device == "cuda" else {"": "cpu"}
|
||||
{"": 0} if self.device == "cuda" else {"": "cpu"}
|
||||
)
|
||||
model_kwargs.pop("torch_dtype", None)
|
||||
self.pop_unused_model_kwargs(model_kwargs)
|
||||
@@ -1427,7 +1429,7 @@ class Langchain:
|
||||
if not lora_weights:
|
||||
# torch.device context uses twice memory for AutoGPTQ
|
||||
context = NullContext if load_gptq else torch.device
|
||||
with context(device):
|
||||
with context(self.device):
|
||||
if infer_devices:
|
||||
config, model = self.get_config(
|
||||
base_model,
|
||||
@@ -1472,7 +1474,7 @@ class Langchain:
|
||||
model,
|
||||
lora_weights,
|
||||
torch_dtype=torch.float16
|
||||
if device == "cuda"
|
||||
if self.device == "cuda"
|
||||
else torch.float32,
|
||||
local_files_only=local_files_only,
|
||||
resume_download=resume_download,
|
||||
@@ -1480,11 +1482,11 @@ class Langchain:
|
||||
trust_remote_code=trust_remote_code,
|
||||
offload_folder=offload_folder,
|
||||
device_map={"": 0}
|
||||
if device == "cuda"
|
||||
if self.device == "cuda"
|
||||
else {"": "cpu"}, # seems to be required
|
||||
)
|
||||
else:
|
||||
with torch.device(device):
|
||||
with torch.device(self.device):
|
||||
config, _ = self.get_config(
|
||||
base_model, raise_exception=True, **config_kwargs
|
||||
)
|
||||
@@ -1499,7 +1501,7 @@ class Langchain:
|
||||
model,
|
||||
lora_weights,
|
||||
torch_dtype=torch.float16
|
||||
if device == "cuda"
|
||||
if self.device == "cuda"
|
||||
else torch.float32,
|
||||
local_files_only=local_files_only,
|
||||
resume_download=resume_download,
|
||||
@@ -1535,7 +1537,7 @@ class Langchain:
|
||||
config, tokenizer, verbose=False, reward_type=reward_type
|
||||
)
|
||||
|
||||
return model, tokenizer, device
|
||||
return model, tokenizer, self.device
|
||||
|
||||
def set_model_max_len(
|
||||
self, config, tokenizer, verbose=False, reward_type=False
|
||||
@@ -1609,15 +1611,15 @@ class Langchain:
|
||||
inference_server = ""
|
||||
llama_type = False
|
||||
compile_model = False
|
||||
smodel, stokenizer, sdevice = self.get_model(
|
||||
smodel, stokenizer, _ = self.get_model(
|
||||
reward_type=True,
|
||||
**get_kwargs(
|
||||
self.get_model, exclude_names=["reward_type"], **locals()
|
||||
),
|
||||
)
|
||||
else:
|
||||
smodel, stokenizer, sdevice = None, None, None
|
||||
return smodel, stokenizer, sdevice
|
||||
smodel, stokenizer, _ = None, None, None
|
||||
return smodel, stokenizer, self.device
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
@@ -1763,7 +1765,6 @@ class Langchain:
|
||||
# get variables
|
||||
model = chosen_model_state["model"]
|
||||
tokenizer = chosen_model_state["tokenizer"]
|
||||
device = chosen_model_state["device"]
|
||||
base_model = chosen_model_state["base_model"]
|
||||
tokenizer_base_model = chosen_model_state["tokenizer_base_model"]
|
||||
lora_weights = chosen_model_state["lora_weights"]
|
||||
@@ -1952,6 +1953,7 @@ class Langchain:
|
||||
lora_weights=lora_weights,
|
||||
auto_reduce_chunks=auto_reduce_chunks,
|
||||
max_chunks=max_chunks,
|
||||
device=self.device,
|
||||
):
|
||||
(
|
||||
outr,
|
||||
@@ -2403,7 +2405,7 @@ class Langchain:
|
||||
prompt_type,
|
||||
prompt_dict,
|
||||
tokenizer,
|
||||
device,
|
||||
self.device,
|
||||
model_max_length=tokenizer.model_max_length,
|
||||
)
|
||||
|
||||
@@ -2412,7 +2414,7 @@ class Langchain:
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
if debug and len(inputs["input_ids"]) > 0:
|
||||
print("input_ids length", len(inputs["input_ids"][0]), flush=True)
|
||||
input_ids = inputs["input_ids"].to(device)
|
||||
input_ids = inputs["input_ids"].to(self.device)
|
||||
# CRITICAL LIMIT else will fail
|
||||
max_max_tokens = tokenizer.model_max_length
|
||||
max_input_tokens = max_max_tokens - min_new_tokens
|
||||
@@ -2498,10 +2500,12 @@ class Langchain:
|
||||
have_lora_weights = lora_weights not in [no_lora_str, "", None]
|
||||
context_class_cast = (
|
||||
NullContext
|
||||
if device == "cpu" or have_lora_weights or device == "mps"
|
||||
if self.device == "cpu"
|
||||
or have_lora_weights
|
||||
or self.device == "mps"
|
||||
else torch.autocast
|
||||
)
|
||||
with context_class_cast(device):
|
||||
with context_class_cast(self.device):
|
||||
# protection for gradio not keeping track of closed users,
|
||||
# else hit bitsandbytes lack of thread safety:
|
||||
# https://github.com/h2oai/h2ogpt/issues/104
|
||||
|
||||
@@ -44,7 +44,6 @@ from utils import (
|
||||
makedirs,
|
||||
get_url,
|
||||
flatten_list,
|
||||
get_device,
|
||||
ProgressParallel,
|
||||
remove,
|
||||
hash_file,
|
||||
@@ -92,6 +91,7 @@ from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain import PromptTemplate, HuggingFaceTextGenInference
|
||||
from langchain.vectorstores import Chroma
|
||||
from apps.stable_diffusion.src import args
|
||||
|
||||
|
||||
def get_db(
|
||||
@@ -371,8 +371,8 @@ def get_embedding(
|
||||
# to ensure can fork without deadlock
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
device, torch_dtype, context_class = get_device_dtype()
|
||||
model_kwargs = dict(device=device)
|
||||
torch_dtype, context_class = get_dtype()
|
||||
model_kwargs = dict(device=args.device)
|
||||
if "instructor" in hf_embedding_model:
|
||||
encode_kwargs = {"normalize_embeddings": True}
|
||||
embedding = HuggingFaceInstructEmbeddings(
|
||||
@@ -907,7 +907,7 @@ def get_llm(
|
||||
# model_name = 'h2oai/h2ogpt-oig-oasst1-512-6_9b'
|
||||
# model_name = 'h2oai/h2ogpt-oasst1-512-20b'
|
||||
inference_server = ""
|
||||
model, tokenizer, device = Langchain.get_model(
|
||||
model, tokenizer, _ = Langchain.get_model(
|
||||
load_8bit=True,
|
||||
base_model=model_name,
|
||||
inference_server=inference_server,
|
||||
@@ -974,17 +974,15 @@ def get_llm(
|
||||
return llm, model_name, streamer, prompt_type
|
||||
|
||||
|
||||
def get_device_dtype():
|
||||
def get_dtype():
|
||||
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
|
||||
import torch
|
||||
|
||||
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
|
||||
device = "cpu" if n_gpus == 0 else "cuda"
|
||||
# from utils import NullContext
|
||||
# context_class = NullContext if n_gpus > 1 or n_gpus == 0 else context_class
|
||||
context_class = torch.device
|
||||
torch_dtype = torch.float16 if device == "cuda" else torch.float32
|
||||
return device, torch_dtype, context_class
|
||||
torch_dtype = torch.float16 if args.device == "cuda" else torch.float32
|
||||
return torch_dtype, context_class
|
||||
|
||||
|
||||
def get_wiki_data(
|
||||
@@ -1715,7 +1713,7 @@ def path_to_docs(
|
||||
caption_loader
|
||||
and not isinstance(caption_loader, (bool, str))
|
||||
and caption_loader.device != "cpu"
|
||||
or get_device() == "cuda"
|
||||
or args.device == "cuda"
|
||||
):
|
||||
# to avoid deadlocks, presume was preloaded and so can't fork due to cuda context
|
||||
n_jobs_image = 1
|
||||
@@ -2549,15 +2547,15 @@ def _run_qa_db(
|
||||
# context stuff similar to used in evaluate()
|
||||
import torch
|
||||
|
||||
device, torch_dtype, context_class = get_device_dtype()
|
||||
torch_dtype, context_class = get_dtype()
|
||||
with torch.no_grad():
|
||||
have_lora_weights = lora_weights not in [no_lora_str, "", None]
|
||||
context_class_cast = (
|
||||
NullContext
|
||||
if device == "cpu" or have_lora_weights
|
||||
if args.device == "cpu" or have_lora_weights
|
||||
else torch.autocast
|
||||
)
|
||||
with context_class_cast(device):
|
||||
with context_class_cast(args.device):
|
||||
answer = chain()
|
||||
|
||||
if not use_context:
|
||||
|
||||
@@ -28,6 +28,7 @@ global_precision = "fp16"
|
||||
if not args.run_docuchat_web:
|
||||
args.device = global_device
|
||||
args.precision = global_precision
|
||||
tensor_device = "cpu" if args.device == "cpu" else "cuda"
|
||||
|
||||
|
||||
class H2OGPTSHARKModel(torch.nn.Module):
|
||||
@@ -102,7 +103,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
|
||||
"forward",
|
||||
(input_ids.to(device="cpu"), attention_mask.to(device="cpu")),
|
||||
)
|
||||
).to(device=args.device)
|
||||
).to(device=tensor_device)
|
||||
return result
|
||||
|
||||
|
||||
@@ -118,14 +119,14 @@ def pad_or_truncate_inputs(
|
||||
num_add_token = max_padding_length - inp_shape[1]
|
||||
padded_input_ids = torch.cat(
|
||||
[
|
||||
torch.tensor([[11] * num_add_token]).to(device=args.device),
|
||||
torch.tensor([[11] * num_add_token]).to(device=tensor_device),
|
||||
input_ids,
|
||||
],
|
||||
dim=1,
|
||||
)
|
||||
padded_attention_mask = torch.cat(
|
||||
[
|
||||
torch.tensor([[0] * num_add_token]).to(device=args.device),
|
||||
torch.tensor([[0] * num_add_token]).to(device=tensor_device),
|
||||
attention_mask,
|
||||
],
|
||||
dim=1,
|
||||
@@ -455,7 +456,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
|
||||
if isinstance(eos_token_id, int):
|
||||
eos_token_id = [eos_token_id]
|
||||
self.eos_token_id_tensor = (
|
||||
torch.tensor(eos_token_id).to(device=args.device)
|
||||
torch.tensor(eos_token_id).to(device=tensor_device)
|
||||
if eos_token_id is not None
|
||||
else None
|
||||
)
|
||||
@@ -533,7 +534,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
|
||||
self.input_ids = torch.cat(
|
||||
[
|
||||
torch.tensor(self.truncated_input_ids)
|
||||
.to(device=args.device)
|
||||
.to(device=tensor_device)
|
||||
.unsqueeze(dim=0),
|
||||
self.input_ids,
|
||||
],
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# for generate (gradio server) and finetune
|
||||
datasets==2.13.0
|
||||
sentencepiece==0.1.99
|
||||
gradio==3.35.2
|
||||
huggingface_hub==0.15.1
|
||||
# gradio==3.37.0
|
||||
huggingface_hub==0.16.4
|
||||
appdirs==1.4.4
|
||||
fire==0.5.0
|
||||
docutils==0.20.1
|
||||
# torch==2.0.1; sys_platform != "darwin" and platform_machine != "arm64"
|
||||
evaluate==0.4.0
|
||||
rouge_score==0.1.2
|
||||
sacrebleu==2.3.1
|
||||
@@ -18,7 +19,9 @@ matplotlib==3.7.1
|
||||
loralib==0.1.1
|
||||
bitsandbytes==0.39.0
|
||||
accelerate==0.20.3
|
||||
git+https://github.com/huggingface/peft.git@0b62b4378b4ce9367932c73540349da9a41bdea8
|
||||
peft==0.4.0
|
||||
# 4.31.0+ breaks load_in_8bit=True (https://github.com/huggingface/transformers/issues/25026)
|
||||
# transformers==4.30.2
|
||||
tokenizers==0.13.3
|
||||
APScheduler==3.10.1
|
||||
|
||||
@@ -33,7 +36,7 @@ tensorboard==2.13.0
|
||||
neptune==1.2.0
|
||||
|
||||
# for gradio client
|
||||
gradio_client==0.2.7
|
||||
gradio_client==0.2.10
|
||||
beautifulsoup4==4.12.2
|
||||
markdown==3.4.3
|
||||
|
||||
@@ -43,8 +46,9 @@ pytest-xdist==3.2.1
|
||||
nltk==3.8.1
|
||||
textstat==0.7.3
|
||||
# pandoc==2.3
|
||||
#pypandoc==1.11
|
||||
pypandoc_binary==1.11
|
||||
pypandoc==1.11; sys_platform == "darwin" and platform_machine == "arm64"
|
||||
pypandoc_binary==1.11; platform_machine == "x86_64"
|
||||
pypandoc_binary==1.11; sys_platform == "win32"
|
||||
openpyxl==3.1.2
|
||||
lm_dataformat==0.0.20
|
||||
bioc==2.0
|
||||
@@ -104,3 +108,15 @@ pip-licenses==4.3.0
|
||||
|
||||
# weaviate vector db
|
||||
weaviate-client==3.22.1
|
||||
|
||||
gpt4all==1.0.5
|
||||
llama-cpp-python==0.1.73
|
||||
|
||||
arxiv==1.4.8
|
||||
pymupdf==1.22.5 # AGPL license
|
||||
# extract-msg==0.41.1 # GPL3
|
||||
|
||||
# sometimes unstructured fails, these work in those cases. See https://github.com/h2oai/h2ogpt/issues/320
|
||||
playwright==1.36.0
|
||||
# requires Chrome binary to be in path
|
||||
selenium==4.10.0
|
||||
|
||||
@@ -51,24 +51,26 @@ def chat(curr_system_message, history, device, precision):
|
||||
|
||||
if h2ogpt_model == 0:
|
||||
if "cuda" in device:
|
||||
device = "cuda"
|
||||
shark_device = "cuda"
|
||||
elif "sync" in device:
|
||||
device = "cpu"
|
||||
shark_device = "cpu"
|
||||
elif "task" in device:
|
||||
device = "cpu"
|
||||
shark_device = "cpu"
|
||||
elif "vulkan" in device:
|
||||
device = "vulkan"
|
||||
shark_device = "vulkan"
|
||||
else:
|
||||
print("unrecognized device")
|
||||
|
||||
args.device = device
|
||||
device = "cpu" if shark_device == "cpu" else "cuda"
|
||||
|
||||
args.device = shark_device
|
||||
args.precision = precision
|
||||
|
||||
from apps.language_models.langchain.gen import Langchain
|
||||
|
||||
langchain = Langchain(device, precision)
|
||||
h2ogpt_model, h2ogpt_tokenizer, _ = langchain.get_model(
|
||||
load_8bit=True
|
||||
load_4bit=True
|
||||
if device == "cuda"
|
||||
else False, # load model in 4bit if device is cuda to save memory
|
||||
load_gptq="",
|
||||
|
||||
Reference in New Issue
Block a user