Fix Langchain multiple device isssue (#1688)

This commit is contained in:
Vivek Khandelwal
2023-07-24 20:33:46 +05:30
committed by GitHub
parent d7092aafaa
commit f3cb63fc9c
6 changed files with 100 additions and 78 deletions

View File

@@ -5,6 +5,7 @@
1.) Install all the dependencies by running:
```shell
pip install -r apps/language_models/langchain/langchain_requirements.txt
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
```
2.) Create a folder named `user_path` in `apps/language_models/langchain/` directory.

View File

@@ -687,6 +687,7 @@ class Langchain:
langchain_mode1,
user_path,
hf_embedding_model,
device=self.device,
kwargs_make_db=locals(),
)
finally:
@@ -811,7 +812,7 @@ class Langchain:
)
)
if base_model1 and not login_mode_if_model0:
model0, tokenizer0, device = self.get_model(
model0, tokenizer0, _ = self.get_model(
reward_type=False,
**get_kwargs(
self.get_model,
@@ -821,7 +822,7 @@ class Langchain:
)
else:
# if empty model, then don't load anything, just get gradio up
model0, tokenizer0, device = None, None, None
model0, tokenizer0, _ = None, None, None
if model0 is None:
if fail_if_cannot_connect:
raise RuntimeError("Could not connect, see logs")
@@ -830,7 +831,7 @@ class Langchain:
model_lock.remove(model_dict)
continue
model_state_trial = dict(
model=model0, tokenizer=tokenizer0, device=device
model=model0, tokenizer=tokenizer0, device=self.device
)
model_state_trial.update(model_dict)
assert len(model_state_none) == len(model_state_trial)
@@ -846,7 +847,7 @@ class Langchain:
# get score model
all_kwargs = locals().copy()
smodel, stokenizer, sdevice = self.get_score_model(
smodel, stokenizer, _ = self.get_score_model(
reward_type=True,
**get_kwargs(
self.get_score_model,
@@ -857,7 +858,7 @@ class Langchain:
score_model_state0 = dict(
model=smodel,
tokenizer=stokenizer,
device=sdevice,
device=self.device,
base_model=score_model,
tokenizer_base_model="",
lora_weights="",
@@ -959,6 +960,7 @@ class Langchain:
Ensure model gets on correct device
"""
device_map = None
if model is not None:
# NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
# NOTE: Some models require avoiding sharding some layers,
@@ -975,25 +977,25 @@ class Langchain:
dtype=torch.float16 if load_half else torch.float32,
)
device_map.update(device_map_model)
else:
device_map = "auto"
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
if n_gpus > 0:
if gpu_id >= 0:
# FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
# So avoid for now, just put on first GPU, unless score_model, put on last
if reward_type:
device_map = {"": n_gpus - 1}
else:
device_map = {"": min(n_gpus - 1, gpu_id)}
if gpu_id == -1:
device_map = {"": "cuda"}
else:
device_map = {"": "cpu"}
model_kwargs["load_in_8bit"] = False
model_kwargs["load_in_4bit"] = False
if device_map is None:
if self.device == "cuda":
if n_gpus > 0:
if gpu_id >= 0:
# FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
# So avoid for now, just put on first GPU, unless score_model, put on last
if reward_type:
device_map = {"": n_gpus - 1}
else:
device_map = {"": min(n_gpus - 1, gpu_id)}
if gpu_id == -1:
device_map = {"": "cuda"}
else:
device_map = {"": "cpu"}
model_kwargs["load_in_8bit"] = False
model_kwargs["load_in_4bit"] = False
print("device_map: %s" % device_map, flush=True)
load_in_8bit = model_kwargs.get("load_in_8bit", False)
@@ -1265,8 +1267,8 @@ class Langchain:
if base_model in non_hf_types:
from gpt4all_llm import get_model_tokenizer_gpt4all
model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
return model, tokenizer, device
model, tokenizer, _ = get_model_tokenizer_gpt4all(base_model)
return model, tokenizer, self.device
# get local torch-HF model
return self.get_hf_model(
@@ -1276,7 +1278,7 @@ class Langchain:
load_gptq=load_gptq,
use_safetensors=use_safetensors,
infer_devices=infer_devices,
device=device,
device=self.device,
base_model=base_model,
tokenizer_base_model=tokenizer_base_model,
lora_weights=lora_weights,
@@ -1325,8 +1327,6 @@ class Langchain:
if lora_weights is not None and lora_weights.strip():
if verbose:
print("Get %s lora weights" % lora_weights, flush=True)
if device is None:
device = get_device()
if "gpt2" in base_model.lower():
# RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
@@ -1365,19 +1365,19 @@ class Langchain:
model = model_loader(
tokenizer,
model=base_model,
device=0 if device == "cuda" else -1,
device=0 if self.device == "cuda" else -1,
torch_dtype=torch.float16
if device == "cuda"
if self.device == "cuda"
else torch.float32,
)
else:
assert device in ["cuda", "cpu", "mps"], (
"Unsupported device %s" % device
assert self.device in ["cuda", "cpu", "mps"], (
"Unsupported device %s" % self.device
)
model_kwargs = dict(
local_files_only=local_files_only,
torch_dtype=torch.float16
if device == "cuda"
if self.device == "cuda"
else torch.float32,
resume_download=resume_download,
use_auth_token=use_auth_token,
@@ -1392,7 +1392,7 @@ class Langchain:
infer_devices
and gpu_id is not None
and gpu_id >= 0
and device == "cuda"
and self.device == "cuda"
):
device_map = {"": gpu_id}
else:
@@ -1412,14 +1412,16 @@ class Langchain:
# MPT doesn't support spreading over GPUs
model_kwargs.update(
dict(
device_map={"": gpu_id} if device == "cuda" else "cpu"
device_map={"": gpu_id}
if self.device == "cuda"
else "cpu"
)
)
if "OpenAssistant/reward-model".lower() in base_model.lower():
# FIXME: could put on other GPUs
model_kwargs["device_map"] = (
{"": 0} if device == "cuda" else {"": "cpu"}
{"": 0} if self.device == "cuda" else {"": "cpu"}
)
model_kwargs.pop("torch_dtype", None)
self.pop_unused_model_kwargs(model_kwargs)
@@ -1427,7 +1429,7 @@ class Langchain:
if not lora_weights:
# torch.device context uses twice memory for AutoGPTQ
context = NullContext if load_gptq else torch.device
with context(device):
with context(self.device):
if infer_devices:
config, model = self.get_config(
base_model,
@@ -1472,7 +1474,7 @@ class Langchain:
model,
lora_weights,
torch_dtype=torch.float16
if device == "cuda"
if self.device == "cuda"
else torch.float32,
local_files_only=local_files_only,
resume_download=resume_download,
@@ -1480,11 +1482,11 @@ class Langchain:
trust_remote_code=trust_remote_code,
offload_folder=offload_folder,
device_map={"": 0}
if device == "cuda"
if self.device == "cuda"
else {"": "cpu"}, # seems to be required
)
else:
with torch.device(device):
with torch.device(self.device):
config, _ = self.get_config(
base_model, raise_exception=True, **config_kwargs
)
@@ -1499,7 +1501,7 @@ class Langchain:
model,
lora_weights,
torch_dtype=torch.float16
if device == "cuda"
if self.device == "cuda"
else torch.float32,
local_files_only=local_files_only,
resume_download=resume_download,
@@ -1535,7 +1537,7 @@ class Langchain:
config, tokenizer, verbose=False, reward_type=reward_type
)
return model, tokenizer, device
return model, tokenizer, self.device
def set_model_max_len(
self, config, tokenizer, verbose=False, reward_type=False
@@ -1609,15 +1611,15 @@ class Langchain:
inference_server = ""
llama_type = False
compile_model = False
smodel, stokenizer, sdevice = self.get_model(
smodel, stokenizer, _ = self.get_model(
reward_type=True,
**get_kwargs(
self.get_model, exclude_names=["reward_type"], **locals()
),
)
else:
smodel, stokenizer, sdevice = None, None, None
return smodel, stokenizer, sdevice
smodel, stokenizer, _ = None, None, None
return smodel, stokenizer, self.device
def evaluate(
self,
@@ -1763,7 +1765,6 @@ class Langchain:
# get variables
model = chosen_model_state["model"]
tokenizer = chosen_model_state["tokenizer"]
device = chosen_model_state["device"]
base_model = chosen_model_state["base_model"]
tokenizer_base_model = chosen_model_state["tokenizer_base_model"]
lora_weights = chosen_model_state["lora_weights"]
@@ -1952,6 +1953,7 @@ class Langchain:
lora_weights=lora_weights,
auto_reduce_chunks=auto_reduce_chunks,
max_chunks=max_chunks,
device=self.device,
):
(
outr,
@@ -2403,7 +2405,7 @@ class Langchain:
prompt_type,
prompt_dict,
tokenizer,
device,
self.device,
model_max_length=tokenizer.model_max_length,
)
@@ -2412,7 +2414,7 @@ class Langchain:
inputs = tokenizer(prompt, return_tensors="pt")
if debug and len(inputs["input_ids"]) > 0:
print("input_ids length", len(inputs["input_ids"][0]), flush=True)
input_ids = inputs["input_ids"].to(device)
input_ids = inputs["input_ids"].to(self.device)
# CRITICAL LIMIT else will fail
max_max_tokens = tokenizer.model_max_length
max_input_tokens = max_max_tokens - min_new_tokens
@@ -2498,10 +2500,12 @@ class Langchain:
have_lora_weights = lora_weights not in [no_lora_str, "", None]
context_class_cast = (
NullContext
if device == "cpu" or have_lora_weights or device == "mps"
if self.device == "cpu"
or have_lora_weights
or self.device == "mps"
else torch.autocast
)
with context_class_cast(device):
with context_class_cast(self.device):
# protection for gradio not keeping track of closed users,
# else hit bitsandbytes lack of thread safety:
# https://github.com/h2oai/h2ogpt/issues/104

View File

@@ -44,7 +44,6 @@ from utils import (
makedirs,
get_url,
flatten_list,
get_device,
ProgressParallel,
remove,
hash_file,
@@ -92,6 +91,7 @@ from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain import PromptTemplate, HuggingFaceTextGenInference
from langchain.vectorstores import Chroma
from apps.stable_diffusion.src import args
def get_db(
@@ -371,8 +371,8 @@ def get_embedding(
# to ensure can fork without deadlock
from langchain.embeddings import HuggingFaceEmbeddings
device, torch_dtype, context_class = get_device_dtype()
model_kwargs = dict(device=device)
torch_dtype, context_class = get_dtype()
model_kwargs = dict(device=args.device)
if "instructor" in hf_embedding_model:
encode_kwargs = {"normalize_embeddings": True}
embedding = HuggingFaceInstructEmbeddings(
@@ -907,7 +907,7 @@ def get_llm(
# model_name = 'h2oai/h2ogpt-oig-oasst1-512-6_9b'
# model_name = 'h2oai/h2ogpt-oasst1-512-20b'
inference_server = ""
model, tokenizer, device = Langchain.get_model(
model, tokenizer, _ = Langchain.get_model(
load_8bit=True,
base_model=model_name,
inference_server=inference_server,
@@ -974,17 +974,15 @@ def get_llm(
return llm, model_name, streamer, prompt_type
def get_device_dtype():
def get_dtype():
# torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
import torch
n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
device = "cpu" if n_gpus == 0 else "cuda"
# from utils import NullContext
# context_class = NullContext if n_gpus > 1 or n_gpus == 0 else context_class
context_class = torch.device
torch_dtype = torch.float16 if device == "cuda" else torch.float32
return device, torch_dtype, context_class
torch_dtype = torch.float16 if args.device == "cuda" else torch.float32
return torch_dtype, context_class
def get_wiki_data(
@@ -1715,7 +1713,7 @@ def path_to_docs(
caption_loader
and not isinstance(caption_loader, (bool, str))
and caption_loader.device != "cpu"
or get_device() == "cuda"
or args.device == "cuda"
):
# to avoid deadlocks, presume was preloaded and so can't fork due to cuda context
n_jobs_image = 1
@@ -2549,15 +2547,15 @@ def _run_qa_db(
# context stuff similar to used in evaluate()
import torch
device, torch_dtype, context_class = get_device_dtype()
torch_dtype, context_class = get_dtype()
with torch.no_grad():
have_lora_weights = lora_weights not in [no_lora_str, "", None]
context_class_cast = (
NullContext
if device == "cpu" or have_lora_weights
if args.device == "cpu" or have_lora_weights
else torch.autocast
)
with context_class_cast(device):
with context_class_cast(args.device):
answer = chain()
if not use_context:

View File

@@ -28,6 +28,7 @@ global_precision = "fp16"
if not args.run_docuchat_web:
args.device = global_device
args.precision = global_precision
tensor_device = "cpu" if args.device == "cpu" else "cuda"
class H2OGPTSHARKModel(torch.nn.Module):
@@ -102,7 +103,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
"forward",
(input_ids.to(device="cpu"), attention_mask.to(device="cpu")),
)
).to(device=args.device)
).to(device=tensor_device)
return result
@@ -118,14 +119,14 @@ def pad_or_truncate_inputs(
num_add_token = max_padding_length - inp_shape[1]
padded_input_ids = torch.cat(
[
torch.tensor([[11] * num_add_token]).to(device=args.device),
torch.tensor([[11] * num_add_token]).to(device=tensor_device),
input_ids,
],
dim=1,
)
padded_attention_mask = torch.cat(
[
torch.tensor([[0] * num_add_token]).to(device=args.device),
torch.tensor([[0] * num_add_token]).to(device=tensor_device),
attention_mask,
],
dim=1,
@@ -455,7 +456,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
self.eos_token_id_tensor = (
torch.tensor(eos_token_id).to(device=args.device)
torch.tensor(eos_token_id).to(device=tensor_device)
if eos_token_id is not None
else None
)
@@ -533,7 +534,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
self.input_ids = torch.cat(
[
torch.tensor(self.truncated_input_ids)
.to(device=args.device)
.to(device=tensor_device)
.unsqueeze(dim=0),
self.input_ids,
],

View File

@@ -1,11 +1,12 @@
# for generate (gradio server) and finetune
datasets==2.13.0
sentencepiece==0.1.99
gradio==3.35.2
huggingface_hub==0.15.1
# gradio==3.37.0
huggingface_hub==0.16.4
appdirs==1.4.4
fire==0.5.0
docutils==0.20.1
# torch==2.0.1; sys_platform != "darwin" and platform_machine != "arm64"
evaluate==0.4.0
rouge_score==0.1.2
sacrebleu==2.3.1
@@ -18,7 +19,9 @@ matplotlib==3.7.1
loralib==0.1.1
bitsandbytes==0.39.0
accelerate==0.20.3
git+https://github.com/huggingface/peft.git@0b62b4378b4ce9367932c73540349da9a41bdea8
peft==0.4.0
# 4.31.0+ breaks load_in_8bit=True (https://github.com/huggingface/transformers/issues/25026)
# transformers==4.30.2
tokenizers==0.13.3
APScheduler==3.10.1
@@ -33,7 +36,7 @@ tensorboard==2.13.0
neptune==1.2.0
# for gradio client
gradio_client==0.2.7
gradio_client==0.2.10
beautifulsoup4==4.12.2
markdown==3.4.3
@@ -43,8 +46,9 @@ pytest-xdist==3.2.1
nltk==3.8.1
textstat==0.7.3
# pandoc==2.3
#pypandoc==1.11
pypandoc_binary==1.11
pypandoc==1.11; sys_platform == "darwin" and platform_machine == "arm64"
pypandoc_binary==1.11; platform_machine == "x86_64"
pypandoc_binary==1.11; sys_platform == "win32"
openpyxl==3.1.2
lm_dataformat==0.0.20
bioc==2.0
@@ -104,3 +108,15 @@ pip-licenses==4.3.0
# weaviate vector db
weaviate-client==3.22.1
gpt4all==1.0.5
llama-cpp-python==0.1.73
arxiv==1.4.8
pymupdf==1.22.5 # AGPL license
# extract-msg==0.41.1 # GPL3
# sometimes unstructured fails, these work in those cases. See https://github.com/h2oai/h2ogpt/issues/320
playwright==1.36.0
# requires Chrome binary to be in path
selenium==4.10.0

View File

@@ -51,24 +51,26 @@ def chat(curr_system_message, history, device, precision):
if h2ogpt_model == 0:
if "cuda" in device:
device = "cuda"
shark_device = "cuda"
elif "sync" in device:
device = "cpu"
shark_device = "cpu"
elif "task" in device:
device = "cpu"
shark_device = "cpu"
elif "vulkan" in device:
device = "vulkan"
shark_device = "vulkan"
else:
print("unrecognized device")
args.device = device
device = "cpu" if shark_device == "cpu" else "cuda"
args.device = shark_device
args.precision = precision
from apps.language_models.langchain.gen import Langchain
langchain = Langchain(device, precision)
h2ogpt_model, h2ogpt_tokenizer, _ = langchain.get_model(
load_8bit=True
load_4bit=True
if device == "cuda"
else False, # load model in 4bit if device is cuda to save memory
load_gptq="",