Fix Langchain multiple device isssue (#1688)

2026-04-25 03:00:12 -04:00 · 2023-07-24 20:33:46 +05:30
parent d7092aafaa
commit f3cb63fc9c
6 changed files with 100 additions and 78 deletions
--- a/apps/language_models/langchain/README.md
+++ b/apps/language_models/langchain/README.md
@@ -5,6 +5,7 @@
 1.) Install all the dependencies by running:
 ```shell
 pip install -r apps/language_models/langchain/langchain_requirements.txt
+sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
 ```

 2.) Create a folder named `user_path` in `apps/language_models/langchain/` directory.
--- a/apps/language_models/langchain/gen.py
+++ b/apps/language_models/langchain/gen.py
@@ -687,6 +687,7 @@ class Langchain:
                        langchain_mode1,
                        user_path,
                        hf_embedding_model,
+                        device=self.device,
                        kwargs_make_db=locals(),
                    )
                finally:
@@ -811,7 +812,7 @@ class Langchain:
                    )
                )
                if base_model1 and not login_mode_if_model0:
-                    model0, tokenizer0, device = self.get_model(
+                    model0, tokenizer0, _ = self.get_model(
                        reward_type=False,
                        **get_kwargs(
                            self.get_model,
@@ -821,7 +822,7 @@ class Langchain:
                    )
                else:
                    # if empty model, then don't load anything, just get gradio up
-                    model0, tokenizer0, device = None, None, None
+                    model0, tokenizer0, _ = None, None, None
                if model0 is None:
                    if fail_if_cannot_connect:
                        raise RuntimeError("Could not connect, see logs")
@@ -830,7 +831,7 @@ class Langchain:
                        model_lock.remove(model_dict)
                    continue
                model_state_trial = dict(
-                    model=model0, tokenizer=tokenizer0, device=device
+                    model=model0, tokenizer=tokenizer0, device=self.device
                )
                model_state_trial.update(model_dict)
                assert len(model_state_none) == len(model_state_trial)
@@ -846,7 +847,7 @@ class Langchain:

            # get score model
            all_kwargs = locals().copy()
-            smodel, stokenizer, sdevice = self.get_score_model(
+            smodel, stokenizer, _ = self.get_score_model(
                reward_type=True,
                **get_kwargs(
                    self.get_score_model,
@@ -857,7 +858,7 @@ class Langchain:
            score_model_state0 = dict(
                model=smodel,
                tokenizer=stokenizer,
-                device=sdevice,
+                device=self.device,
                base_model=score_model,
                tokenizer_base_model="",
                lora_weights="",
@@ -959,6 +960,7 @@ class Langchain:
        Ensure model gets on correct device
        """

+        device_map = None
        if model is not None:
            # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
            # NOTE: Some models require avoiding sharding some layers,
@@ -975,25 +977,25 @@ class Langchain:
                    dtype=torch.float16 if load_half else torch.float32,
                )
                device_map.update(device_map_model)
-        else:
-            device_map = "auto"

        n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0

-        if n_gpus > 0:
-            if gpu_id >= 0:
-                # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
-                # So avoid for now, just put on first GPU, unless score_model, put on last
-                if reward_type:
-                    device_map = {"": n_gpus - 1}
-                else:
-                    device_map = {"": min(n_gpus - 1, gpu_id)}
-            if gpu_id == -1:
-                device_map = {"": "cuda"}
-        else:
-            device_map = {"": "cpu"}
-            model_kwargs["load_in_8bit"] = False
-            model_kwargs["load_in_4bit"] = False
+        if device_map is None:
+            if self.device == "cuda":
+                if n_gpus > 0:
+                    if gpu_id >= 0:
+                        # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
+                        # So avoid for now, just put on first GPU, unless score_model, put on last
+                        if reward_type:
+                            device_map = {"": n_gpus - 1}
+                        else:
+                            device_map = {"": min(n_gpus - 1, gpu_id)}
+                    if gpu_id == -1:
+                        device_map = {"": "cuda"}
+            else:
+                device_map = {"": "cpu"}
+                model_kwargs["load_in_8bit"] = False
+                model_kwargs["load_in_4bit"] = False
        print("device_map: %s" % device_map, flush=True)

        load_in_8bit = model_kwargs.get("load_in_8bit", False)
@@ -1265,8 +1267,8 @@ class Langchain:
        if base_model in non_hf_types:
            from gpt4all_llm import get_model_tokenizer_gpt4all

-            model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
-            return model, tokenizer, device
+            model, tokenizer, _ = get_model_tokenizer_gpt4all(base_model)
+            return model, tokenizer, self.device

        # get local torch-HF model
        return self.get_hf_model(
@@ -1276,7 +1278,7 @@ class Langchain:
            load_gptq=load_gptq,
            use_safetensors=use_safetensors,
            infer_devices=infer_devices,
-            device=device,
+            device=self.device,
            base_model=base_model,
            tokenizer_base_model=tokenizer_base_model,
            lora_weights=lora_weights,
@@ -1325,8 +1327,6 @@ class Langchain:
        if lora_weights is not None and lora_weights.strip():
            if verbose:
                print("Get %s lora weights" % lora_weights, flush=True)
-        if device is None:
-            device = get_device()

        if "gpt2" in base_model.lower():
            # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
@@ -1365,19 +1365,19 @@ class Langchain:
            model = model_loader(
                tokenizer,
                model=base_model,
-                device=0 if device == "cuda" else -1,
+                device=0 if self.device == "cuda" else -1,
                torch_dtype=torch.float16
-                if device == "cuda"
+                if self.device == "cuda"
                else torch.float32,
            )
        else:
-            assert device in ["cuda", "cpu", "mps"], (
-                "Unsupported device %s" % device
+            assert self.device in ["cuda", "cpu", "mps"], (
+                "Unsupported device %s" % self.device
            )
            model_kwargs = dict(
                local_files_only=local_files_only,
                torch_dtype=torch.float16
-                if device == "cuda"
+                if self.device == "cuda"
                else torch.float32,
                resume_download=resume_download,
                use_auth_token=use_auth_token,
@@ -1392,7 +1392,7 @@ class Langchain:
                    infer_devices
                    and gpu_id is not None
                    and gpu_id >= 0
-                    and device == "cuda"
+                    and self.device == "cuda"
                ):
                    device_map = {"": gpu_id}
                else:
@@ -1412,14 +1412,16 @@ class Langchain:
                # MPT doesn't support spreading over GPUs
                model_kwargs.update(
                    dict(
-                        device_map={"": gpu_id} if device == "cuda" else "cpu"
+                        device_map={"": gpu_id}
+                        if self.device == "cuda"
+                        else "cpu"
                    )
                )

            if "OpenAssistant/reward-model".lower() in base_model.lower():
                # FIXME: could put on other GPUs
                model_kwargs["device_map"] = (
-                    {"": 0} if device == "cuda" else {"": "cpu"}
+                    {"": 0} if self.device == "cuda" else {"": "cpu"}
                )
                model_kwargs.pop("torch_dtype", None)
            self.pop_unused_model_kwargs(model_kwargs)
@@ -1427,7 +1429,7 @@ class Langchain:
            if not lora_weights:
                # torch.device context uses twice memory for AutoGPTQ
                context = NullContext if load_gptq else torch.device
-                with context(device):
+                with context(self.device):
                    if infer_devices:
                        config, model = self.get_config(
                            base_model,
@@ -1472,7 +1474,7 @@ class Langchain:
                    model,
                    lora_weights,
                    torch_dtype=torch.float16
-                    if device == "cuda"
+                    if self.device == "cuda"
                    else torch.float32,
                    local_files_only=local_files_only,
                    resume_download=resume_download,
@@ -1480,11 +1482,11 @@ class Langchain:
                    trust_remote_code=trust_remote_code,
                    offload_folder=offload_folder,
                    device_map={"": 0}
-                    if device == "cuda"
+                    if self.device == "cuda"
                    else {"": "cpu"},  # seems to be required
                )
            else:
-                with torch.device(device):
+                with torch.device(self.device):
                    config, _ = self.get_config(
                        base_model, raise_exception=True, **config_kwargs
                    )
@@ -1499,7 +1501,7 @@ class Langchain:
                        model,
                        lora_weights,
                        torch_dtype=torch.float16
-                        if device == "cuda"
+                        if self.device == "cuda"
                        else torch.float32,
                        local_files_only=local_files_only,
                        resume_download=resume_download,
@@ -1535,7 +1537,7 @@ class Langchain:
            config, tokenizer, verbose=False, reward_type=reward_type
        )

-        return model, tokenizer, device
+        return model, tokenizer, self.device

    def set_model_max_len(
        self, config, tokenizer, verbose=False, reward_type=False
@@ -1609,15 +1611,15 @@ class Langchain:
            inference_server = ""
            llama_type = False
            compile_model = False
-            smodel, stokenizer, sdevice = self.get_model(
+            smodel, stokenizer, _ = self.get_model(
                reward_type=True,
                **get_kwargs(
                    self.get_model, exclude_names=["reward_type"], **locals()
                ),
            )
        else:
-            smodel, stokenizer, sdevice = None, None, None
-        return smodel, stokenizer, sdevice
+            smodel, stokenizer, _ = None, None, None
+        return smodel, stokenizer, self.device

    def evaluate(
        self,
@@ -1763,7 +1765,6 @@ class Langchain:
        # get variables
        model = chosen_model_state["model"]
        tokenizer = chosen_model_state["tokenizer"]
-        device = chosen_model_state["device"]
        base_model = chosen_model_state["base_model"]
        tokenizer_base_model = chosen_model_state["tokenizer_base_model"]
        lora_weights = chosen_model_state["lora_weights"]
@@ -1952,6 +1953,7 @@ class Langchain:
                lora_weights=lora_weights,
                auto_reduce_chunks=auto_reduce_chunks,
                max_chunks=max_chunks,
+                device=self.device,
            ):
                (
                    outr,
@@ -2403,7 +2405,7 @@ class Langchain:
            prompt_type,
            prompt_dict,
            tokenizer,
-            device,
+            self.device,
            model_max_length=tokenizer.model_max_length,
        )

@@ -2412,7 +2414,7 @@ class Langchain:
        inputs = tokenizer(prompt, return_tensors="pt")
        if debug and len(inputs["input_ids"]) > 0:
            print("input_ids length", len(inputs["input_ids"][0]), flush=True)
-        input_ids = inputs["input_ids"].to(device)
+        input_ids = inputs["input_ids"].to(self.device)
        # CRITICAL LIMIT else will fail
        max_max_tokens = tokenizer.model_max_length
        max_input_tokens = max_max_tokens - min_new_tokens
@@ -2498,10 +2500,12 @@ class Langchain:
            have_lora_weights = lora_weights not in [no_lora_str, "", None]
            context_class_cast = (
                NullContext
-                if device == "cpu" or have_lora_weights or device == "mps"
+                if self.device == "cpu"
+                or have_lora_weights
+                or self.device == "mps"
                else torch.autocast
            )
-            with context_class_cast(device):
+            with context_class_cast(self.device):
                # protection for gradio not keeping track of closed users,
                # else hit bitsandbytes lack of thread safety:
                # https://github.com/h2oai/h2ogpt/issues/104
--- a/apps/language_models/langchain/gpt_langchain.py
+++ b/apps/language_models/langchain/gpt_langchain.py
@@ -44,7 +44,6 @@ from utils import (
    makedirs,
    get_url,
    flatten_list,
-    get_device,
    ProgressParallel,
    remove,
    hash_file,
@@ -92,6 +91,7 @@ from langchain.chains.question_answering import load_qa_chain
 from langchain.docstore.document import Document
 from langchain import PromptTemplate, HuggingFaceTextGenInference
 from langchain.vectorstores import Chroma
+from apps.stable_diffusion.src import args


 def get_db(
@@ -371,8 +371,8 @@ def get_embedding(
        # to ensure can fork without deadlock
        from langchain.embeddings import HuggingFaceEmbeddings

-        device, torch_dtype, context_class = get_device_dtype()
-        model_kwargs = dict(device=device)
+        torch_dtype, context_class = get_dtype()
+        model_kwargs = dict(device=args.device)
        if "instructor" in hf_embedding_model:
            encode_kwargs = {"normalize_embeddings": True}
            embedding = HuggingFaceInstructEmbeddings(
@@ -907,7 +907,7 @@ def get_llm(
                # model_name = 'h2oai/h2ogpt-oig-oasst1-512-6_9b'
                # model_name = 'h2oai/h2ogpt-oasst1-512-20b'
            inference_server = ""
-            model, tokenizer, device = Langchain.get_model(
+            model, tokenizer, _ = Langchain.get_model(
                load_8bit=True,
                base_model=model_name,
                inference_server=inference_server,
@@ -974,17 +974,15 @@ def get_llm(
    return llm, model_name, streamer, prompt_type


-def get_device_dtype():
+def get_dtype():
    # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
    import torch

-    n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
-    device = "cpu" if n_gpus == 0 else "cuda"
    # from utils import NullContext
    # context_class = NullContext if n_gpus > 1 or n_gpus == 0 else context_class
    context_class = torch.device
-    torch_dtype = torch.float16 if device == "cuda" else torch.float32
-    return device, torch_dtype, context_class
+    torch_dtype = torch.float16 if args.device == "cuda" else torch.float32
+    return torch_dtype, context_class


 def get_wiki_data(
@@ -1715,7 +1713,7 @@ def path_to_docs(
        caption_loader
        and not isinstance(caption_loader, (bool, str))
        and caption_loader.device != "cpu"
-        or get_device() == "cuda"
+        or args.device == "cuda"
    ):
        # to avoid deadlocks, presume was preloaded and so can't fork due to cuda context
        n_jobs_image = 1
@@ -2549,15 +2547,15 @@ def _run_qa_db(
    # context stuff similar to used in evaluate()
    import torch

-    device, torch_dtype, context_class = get_device_dtype()
+    torch_dtype, context_class = get_dtype()
    with torch.no_grad():
        have_lora_weights = lora_weights not in [no_lora_str, "", None]
        context_class_cast = (
            NullContext
-            if device == "cpu" or have_lora_weights
+            if args.device == "cpu" or have_lora_weights
            else torch.autocast
        )
-        with context_class_cast(device):
+        with context_class_cast(args.device):
            answer = chain()

    if not use_context:
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -28,6 +28,7 @@ global_precision = "fp16"
 if not args.run_docuchat_web:
    args.device = global_device
    args.precision = global_precision
+tensor_device = "cpu" if args.device == "cpu" else "cuda"


 class H2OGPTSHARKModel(torch.nn.Module):
@@ -102,7 +103,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
                "forward",
                (input_ids.to(device="cpu"), attention_mask.to(device="cpu")),
            )
-        ).to(device=args.device)
+        ).to(device=tensor_device)
        return result


@@ -118,14 +119,14 @@ def pad_or_truncate_inputs(
        num_add_token = max_padding_length - inp_shape[1]
        padded_input_ids = torch.cat(
            [
-                torch.tensor([[11] * num_add_token]).to(device=args.device),
+                torch.tensor([[11] * num_add_token]).to(device=tensor_device),
                input_ids,
            ],
            dim=1,
        )
        padded_attention_mask = torch.cat(
            [
-                torch.tensor([[0] * num_add_token]).to(device=args.device),
+                torch.tensor([[0] * num_add_token]).to(device=tensor_device),
                attention_mask,
            ],
            dim=1,
@@ -455,7 +456,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
        if isinstance(eos_token_id, int):
            eos_token_id = [eos_token_id]
        self.eos_token_id_tensor = (
-            torch.tensor(eos_token_id).to(device=args.device)
+            torch.tensor(eos_token_id).to(device=tensor_device)
            if eos_token_id is not None
            else None
        )
@@ -533,7 +534,7 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
        self.input_ids = torch.cat(
            [
                torch.tensor(self.truncated_input_ids)
-                .to(device=args.device)
+                .to(device=tensor_device)
                .unsqueeze(dim=0),
                self.input_ids,
            ],
--- a/apps/language_models/langchain/langchain_requirements.txt
+++ b/apps/language_models/langchain/langchain_requirements.txt
@@ -1,11 +1,12 @@
 # for generate (gradio server) and finetune
 datasets==2.13.0
 sentencepiece==0.1.99
-gradio==3.35.2
-huggingface_hub==0.15.1
+# gradio==3.37.0
+huggingface_hub==0.16.4
 appdirs==1.4.4
 fire==0.5.0
 docutils==0.20.1
+# torch==2.0.1; sys_platform != "darwin" and platform_machine != "arm64"
 evaluate==0.4.0
 rouge_score==0.1.2
 sacrebleu==2.3.1
@@ -18,7 +19,9 @@ matplotlib==3.7.1
 loralib==0.1.1
 bitsandbytes==0.39.0
 accelerate==0.20.3
-git+https://github.com/huggingface/peft.git@0b62b4378b4ce9367932c73540349da9a41bdea8
+peft==0.4.0
+# 4.31.0+ breaks load_in_8bit=True (https://github.com/huggingface/transformers/issues/25026)
+# transformers==4.30.2
 tokenizers==0.13.3
 APScheduler==3.10.1

@@ -33,7 +36,7 @@ tensorboard==2.13.0
 neptune==1.2.0

 # for gradio client
-gradio_client==0.2.7
+gradio_client==0.2.10
 beautifulsoup4==4.12.2
 markdown==3.4.3

@@ -43,8 +46,9 @@ pytest-xdist==3.2.1
 nltk==3.8.1
 textstat==0.7.3
 # pandoc==2.3
-#pypandoc==1.11
-pypandoc_binary==1.11
+pypandoc==1.11; sys_platform == "darwin" and platform_machine == "arm64"
+pypandoc_binary==1.11; platform_machine == "x86_64"
+pypandoc_binary==1.11; sys_platform == "win32"
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0
@@ -104,3 +108,15 @@ pip-licenses==4.3.0

 # weaviate vector db
 weaviate-client==3.22.1
+
+gpt4all==1.0.5
+llama-cpp-python==0.1.73
+
+arxiv==1.4.8
+pymupdf==1.22.5 # AGPL license
+# extract-msg==0.41.1  # GPL3
+
+# sometimes unstructured fails, these work in those cases.  See https://github.com/h2oai/h2ogpt/issues/320
+playwright==1.36.0
+# requires Chrome binary to be in path
+selenium==4.10.0
--- a/apps/stable_diffusion/web/ui/h2ogpt.py
+++ b/apps/stable_diffusion/web/ui/h2ogpt.py
@@ -51,24 +51,26 @@ def chat(curr_system_message, history, device, precision):

    if h2ogpt_model == 0:
        if "cuda" in device:
-            device = "cuda"
+            shark_device = "cuda"
        elif "sync" in device:
-            device = "cpu"
+            shark_device = "cpu"
        elif "task" in device:
-            device = "cpu"
+            shark_device = "cpu"
        elif "vulkan" in device:
-            device = "vulkan"
+            shark_device = "vulkan"
        else:
            print("unrecognized device")

-        args.device = device
+        device = "cpu" if shark_device == "cpu" else "cuda"
+
+        args.device = shark_device
        args.precision = precision

        from apps.language_models.langchain.gen import Langchain

        langchain = Langchain(device, precision)
        h2ogpt_model, h2ogpt_tokenizer, _ = langchain.get_model(
-            load_8bit=True
+            load_4bit=True
            if device == "cuda"
            else False,  # load model in 4bit if device is cuda to save memory
            load_gptq="",