Final patch for fixing Langchain token streaming issue (#1744)

2026-04-03 03:00:17 -04:00 · 2023-08-09 22:39:41 +05:30
parent 96185c9dc1
commit e4d7abb519
7 changed files with 431 additions and 857 deletions
--- a/apps/language_models/langchain/exp_hf_pipelines.py
+++ b/apps/language_models/langchain/exp_hf_pipelines.py
@@ -1,572 +0,0 @@
-"""Wrapper around HuggingFace Pipeline APIs."""
-import importlib.util
-import logging
-from typing import Any, List, Mapping, Optional
-
-from pydantic import Extra
-
-from langchain.callbacks.manager import CallbackManagerForLLMRun
-from langchain.llms.base import LLM
-from langchain.llms.utils import enforce_stop_tokens
-
-import enum
-import warnings
-from transformers.pipelines.base import PIPELINE_INIT_ARGS, Pipeline
-from transformers.utils import add_end_docstrings
-from transformers import (
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-)
-
-
-DEFAULT_MODEL_ID = "gpt2"
-DEFAULT_TASK = "text-generation"
-VALID_TASKS = ("text2text-generation", "text-generation", "summarization")
-
-logger = logging.getLogger(__name__)
-
-
-class HuggingFacePipeline(LLM):
-    """Wrapper around HuggingFace Pipeline API.
-
-    To use, you should have the ``transformers`` python package installed.
-
-    Only supports `text-generation`, `text2text-generation` and `summarization` for now.
-
-    Example using from_model_id:
-        .. code-block:: python
-
-            from langchain.llms import HuggingFacePipeline
-            hf = HuggingFacePipeline.from_model_id(
-                model_id="gpt2",
-                task="text-generation",
-                pipeline_kwargs={"max_new_tokens": 10},
-            )
-    Example passing pipeline in directly:
-        .. code-block:: python
-
-            from langchain.llms import HuggingFacePipeline
-            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-
-            model_id = "gpt2"
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            model = AutoModelForCausalLM.from_pretrained(model_id)
-            pipe = pipeline(
-                "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10
-            )
-            hf = HuggingFacePipeline(pipeline=pipe)
-    """
-
-    pipeline: Any  #: :meta private:
-    model_id: str = DEFAULT_MODEL_ID
-    """Model name to use."""
-    model_kwargs: Optional[dict] = None
-    """Key word arguments passed to the model."""
-    pipeline_kwargs: Optional[dict] = None
-    """Key word arguments passed to the pipeline."""
-
-    class Config:
-        """Configuration for this pydantic object."""
-
-        extra = Extra.forbid
-
-    @classmethod
-    def from_model_id(
-        cls,
-        model_id: str,
-        task: str,
-        device: int = -1,
-        model_kwargs: Optional[dict] = None,
-        pipeline_kwargs: Optional[dict] = None,
-        **kwargs: Any,
-    ) -> LLM:
-        """Construct the pipeline object from model_id and task."""
-        try:
-            from transformers import (
-                AutoModelForCausalLM,
-                AutoModelForSeq2SeqLM,
-                AutoTokenizer,
-            )
-            from transformers import pipeline as hf_pipeline
-
-        except ImportError:
-            raise ValueError(
-                "Could not import transformers python package. "
-                "Please install it with `pip install transformers`."
-            )
-
-        _model_kwargs = model_kwargs or {}
-        tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
-
-        try:
-            if task == "text-generation":
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_id, **_model_kwargs
-                )
-            elif task in ("text2text-generation", "summarization"):
-                model = AutoModelForSeq2SeqLM.from_pretrained(
-                    model_id, **_model_kwargs
-                )
-            else:
-                raise ValueError(
-                    f"Got invalid task {task}, "
-                    f"currently only {VALID_TASKS} are supported"
-                )
-        except ImportError as e:
-            raise ValueError(
-                f"Could not load the {task} model due to missing dependencies."
-            ) from e
-
-        if importlib.util.find_spec("torch") is not None:
-            import torch
-
-            cuda_device_count = torch.cuda.device_count()
-            if device < -1 or (device >= cuda_device_count):
-                raise ValueError(
-                    f"Got device=={device}, "
-                    f"device is required to be within [-1, {cuda_device_count})"
-                )
-            if device < 0 and cuda_device_count > 0:
-                logger.warning(
-                    "Device has %d GPUs available. "
-                    "Provide device={deviceId} to `from_model_id` to use available"
-                    "GPUs for execution. deviceId is -1 (default) for CPU and "
-                    "can be a positive integer associated with CUDA device id.",
-                    cuda_device_count,
-                )
-        if "trust_remote_code" in _model_kwargs:
-            _model_kwargs = {
-                k: v
-                for k, v in _model_kwargs.items()
-                if k != "trust_remote_code"
-            }
-        _pipeline_kwargs = pipeline_kwargs or {}
-        pipeline = hf_pipeline(
-            task=task,
-            model=model,
-            tokenizer=tokenizer,
-            device=device,
-            model_kwargs=_model_kwargs,
-            **_pipeline_kwargs,
-        )
-        if pipeline.task not in VALID_TASKS:
-            raise ValueError(
-                f"Got invalid task {pipeline.task}, "
-                f"currently only {VALID_TASKS} are supported"
-            )
-        return cls(
-            pipeline=pipeline,
-            model_id=model_id,
-            model_kwargs=_model_kwargs,
-            pipeline_kwargs=_pipeline_kwargs,
-            **kwargs,
-        )
-
-    @property
-    def _identifying_params(self) -> Mapping[str, Any]:
-        """Get the identifying parameters."""
-        return {
-            "model_id": self.model_id,
-            "model_kwargs": self.model_kwargs,
-            "pipeline_kwargs": self.pipeline_kwargs,
-        }
-
-    @property
-    def _llm_type(self) -> str:
-        return "huggingface_pipeline"
-
-    def _call(
-        self,
-        prompt: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> str:
-        response = self.pipeline(prompt)
-        if self.pipeline.task == "text-generation":
-            # Text generation return includes the starter text.
-            text = response[0]["generated_text"][len(prompt) :]
-        elif self.pipeline.task == "text2text-generation":
-            text = response[0]["generated_text"]
-        elif self.pipeline.task == "summarization":
-            text = response[0]["summary_text"]
-        else:
-            raise ValueError(
-                f"Got invalid task {self.pipeline.task}, "
-                f"currently only {VALID_TASKS} are supported"
-            )
-        if stop is not None:
-            # This is a bit hacky, but I can't figure out a better way to enforce
-            # stop tokens when making calls to huggingface_hub.
-            text = enforce_stop_tokens(text, stop)
-        return text
-
-
-##### TextGenerationPipeline
-
-
-class ReturnType(enum.Enum):
-    TENSORS = 0
-    NEW_TEXT = 1
-    FULL_TEXT = 2
-
-
-@add_end_docstrings(PIPELINE_INIT_ARGS)
-class TextGenerationPipeline(Pipeline):
-    """
-    Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a
-    specified text prompt.
-
-    Example:
-
-    ```python
-    >>> from transformers import pipeline
-
-    >>> generator = pipeline(model="gpt2")
-    >>> generator("I can't believe you did such a ", do_sample=False)
-    [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]
-
-    >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions.
-    >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
-    ```
-
-    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
-
-    This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
-    `"text-generation"`.
-
-    The models that this pipeline can use are models that have been trained with an autoregressive language modeling
-    objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available models
-    on [huggingface.co/models](https://huggingface.co/models?filter=text-generation).
-    """
-
-    # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
-    # in https://github.com/rusiaaman/XLNet-gen#methodology
-    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
-
-    XL_PREFIX = """
-    In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
-    voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
-    Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
-    and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
-    accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
-    begging for his blessing. <eod> </s> <eos>
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.check_model_type(
-            TF_MODEL_FOR_CAUSAL_LM_MAPPING
-            if self.framework == "tf"
-            else MODEL_FOR_CAUSAL_LM_MAPPING
-        )
-        if "prefix" not in self._preprocess_params:
-            # This is very specific. The logic is quite complex and needs to be done
-            # as a "default".
-            # It also defines both some preprocess_kwargs and generate_kwargs
-            # which is why we cannot put them in their respective methods.
-            prefix = None
-            if self.model.config.prefix is not None:
-                prefix = self.model.config.prefix
-            if prefix is None and self.model.__class__.__name__ in [
-                "XLNetLMHeadModel",
-                "TransfoXLLMHeadModel",
-                "TFXLNetLMHeadModel",
-                "TFTransfoXLLMHeadModel",
-            ]:
-                # For XLNet and TransformerXL we add an article to the prompt to give more state to the model.
-                prefix = self.XL_PREFIX
-            if prefix is not None:
-                # Recalculate some generate_kwargs linked to prefix.
-                (
-                    preprocess_params,
-                    forward_params,
-                    _,
-                ) = self._sanitize_parameters(
-                    prefix=prefix, **self._forward_params
-                )
-                self._preprocess_params = {
-                    **self._preprocess_params,
-                    **preprocess_params,
-                }
-                self._forward_params = {
-                    **self._forward_params,
-                    **forward_params,
-                }
-
-    def _sanitize_parameters(
-        self,
-        return_full_text=None,
-        return_tensors=None,
-        return_text=None,
-        return_type=None,
-        clean_up_tokenization_spaces=None,
-        prefix=None,
-        handle_long_generation=None,
-        stop_sequence=None,
-        **generate_kwargs,
-    ):
-        preprocess_params = {}
-        if prefix is not None:
-            preprocess_params["prefix"] = prefix
-        if prefix:
-            prefix_inputs = self.tokenizer(
-                prefix,
-                padding=False,
-                add_special_tokens=False,
-                return_tensors=self.framework,
-            )
-            generate_kwargs["prefix_length"] = prefix_inputs[
-                "input_ids"
-            ].shape[-1]
-
-        if handle_long_generation is not None:
-            if handle_long_generation not in {"hole"}:
-                raise ValueError(
-                    f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected"
-                    " [None, 'hole']"
-                )
-            preprocess_params[
-                "handle_long_generation"
-            ] = handle_long_generation
-
-        preprocess_params.update(generate_kwargs)
-        forward_params = generate_kwargs
-
-        postprocess_params = {}
-        if return_full_text is not None and return_type is None:
-            if return_text is not None:
-                raise ValueError(
-                    "`return_text` is mutually exclusive with `return_full_text`"
-                )
-            if return_tensors is not None:
-                raise ValueError(
-                    "`return_full_text` is mutually exclusive with `return_tensors`"
-                )
-            return_type = (
-                ReturnType.FULL_TEXT
-                if return_full_text
-                else ReturnType.NEW_TEXT
-            )
-        if return_tensors is not None and return_type is None:
-            if return_text is not None:
-                raise ValueError(
-                    "`return_text` is mutually exclusive with `return_tensors`"
-                )
-            return_type = ReturnType.TENSORS
-        if return_type is not None:
-            postprocess_params["return_type"] = return_type
-        if clean_up_tokenization_spaces is not None:
-            postprocess_params[
-                "clean_up_tokenization_spaces"
-            ] = clean_up_tokenization_spaces
-
-        if stop_sequence is not None:
-            stop_sequence_ids = self.tokenizer.encode(
-                stop_sequence, add_special_tokens=False
-            )
-            if len(stop_sequence_ids) > 1:
-                warnings.warn(
-                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
-                    " the stop sequence will be used as the stop sequence string in the interim."
-                )
-            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
-
-        return preprocess_params, forward_params, postprocess_params
-
-    # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments
-    def _parse_and_tokenize(self, *args, **kwargs):
-        """
-        Parse arguments and tokenize
-        """
-        # Parse arguments
-        if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
-            kwargs.update({"add_space_before_punct_symbol": True})
-
-        return super()._parse_and_tokenize(*args, **kwargs)
-
-    def __call__(self, text_inputs, **kwargs):
-        """
-        Complete the prompt(s) given as inputs.
-
-        Args:
-            args (`str` or `List[str]`):
-                One or several prompts (or one list of prompts) to complete.
-            return_tensors (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the tensors of predictions (as token indices) in the outputs. If set to
-                `True`, the decoded text is not returned.
-            return_text (`bool`, *optional*, defaults to `True`):
-                Whether or not to return the decoded texts in the outputs.
-            return_full_text (`bool`, *optional*, defaults to `True`):
-                If set to `False` only added text is returned, otherwise the full text is returned. Only meaningful if
-                *return_text* is set to True.
-            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-                Whether or not to clean up the potential extra spaces in the text output.
-            prefix (`str`, *optional*):
-                Prefix added to prompt.
-            handle_long_generation (`str`, *optional*):
-                By default, this pipelines does not handle long generation (ones that exceed in one form or the other
-                the model maximum length). There is no perfect way to adress this (more info
-                :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common
-                strategies to work around that problem depending on your use case.
-
-                - `None` : default strategy where nothing in particular happens
-                - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
-                  truncate a lot of the prompt and not suitable when generation exceed the model capacity)
-
-            generate_kwargs:
-                Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework [here](./model#generative-models)).
-
-        Return:
-            A list or a list of list of `dict`: Returns one of the following dictionaries (cannot return a combination
-            of both `generated_text` and `generated_token_ids`):
-
-            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
-            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
-              ids of the generated text.
-        """
-        return super().__call__(text_inputs, **kwargs)
-
-    def preprocess(
-        self,
-        prompt_text,
-        prefix="",
-        handle_long_generation=None,
-        **generate_kwargs,
-    ):
-        inputs = self.tokenizer(
-            prefix + prompt_text,
-            padding=False,
-            add_special_tokens=False,
-            return_tensors=self.framework,
-        )
-        inputs["prompt_text"] = prompt_text
-
-        if handle_long_generation == "hole":
-            cur_len = inputs["input_ids"].shape[-1]
-            if "max_new_tokens" in generate_kwargs:
-                new_tokens = generate_kwargs["max_new_tokens"]
-            else:
-                new_tokens = (
-                    generate_kwargs.get(
-                        "max_length", self.model.config.max_length
-                    )
-                    - cur_len
-                )
-                if new_tokens < 0:
-                    raise ValueError(
-                        "We cannot infer how many new tokens are expected"
-                    )
-            if cur_len + new_tokens > self.tokenizer.model_max_length:
-                keep_length = self.tokenizer.model_max_length - new_tokens
-                if keep_length <= 0:
-                    raise ValueError(
-                        "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
-                        " models max length"
-                    )
-
-                inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
-                if "attention_mask" in inputs:
-                    inputs["attention_mask"] = inputs["attention_mask"][
-                        :, -keep_length:
-                    ]
-
-        return inputs
-
-    def _forward(self, model_inputs, **generate_kwargs):
-        input_ids = model_inputs["input_ids"]
-        attention_mask = model_inputs.get("attention_mask", None)
-        # Allow empty prompts
-        if input_ids.shape[1] == 0:
-            input_ids = None
-            attention_mask = None
-            in_b = 1
-        else:
-            in_b = input_ids.shape[0]
-        prompt_text = model_inputs.pop("prompt_text")
-
-        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
-        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
-        prefix_length = generate_kwargs.pop("prefix_length", 0)
-        if prefix_length > 0:
-            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
-                "generation_config" in generate_kwargs
-                and generate_kwargs["generation_config"].max_new_tokens
-                is not None
-            )
-            if not has_max_new_tokens:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length")
-                    or self.model.config.max_length
-                )
-                generate_kwargs["max_length"] += prefix_length
-            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
-                "generation_config" in generate_kwargs
-                and generate_kwargs["generation_config"].min_new_tokens
-                is not None
-            )
-            if not has_min_new_tokens and "min_length" in generate_kwargs:
-                generate_kwargs["min_length"] += prefix_length
-
-        # BS x SL
-        generated_sequence = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-        out_b = generated_sequence.shape[0]
-        if self.framework == "pt":
-            generated_sequence = generated_sequence.reshape(
-                in_b, out_b // in_b, *generated_sequence.shape[1:]
-            )
-        return {
-            "generated_sequence": generated_sequence,
-            "input_ids": input_ids,
-            "prompt_text": prompt_text,
-        }
-
-    def postprocess(
-        self,
-        model_outputs,
-        return_type=ReturnType.FULL_TEXT,
-        clean_up_tokenization_spaces=True,
-    ):
-        generated_sequence = model_outputs["generated_sequence"][0]
-        input_ids = model_outputs["input_ids"]
-        prompt_text = model_outputs["prompt_text"]
-        generated_sequence = generated_sequence.numpy().tolist()
-        records = []
-        for sequence in generated_sequence:
-            if return_type == ReturnType.TENSORS:
-                record = {"generated_token_ids": sequence}
-            elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
-                # Decode text
-                text = self.tokenizer.decode(
-                    sequence,
-                    skip_special_tokens=True,
-                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-                )
-
-                # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
-                if input_ids is None:
-                    prompt_length = 0
-                else:
-                    prompt_length = len(
-                        self.tokenizer.decode(
-                            input_ids[0],
-                            skip_special_tokens=True,
-                            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-                        )
-                    )
-
-                if return_type == ReturnType.FULL_TEXT:
-                    all_text = prompt_text + text[prompt_length:]
-                else:
-                    all_text = text[prompt_length:]
-
-                record = {"generated_text": all_text}
-            records.append(record)
-
-        return records
--- a/apps/language_models/langchain/expanded_pipelines.py
+++ b/apps/language_models/langchain/expanded_pipelines.py
@@ -154,26 +154,182 @@ class Chain(Serializable, ABC):
            include_run_info: Whether to include run info in the response. Defaults
                to False.
        """
-        inputs = self.prep_inputs(inputs)
+        input_docs = inputs["input_documents"]
+        missing_keys = set(self.input_keys).difference(inputs)
+        if missing_keys:
+            raise ValueError(f"Missing some input keys: {missing_keys}")
+
        callback_manager = CallbackManager.configure(
            callbacks, self.callbacks, self.verbose, tags, self.tags
        )
-        new_arg_supported = inspect.signature(self._call).parameters.get(
-            "run_manager"
-        )
        run_manager = callback_manager.on_chain_start(
            dumpd(self),
            inputs,
        )
-        try:
-            outputs = (
-                self._call(inputs, run_manager=run_manager)
-                if new_arg_supported
-                else self._call(inputs)
+
+        if "is_first" in inputs.keys() and not inputs["is_first"]:
+            run_manager_ = run_manager
+            input_list = [inputs]
+            stop = None
+            prompts = []
+            for inputs in input_list:
+                selected_inputs = {
+                    k: inputs[k] for k in self.prompt.input_variables
+                }
+                prompt = self.prompt.format_prompt(**selected_inputs)
+                _colored_text = get_colored_text(prompt.to_string(), "green")
+                _text = "Prompt after formatting:\n" + _colored_text
+                if run_manager_:
+                    run_manager_.on_text(_text, end="\n", verbose=self.verbose)
+                if "stop" in inputs and inputs["stop"] != stop:
+                    raise ValueError(
+                        "If `stop` is present in any inputs, should be present in all."
+                    )
+                prompts.append(prompt)
+
+            prompt_strings = [p.to_string() for p in prompts]
+            prompts = prompt_strings
+            callbacks = run_manager_.get_child() if run_manager_ else None
+            tags = None
+
+            """Run the LLM on the given prompt and input."""
+            # If string is passed in directly no errors will be raised but outputs will
+            # not make sense.
+            if not isinstance(prompts, list):
+                raise ValueError(
+                    "Argument 'prompts' is expected to be of type List[str], received"
+                    f" argument of type {type(prompts)}."
+                )
+            params = self.llm.dict()
+            params["stop"] = stop
+            options = {"stop": stop}
+            disregard_cache = self.llm.cache is not None and not self.llm.cache
+            callback_manager = CallbackManager.configure(
+                callbacks,
+                self.llm.callbacks,
+                self.llm.verbose,
+                tags,
+                self.llm.tags,
            )
-        except (KeyboardInterrupt, Exception) as e:
-            run_manager.on_chain_error(e)
-            raise e
+            if langchain.llm_cache is None or disregard_cache:
+                # This happens when langchain.cache is None, but self.cache is True
+                if self.llm.cache is not None and self.cache:
+                    raise ValueError(
+                        "Asked to cache, but no cache found at `langchain.cache`."
+                    )
+                run_manager_ = callback_manager.on_llm_start(
+                    dumpd(self),
+                    prompts,
+                    invocation_params=params,
+                    options=options,
+                )
+
+                generations = []
+                for prompt in prompts:
+                    inputs_ = prompt
+                    num_workers = None
+                    batch_size = None
+
+                    if num_workers is None:
+                        if self.llm.pipeline._num_workers is None:
+                            num_workers = 0
+                        else:
+                            num_workers = self.llm.pipeline._num_workers
+                    if batch_size is None:
+                        if self.llm.pipeline._batch_size is None:
+                            batch_size = 1
+                        else:
+                            batch_size = self.llm.pipeline._batch_size
+
+                    preprocess_params = {}
+                    generate_kwargs = {}
+                    preprocess_params.update(generate_kwargs)
+                    forward_params = generate_kwargs
+                    postprocess_params = {}
+                    # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+                    preprocess_params = {
+                        **self.llm.pipeline._preprocess_params,
+                        **preprocess_params,
+                    }
+                    forward_params = {
+                        **self.llm.pipeline._forward_params,
+                        **forward_params,
+                    }
+                    postprocess_params = {
+                        **self.llm.pipeline._postprocess_params,
+                        **postprocess_params,
+                    }
+
+                    self.llm.pipeline.call_count += 1
+                    if (
+                        self.llm.pipeline.call_count > 10
+                        and self.llm.pipeline.framework == "pt"
+                        and self.llm.pipeline.device.type == "cuda"
+                    ):
+                        warnings.warn(
+                            "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+                            " dataset",
+                            UserWarning,
+                        )
+
+                    model_inputs = self.llm.pipeline.preprocess(
+                        inputs_, **preprocess_params
+                    )
+                    model_outputs = self.llm.pipeline.forward(
+                        model_inputs, **forward_params
+                    )
+                    model_outputs["process"] = False
+                    return model_outputs
+                output = LLMResult(generations=generations)
+                run_manager_.on_llm_end(output)
+                if run_manager_:
+                    output.run = RunInfo(run_id=run_manager_.run_id)
+                response = output
+
+            outputs = [
+                # Get the text of the top generated string.
+                {self.output_key: generation[0].text}
+                for generation in response.generations
+            ][0]
+            run_manager.on_chain_end(outputs)
+            final_outputs: Dict[str, Any] = self.prep_outputs(
+                inputs, outputs, return_only_outputs
+            )
+            if include_run_info:
+                final_outputs[RUN_KEY] = RunInfo(run_id=run_manager.run_id)
+            return final_outputs
+        else:
+            _run_manager = (
+                run_manager or CallbackManagerForChainRun.get_noop_manager()
+            )
+            docs = inputs[self.input_key]
+            # Other keys are assumed to be needed for LLM prediction
+            other_keys = {
+                k: v for k, v in inputs.items() if k != self.input_key
+            }
+            doc_strings = [
+                format_document(doc, self.document_prompt) for doc in docs
+            ]
+            # Join the documents together to put them in the prompt.
+            inputs = {
+                k: v
+                for k, v in other_keys.items()
+                if k in self.llm_chain.prompt.input_variables
+            }
+            inputs[self.document_variable_name] = self.document_separator.join(
+                doc_strings
+            )
+            inputs["is_first"] = False
+            inputs["input_documents"] = input_docs
+
+            # Call predict on the LLM.
+            output = self.llm_chain(inputs, callbacks=_run_manager.get_child())
+            if "process" in output.keys() and not output["process"]:
+                return output
+            output = output[self.llm_chain.output_key]
+            extra_return_dict = {}
+        extra_return_dict[self.output_key] = output
+        outputs = extra_return_dict
        run_manager.on_chain_end(outputs)
        final_outputs: Dict[str, Any] = self.prep_outputs(
            inputs, outputs, return_only_outputs
@@ -376,6 +532,24 @@ class BaseCombineDocumentsChain(Chain, ABC):
        return extra_return_dict


+from pydantic import BaseModel
+
+
+class Generation(Serializable):
+    """Output of a single generation."""
+
+    text: str
+    """Generated text output."""
+
+    generation_info: Optional[Dict[str, Any]] = None
+    """Raw generation info response from the provider"""
+    """May include things like reason for finishing (e.g. in OpenAI)"""
+    # TODO: add log probs
+
+
+VALID_TASKS = ("text2text-generation", "text-generation", "summarization")
+
+
 class LLMChain(Chain):
    """Chain to run queries against LLMs.

--- a/apps/language_models/langchain/gen.py
+++ b/apps/language_models/langchain/gen.py
@@ -1129,7 +1129,7 @@ class Langchain:
                max_time=max_time,
                num_return_sequences=num_return_sequences,
            )
-            outr, extra = run_qa_db(
+            out = run_qa_db(
                query=instruction,
                iinput=iinput,
                context=context,
@@ -1171,14 +1171,7 @@ class Langchain:
                max_chunks=max_chunks,
                device=self.device,
            )
-            response = dict(response=outr, sources=extra)
-            if outr or base_model in non_hf_types:
-                # if got no response (e.g. not showing sources and got no sources,
-                # so nothing to give to LLM), then slip through and ask LLM
-                # Or if llama/gptj, then just return since they had no response and can't go down below code path
-                # clear before return, since .then() never done if from API
-                clear_torch_cache()
-            return response
+            return out

    inputs_list_names = list(inspect.signature(evaluate).parameters)
    global inputs_kwargs_list
--- a/apps/language_models/langchain/gpt_langchain.py
+++ b/apps/language_models/langchain/gpt_langchain.py
@@ -968,7 +968,7 @@ def get_llm(
        # not built in prompt removal that is less general and not specific for our model
        pipe.task = "text2text-generation"

-        from exp_hf_pipelines import HuggingFacePipeline
+        from langchain.llms import HuggingFacePipeline

        llm = HuggingFacePipeline(pipeline=pipe)
    return llm, model_name, streamer, prompt_type
@@ -2554,22 +2554,7 @@ def _run_qa_db(
        )
        with context_class_cast(args.device):
            answer = chain()
-
-    if not use_context:
-        ret = answer["output_text"]
-        extra = ""
-        return ret, extra
-    elif answer is not None:
-        ret, extra = get_sources_answer(
-            query,
-            answer,
-            scores,
-            show_rank,
-            answer_with_sources,
-            verbose=verbose,
-        )
-        return ret, extra
-    return
+            return answer


 def get_similarity_chain(
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -6,7 +6,8 @@ import torch_mlir
 from stopping import get_stopping
 from prompter import Prompter, PromptType

-from exp_hf_pipelines import TextGenerationPipeline, ReturnType
+from transformers import TextGenerationPipeline
+from transformers.pipelines.text_generation import ReturnType
 from transformers.generation import (
    GenerationConfig,
    LogitsProcessorList,
@@ -282,7 +283,215 @@ class H2OGPTSHARKModel(torch.nn.Module):
        return result


-h2ogpt_model = H2OGPTSHARKModel()
+def decode_tokens(tokenizer, res_tokens):
+    for i in range(len(res_tokens)):
+        if type(res_tokens[i]) != int:
+            res_tokens[i] = int(res_tokens[i][0])
+
+    res_str = tokenizer.decode(res_tokens, skip_special_tokens=True)
+    return res_str
+
+
+def generate_token(h2ogpt_shark_model, model, tokenizer, **generate_kwargs):
+    del generate_kwargs["max_time"]
+    generate_kwargs["input_ids"] = generate_kwargs["input_ids"].to(
+        device=tensor_device
+    )
+    generate_kwargs["attention_mask"] = generate_kwargs["attention_mask"].to(
+        device=tensor_device
+    )
+    truncated_input_ids = []
+    stopping_criteria = generate_kwargs["stopping_criteria"]
+
+    generation_config_ = GenerationConfig.from_model_config(model.config)
+    generation_config = copy.deepcopy(generation_config_)
+    model_kwargs = generation_config.update(**generate_kwargs)
+
+    logits_processor = LogitsProcessorList()
+    stopping_criteria = (
+        stopping_criteria
+        if stopping_criteria is not None
+        else StoppingCriteriaList()
+    )
+
+    eos_token_id = generation_config.eos_token_id
+    generation_config.pad_token_id = eos_token_id
+
+    (
+        inputs_tensor,
+        model_input_name,
+        model_kwargs,
+    ) = model._prepare_model_inputs(
+        None, generation_config.bos_token_id, model_kwargs
+    )
+
+    model_kwargs["output_attentions"] = generation_config.output_attentions
+    model_kwargs[
+        "output_hidden_states"
+    ] = generation_config.output_hidden_states
+    model_kwargs["use_cache"] = generation_config.use_cache
+
+    input_ids = (
+        inputs_tensor
+        if model_input_name == "input_ids"
+        else model_kwargs.pop("input_ids")
+    )
+
+    input_ids_seq_length = input_ids.shape[-1]
+
+    generation_config.max_length = (
+        generation_config.max_new_tokens + input_ids_seq_length
+    )
+
+    logits_processor = model._get_logits_processor(
+        generation_config=generation_config,
+        input_ids_seq_length=input_ids_seq_length,
+        encoder_input_ids=inputs_tensor,
+        prefix_allowed_tokens_fn=None,
+        logits_processor=logits_processor,
+    )
+
+    stopping_criteria = model._get_stopping_criteria(
+        generation_config=generation_config,
+        stopping_criteria=stopping_criteria,
+    )
+
+    logits_warper = model._get_logits_warper(generation_config)
+
+    (
+        input_ids,
+        model_kwargs,
+    ) = model._expand_inputs_for_generation(
+        input_ids=input_ids,
+        expand_size=generation_config.num_return_sequences,  # 1
+        is_encoder_decoder=model.config.is_encoder_decoder,  # False
+        **model_kwargs,
+    )
+
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    eos_token_id_tensor = (
+        torch.tensor(eos_token_id).to(device=tensor_device)
+        if eos_token_id is not None
+        else None
+    )
+
+    pad_token_id = generation_config.pad_token_id
+    eos_token_id = eos_token_id
+
+    output_scores = generation_config.output_scores  # False
+    return_dict_in_generate = (
+        generation_config.return_dict_in_generate  # False
+    )
+
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+
+    # keep track of which sequences are already finished
+    unfinished_sequences = torch.ones(
+        input_ids.shape[0],
+        dtype=torch.long,
+        device=input_ids.device,
+    )
+
+    timesRan = 0
+    import time
+
+    start = time.time()
+    print("\n")
+
+    res_tokens = []
+    while True:
+        model_inputs = model.prepare_inputs_for_generation(
+            input_ids, **model_kwargs
+        )
+
+        outputs = h2ogpt_shark_model.forward(
+            model_inputs["input_ids"], model_inputs["attention_mask"]
+        )
+
+        if args.precision == "fp16":
+            outputs = outputs.to(dtype=torch.float32)
+        next_token_logits = outputs
+
+        # pre-process distribution
+        next_token_scores = logits_processor(input_ids, next_token_logits)
+        next_token_scores = logits_warper(input_ids, next_token_scores)
+
+        # sample
+        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+
+        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError(
+                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
+                )
+            next_token = next_token * unfinished_sequences + pad_token_id * (
+                1 - unfinished_sequences
+            )
+
+        input_ids = torch.cat([input_ids, next_token[:, None]], dim=-1)
+
+        model_kwargs["past_key_values"] = None
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [
+                    attention_mask,
+                    attention_mask.new_ones((attention_mask.shape[0], 1)),
+                ],
+                dim=-1,
+            )
+
+        truncated_input_ids.append(input_ids[:, 0])
+        input_ids = input_ids[:, 1:]
+        model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, 1:]
+
+        new_word = tokenizer.decode(
+            next_token.cpu().numpy(),
+            add_special_tokens=False,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True,
+        )
+
+        res_tokens.append(next_token)
+        if new_word == "<0x0A>":
+            print("\n", end="", flush=True)
+        else:
+            print(f"{new_word}", end=" ", flush=True)
+
+        part_str = decode_tokens(tokenizer, res_tokens)
+        yield part_str
+
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id_tensor is not None:
+            unfinished_sequences = unfinished_sequences.mul(
+                next_token.tile(eos_token_id_tensor.shape[0], 1)
+                .ne(eos_token_id_tensor.unsqueeze(1))
+                .prod(dim=0)
+            )
+            # stop when each sentence is finished
+            if unfinished_sequences.max() == 0 or stopping_criteria(
+                input_ids, scores
+            ):
+                break
+        timesRan = timesRan + 1
+
+    end = time.time()
+    print(
+        "\n\nTime taken is {:.2f} seconds/token\n".format(
+            (end - start) / timesRan
+        )
+    )
+
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    res_str = decode_tokens(tokenizer, res_tokens)
+    yield res_str


 def pad_or_truncate_inputs(
@@ -495,233 +704,6 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
            )
        return records

-    def generate_new_token(self):
-        model_inputs = self.model.prepare_inputs_for_generation(
-            self.input_ids, **self.model_kwargs
-        )
-
-        outputs = h2ogpt_model.forward(
-            model_inputs["input_ids"], model_inputs["attention_mask"]
-        )
-
-        if args.precision == "fp16":
-            outputs = outputs.to(dtype=torch.float32)
-        next_token_logits = outputs
-
-        # pre-process distribution
-        next_token_scores = self.logits_processor(
-            self.input_ids, next_token_logits
-        )
-        next_token_scores = self.logits_warper(
-            self.input_ids, next_token_scores
-        )
-
-        # sample
-        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
-
-        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-        # finished sentences should have their next token be a padding token
-        if self.eos_token_id is not None:
-            if self.pad_token_id is None:
-                raise ValueError(
-                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
-                )
-            next_token = (
-                next_token * self.unfinished_sequences
-                + self.pad_token_id * (1 - self.unfinished_sequences)
-            )
-
-        self.input_ids = torch.cat(
-            [self.input_ids, next_token[:, None]], dim=-1
-        )
-
-        self.model_kwargs["past_key_values"] = None
-        if "attention_mask" in self.model_kwargs:
-            attention_mask = self.model_kwargs["attention_mask"]
-            self.model_kwargs["attention_mask"] = torch.cat(
-                [
-                    attention_mask,
-                    attention_mask.new_ones((attention_mask.shape[0], 1)),
-                ],
-                dim=-1,
-            )
-
-        self.truncated_input_ids.append(self.input_ids[:, 0])
-        self.input_ids = self.input_ids[:, 1:]
-        self.model_kwargs["attention_mask"] = self.model_kwargs[
-            "attention_mask"
-        ][:, 1:]
-
-        return next_token
-
-    def generate_token(self, **generate_kwargs):
-        del generate_kwargs["max_time"]
-        self.truncated_input_ids = []
-
-        generation_config_ = GenerationConfig.from_model_config(
-            self.model.config
-        )
-        generation_config = copy.deepcopy(generation_config_)
-        self.model_kwargs = generation_config.update(**generate_kwargs)
-
-        logits_processor = LogitsProcessorList()
-        self.stopping_criteria = (
-            self.stopping_criteria
-            if self.stopping_criteria is not None
-            else StoppingCriteriaList()
-        )
-
-        eos_token_id = generation_config.eos_token_id
-        generation_config.pad_token_id = eos_token_id
-
-        (
-            inputs_tensor,
-            model_input_name,
-            self.model_kwargs,
-        ) = self.model._prepare_model_inputs(
-            None, generation_config.bos_token_id, self.model_kwargs
-        )
-        batch_size = inputs_tensor.shape[0]
-
-        self.model_kwargs[
-            "output_attentions"
-        ] = generation_config.output_attentions
-        self.model_kwargs[
-            "output_hidden_states"
-        ] = generation_config.output_hidden_states
-        self.model_kwargs["use_cache"] = generation_config.use_cache
-
-        self.input_ids = (
-            inputs_tensor
-            if model_input_name == "input_ids"
-            else self.model_kwargs.pop("input_ids")
-        )
-
-        input_ids_seq_length = self.input_ids.shape[-1]
-
-        generation_config.max_length = (
-            generation_config.max_new_tokens + input_ids_seq_length
-        )
-
-        self.logits_processor = self.model._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=inputs_tensor,
-            prefix_allowed_tokens_fn=None,
-            logits_processor=logits_processor,
-        )
-
-        self.stopping_criteria = self.model._get_stopping_criteria(
-            generation_config=generation_config,
-            stopping_criteria=self.stopping_criteria,
-        )
-
-        self.logits_warper = self.model._get_logits_warper(generation_config)
-
-        (
-            self.input_ids,
-            self.model_kwargs,
-        ) = self.model._expand_inputs_for_generation(
-            input_ids=self.input_ids,
-            expand_size=generation_config.num_return_sequences,  # 1
-            is_encoder_decoder=self.model.config.is_encoder_decoder,  # False
-            **self.model_kwargs,
-        )
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        self.eos_token_id_tensor = (
-            torch.tensor(eos_token_id).to(device=tensor_device)
-            if eos_token_id is not None
-            else None
-        )
-
-        self.pad_token_id = generation_config.pad_token_id
-        self.eos_token_id = eos_token_id
-
-        output_scores = generation_config.output_scores  # False
-        output_attentions = generation_config.output_attentions  # False
-        output_hidden_states = generation_config.output_hidden_states  # False
-        return_dict_in_generate = (
-            generation_config.return_dict_in_generate  # False
-        )
-
-        # init attention / hidden states / scores tuples
-        self.scores = (
-            () if (return_dict_in_generate and output_scores) else None
-        )
-        decoder_attentions = (
-            () if (return_dict_in_generate and output_attentions) else None
-        )
-        cross_attentions = (
-            () if (return_dict_in_generate and output_attentions) else None
-        )
-        decoder_hidden_states = (
-            () if (return_dict_in_generate and output_hidden_states) else None
-        )
-
-        # keep track of which sequences are already finished
-        self.unfinished_sequences = torch.ones(
-            self.input_ids.shape[0],
-            dtype=torch.long,
-            device=self.input_ids.device,
-        )
-
-        timesRan = 0
-        import time
-
-        start = time.time()
-        print("\n")
-
-        while True:
-            next_token = self.generate_new_token()
-            new_word = self.tokenizer.decode(
-                next_token.cpu().numpy(),
-                add_special_tokens=False,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=True,
-            )
-
-            print(f"{new_word}", end="", flush=True)
-
-            # if eos_token was found in one sentence, set sentence to finished
-            if self.eos_token_id_tensor is not None:
-                self.unfinished_sequences = self.unfinished_sequences.mul(
-                    next_token.tile(self.eos_token_id_tensor.shape[0], 1)
-                    .ne(self.eos_token_id_tensor.unsqueeze(1))
-                    .prod(dim=0)
-                )
-                # stop when each sentence is finished
-                if (
-                    self.unfinished_sequences.max() == 0
-                    or self.stopping_criteria(self.input_ids, self.scores)
-                ):
-                    break
-            timesRan = timesRan + 1
-
-        end = time.time()
-        print(
-            "\n\nTime taken is {:.2f} seconds/token\n".format(
-                (end - start) / timesRan
-            )
-        )
-
-        self.input_ids = torch.cat(
-            [
-                torch.tensor(self.truncated_input_ids)
-                .to(device=tensor_device)
-                .unsqueeze(dim=0),
-                self.input_ids,
-            ],
-            dim=-1,
-        )
-
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        return self.input_ids
-
    def _forward(self, model_inputs, **generate_kwargs):
        if self.can_stop:
            stopping_criteria = get_stopping(
@@ -781,19 +763,13 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
        input_ids, attention_mask = pad_or_truncate_inputs(
            input_ids, attention_mask, max_padding_length=max_padding_length
        )
-        self.stopping_criteria = generate_kwargs["stopping_criteria"]

-        generated_sequence = self.generate_token(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-        out_b = generated_sequence.shape[0]
-        generated_sequence = generated_sequence.reshape(
-            in_b, out_b // in_b, *generated_sequence.shape[1:]
-        )
-        return {
-            "generated_sequence": generated_sequence,
+        return_dict = {
+            "model": self.model,
+            "tokenizer": self.tokenizer,
            "input_ids": input_ids,
-            "prompt_text": prompt_text,
+            "attention_mask": attention_mask,
+            "attention_mask": attention_mask,
        }
+        return_dict = {**return_dict, **generate_kwargs}
+        return return_dict
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -249,9 +249,9 @@ if __name__ == "__main__":
                    ]
                )
            # with gr.TabItem(label="DocuChat Upload", id=11):
-            #    h2ogpt_upload.render()
+            #     h2ogpt_upload.render()
            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
-            #    h2ogpt_web.render()
+            #     h2ogpt_web.render()

        # send to buttons
        register_button_click(
--- a/apps/stable_diffusion/web/ui/h2ogpt.py
+++ b/apps/stable_diffusion/web/ui/h2ogpt.py
@@ -37,8 +37,15 @@ start_message = """

 def create_prompt(history):
    system_message = start_message
+    for item in history:
+        print("His item: ", item)

-    conversation = "".join(["".join([item[0], item[1]]) for item in history])
+    conversation = "<|endoftext|>".join(
+        [
+            "<|endoftext|><|answer|>".join([item[0], item[1]])
+            for item in history
+        ]
+    )

    msg = system_message + conversation
    msg = msg.strip()
@@ -48,10 +55,12 @@ def create_prompt(history):
 def chat(curr_system_message, history, device, precision):
    args.run_docuchat_web = True
    global h2ogpt_model
+    global sharkModel
    global h2ogpt_tokenizer
    global model_state
    global langchain
    global userpath_selector
+    from apps.language_models.langchain.h2oai_pipeline import generate_token

    if h2ogpt_model == 0:
        if "cuda" in device:
@@ -106,9 +115,14 @@ def chat(curr_system_message, history, device, precision):
            prompt_type=None,
            prompt_dict=None,
        )
+        from apps.language_models.langchain.h2oai_pipeline import (
+            H2OGPTSHARKModel,
+        )
+
+        sharkModel = H2OGPTSHARKModel()

    prompt = create_prompt(history)
-    output = langchain.evaluate(
+    output_dict = langchain.evaluate(
        model_state=model_state,
        my_db_state=None,
        instruction=prompt,
@@ -168,7 +182,11 @@ def chat(curr_system_message, history, device, precision):
        model_lock=True,
        user_path=userpath_selector.value,
    )
-    history[-1][1] = output["response"]
+
+    output = generate_token(sharkModel, **output_dict)
+    for partial_text in output:
+        history[-1][1] = partial_text
+        yield history
    return history