Define initialize_llm() and generate() methods. Remove extra logging in llm.py (ludwig-ai#3711)

justinxzhao · web-flow · commit cd204585a0f5 · 2023-10-27T19:20:41.000-07:00
diff --git a/ludwig/api.py b/ludwig/api.py
@@ -25,6 +25,7 @@
 import os
 import sys
 import tempfile
+import time
 import traceback
 from collections import OrderedDict
 from pprint import pformat
@@ -103,6 +104,7 @@
     set_saved_weights_in_checkpoint_flag,
 )
 from ludwig.utils.print_utils import print_boxed
+from ludwig.utils.tokenizers import HFTokenizer
 from ludwig.utils.torch_utils import DEVICE
 from ludwig.utils.trainer_utils import get_training_report
 from ludwig.utils.types import DataFrame, TorchDevice
@@ -332,6 +334,27 @@ def __init__(
         # online training state
         self._online_trainer = None
 
+        # Zero-shot LLM usage.
+        if (
+            self.config_obj.model_type == MODEL_LLM
+            and self.config_obj.trainer.type == "none"
+            # Category output features require a vocabulary. The LLM LudwigModel should be initialized with
+            # model.train(dataset).
+            and self.config_obj.output_features[0].type == "text"
+        ):
+            self._initialize_llm()
+
+    def _initialize_llm(self, random_seed: int = default_random_seed):
+        """Initialize the LLM model.
+
+        Should only be used in a zero-shot (NoneTrainer) setting.
+        """
+        self.model = LudwigModel.create_model(self.config_obj, random_seed=random_seed)
+
+        if self.model.model.device == "cpu":
+            logger.warning(f"LLM was initialized on {self.model.model.device}. Moving to GPU for inference.")
+            self.model.model.to(torch.device("cuda"))
+
     def train(
         self,
         dataset: Optional[Union[str, dict, pd.DataFrame]] = None,
@@ -891,6 +914,53 @@ def _tune_batch_size(self, trainer, dataset, random_seed: int = default_random_s
         trainer.eval_batch_size = self.config_obj.trainer.eval_batch_size
         trainer.gradient_accumulation_steps = self.config_obj.trainer.gradient_accumulation_steps
 
+    def generate(
+        self,
+        input_strings: Union[str, List[str]],
+        generation_config: Optional[dict] = None,
+    ) -> Union[str, List[str]]:
+        """A simple generate() method that directly uses the underlying transformers library to generate text."""
+        if self.config_obj.model_type != MODEL_LLM:
+            raise ValueError(
+                f"Model type {self.config_obj.model_type} is not supported by this method. Only `llm` model type is "
+                "supported."
+            )
+        if not torch.cuda.is_available() or torch.cuda.device_count() == 0:
+            # GPU is generally well-advised for working with LLMs and is required for loading quantized models, see
+            # https://github.com/ludwig-ai/ludwig/issues/3695.
+            raise ValueError("GPU is not available.")
+
+        # TODO(Justin): Decide if it's worth folding padding_side handling into llm.py's tokenizer initialization.
+        # For batch inference with models like facebook/opt-350m, if the tokenizer padding side is off, HF prints a
+        # warning, e.g.:
+        # "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, "
+        # "please set `padding_side='left'` when initializing the tokenizer.
+        if not self.model.model.config.is_encoder_decoder:
+            padding_side = "left"
+        else:
+            padding_side = "right"
+        tokenizer = HFTokenizer(self.config_obj.base_model, padding_side=padding_side)
+
+        with self.model.use_generation_config(generation_config):
+            start_time = time.time()
+            inputs = tokenizer.tokenizer(input_strings, return_tensors="pt", padding=True)
+            input_ids = inputs["input_ids"].to("cuda")
+            attention_mask = inputs["attention_mask"].to("cuda")
+            with torch.no_grad():
+                outputs = self.model.model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    # NOTE: self.model.model.generation_config is not used here because it is the default
+                    # generation config that the CausalLM was initialized with, rather than the one set within the
+                    # context manager.
+                    generation_config=self.model.generation,
+                )
+                decoded_outputs = tokenizer.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+                logger.info(f"Finished generating in: {(time.time() - start_time):.2f}s.")
+                if len(decoded_outputs) == 1:
+                    return decoded_outputs[0]
+                return decoded_outputs
+
     def predict(
         self,
         dataset: Optional[Union[str, dict, pd.DataFrame]] = None,
@@ -946,6 +1016,7 @@ def predict(
         self._check_initialization()
 
         # preprocessing
+        start_time = time.time()
         logger.debug("Preprocessing")
         dataset, _ = preprocess_for_prediction(  # TODO (Connor): Refactor to use self.config_obj
             self.config_obj.to_dict(),
@@ -992,6 +1063,7 @@ def predict(
 
                     logger.info(f"Saved to: {output_directory}")
 
+            logger.info(f"Finished predicting in: {(time.time() - start_time):.2f}s.")
             return converted_postproc_predictions, output_directory
 
     def evaluate(
diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py
@@ -417,10 +417,6 @@ def generate(
             sequences_list = []
             for input_ids_sample in input_ids:
                 input_ids_sample_no_padding = remove_left_padding(input_ids_sample, self.tokenizer)
-                logger.info(
-                    "Decoded text inputs for the first example in batch: "
-                    f"{self.tokenizer.decode(input_ids_sample_no_padding[0], skip_special_tokens=True)}"
-                )
 
                 if input_ids_sample_no_padding.shape[1] > self.max_input_length:
                     logger.warning(
@@ -443,10 +439,6 @@ def generate(
                         return_dict_in_generate=True,
                         output_scores=True,
                     )
-                    logger.info(
-                        "Decoded generated output for the first example in batch: "
-                        f"{self.tokenizer.batch_decode(model_outputs.sequences, skip_special_tokens=True)[0]}"
-                    )
 
                 sequences_list.append(model_outputs.sequences[0])
 
diff --git a/tests/integration_tests/test_api.py b/tests/integration_tests/test_api.py
@@ -768,6 +768,7 @@ def test_constant_metadata(tmpdir):
     assert metadata1 == metadata2
 
 
+@pytest.mark.integration_tests_e
 @pytest.mark.parametrize(
     "input_max_sequence_length, global_max_sequence_length, expect_raise",
     [
@@ -797,9 +798,6 @@ def test_llm_template_too_long(tmpdir, input_max_sequence_length, global_max_seq
 
   preprocessing:
     global_max_sequence_length: {global_max_sequence_length}
-
-  quantization:
-    bits: 4
   """
     )
     zero_shot_config["prompt"] = {}
diff --git a/tests/integration_tests/test_peft.py b/tests/integration_tests/test_peft.py
@@ -6,6 +6,7 @@
 from tests.integration_tests.utils import binary_feature, generate_data, run_test_suite, text_feature
 
 
+@pytest.mark.integration_tests_e
 @pytest.mark.parametrize(
     "backend",
     [

Original file line number	Diff line number	Diff line change
`@@ -768,6 +768,7 @@ def test_constant_metadata(tmpdir):`
`768`	`768`	`assert metadata1 == metadata2`
`769`	`769`
`770`	`770`
	`771`	`+@pytest.mark.integration_tests_e`
`771`	`772`	`@pytest.mark.parametrize(`
`772`	`773`	`"input_max_sequence_length, global_max_sequence_length, expect_raise",`
`773`	`774`	`[`
`@@ -797,9 +798,6 @@ def test_llm_template_too_long(tmpdir, input_max_sequence_length, global_max_seq`
`797`	`798`
`798`	`799`	`preprocessing:`
`799`	`800`	`global_max_sequence_length: {global_max_sequence_length}`
`800`		`-`
`801`		`- quantization:`
`802`		`- bits: 4`
`803`	`801`	`"""`
`804`	`802`	`)`
`805`	`803`	`zero_shot_config["prompt"] = {}`
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@`
`6`	`6`	`from tests.integration_tests.utils import binary_feature, generate_data, run_test_suite, text_feature`
`7`	`7`
`8`	`8`
	`9`	`+@pytest.mark.integration_tests_e`
`9`	`10`	`@pytest.mark.parametrize(`
`10`	`11`	`"backend",`
`11`	`12`	`[`