Exclude prompt from generated response (Lightning-AI#1485)

beep-bebop · Jun 12, 2024 · c0f7686 · c0f7686
1 parent 8f65463
commit c0f7686
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 5 deletions.
diff --git a/litgpt/deploy/serve.py b/litgpt/deploy/serve.py
@@ -113,7 +113,8 @@ def predict(self, inputs: torch.Tensor) -> Any:
             temperature=self.temperature,
             top_k=self.top_k,
             top_p=self.top_p,
-            eos_id=self.tokenizer.eos_id
+            eos_id=self.tokenizer.eos_id,
+            include_prompt=False
         )
 
         for block in self.model.transformer.h:

diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
@@ -86,6 +86,7 @@ def generate(
     top_k: Optional[int] = None,
     top_p: float = 1.0,
     eos_id: Optional[int] = None,
+    include_prompt: bool = True,
 ) -> torch.Tensor:
     """
     Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
@@ -112,6 +113,7 @@ def generate(
             For more details, see https://arxiv.org/abs/1904.09751
             or https://huyenchip.com/2024/01/16/sampling.html#top_p
         eos_id: If specified, stop generating any more token once the <eos> token is triggered.
+        include_prompt: If true (default) prepends the prompt (after applying the prompt style) to the output.
     """
     T = prompt.size(0)
     assert max_returned_tokens > T
@@ -122,7 +124,10 @@ def generate(
         raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")
 
     device = prompt.device
-    tokens = [prompt]
+    if include_prompt:
+        tokens = [prompt]
+    else:
+        tokens = []
     input_pos = torch.tensor([T], device=device)
     token = next_token(
         model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k, top_p=top_p

diff --git a/tests/test_serve.py b/tests/test_serve.py
@@ -39,4 +39,4 @@ def test_simple(tmp_path):
         response = client.post("/predict", json={"prompt": "Hello world"})
         # Model is a small random model, not trained, hence the gibberish.
         # We are just testing that the server works.
-        assert response.json()["output"][:19] == "Hello world statues"
+        assert response.json()["output"][:19] == " statues CAD pierci"
diff --git a/tutorials/deploy.md b/tutorials/deploy.md
@@ -44,6 +44,5 @@ print(response.json()["output"])
 Executing the code above prints the following output:
 
 ```
-Instruct: Fix typos in the following sentence: Exampel input
-Output: Example input.
+Example input.
 ```