[Serve] Add doc for model composition (ray-project#8871)

Co-authored-by: Edward Oakes <[email protected]>
LecJackS · Jun 10, 2020 · cf53b35 · cf53b35
1 parent 0ba7472
commit cf53b35
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 2 deletions.
diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst
@@ -185,6 +185,28 @@ The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or `
   handle = serve.get_handle("api_endpoint")
   handler.options(shard_key=session_id).remote(args)
 
+Composing Multiple Models
+=========================
+Ray Serve supports composing individually scalable models into a single model 
+out of the box. For instance, you can combine multiple models to perform 
+stacking or ensembles.
+
+To define a higher-level composed model you need to do three things:
+
+1. Define your underlying models (the ones that you will compose together) as 
+   Ray Serve backends
+2. Define your composed model, using the handles of the underlying models 
+   (see the example below).
+3. Define an endpoint representing this composed model and query it!
+
+In order to avoid synchronous execution in the composed model (e.g., it's very
+slow to make calls to the composed model), you'll need to make the function
+asynchronous by using an ``async def``. You'll see this in the example below.
+
+That's it. Let's take a look at an example:
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_composition.py
+
 
 .. _serve-faq:
 

diff --git a/doc/source/serve/key-concepts.rst b/doc/source/serve/key-concepts.rst
@@ -11,6 +11,7 @@ To follow along, you'll need to make the necessary imports.
   from ray import serve
   serve.init() # Initializes Ray and Ray Serve.
 
+.. _`serve-backend`:
 
 Backends
 ========

diff --git a/python/ray/serve/examples/doc/snippet_model_composition.py b/python/ray/serve/examples/doc/snippet_model_composition.py
@@ -1,11 +1,17 @@
 from random import random
-
 import requests
-
 from ray import serve
 
 serve.init()
 
+# Our pipeline will be structured as follows:
+# - Input comes in, the composed model sends it to model_one
+# - model_one outputs a random number between 0 and 1, if the value is
+#   greater than 0.5, then the data is sent to model_two
+# - otherwise, the data is returned to the user.
+
+# Let's define two models that just print out the data they received.
+
 
 def model_one(_unused_flask_request, data=None):
     print("Model 1 called with data ", data)
@@ -22,6 +28,7 @@ def __init__(self):
         self.model_one = serve.get_handle("model_one")
         self.model_two = serve.get_handle("model_two")
 
+    # This method can be called concurrently!
     async def __call__(self, flask_request):
         data = flask_request.data
 
@@ -41,6 +48,8 @@ async def __call__(self, flask_request):
 serve.create_backend("model_two", model_two)
 serve.create_endpoint("model_two", backend="model_two")
 
+# max_concurrent_queries is optional. By default, if you pass in an async
+# function, Ray Serve sets the limit to a high number.
 serve.create_backend(
     "composed_backend", ComposedModel, config={"max_concurrent_queries": 10})
 serve.create_endpoint(