[train] Update quickstart example to use dataloader (ray-project#33050)

To simplify the CPU/GPU transition, Ray Train provides a ray.train.torch.prepare_data_loader utility function. Prior to this change, this logic was not being handled so the example would fail if run with use_gpu=True. --------- Signed-off-by: Matthew Deng <[email protected]>
WZDTHU · Mar 7, 2023 · d2b855f · d2b855f
1 parent 552bbcf
commit d2b855f
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 55 deletions.
diff --git a/doc/source/ray-overview/getting-started.md b/doc/source/ray-overview/getting-started.md
@@ -221,6 +221,7 @@ This training function can be executed with:
 :language: python
 :start-after: __torch_single_run_begin__
 :end-before: __torch_single_run_end__
+:dedent: 0
 ```
 
 Now let's convert this to a distributed multi-worker training function!
@@ -237,13 +238,14 @@ and place it on the right device, and add ``DistributedSampler`` to your DataLoa
 :end-before: __torch_distributed_end__
 ```
 
-Then, instantiate a ``Trainer`` that uses a ``"torch"`` backend
+Then, instantiate a ``TorchTrainer``
 with 4 workers, and use it to run the new training function!
 
 ```{literalinclude} /../../python/ray/train/examples/pytorch/torch_quick_start.py
 :language: python
 :start-after: __torch_trainer_begin__
 :end-before: __torch_trainer_end__
+:dedent: 0
 ```
 ````
 
@@ -274,6 +276,7 @@ This training function can be executed with:
 :language: python
 :start-after: __tf_single_run_begin__
 :end-before: __tf_single_run_end__
+:dedent: 0
 ```
 
 Now let's convert this to a distributed multi-worker training function!
@@ -290,13 +293,14 @@ All you need to do is:
 :end-before: __tf_distributed_end__
 ```
 
-Then, instantiate a ``Trainer`` that uses a ``"tensorflow"`` backend
+Then, instantiate a ``TensorflowTrainer``
 with 4 workers, and use it to run the new training function!
 
 ```{literalinclude} /../../python/ray/train/examples/tf/tensorflow_quick_start.py
 :language: python
 :start-after: __tf_trainer_begin__
 :end-before: __tf_trainer_end__
+:dedent: 0
 ```
 ````
 

diff --git a/doc/source/train/getting-started.rst b/doc/source/train/getting-started.rst
@@ -107,6 +107,7 @@ Here are examples for some of the commonly used trainers:
         :language: python
         :start-after: __torch_single_run_begin__
         :end-before: __torch_single_run_end__
+        :dedent:
 
     Now let's convert this to a distributed multi-worker training function!
 
@@ -128,6 +129,7 @@ Here are examples for some of the commonly used trainers:
         :language: python
         :start-after: __torch_trainer_begin__
         :end-before: __torch_trainer_end__
+        :dedent:
 
     See :ref:`train-porting-code` for a more comprehensive example.
 
@@ -156,6 +158,7 @@ Here are examples for some of the commonly used trainers:
         :language: python
         :start-after: __tf_single_run_begin__
         :end-before: __tf_single_run_end__
+        :dedent:
 
     Now let's convert this to a distributed multi-worker training function!
     All you need to do is:
@@ -177,6 +180,7 @@ Here are examples for some of the commonly used trainers:
         :language: python
         :start-after: __tf_trainer_begin__
         :end-before: __tf_trainer_end__
+        :dedent:
 
     See :ref:`train-porting-code` for a more comprehensive example.
 

diff --git a/python/ray/train/examples/pytorch/torch_quick_start.py b/python/ray/train/examples/pytorch/torch_quick_start.py
@@ -5,79 +5,93 @@
 # __torch_setup_begin__
 import torch
 import torch.nn as nn
-
-num_samples = 20
-input_size = 10
-layer_size = 15
-output_size = 5
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+def get_dataset():
+    return datasets.FashionMNIST(
+        root="/tmp/data",
+        train=True,
+        download=True,
+        transform=ToTensor(),
+    )
 
 class NeuralNetwork(nn.Module):
     def __init__(self):
-        super(NeuralNetwork, self).__init__()
-        self.layer1 = nn.Linear(input_size, layer_size)
-        self.relu = nn.ReLU()
-        self.layer2 = nn.Linear(layer_size, output_size)
-
-    def forward(self, input):
-        return self.layer2(self.relu(self.layer1(input)))
-
-# In this example we use a randomly generated dataset.
-input = torch.randn(num_samples, input_size)
-labels = torch.randn(num_samples, output_size)
-
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28 * 28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+        )
+
+    def forward(self, inputs):
+        inputs = self.flatten(inputs)
+        logits = self.linear_relu_stack(inputs)
+        return logits
 # __torch_setup_end__
 
 # __torch_single_begin__
-
-import torch.optim as optim
-
 def train_func():
     num_epochs = 3
+    batch_size = 64
+
+    dataset = get_dataset()
+    dataloader = DataLoader(dataset, batch_size=batch_size)
+
     model = NeuralNetwork()
-    loss_fn = nn.MSELoss()
-    optimizer = optim.SGD(model.parameters(), lr=0.1)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
     for epoch in range(num_epochs):
-        output = model(input)
-        loss = loss_fn(output, labels)
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
+        for inputs, labels in dataloader:
+            optimizer.zero_grad()
+            pred = model(inputs)
+            loss = criterion(pred, labels)
+            loss.backward()
+            optimizer.step()
         print(f"epoch: {epoch}, loss: {loss.item()}")
-
 # __torch_single_end__
 
 # __torch_distributed_begin__
-
 from ray import train
 
 def train_func_distributed():
     num_epochs = 3
+    batch_size = 64
+
+    dataset = get_dataset()
+    dataloader = DataLoader(dataset, batch_size=batch_size)
+    dataloader = train.torch.prepare_data_loader(dataloader)
+
     model = NeuralNetwork()
     model = train.torch.prepare_model(model)
-    loss_fn = nn.MSELoss()
-    optimizer = optim.SGD(model.parameters(), lr=0.1)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
     for epoch in range(num_epochs):
-        output = model(input)
-        loss = loss_fn(output, labels)
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
+        for inputs, labels in dataloader:
+            optimizer.zero_grad()
+            pred = model(inputs)
+            loss = criterion(pred, labels)
+            loss.backward()
+            optimizer.step()
         print(f"epoch: {epoch}, loss: {loss.item()}")
-
 # __torch_distributed_end__
 
 
 if __name__ == "__main__":
     # __torch_single_run_begin__
-
     train_func()
-
     # __torch_single_run_end__
 
     # __torch_trainer_begin__
-
     from ray.train.torch import TorchTrainer
     from ray.air.config import ScalingConfig
 
@@ -86,10 +100,8 @@ def train_func_distributed():
 
     trainer = TorchTrainer(
         train_func_distributed,
-        scaling_config=ScalingConfig(
-            num_workers=4, use_gpu=use_gpu)
+        scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)
     )
 
     results = trainer.fit()
-
     # __torch_trainer_end__
diff --git a/python/ray/train/examples/tf/tensorflow_quick_start.py b/python/ray/train/examples/tf/tensorflow_quick_start.py
@@ -3,7 +3,6 @@
 # isort: skip_file
 
 # __tf_setup_begin__
-
 import numpy as np
 import tensorflow as tf
 
@@ -32,21 +31,17 @@ def build_and_compile_cnn_model():
         optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
         metrics=['accuracy'])
     return model
-
 # __tf_setup_end__
 
 # __tf_single_begin__
-
 def train_func():
     batch_size = 64
     single_worker_dataset = mnist_dataset(batch_size)
     single_worker_model = build_and_compile_cnn_model()
     single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)
-
 # __tf_single_end__
 
 # __tf_distributed_begin__
-
 import json
 import os
 
@@ -66,18 +61,14 @@ def train_func_distributed():
         multi_worker_model = build_and_compile_cnn_model()
 
     multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)
-
 # __tf_distributed_end__
 
 if __name__ == "__main__":
     # __tf_single_run_begin__
-
     train_func()
-
     # __tf_single_run_end__
 
     # __tf_trainer_begin__
-
     from ray.train.tensorflow import TensorflowTrainer
     from ray.air.config import ScalingConfig
 
@@ -87,5 +78,4 @@ def train_func_distributed():
     trainer = TensorflowTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu))
 
     trainer.fit()
-
     # __tf_trainer_end__