Skip to content

Commit

Permalink
[train] Update quickstart example to use dataloader (ray-project#33050)
Browse files Browse the repository at this point in the history
To simplify the CPU/GPU transition, Ray Train provides a ray.train.torch.prepare_data_loader utility function. Prior to this change, this logic was not being handled so the example would fail if run with use_gpu=True.

---------

Signed-off-by: Matthew Deng <[email protected]>
  • Loading branch information
matthewdeng authored Mar 7, 2023
1 parent 552bbcf commit d2b855f
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 55 deletions.
8 changes: 6 additions & 2 deletions doc/source/ray-overview/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ This training function can be executed with:
:language: python
:start-after: __torch_single_run_begin__
:end-before: __torch_single_run_end__
:dedent: 0
```
Now let's convert this to a distributed multi-worker training function!
Expand All @@ -237,13 +238,14 @@ and place it on the right device, and add ``DistributedSampler`` to your DataLoa
:end-before: __torch_distributed_end__
```
Then, instantiate a ``Trainer`` that uses a ``"torch"`` backend
Then, instantiate a ``TorchTrainer``
with 4 workers, and use it to run the new training function!
```{literalinclude} /../../python/ray/train/examples/pytorch/torch_quick_start.py
:language: python
:start-after: __torch_trainer_begin__
:end-before: __torch_trainer_end__
:dedent: 0
```
````
Expand Down Expand Up @@ -274,6 +276,7 @@ This training function can be executed with:
:language: python
:start-after: __tf_single_run_begin__
:end-before: __tf_single_run_end__
:dedent: 0
```
Now let's convert this to a distributed multi-worker training function!
Expand All @@ -290,13 +293,14 @@ All you need to do is:
:end-before: __tf_distributed_end__
```
Then, instantiate a ``Trainer`` that uses a ``"tensorflow"`` backend
Then, instantiate a ``TensorflowTrainer``
with 4 workers, and use it to run the new training function!
```{literalinclude} /../../python/ray/train/examples/tf/tensorflow_quick_start.py
:language: python
:start-after: __tf_trainer_begin__
:end-before: __tf_trainer_end__
:dedent: 0
```
````
Expand Down
4 changes: 4 additions & 0 deletions doc/source/train/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Here are examples for some of the commonly used trainers:
:language: python
:start-after: __torch_single_run_begin__
:end-before: __torch_single_run_end__
:dedent:

Now let's convert this to a distributed multi-worker training function!

Expand All @@ -128,6 +129,7 @@ Here are examples for some of the commonly used trainers:
:language: python
:start-after: __torch_trainer_begin__
:end-before: __torch_trainer_end__
:dedent:

See :ref:`train-porting-code` for a more comprehensive example.

Expand Down Expand Up @@ -156,6 +158,7 @@ Here are examples for some of the commonly used trainers:
:language: python
:start-after: __tf_single_run_begin__
:end-before: __tf_single_run_end__
:dedent:

Now let's convert this to a distributed multi-worker training function!
All you need to do is:
Expand All @@ -177,6 +180,7 @@ Here are examples for some of the commonly used trainers:
:language: python
:start-after: __tf_trainer_begin__
:end-before: __tf_trainer_end__
:dedent:

See :ref:`train-porting-code` for a more comprehensive example.

Expand Down
98 changes: 55 additions & 43 deletions python/ray/train/examples/pytorch/torch_quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,79 +5,93 @@
# __torch_setup_begin__
import torch
import torch.nn as nn

num_samples = 20
input_size = 10
layer_size = 15
output_size = 5
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

def get_dataset():
return datasets.FashionMNIST(
root="/tmp/data",
train=True,
download=True,
transform=ToTensor(),
)

class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.layer1 = nn.Linear(input_size, layer_size)
self.relu = nn.ReLU()
self.layer2 = nn.Linear(layer_size, output_size)

def forward(self, input):
return self.layer2(self.relu(self.layer1(input)))

# In this example we use a randomly generated dataset.
input = torch.randn(num_samples, input_size)
labels = torch.randn(num_samples, output_size)

super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)

def forward(self, inputs):
inputs = self.flatten(inputs)
logits = self.linear_relu_stack(inputs)
return logits
# __torch_setup_end__

# __torch_single_begin__

import torch.optim as optim

def train_func():
num_epochs = 3
batch_size = 64

dataset = get_dataset()
dataloader = DataLoader(dataset, batch_size=batch_size)

model = NeuralNetwork()
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
output = model(input)
loss = loss_fn(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
for inputs, labels in dataloader:
optimizer.zero_grad()
pred = model(inputs)
loss = criterion(pred, labels)
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")

# __torch_single_end__

# __torch_distributed_begin__

from ray import train

def train_func_distributed():
num_epochs = 3
batch_size = 64

dataset = get_dataset()
dataloader = DataLoader(dataset, batch_size=batch_size)
dataloader = train.torch.prepare_data_loader(dataloader)

model = NeuralNetwork()
model = train.torch.prepare_model(model)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
output = model(input)
loss = loss_fn(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
for inputs, labels in dataloader:
optimizer.zero_grad()
pred = model(inputs)
loss = criterion(pred, labels)
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")

# __torch_distributed_end__


if __name__ == "__main__":
# __torch_single_run_begin__

train_func()

# __torch_single_run_end__

# __torch_trainer_begin__

from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

Expand All @@ -86,10 +100,8 @@ def train_func_distributed():

trainer = TorchTrainer(
train_func_distributed,
scaling_config=ScalingConfig(
num_workers=4, use_gpu=use_gpu)
scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)
)

results = trainer.fit()

# __torch_trainer_end__
10 changes: 0 additions & 10 deletions python/ray/train/examples/tf/tensorflow_quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# isort: skip_file

# __tf_setup_begin__

import numpy as np
import tensorflow as tf

Expand Down Expand Up @@ -32,21 +31,17 @@ def build_and_compile_cnn_model():
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=['accuracy'])
return model

# __tf_setup_end__

# __tf_single_begin__

def train_func():
batch_size = 64
single_worker_dataset = mnist_dataset(batch_size)
single_worker_model = build_and_compile_cnn_model()
single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)

# __tf_single_end__

# __tf_distributed_begin__

import json
import os

Expand All @@ -66,18 +61,14 @@ def train_func_distributed():
multi_worker_model = build_and_compile_cnn_model()

multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)

# __tf_distributed_end__

if __name__ == "__main__":
# __tf_single_run_begin__

train_func()

# __tf_single_run_end__

# __tf_trainer_begin__

from ray.train.tensorflow import TensorflowTrainer
from ray.air.config import ScalingConfig

Expand All @@ -87,5 +78,4 @@ def train_func_distributed():
trainer = TensorflowTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu))

trainer.fit()

# __tf_trainer_end__

0 comments on commit d2b855f

Please sign in to comment.