Skip to content

Commit

Permalink
No image works
Browse files Browse the repository at this point in the history
  • Loading branch information
iucario committed Jul 1, 2022
1 parent 206bee7 commit beb8abd
Show file tree
Hide file tree
Showing 13 changed files with 323 additions and 269 deletions.
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
[submodule "mmaction2"]
path = mmaction2
url = https://github.com/iucario/mmaction2
18 changes: 10 additions & 8 deletions docker/dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-devel
FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel

# Remove any third-party apt sources to avoid issues with expiring keys.
RUN rm -f /etc/apt/sources.list.d/*.list
Expand All @@ -13,12 +13,12 @@ RUN apt-get update && apt-get install -y \
libx11-6 \
&& rm -rf /var/lib/apt/lists/*

RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y
RUN apt-get clean && \
rm -rf /var/lib/apt

ENV TZ=Japan/Tokyo
RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
# Timezone
RUN apt-get update && \
apt-get install -yq tzdata && \
dpkg-reconfigure -f noninteractive tzdata &&\
rm /etc/localtime && \
ln -fs /usr/share/zoneinfo/Asia/Tokyo /etc/localtime

# Create a working directory
RUN mkdir /app
Expand Down Expand Up @@ -65,7 +65,9 @@ RUN pip install pytorch_lightning \
fvcore \
timm \
scikit-learn \
scipy
scipy \
pandas \
matplotlib

# Set the default command to python3
CMD ["python3"]
1 change: 0 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ docker run --rm -it \
--user="$(id -u):$(id -g)" \
--volume="$PWD:/app" \
work:latest python3 tmp/multi_gpu.py
# --ipc=host \
9 changes: 9 additions & 0 deletions start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
docker run -it \
--gpus=all \
--shm-size=4gb \
--user="$(id -u):$(id -g)" \
--volume="$PWD:/app" \
-w /app \
--entrypoint bash \
--name ani \
anibali/pytorch:1.10.2-cuda11.3
84 changes: 84 additions & 0 deletions tmp/cnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os

os.environ['NCCL_DEBUG'] = 'INFO'
os.environ['NCCL_DEBUG_SUBSYS'] = 'ENV'
os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'DETAIL'
os.environ['NCCL_P2P_LEVEL'] = 'LOC'
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

from typing import Optional
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import random_split, DataLoader, Subset
from torchvision import transforms as T
from torchvision.datasets import CIFAR10, MNIST
from torchvision.models import resnet50


def train_epoch(model, batch, loss_fn, optimizer, device):
model.train()
optimizer.zero_grad()
x, y = batch
x = x.to(device)
y = y.to(device)
z = model(x)
loss = loss_fn(z, y)
loss.backward()
optimizer.step()
return loss.item(), (z.argmax(dim=1) == y).sum().item()


def val_epoch(model, batch, loss_fn, device):
model.eval()
with torch.no_grad():
x, y = batch
x = x.to(device)
y = y.to(device)
z = model(x)
loss = loss_fn(z, y)
return loss.item(), (z.argmax(dim=1) == y).sum().item()


def main():
device = 'cuda'

dataset = CIFAR10('./data',
'train',
download=True,
transform=T.Compose([
T.ToTensor(),
T.Resize(size=(224, 224)),
]))
train_set, val_set = random_split(Subset(dataset, range(500)), [400, 100])
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False)
print(dataset[0][0].shape)

model = resnet50(pretrained=True)
model = model.to(device)

y = model(torch.randn(1, 3, 224, 224).to(device))
print(y.shape)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for i in range(10):
train_correct = 0
for batch in train_loader:
loss, correct = train_epoch(model, batch, loss_fn, optimizer, device)
train_correct += correct
train_acc = train_correct / len(train_loader.dataset)
print(f"Train loss: {loss:.4f}, Train acc: {train_acc:.4f}")
val_correct = 0
for batch in val_loader:
loss, correct = val_epoch(model, batch, loss_fn, device)
val_correct += correct
val_acc = val_correct / len(val_loader.dataset)
print(f"Val loss: {loss:.4f}, Val acc: {val_acc:.4f}")


if __name__ == '__main__':
main()
31 changes: 20 additions & 11 deletions tmp/cpu.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from typing import Optional
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import random_split, DataLoader, Subset
from torchvision import transforms as T
from torchvision.datasets import MNIST
from torchvision.datasets import CIFAR10, MNIST
from torchvision.models import resnet50


class DataParallelModel(nn.Module):
Expand Down Expand Up @@ -35,7 +38,7 @@ def train_epoch(model, batch, loss_fn, optimizer, device):
model.train()
optimizer.zero_grad()
x, y = batch
x = x.view(x.size(0), -1).to(device)
x = x.to(device)
y = y.to(device)
z = model(x)
loss = loss_fn(z, y)
Expand All @@ -48,29 +51,35 @@ def val_epoch(model, batch, loss_fn, device):
model.eval()
with torch.no_grad():
x, y = batch
x = x.view(x.size(0), -1).to(device)
x = x.to(device)
y = y.to(device)
z = model(x)
loss = loss_fn(z, y)
return loss.item(), (z.argmax(dim=1) == y).sum().item()


def main():
device = 'cpu'

dataset = MNIST('./data',
'train',
download=True,
transform=T.Compose([T.ToTensor(),
T.Normalize((0.1307,), (0.3081,))]))
device = 'cuda:1'

dataset = CIFAR10('./data',
'train',
download=True,
transform=T.Compose([
T.ToTensor(),
T.Resize(size=(224, 224)),
]))
train_set, val_set = random_split(Subset(dataset, range(500)), [400, 100])
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False)
print(dataset[0][0].shape)

model = Model(28*28, 10)
# model = Model(28 * 28, 10)
model = resnet50(pretrained=True)
model = model.to(device)

y = model(torch.randn(1, 3, 224, 224).to(device))
print(y.shape)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

Expand Down
30 changes: 18 additions & 12 deletions tmp/ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,30 @@ def forward(self, x):
return self.model(x)


class Net(nn.Module):

def __init__(self, num_class: int = 10):
super(Net, self).__init__()
fx = resnet18(pretrained=True)
fx.fc = nn.Linear(512, num_class)
self.model = fx

def forward(self, x):
return self.model(x)


def example(rank, world_size):
# create default process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
# model = resnet18(weights='ResNet18_Weights.IMAGENET1K_V1').to(rank)
model = CNN(1000).to(rank)
# construct DDP model
model = Net(10).to(rank)
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer

loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

# forward pass
outputs = ddp_model(torch.randn(10, 3, 224, 224).to(rank))
labels = torch.randn(10, 1000).to(rank)
# backward pass
outputs = ddp_model(torch.randn(8, 3, 224, 224).to(rank))
labels = torch.randn(8, 10).to(rank)
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()


Expand All @@ -100,9 +106,9 @@ def main():
print("We have available ", torch.cuda.device_count(), "GPUs! Using ", world_size,
" GPUs")

y = CNN(1000)(torch.randn(1, 3, 224, 224))
y = Net(10)(torch.randn(1, 3, 224, 224))
print(y.shape)

mp.spawn(example, args=(world_size,), nprocs=world_size, join=True)


Expand Down
80 changes: 80 additions & 0 deletions tmp/dp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os

os.environ['NCCL_DEBUG'] = 'INFO'
os.environ['NCCL_DEBUG_SUBSYS'] = 'ENV'
os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'DETAIL'
os.environ['NCCL_P2P_LEVEL'] = 'LOC'
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms as T
from torch.utils.data import random_split, DataLoader, Subset
from torchvision.datasets import CIFAR10, MNIST
from torchvision.models import resnet18

torch.autograd.set_detect_anomaly(True)


class Net(nn.Module):

def __init__(self, num_class: int = 10):
super(Net, self).__init__()
fx = resnet18(pretrained=True)
fx.fc = nn.Linear(512, num_class)
self.model = fx

def forward(self, x):
return self.model(x)


def train(model, device, train_loader, optimizer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
print(f'{batch_idx}/{len(train_loader)}, loss={loss.item():.4f}')


def main():
device = 'cuda'

dataset = CIFAR10('./data',
'train',
download=True,
transform=T.Compose([
T.ToTensor(),
T.Resize(size=(224, 224)),
T.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
]))
train_set = Subset(dataset, range(200))
train_loader = DataLoader(train_set, batch_size=20, shuffle=True, num_workers=1)
print(dataset[0][0].shape)

model = Net(num_class=10)
model = model.to(device)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model).to(device)

y = model(torch.randn(1, 3, 224, 224).to(device))
print(y.shape)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
print('start training')
for i in range(3):
train(model, device, train_loader, optimizer)

print('Done')


if __name__ == '__main__':
main()
Loading

0 comments on commit beb8abd

Please sign in to comment.