Skip to content

Commit

Permalink
Load models from Hugging Face
Browse files Browse the repository at this point in the history
  • Loading branch information
LiheYoung authored Jan 23, 2024
1 parent a61bb5a commit c3390b8
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 35 deletions.
22 changes: 13 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,26 @@ We highlight the **best** and *second best* results in **bold** and *italic* res

## Pre-trained models

We provide three models of varying scales for robust relatve depth estimation:
We provide three models of varying scales for robust relative depth estimation:

- Depth-Anything-ViT-Small (24.8M)

- Depth-Anything-ViT-Base (97.5M)

- Depth-Anything-ViT-Large (335.3M)

Download our pre-trained models [here](https://huggingface.co/spaces/LiheYoung/Depth-Anything/tree/main/checkpoints), and put them under the ``checkpoints`` directory.
You can easily load our pre-trained models by:
```python
from depth_anything.dpt import DepthAnything

encoder = 'vits' # can also be 'vitb' or 'vitl'
depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder))
```

## Usage

### Installation

The setup is very simple. Just make ensure ``torch``, ``torchvision``, and ``cv2`` are supported in your environment.

```bash
git clone https://github.com/LiheYoung/Depth-Anything
cd Depth-Anything
Expand All @@ -84,13 +88,13 @@ pip install -r requirements.txt
### Running

```bash
python run.py --encoder <vits | vitb | vitl> --load-from <pretrained-model> --img-path <img-directory | single-img | txt-file> --outdir <outdir> --localhub
python run.py --encoder <vits | vitb | vitl> --img-path <img-directory | single-img | txt-file> --outdir <outdir>
```
For the ``img-path``, you can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.

For example:
```bash
python run.py --encoder vitl --load-from checkpoints/depth_anything_vitl14.pth --img-path demo_images --outdir depth_visualization --localhub
python run.py --encoder vitl --img-path demo_images --outdir depth_visualization
```


Expand All @@ -112,14 +116,14 @@ If you want to use Depth Anything in your own project, you can simply follow [``
<summary>Code snippet (note the difference between our data pre-processing and that of MiDaS)</summary>

```python
from depth_anything.dpt import DPT_DINOv2
from depth_anything.dpt import DepthAnything
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet

import cv2
import torch

depth_anything = DPT_DINOv2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024], localhub=True)
depth_anything.load_state_dict(torch.load('checkpoints/depth_anything_vitl14.pth'))
encoder = 'vits' # can also be 'vitb' or 'vitl'
depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{:}14'.format(encoder))

transform = Compose([
Resize(
Expand Down
8 changes: 3 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import tempfile
from gradio_imageslider import ImageSlider

from depth_anything.dpt import DPT_DINOv2
from depth_anything.dpt import DepthAnything
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet

css = """
Expand All @@ -24,8 +24,7 @@
}
"""
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DPT_DINOv2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(DEVICE).eval()
model.load_state_dict(torch.load('checkpoints/depth_anything_vitl14.pth'))
model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval()

title = "# Depth Anything"
description = """Official demo for **Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data**.
Expand All @@ -49,7 +48,6 @@
def predict_depth(model, image):
return model(image)


with gr.Blocks(css=css) as demo:
gr.Markdown(title)
gr.Markdown(description)
Expand Down Expand Up @@ -93,4 +91,4 @@ def on_submit(image):


if __name__ == '__main__':
demo.queue().launch()
demo.queue().launch()
25 changes: 21 additions & 4 deletions depth_anything/dpt.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import argparse
import torch
import torch.nn as nn

from .blocks import FeatureFusionBlock, _make_scratch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download

from depth_anything.blocks import FeatureFusionBlock, _make_scratch


def _make_fusion_block(features, use_bn, size = None):
Expand Down Expand Up @@ -164,7 +166,22 @@ def forward(self, x):
return depth.squeeze(1)


class DepthAnything(DPT_DINOv2, PyTorchModelHubMixin):
def __init__(self, config):
super().__init__(**config)


if __name__ == '__main__':
depth_anything = DPT_DINOv2()
depth_anything.load_state_dict(torch.load('checkpoints/depth_anything_dinov2_vitl14.pth'))
parser = argparse.ArgumentParser()
parser.add_argument(
"--encoder",
default="vits",
type=str,
choices=["vits", "vitb", "vitl"],
)
args = parser.parse_args()

model = DepthAnything.from_pretrained("LiheYoung/depth_anything_{:}14".format(args.encoder))

print(model)

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ gradio_imageslider
gradio==4.14.0
torch
torchvision
opencv-python
opencv-python
huggingface_hub
22 changes: 6 additions & 16 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,15 @@
from torchvision.transforms import Compose
from tqdm import tqdm

from depth_anything.dpt import DPT_DINOv2
from depth_anything.dpt import DepthAnything
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--img-path', type=str)
parser.add_argument('--outdir', type=str, default='./vis_depth')

parser.add_argument('--encoder', type=str, default='vitl')
parser.add_argument('--load-from', type=str, required=True)
parser.add_argument('--localhub', dest='localhub', action='store_true', default=False)
parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl'])

args = parser.parse_args()

Expand All @@ -29,19 +26,13 @@
font_scale = 1
font_thickness = 2

assert args.encoder in ['vits', 'vitb', 'vitl']
if args.encoder == 'vits':
depth_anything = DPT_DINOv2(encoder='vits', features=64, out_channels=[48, 96, 192, 384], localhub=args.localhub).cuda()
elif args.encoder == 'vitb':
depth_anything = DPT_DINOv2(encoder='vitb', features=128, out_channels=[96, 192, 384, 768], localhub=args.localhub).cuda()
else:
depth_anything = DPT_DINOv2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024], localhub=args.localhub).cuda()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE)

total_params = sum(param.numel() for param in depth_anything.parameters())
print('Total parameters: {:.2f}M'.format(total_params / 1e6))

depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'), strict=True)

depth_anything.eval()

transform = Compose([
Expand Down Expand Up @@ -76,7 +67,7 @@
h, w = image.shape[:2]

image = transform({'image': image})['image']
image = torch.from_numpy(image).unsqueeze(0).cuda()
image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)

with torch.no_grad():
depth = depth_anything(image)
Expand Down Expand Up @@ -109,4 +100,3 @@
final_result = cv2.vconcat([caption_space, combined_results])

cv2.imwrite(os.path.join(args.outdir, filename[:filename.find('.')] + '_img_depth.png'), final_result)

0 comments on commit c3390b8

Please sign in to comment.