Depth Anything: update conversion script for V2 by pcuenca · Pull Request #31522 · huggingface/transformers

Hi @LiheYoung, thanks for checking!

Yes, I could replicate exactly the results from the small version of the model, applying the same inputs to both the original and the transformers implementations. The reference implementation I used was the one from your demo Space. I saved the depth output from the second image example (the sunflowers) as a numpy array, and verified transformers inference with the following code:

from transformers import AutoModelForDepthEstimation, AutoProcessor
from PIL import Image
import torch
import torch.nn.functional as F
import numpy as np
from torchvision.transforms import Compose

# Copied from source code
from depth_anything_transform import *

model_id = "pcuenq/Depth-Anything-V2-Small-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForDepthEstimation.from_pretrained(model_id).eval()

image = Image.open("space/Depth-Anything-V2/examples/demo02.jpg")
w, h = image.size

# Manually pre-process to match the original source code
# The transformers pre-processor produces slightly different values for some reason

transform = Compose([
    Resize(
        width=518,
        height=518,
        resize_target=False,
        keep_aspect_ratio=True,
        ensure_multiple_of=14,
        resize_method='lower_bound',
        image_interpolation_method=cv2.INTER_CUBIC,
    ),
    NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    PrepareForNet(),
])
pixel_values = np.array(image) / 255.0
pixel_values = transform({'image': pixel_values})['image']
pixel_values = torch.from_numpy(pixel_values).unsqueeze(0)

with torch.inference_mode():
    # DA2 processor
    outputs = model(pixel_values=pixel_values, output_hidden_states=False)

    # Transformers Processor
    inputs = processor(images=image, return_tensors="pt")
    outputs_transformers = model(**inputs, output_hidden_states=False)

# Compare with results from the same image obtained with https://huggingface.co/spaces/depth-anything/Depth-Anything-V2
def compare_with_reference(outputs, reference_depth, filename):
    depth = outputs["predicted_depth"]
    depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
    max_diff = np.abs(depth - reference_depth).max()
    mean_diff = np.abs(depth - reference_depth).mean()
    print(f"Sum of absolute differences vs baseline: {np.sum(np.abs(depth.numpy() - reference_depth))}")
    print(f"Difference using transformers processor, max: {max_diff}, mean: {mean_diff}")

    # raw_depth = Image.fromarray(depth.numpy().astype('uint16'))
    depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    depth = depth.numpy().astype(np.uint8)
    # colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)

    gray_depth = Image.fromarray(depth)
    gray_depth.save(filename)

reference_depth = np.load("space/Depth-Anything-V2/depth_gradio.npy")
compare_with_reference(outputs, reference_depth, "gray_depth.png")
compare_with_reference(outputs_transformers, reference_depth, "gray_depth_transformers.png")

Results are identical when the same pre-processing steps are used, but are not equal when using the transformers pre-processor. I assume most of the difference will come from the resampling algorithms (the original code uses OpenCV, while transformers uses PIL). I also assume (but didn't check) that the same processor differences will affect the v1 version as well.

cc @NielsRogge in case he has additional insight