sample/predict.py

import os
import subprocess
from typing import Any, List, Optional
from argparse import Namespace

import torch
from cog import BasePredictor, Input, Path, BaseModel

import data_loaders.humanml.utils.paramUtil as paramUtil
from data_loaders.get_data import get_dataset_loader
from data_loaders.humanml.scripts.motion_process import recover_from_ric
from data_loaders.humanml.utils.plot_script import plot_3d_motion
from data_loaders.tensors import collate
from model.cfg_sampler import ClassifierFreeSampleModel
from utils import dist_util
from utils.model_util import create_model_and_diffusion, load_model_wo_clip
from visualize.motions2hik import motions2hik
from sample.generate import construct_template_variables

"""
In case of matplot lib issues it may be needed to delete model/data_loaders/humanml/utils/plot_script.py" in lines 89~92 as
suggested in https://github.com/GuyTevet/motion-diffusion-model/issues/6
"""


class ModelOutput(BaseModel):
    json_file: Optional[Any]
    animation: Optional[List[Path]]


def get_args():
    args = Namespace()
    args.fps = 20
    args.model_path = './save/humanml_trans_enc_512/model000200000.pt'
    args.guidance_param = 2.5
    args.unconstrained = False
    args.dataset = 'humanml'

    args.cond_mask_prob = 1
    args.emb_trans_dec = False
    args.latent_dim = 512
    args.layers = 8
    args.arch = 'trans_enc'

    args.noise_schedule = 'cosine'
    args.sigma_small = True
    args.lambda_vel = 0.0
    args.lambda_rcxyz = 0.0
    args.lambda_fc   = 0.0
    return args


class Predictor(BasePredictor):
    def setup(self):
        subprocess.run(["mkdir", "/root/.cache/clip"])
        subprocess.run(["cp", "-r", "ViT-B-32.pt", "/root/.cache/clip"])

        self.args = get_args()
        self.num_frames = self.args.fps * 6
        print('Loading dataset...')

        # temporary data
        self.data = get_dataset_loader(name=self.args.dataset,
                                  batch_size=1,
                                  num_frames=196,
                                  split='test',
                                  hml_mode='text_only')

        self.data.fixed_length = float(self.num_frames)

        print("Creating model and diffusion...")
        self.model, self.diffusion = create_model_and_diffusion(self.args, self.data)

        print(f"Loading checkpoints from...")
        state_dict = torch.load(self.args.model_path, map_location='cpu')
        load_model_wo_clip(self.model, state_dict)

        if self.args.guidance_param != 1:
           self.model = ClassifierFreeSampleModel(self.model)   # wrapping model with the classifier-free sampler
        self.model.to(dist_util.dev())
        self.model.eval()  # disable random masking

    def predict(
            self,
            prompt: str = Input(default="the person walked forward and is picking up his toolbox."),
            num_repetitions: int = Input(default=3, description="How many"),
            output_format: str = Input(
                description='Choose the format of the output, either an animation or a json file of the animation data.\
                The json format is: {"thetas": [...], "root_translation": [...], "joint_map": [...]}, where "thetas" \
                is an [nframes x njoints x 3] array of joint rotations in degrees, "root_translation" is an [nframes x 3] \
                array of (X, Y, Z) positions of the root, and "joint_map" is a list mapping the SMPL joint index to the\
                corresponding HumanIK joint name',
                default="animation",
                choices=["animation", "json_file"],
            ),
    ) -> ModelOutput:
        args = self.args
        args.num_repetitions = int(num_repetitions)

        self.data = get_dataset_loader(name=self.args.dataset,
                                  batch_size=args.num_repetitions,
                                  num_frames=self.num_frames,
                                  split='test',
                                  hml_mode='text_only')

        collate_args = [{'inp': torch.zeros(self.num_frames), 'tokens': None, 'lengths': self.num_frames, 'text': str(prompt)}]
        _, model_kwargs = collate(collate_args)

        # add CFG scale to batch
        if args.guidance_param != 1:
            model_kwargs['y']['scale'] = torch.ones(args.num_repetitions, device=dist_util.dev()) * args.guidance_param

        sample_fn = self.diffusion.p_sample_loop
        sample = sample_fn(
            self.model,
            (args.num_repetitions, self.model.njoints, self.model.nfeats, self.num_frames),
            clip_denoised=False,
            model_kwargs=model_kwargs,
            skip_timesteps=0,  # 0 is the default value - i.e. don't skip any step
            init_image=None,
            progress=True,
            dump_steps=None,
            noise=None,
            const_noise=False,
        )

        # Recover XYZ *positions* from HumanML3D vector representation
        if self.model.data_rep == 'hml_vec':
            n_joints = 22 if sample.shape[1] == 263 else 21
            sample = self.data.dataset.t2m_dataset.inv_transform(sample.cpu().permute(0, 2, 3, 1)).float()
            sample = recover_from_ric(sample, n_joints)
            sample = sample.view(-1, *sample.shape[2:]).permute(0, 2, 3, 1)

        rot2xyz_pose_rep = 'xyz' if self.model.data_rep in ['xyz', 'hml_vec'] else self.model.data_rep
        rot2xyz_mask = None if rot2xyz_pose_rep == 'xyz' else model_kwargs['y']['mask'].reshape(args.num_repetitions,
                                                                                                self.num_frames).bool()
        sample = self.model.rot2xyz(x=sample, mask=rot2xyz_mask, pose_rep=rot2xyz_pose_rep, glob=True, translation=True,
                               jointstype='smpl', vertstrans=True, betas=None, beta=0, glob_rot=None,
                               get_rotations_back=False)

        all_motions = sample.cpu().numpy()

        if output_format == 'json_file':
            data_dict = motions2hik(all_motions)
            return ModelOutput(json_file=data_dict)

        caption = str(prompt)

        skeleton = paramUtil.t2m_kinematic_chain

        sample_print_template, row_print_template, all_print_template, \
            sample_file_template, row_file_template, all_file_template = construct_template_variables(
            args.unconstrained)

        rep_files = []
        replicate_fnames = []
        for rep_i in range(args.num_repetitions):
            motion = all_motions[rep_i].transpose(2, 0, 1)[:self.num_frames]
            save_file = sample_file_template.format(1, rep_i)
            print(sample_print_template.format(caption, 1, rep_i, save_file))
            plot_3d_motion(save_file, skeleton, motion, dataset=args.dataset, title=caption, fps=args.fps)
            # Credit for visualization: https://github.com/EricGuo5513/text-to-motion
            rep_files.append(save_file)

            replicate_fnames.append(Path(save_file))

        return ModelOutput(animation=replicate_fnames)