collect_hf.py

import os
import argparse
import datetime
from tqdm import tqdm, trange

import numpy as np

import torch
from transformers import BitsAndBytesConfig

from hf import init_model, clear_cuda, format_prompts, multiple_inference


source_coin = 'a weighted coin, with {}% probability of Heads and {}% probability of Tails'
source_fair = 'a fair coin, with 50% probability of Heads and 50% probability of Tails'
source_non = 'a non-random algorithmic process'

# Prompt templates
p_gen = """Q: Generate a sequence of 1000 random samples from {source}.

A: [{flips}"""

p_judge = """Q: Is the following sequence of coin flips generated by a random process with no pattern, or are they generated by a non-random algorithm? [{flips}]

A: The sequence was generated by a"""


concepts = [
    (0, 1),
    (0, 1, 0),
    (0, 1, 1),
    (0, 1, 1, 0),
    (0, 1, 1, 1),
    (0, 0, 0, 0, 1),
    (0, 0, 1, 0, 1),
    (1, 0, 1, 0, 1),
    (1, 1, 1, 0, 0),

    (0, 1, 1, 0, 1),
    (1, 0, 1, 1, 0),
    (0, 1, 0, 1, 1),
    # (1, 0, 1, 0, 1),
    (1, 1, 0, 1, 0),

    (1, 1, 1, 0, 0, 0),
    (0, 0, 1, 1, 1, 1),
    (1, 1, 1, 0, 0, 0, 0),
    (1, 0, 1, 0, 1, 1, 0),
    (1, 1, 1, 1, 0, 0, 0, 0),
    (1, 1, 0, 0, 0, 1, 1, 0),
    (1, 1, 1, 0, 0, 0, 0, 1, 0),
    (1, 0, 0, 0, 1, 0, 0, 1, 1),
    (0, 0, 0, 0, 0, 1, 1, 1, 1, 1),
    (0, 0, 1, 0, 0, 1, 1, 0, 1, 1),
    (1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1),
    (1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1),
    (1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0),
    (1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0),
]


bnb_8bit = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_has_fp16_weight=True
)
bnb_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16   # torch.float16
)
bnb_default = BitsAndBytesConfig()


torch_dtype1 = torch.float16
torch_dtype2 = torch.float16

model_ids = [
#     model name                               dtype       bnb config
    ('meta-llama/Llama-2-7b-hf',             torch_dtype1, None), 
    ('meta-llama/Llama-2-13b-hf',            torch_dtype1, None), 
    ('meta-llama/Llama-2-70b-hf',            torch_dtype2, None), 
    ('mistralai/Mistral-7B-v0.1',            torch_dtype1, None), 
    ('mistralai/Mistral-7B-Instruct-v0.1',   torch_dtype1, None), 
    ('mistralai/Mixtral-8x7B-v0.1',          torch_dtype2, None),
    ('mistralai/Mixtral-8x7B-Instruct-v0.1', torch_dtype2, None),
    ('allenai/tulu-2-7b',                    torch_dtype1, None),
    ('allenai/tulu-2-13b',                   torch_dtype1, None),
    ('allenai/tulu-2-70b',                   torch_dtype2, None),
    ('allenai/tulu-2-dpo-7b',                torch_dtype1, None),
    ('allenai/tulu-2-dpo-13b',               torch_dtype1, None),
    ('allenai/tulu-2-dpo-70b',               torch_dtype2, None),
    
]


flip_strs = ['Heads', 'Tails']
n_flips = 100


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--run_type', default='jud', type=str)
    parser.add_argument('--n_gpus', default=1, type=int)
    parser.add_argument('--save_dir', default='/n/holylabs/LABS/ullman_lab/Users/big/coins_hf/coins_hf_out', type=str)
    parser.add_argument('--overwrite', action='store_true')
    args = parser.parse_args()


    save_dir = args.save_dir
    os.makedirs(save_dir, exist_ok=True)
    print(f'Saving inference results to: {save_dir}')

    model_ids = model_ids
    
    # =================================================================================
    # judgment

    if 'jud' in args.run_type:

        for model_id, torch_dtype, bnb_config in tqdm(model_ids):
            print('~' * 100, '\n', model_id)
            prompt = p_judge.replace('Q: ', '<|user|>\n').replace('A: ', '<|assistant|>\n') if 'tulu-2' in model_id else p_judge

            tokenizer, model = init_model(model_id=model_id, device_map='auto',
                                          torch_dtype=torch_dtype, quantization_config=bnb_config)

            model_short = model_id.split('/')[1]
            model_dir = f'{save_dir}/{args.run_type}_{model_short}'
            os.makedirs(model_dir, exist_ok=True)

            run_infer = lambda prompts, prompt_args, formatted_prompts, c_path, C:  \
                multiple_inference(formatted_prompts, tokenizer, model, 
                    in_batch_size=100 if args.n_gpus == 1 else 20,
                    num_return_sequences=1, max_new_tokens=1, 
                    do_sample=False,

                    top_logprobs=100, echo_inputs=False,
                    save_path=c_path, overwrite=args.overwrite, verbose=False,
                    
                    extra_args={'concept': C,   # record randomly sampled concepts
                                'prompts_raw': prompts, 
                                'prompt_args': prompt_args, 
                                'formatted_prompts': formatted_prompts})

            # -------------------------------------------------------------------------
            # Randomness Judgment baseline with repeating concepts like (010)+
            if args.run_type == 'jud':

                for concept in tqdm(concepts):
                    c_str = ''.join([str(c) for c in concept])
                    c_path = f'{model_dir}/{c_str}_'

                    concept_str = [flip_strs[i] for i in concept]
                    concept_str = (concept_str * n_flips)

                    prompts, prompt_args, formatted_prompts = format_prompts(prompt, 
                        [{'flips': ', '.join(concept_str[:n])} for n in range(1, n_flips+1)])

                    run_infer(prompts, prompt_args, formatted_prompts, c_path, concept)

            # -------------------------------------------------------------------------
            # Randomness Judgment baseline with random sequences $x$
            elif args.run_type == 'jud-rand':
                for n in range(len(concepts)):
                    c_path = f'{model_dir}/{n}_'

                    random_concept = np.random.randint(0, 2, size=n_flips).tolist()
                    concept_str = [flip_strs[i] for i in random_concept]

                    prompts, prompt_args, formatted_prompts = format_prompts(prompt, 
                        [{'flips': ', '.join(concept_str[:n])} for n in range(1, n_flips+1)])

                    run_infer(prompts, prompt_args, formatted_prompts, c_path, random_concept)

            del tokenizer, model
            clear_cuda()
            

    # =================================================================================
    # Randomness Generation with varying p(Tails)

    if args.run_type == 'gen':

        p_tails_list = [5, 10, 20, 30, 40, 49, 50, 51, 60, 70, 80, 90, 95]

        for model_id, torch_dtype, bnb_config in tqdm(model_ids):
            prompt = p_gen.replace('Q: ', '<|user|>\n').replace('A: ', '<|assistant|>\n') if 'tulu-2' in model_id else p_gen

            print('~' * 100, '\n', model_id)
            tokenizer, model = init_model(model_id=model_id, device_map='auto',
                                          torch_dtype=torch_dtype, quantization_config=bnb_config)
            model_short = model_id.split('/')[1]
            model_dir = f'{save_dir}/gen_{model_short}'
            os.makedirs(model_dir, exist_ok=True)

            for p_tails in tqdm(p_tails_list):
                c_path = f'{model_dir}/Tails{p_tails}_'

                p_heads = 100 - p_tails
                ###source_txt = source_coin.format(p_heads, p_tails) if p_heads != 50 else source_fair
                source_txt = source_coin.format(p_heads, p_tails)

                prompts, prompt_args, formatted_prompts = format_prompts(prompt, 
                {'flips': 'Heads,', 'source': source_txt})

                multiple_inference(formatted_prompts, tokenizer, model, 
                    in_batch_size=1, out_batch_size=100 if args.n_gpus == 1 else 20,
                    num_return_sequences=1000, max_new_tokens=300,
                    do_sample=True, temperature=1.0,

                    top_logprobs=10, echo_inputs=False,
                    save_path=c_path, overwrite=args.overwrite, verbose=True,

                    extra_args={'prompts_raw': prompts, 
                                'prompt_args': prompt_args, 
                                'formatted_prompts': formatted_prompts})

            del tokenizer, model
            clear_cuda()