gradio_service.py

import os
import gc
import torch
import argparse
import traceback
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    StoppingCriteria,
)
import gradio as gr
from queue import Queue
from threading import Thread
from datetime import datetime
from transformers import GenerationConfig

DEFAULT_SYSTEM_PROMPT = """You are now working as an excellent expert in chemistry and molecule discovery. """

TEMPLATE_WITH_SYSTEM_PROMPT = (
    "[INST] <<SYS>>\n"
    "{system_prompt}\n"
    "<</SYS>>\n\n"
    "{instruction} [/INST]"
)

TEMPLATE_WITHOUT_SYSTEM_PROMPT = "[INST] {instruction} [/INST]"

# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument(
    '--base_model',
    default=None,
    type=str,
    required=True,
    help='Base model path')
parser.add_argument('--lora_model', default=None, type=str,
                    help="If None, perform inference on the base model")
parser.add_argument(
    '--tokenizer_path',
    default=None,
    type=str,
    help='If None, lora model path or base model path will be used')
parser.add_argument(
    '--gpus',
    default="0",
    type=str,
    help='If None, cuda:0 will be used. Inference using multi-cards: --gpus=0,1,... ')
parser.add_argument('--share', default=True, help='Share gradio domain name')
parser.add_argument('--ip', default='0.0.0.0', type=str, help='IP of gradio demo')
parser.add_argument('--port', default=19324, type=int, help='Port of gradio demo')
parser.add_argument(
    '--max_memory',
    default=1024,
    type=int,
    help='Maximum number of input tokens (including system prompt) to keep. If exceeded, earlier history will be discarded.')
parser.add_argument(
    '--load_in_8bit',
    action='store_true',
    help='Use 8 bit quantified model')
parser.add_argument(
    '--only_cpu',
    action='store_true',
    help='Only use CPU for inference')
parser.add_argument(
    '--alpha',
    type=str,
    default="1.0",
    help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument(
    '--system_prompt',
    type=str,
    default=DEFAULT_SYSTEM_PROMPT,
    help="The system prompt of the prompt template."
)
args = parser.parse_args()
if args.only_cpu is True:
    args.gpus = ""

import sys

parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from utils.attn_and_long_ctx_patches import apply_attention_patch, apply_ntk_scaling_patch

apply_attention_patch(use_memory_efficient_attention=True)
apply_ntk_scaling_patch(args.alpha)

# Set CUDA devices if available
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

# Peft library can only import after setting CUDA devices
from peft import PeftModel


# Set up the required components: model and tokenizer
def setup():
    global tokenizer, model, device, share, ip, port, max_memory
    max_memory = args.max_memory
    ip = args.ip
    port = args.port
    share = args.share
    load_in_8bit = args.load_in_8bit
    load_type = torch.float16
    if torch.cuda.is_available():
        device = torch.device(0)
    else:
        device = torch.device('cpu')
    if args.tokenizer_path is None:
        args.tokenizer_path = args.lora_model
        if args.lora_model is None:
            args.tokenizer_path = args.base_model
    tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, legacy=True)

    base_model = LlamaForCausalLM.from_pretrained(
        args.base_model,
        load_in_8bit=load_in_8bit,
        torch_dtype=load_type,
        low_cpu_mem_usage=True,
        device_map='auto',
    )

    model_vocab_size = base_model.get_input_embeddings().weight.size(0)
    tokenzier_vocab_size = len(tokenizer)
    print(f"Vocab of the base model: {model_vocab_size}")
    print(f"Vocab of the tokenizer: {tokenzier_vocab_size}")
    if model_vocab_size != tokenzier_vocab_size:
        print("Resize model embeddings to fit tokenizer")
        base_model.resize_token_embeddings(tokenzier_vocab_size)
    if args.lora_model is not None:
        print("loading peft model")
        model = PeftModel.from_pretrained(
            base_model,
            args.lora_model,
            torch_dtype=load_type,
            device_map='auto',
        )
    else:
        model = base_model

    if device == torch.device('cpu'):
        model.float()

    model.eval()


# Reset the user input
def reset_user_input():
    return gr.update(value='')


# Reset the state
def reset_state():
    return []


def generate_prompt(instruction, response="", with_system_prompt=True):
    if with_system_prompt is True:
        system_prompt = args.system_prompt or DEFAULT_SYSTEM_PROMPT
        prompt = TEMPLATE_WITH_SYSTEM_PROMPT.format_map({'instruction': instruction, 'system_prompt': system_prompt})
    else:
        prompt = TEMPLATE_WITHOUT_SYSTEM_PROMPT.format_map({'instruction': instruction})
    if len(response) > 0:
        prompt += " " + response
    return prompt


# User interaction function for chat
def user(user_message, history):
    return gr.update(value="", interactive=False), history + \
           [[user_message, None]]


class Stream(StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func

    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False


class Iteratorize:
    """
    Transforms a function that takes a callback
    into a lazy iterator (generator).

    Adapted from: https://stackoverflow.com/a/9969000
    """

    def __init__(self, func, kwargs=None, callback=None):
        self.mfunc = func
        self.c_callback = callback
        self.q = Queue()
        self.sentinel = object()
        self.kwargs = kwargs or {}
        self.stop_now = False

        def _callback(val):
            if self.stop_now:
                raise ValueError
            self.q.put(val)

        def gentask():
            try:
                ret = self.mfunc(callback=_callback, **self.kwargs)
            except ValueError:
                pass
            except Exception:
                traceback.print_exc()

            clear_torch_cache()
            self.q.put(self.sentinel)
            if self.c_callback:
                self.c_callback(ret)

        self.thread = Thread(target=gentask)
        self.thread.start()

    def __iter__(self):
        return self

    def __next__(self):
        obj = self.q.get(True, None)
        if obj is self.sentinel:
            raise StopIteration
        else:
            return obj

    def __del__(self):
        clear_torch_cache()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop_now = True
        clear_torch_cache()


def clear_torch_cache():
    gc.collect()
    if torch.cuda.device_count() > 0:
        torch.cuda.empty_cache()


# Perform prediction based on the user input and history
@torch.no_grad()
def predict(
        history,
        max_new_tokens=128,
        top_p=0.75,
        temperature=0.1,
        top_k=40,
        do_sample=True,
        repetition_penalty=1.0
):
    current_time = datetime.now()
    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"\n---------[{formatted_time}] from web---------")
    while True:
        history[-1][1] = ""
        for i, msg in enumerate(history):
            print(f'[{i + 1}]- Human: {msg[0]}')
            print(f'[{i + 1}]- Machine: {msg[1]}')
        if len(history) == 1:
            input = history[0][0]
            prompt = generate_prompt(input, response="", with_system_prompt=True)
        else:
            input = history[0][0]
            response = history[0][1]
            prompt = generate_prompt(input, response=response, with_system_prompt=True) + '</s>'
            for hist in history[1:-1]:
                input = hist[0]
                response = hist[1]
                prompt = prompt + '<s>' + generate_prompt(input, response=response, with_system_prompt=False) + '</s>'
            input = history[-1][0]
            prompt = prompt + '<s>' + generate_prompt(input, response="", with_system_prompt=False)

        input_length = len(tokenizer.encode(prompt, add_special_tokens=True))
        if input_length > max_memory and len(history) > 1:
            print(
                f"The input length ({input_length}) exceeds the max memory ({max_memory}). The earlier history will be discarded.")
            history = history[1:]
            print("history: ", history)
        else:
            break

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    generate_params = {
        'input_ids': input_ids,
        'max_new_tokens': max_new_tokens,
        'top_p': top_p,
        'temperature': temperature,
        'top_k': top_k,
        'do_sample': do_sample,
        'repetition_penalty': repetition_penalty,
    }

    def generate_with_callback(callback=None, **kwargs):
        if 'stopping_criteria' in kwargs:
            kwargs['stopping_criteria'].append(Stream(callback_func=callback))
        else:
            kwargs['stopping_criteria'] = [Stream(callback_func=callback)]
        clear_torch_cache()
        with torch.no_grad():
            model.generate(**kwargs)

    def generate_with_streaming(**kwargs):
        return Iteratorize(generate_with_callback, kwargs, callback=None)

    with generate_with_streaming(**generate_params) as generator:
        for output in generator:
            next_token_ids = output[len(input_ids[0]):]
            if next_token_ids[0] == tokenizer.eos_token_id:
                break
            new_tokens = tokenizer.decode(
                next_token_ids, skip_special_tokens=True)
            if isinstance(tokenizer, LlamaTokenizer) and len(next_token_ids) > 0:
                if tokenizer.convert_ids_to_tokens(int(next_token_ids[0])).startswith('▁'):
                    new_tokens = ' ' + new_tokens

            history[-1][1] = new_tokens
            yield history
            if len(next_token_ids) >= max_new_tokens:
                break


@torch.no_grad()
def predict_for_client(msg, history):
    current_time = datetime.now()
    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"\n---------[{formatted_time}] from client---------")
    while True:
        history.append([msg, ""])
        if len(history) == 1:
            input = history[0][0]
            prompt = generate_prompt(input, response="", with_system_prompt=True)
        else:
            input = history[0][0]
            response = history[0][1]
            prompt = generate_prompt(input, response=response, with_system_prompt=True) + '</s>'
            for hist in history[1:-1]:
                input = hist[0]
                response = hist[1]
                prompt = prompt + '<s>' + generate_prompt(input, response=response, with_system_prompt=False) + '</s>'
            input = history[-1][0]
            prompt = prompt + '<s>' + generate_prompt(input, response="", with_system_prompt=False)

        input_length = len(tokenizer.encode(prompt, add_special_tokens=True))
        if input_length > max_memory:
            print(
                f"The input length ({input_length}) exceeds the max memory ({max_memory}). The earlier history will be discarded.")
            return [
                [f'Error, The input length ({input_length}) exceeds the max memory ({max_memory}). history: {history}']]
        else:
            break

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    generation_config = GenerationConfig(
        temperature=0.2,
        top_k=40,
        top_p=0.9,
        do_sample=True,
        num_beams=1,
        repetition_penalty=1.1,
        max_new_tokens=400
    )

    generation_output = model.generate(
        input_ids=input_ids,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        generation_config=generation_config
    )

    s = generation_output[0]
    output = tokenizer.decode(s, skip_special_tokens=True)
    response = output.split("[/INST]")[-1].strip()
    history[-1][1] = response

    for i, msg in enumerate(history):
        print(f'[{i + 1}]- Human: {msg[0]}')
        print(f'[{i + 1}]- Machine: {msg[1]}')

    return history


# Call the setup function to initialize the components
setup()

# Create the Gradio interface
with gr.Blocks() as demo:
    demo.title = "DrugAssist"
    chatbot = gr.Chatbot()
    with gr.Row():
        with gr.Column(scale=4):
            with gr.Column(scale=12):
                user_input = gr.Textbox(
                    show_label=False,
                    placeholder="Shift + Enter[Send message]...",
                    lines=10).style(
                    container=False)
            with gr.Column(min_width=32, scale=1):
                submitBtn = gr.Button("Submit", variant="primary")
        with gr.Column(scale=1):
            emptyBtn = gr.Button("Clear History")
            max_new_token = gr.Slider(
                0,
                4096,
                value=512,
                step=1.0,
                label="Maximum New Token Length",
                interactive=True)
            top_p = gr.Slider(0, 1, value=0.9, step=0.01,
                              label="Top P", interactive=True)
            temperature = gr.Slider(
                0,
                1,
                value=0.5,
                step=0.01,
                label="Temperature",
                interactive=True)
            top_k = gr.Slider(1, 40, value=40, step=1,
                              label="Top K", interactive=True)
            do_sample = gr.Checkbox(
                value=True,
                label="Do Sample",
                info="use random sample strategy",
                interactive=True)
            repetition_penalty = gr.Slider(
                1.0,
                3.0,
                value=1.1,
                step=0.1,
                label="Repetition Penalty",
                interactive=True)

    params = [user_input, chatbot]
    predict_params = [
        chatbot,
        max_new_token,
        top_p,
        temperature,
        top_k,
        do_sample,
        repetition_penalty]

    submitBtn.click(
        user,
        params,
        params,
        queue=False).then(
        predict,
        predict_params,
        chatbot).then(
        lambda: gr.update(
            interactive=True),
        None,
        [user_input],
        queue=False)

    user_input.submit(
        user,
        params,
        params,
        queue=False).then(
        predict,
        predict_params,
        chatbot).then(
        lambda: gr.update(
            interactive=True),
        None,
        [user_input],
        queue=False)

    submitBtn.click(reset_user_input, [], [user_input])

    emptyBtn.click(reset_state, outputs=[chatbot], show_progress=True)

    user_input.submit(predict_for_client, [user_input, chatbot], chatbot, api_name='drugassist')

# Launch the Gradio interface
demo.queue().launch(
    share=share,
    inbrowser=True,
    server_name=ip,
    server_port=port)