Skip to content

Commit

Permalink
llm memory free
Browse files Browse the repository at this point in the history
  • Loading branch information
gokayfem committed Nov 1, 2024
1 parent 80d1f30 commit 63e9e20
Showing 1 changed file with 107 additions and 1 deletion.
108 changes: 107 additions & 1 deletion nodes/suggest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from typing import Any, List
from pydantic import BaseModel, Field, create_model
from typing_extensions import Literal

import torch
import gc


supported_LLava_extensions = set(['.gguf'])
Expand Down Expand Up @@ -611,6 +612,107 @@ def keyword_extract(self, prompt, model, temperature, attribute_name, attribute_
parsed_response = json.loads(response)

return (next(iter(parsed_response.values())),)

class LLMOptionalMemoryFreeSimple:
def __init__(self):
self.llm = None # Store the model instance

@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"ckpt_name": (folder_paths.get_filename_list("LLavacheckpoints"), ),
"max_ctx": ("INT", {"default": 4096, "min": 128, "max": 128000, "step": 64}),
"gpu_layers": ("INT", {"default": 27, "min": 0, "max": 100, "step": 1}),
"n_threads": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
"prompt": ("STRING", {"forceInput": True}),
"temperature": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0, "step": 0.01}),
"unload": ("BOOLEAN", {"default": False}), # Add unload parameter
}
}

RETURN_TYPES = ("STRING",)
FUNCTION = "generate_text"
CATEGORY = "VLM Nodes/LLM"

def generate_text(self, ckpt_name, max_ctx, gpu_layers, n_threads, prompt, temperature, unload):
# Load model
ckpt_path = folder_paths.get_full_path("LLavacheckpoints", ckpt_name)
self.llm = Llama(model_path=ckpt_path, offload_kqv=True, f16_kv=True, use_mlock=False, embedding=False, n_batch=1024, last_n_tokens_size=1024, verbose=True, seed=42, n_ctx=max_ctx, n_gpu_layers=gpu_layers, n_threads=n_threads, logits_all=True, echo=False)

response = self.llm.create_chat_completion(
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": prompt}
],
temperature=temperature,
)

if unload and self.llm is not None:
del self.llm # Unload the model
self.llm = None # Remove reference to the model
gc.collect()
torch.cuda.empty_cache()

return (f"{response['choices'][0]['message']['content']}", )

class LLMOptionalMemoryFreeAdvanced:
def __init__(self):
self.llm = None # Store the model instance

@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"ckpt_name": (folder_paths.get_filename_list("LLavacheckpoints"), ),
"max_ctx": ("INT", {"default": 4096, "min": 128, "max": 128000, "step": 64}),
"gpu_layers": ("INT", {"default": 27, "min": 0, "max": 100, "step": 1}),
"n_threads": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
"system_msg": ("STRING", {"default": "You are a helpful AI assistant."}),
"prompt": ("STRING", {"forceInput": True, "default": ""}),
"max_tokens": ("INT", {"default": 512, "min": 1, "max": 2048, "step": 1}),
"temperature": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0, "step": 0.01}),
"top_p": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 1.0, "step": 0.01}),
"top_k": ("INT", {"default": 40, "step": 1}),
"frequency_penalty": ("FLOAT", {"default": 0.0, "step": 0.01}),
"presence_penalty": ("FLOAT", {"default": 0.0, "step": 0.01}),
"repeat_penalty": ("FLOAT", {"default": 1.1, "step": 0.01}),
"seed": ("INT", {"default": 42, "step": 1}),
"unload": ("BOOLEAN", {"default": False}), # Add unload parameter
}
}

RETURN_TYPES = ("STRING",)
FUNCTION = "generate_text_advanced"
CATEGORY = "VLM Nodes/LLM"

def generate_text_advanced(self, ckpt_name, max_ctx, gpu_layers, n_threads, system_msg, prompt, max_tokens, temperature, top_p, top_k, frequency_penalty, presence_penalty, repeat_penalty, seed, unload):
# Load model
ckpt_path = folder_paths.get_full_path("LLavacheckpoints", ckpt_name)
self.llm = Llama(model_path=ckpt_path, offload_kqv=True, f16_kv=True, use_mlock=False, embedding=False, n_batch=1024, last_n_tokens_size=1024, verbose=True, seed=seed, n_ctx=max_ctx, n_gpu_layers=gpu_layers, n_threads=n_threads, logits_all=True, echo=False)

response = self.llm.create_chat_completion(
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": prompt}
],
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
repeat_penalty=repeat_penalty,
seed=seed,
)

if unload and self.llm is not None:
del self.llm # Unload the model
self.llm = None # Remove reference to the model
gc.collect()
torch.cuda.empty_cache()

return (f"{response['choices'][0]['message']['content']}", )


NODE_CLASS_MAPPINGS = {
Expand All @@ -624,6 +726,8 @@ def keyword_extract(self, prompt, model, temperature, attribute_name, attribute_
"CreativeArtPromptGenerator": CreativeArtPromptGenerator,
"ChatMusician": ChatMusician,
"StructuredOutput": StructuredOutput,
"LLMOptionalMemoryFreeSimple": LLMOptionalMemoryFreeSimple,
"LLMOptionalMemoryFreeAdvanced": LLMOptionalMemoryFreeAdvanced,
}
# A dictionary that contains the friendly/humanly readable titles for the nodes
NODE_DISPLAY_NAME_MAPPINGS = {
Expand All @@ -637,4 +741,6 @@ def keyword_extract(self, prompt, model, temperature, attribute_name, attribute_
"CreativeArtPromptGenerator": "Creative Art PromptGenerator",
"ChatMusician": "ChatMusician",
"StructuredOutput": "Structured Output",
"LLMOptionalMemoryFreeSimple": "LLM Simple (Memory Optional)",
"LLMOptionalMemoryFreeAdvanced": "LLM Advanced (Memory Optional)",
}

0 comments on commit 63e9e20

Please sign in to comment.