llm memory free

gokayfem · Nov 1, 2024 · 63e9e20 · 63e9e20
1 parent 80d1f30
commit 63e9e20
Showing 1 changed file with 107 additions and 1 deletion.
diff --git a/nodes/suggest.py b/nodes/suggest.py
@@ -14,7 +14,8 @@
 from typing import Any, List
 from pydantic import BaseModel, Field, create_model
 from typing_extensions import Literal
-
+import torch
+import gc
 
 
 supported_LLava_extensions = set(['.gguf'])
@@ -611,6 +612,107 @@ def keyword_extract(self, prompt, model, temperature, attribute_name, attribute_
         parsed_response = json.loads(response)
 
         return (next(iter(parsed_response.values())),)
+
+class LLMOptionalMemoryFreeSimple:
+    def __init__(self):
+        self.llm = None  # Store the model instance
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "ckpt_name": (folder_paths.get_filename_list("LLavacheckpoints"), ),
+                "max_ctx": ("INT", {"default": 4096, "min": 128, "max": 128000, "step": 64}),
+                "gpu_layers": ("INT", {"default": 27, "min": 0, "max": 100, "step": 1}),
+                "n_threads": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
+                "prompt": ("STRING", {"forceInput": True}),
+                "temperature": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0, "step": 0.01}),
+                "unload": ("BOOLEAN", {"default": False}),  # Add unload parameter
+            }
+        }
+
+    RETURN_TYPES = ("STRING",)
+    FUNCTION = "generate_text"
+    CATEGORY = "VLM Nodes/LLM"
+
+    def generate_text(self, ckpt_name, max_ctx, gpu_layers, n_threads, prompt, temperature, unload):
+        # Load model
+        ckpt_path = folder_paths.get_full_path("LLavacheckpoints", ckpt_name)
+        self.llm = Llama(model_path=ckpt_path, offload_kqv=True, f16_kv=True, use_mlock=False, embedding=False, n_batch=1024, last_n_tokens_size=1024, verbose=True, seed=42, n_ctx=max_ctx, n_gpu_layers=gpu_layers, n_threads=n_threads, logits_all=True, echo=False)
+
+        response = self.llm.create_chat_completion(
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=temperature,
+        )
+
+        if unload and self.llm is not None:
+            del self.llm  # Unload the model
+            self.llm = None  # Remove reference to the model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        return (f"{response['choices'][0]['message']['content']}", )
+
+class LLMOptionalMemoryFreeAdvanced:
+    def __init__(self):
+        self.llm = None  # Store the model instance
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "ckpt_name": (folder_paths.get_filename_list("LLavacheckpoints"), ),
+                "max_ctx": ("INT", {"default": 4096, "min": 128, "max": 128000, "step": 64}),
+                "gpu_layers": ("INT", {"default": 27, "min": 0, "max": 100, "step": 1}),
+                "n_threads": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
+                "system_msg": ("STRING", {"default": "You are a helpful AI assistant."}),
+                "prompt": ("STRING", {"forceInput": True, "default": ""}),
+                "max_tokens": ("INT", {"default": 512, "min": 1, "max": 2048, "step": 1}),
+                "temperature": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0, "step": 0.01}),
+                "top_p": ("FLOAT", {"default": 0.95, "min": 0.1, "max": 1.0, "step": 0.01}),
+                "top_k": ("INT", {"default": 40, "step": 1}),
+                "frequency_penalty": ("FLOAT", {"default": 0.0, "step": 0.01}),
+                "presence_penalty": ("FLOAT", {"default": 0.0, "step": 0.01}),
+                "repeat_penalty": ("FLOAT", {"default": 1.1, "step": 0.01}),
+                "seed": ("INT", {"default": 42, "step": 1}),
+                "unload": ("BOOLEAN", {"default": False}),  # Add unload parameter
+            }
+        }
+
+    RETURN_TYPES = ("STRING",)
+    FUNCTION = "generate_text_advanced"
+    CATEGORY = "VLM Nodes/LLM"
+
+    def generate_text_advanced(self, ckpt_name, max_ctx, gpu_layers, n_threads, system_msg, prompt, max_tokens, temperature, top_p, top_k, frequency_penalty, presence_penalty, repeat_penalty, seed, unload):
+        # Load model
+        ckpt_path = folder_paths.get_full_path("LLavacheckpoints", ckpt_name)
+        self.llm = Llama(model_path=ckpt_path, offload_kqv=True, f16_kv=True, use_mlock=False, embedding=False, n_batch=1024, last_n_tokens_size=1024, verbose=True, seed=seed, n_ctx=max_ctx, n_gpu_layers=gpu_layers, n_threads=n_threads, logits_all=True, echo=False)
+
+        response = self.llm.create_chat_completion(
+            messages=[
+                {"role": "system", "content": system_msg},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            repeat_penalty=repeat_penalty,
+            seed=seed,
+        )
+
+        if unload and self.llm is not None:
+            del self.llm  # Unload the model
+            self.llm = None  # Remove reference to the model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        return (f"{response['choices'][0]['message']['content']}", )
 
 
 NODE_CLASS_MAPPINGS = {
@@ -624,6 +726,8 @@ def keyword_extract(self, prompt, model, temperature, attribute_name, attribute_
     "CreativeArtPromptGenerator": CreativeArtPromptGenerator,
     "ChatMusician": ChatMusician,
     "StructuredOutput": StructuredOutput,
+    "LLMOptionalMemoryFreeSimple": LLMOptionalMemoryFreeSimple,
+    "LLMOptionalMemoryFreeAdvanced": LLMOptionalMemoryFreeAdvanced,
 }
 # A dictionary that contains the friendly/humanly readable titles for the nodes
 NODE_DISPLAY_NAME_MAPPINGS = {
@@ -637,4 +741,6 @@ def keyword_extract(self, prompt, model, temperature, attribute_name, attribute_
     "CreativeArtPromptGenerator": "Creative Art PromptGenerator",
     "ChatMusician": "ChatMusician",
     "StructuredOutput": "Structured Output",
+    "LLMOptionalMemoryFreeSimple": "LLM Simple (Memory Optional)",
+    "LLMOptionalMemoryFreeAdvanced": "LLM Advanced (Memory Optional)",
 }