Files
dark_hal/llm_runtime/loaders/llamacpp_loader.py

91 lines
3.4 KiB
Python
Raw Normal View History

2026-03-13 12:56:43 -07:00
from typing import Any, Iterator, List
from llm_runtime.types import UnifiedModel, GenerateConfig
class _LlamaCppUnified:
def __init__(self, model_path: str, **kwargs: Any):
# Import llama_cpp directly instead of from main.py to avoid circular imports
from llama_cpp import Llama
if not model_path.lower().endswith(".gguf"):
raise ValueError(f"Not a valid GGUF model: {model_path}")
self.model_path = model_path
self.kwargs = kwargs
self._llama = None
def _get_model(self):
"""Lazy load the model using existing implementation"""
if self._llama is None:
# Import the _get_llama function from main module to maintain compatibility
try:
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from main import _get_llama
self._llama = _get_llama(
self.model_path,
n_ctx=self.kwargs.get("n_ctx", 8192),
n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),
lora_path=self.kwargs.get("lora_path"),
n_threads=self.kwargs.get("n_threads"),
)
except ImportError:
# Fallback to direct llama-cpp-python if main import fails
from llama_cpp import Llama
self._llama = Llama(
model_path=self.model_path,
n_ctx=self.kwargs.get("n_ctx", 8192),
n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),
verbose=self.kwargs.get("verbose", False),
n_threads=self.kwargs.get("n_threads")
)
return self._llama
def generate(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> str:
llama = self._get_model()
# Convert GenerateConfig to llama-cpp-python format
result = llama(
prompt,
max_tokens=cfg.max_tokens,
temperature=cfg.temperature,
top_p=cfg.top_p,
stop=list(cfg.stop) if cfg.stop else None,
echo=False
)
return result["choices"][0]["text"]
def stream(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> Iterator[str]:
llama = self._get_model()
# Use streaming generation
for chunk in llama.create_completion(
prompt,
max_tokens=cfg.max_tokens,
temperature=cfg.temperature,
top_p=cfg.top_p,
stop=list(cfg.stop) if cfg.stop else None,
stream=True,
echo=False
):
text = chunk["choices"][0]["text"]
if text:
yield text
def tokenize(self, text: str) -> List[int]:
llama = self._get_model()
return llama.tokenize(text.encode("utf-8"), add_bos=False)
def detokenize(self, ids: List[int]) -> str:
llama = self._get_model()
return llama.detokenize(ids).decode("utf-8", errors="ignore")
class LlamaCppLoader:
name = "llamacpp"
def can_load(self, source: str, **kwargs: Any) -> bool:
return source.lower().endswith(".gguf")
def load(self, source: str, **kwargs: Any) -> UnifiedModel:
return _LlamaCppUnified(source, **kwargs)