91 lines
3.4 KiB
Python
91 lines
3.4 KiB
Python
from typing import Any, Iterator, List
|
|
from llm_runtime.types import UnifiedModel, GenerateConfig
|
|
|
|
class _LlamaCppUnified:
|
|
def __init__(self, model_path: str, **kwargs: Any):
|
|
# Import llama_cpp directly instead of from main.py to avoid circular imports
|
|
from llama_cpp import Llama
|
|
|
|
if not model_path.lower().endswith(".gguf"):
|
|
raise ValueError(f"Not a valid GGUF model: {model_path}")
|
|
|
|
self.model_path = model_path
|
|
self.kwargs = kwargs
|
|
self._llama = None
|
|
|
|
def _get_model(self):
|
|
"""Lazy load the model using existing implementation"""
|
|
if self._llama is None:
|
|
# Import the _get_llama function from main module to maintain compatibility
|
|
try:
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
|
from main import _get_llama
|
|
self._llama = _get_llama(
|
|
self.model_path,
|
|
n_ctx=self.kwargs.get("n_ctx", 8192),
|
|
n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),
|
|
lora_path=self.kwargs.get("lora_path"),
|
|
n_threads=self.kwargs.get("n_threads"),
|
|
)
|
|
except ImportError:
|
|
# Fallback to direct llama-cpp-python if main import fails
|
|
from llama_cpp import Llama
|
|
self._llama = Llama(
|
|
model_path=self.model_path,
|
|
n_ctx=self.kwargs.get("n_ctx", 8192),
|
|
n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),
|
|
verbose=self.kwargs.get("verbose", False),
|
|
n_threads=self.kwargs.get("n_threads")
|
|
)
|
|
return self._llama
|
|
|
|
def generate(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> str:
|
|
llama = self._get_model()
|
|
|
|
# Convert GenerateConfig to llama-cpp-python format
|
|
result = llama(
|
|
prompt,
|
|
max_tokens=cfg.max_tokens,
|
|
temperature=cfg.temperature,
|
|
top_p=cfg.top_p,
|
|
stop=list(cfg.stop) if cfg.stop else None,
|
|
echo=False
|
|
)
|
|
|
|
return result["choices"][0]["text"]
|
|
|
|
def stream(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> Iterator[str]:
|
|
llama = self._get_model()
|
|
|
|
# Use streaming generation
|
|
for chunk in llama.create_completion(
|
|
prompt,
|
|
max_tokens=cfg.max_tokens,
|
|
temperature=cfg.temperature,
|
|
top_p=cfg.top_p,
|
|
stop=list(cfg.stop) if cfg.stop else None,
|
|
stream=True,
|
|
echo=False
|
|
):
|
|
text = chunk["choices"][0]["text"]
|
|
if text:
|
|
yield text
|
|
|
|
def tokenize(self, text: str) -> List[int]:
|
|
llama = self._get_model()
|
|
return llama.tokenize(text.encode("utf-8"), add_bos=False)
|
|
|
|
def detokenize(self, ids: List[int]) -> str:
|
|
llama = self._get_model()
|
|
return llama.detokenize(ids).decode("utf-8", errors="ignore")
|
|
|
|
class LlamaCppLoader:
|
|
name = "llamacpp"
|
|
|
|
def can_load(self, source: str, **kwargs: Any) -> bool:
|
|
return source.lower().endswith(".gguf")
|
|
|
|
def load(self, source: str, **kwargs: Any) -> UnifiedModel:
|
|
return _LlamaCppUnified(source, **kwargs) |