llm_runtime/loaders/llamacpp_loader.py

from typing import Any, Iterator, List
from llm_runtime.types import UnifiedModel, GenerateConfig

class _LlamaCppUnified:
    def __init__(self, model_path: str, **kwargs: Any):
        # Import llama_cpp directly instead of from main.py to avoid circular imports
        from llama_cpp import Llama
        
        if not model_path.lower().endswith(".gguf"):
            raise ValueError(f"Not a valid GGUF model: {model_path}")
        
        self.model_path = model_path
        self.kwargs = kwargs
        self._llama = None
        
    def _get_model(self):
        """Lazy load the model using existing implementation"""
        if self._llama is None:
            # Import the _get_llama function from main module to maintain compatibility
            try:
                import sys
                import os
                sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
                from main import _get_llama
                self._llama = _get_llama(
                    self.model_path,
                    n_ctx=self.kwargs.get("n_ctx", 8192),
                    n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),
                    lora_path=self.kwargs.get("lora_path"),
                    n_threads=self.kwargs.get("n_threads"),
                )
            except ImportError:
                # Fallback to direct llama-cpp-python if main import fails
                from llama_cpp import Llama
                self._llama = Llama(
                    model_path=self.model_path,
                    n_ctx=self.kwargs.get("n_ctx", 8192),
                    n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),
                    verbose=self.kwargs.get("verbose", False),
                    n_threads=self.kwargs.get("n_threads")
                )
        return self._llama

    def generate(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> str:
        llama = self._get_model()
        
        # Convert GenerateConfig to llama-cpp-python format
        result = llama(
            prompt,
            max_tokens=cfg.max_tokens,
            temperature=cfg.temperature,
            top_p=cfg.top_p,
            stop=list(cfg.stop) if cfg.stop else None,
            echo=False
        )
        
        return result["choices"][0]["text"]

    def stream(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> Iterator[str]:
        llama = self._get_model()
        
        # Use streaming generation
        for chunk in llama.create_completion(
            prompt,
            max_tokens=cfg.max_tokens,
            temperature=cfg.temperature,
            top_p=cfg.top_p,
            stop=list(cfg.stop) if cfg.stop else None,
            stream=True,
            echo=False
        ):
            text = chunk["choices"][0]["text"]
            if text:
                yield text

    def tokenize(self, text: str) -> List[int]:
        llama = self._get_model()
        return llama.tokenize(text.encode("utf-8"), add_bos=False)

    def detokenize(self, ids: List[int]) -> str:
        llama = self._get_model()
        return llama.detokenize(ids).decode("utf-8", errors="ignore")

class LlamaCppLoader:
    name = "llamacpp"
    
    def can_load(self, source: str, **kwargs: Any) -> bool:
        return source.lower().endswith(".gguf")
    
    def load(self, source: str, **kwargs: Any) -> UnifiedModel:
        return _LlamaCppUnified(source, **kwargs)
first commit 2026-03-13 12:56:43 -07:00			`from typing import Any, Iterator, List`
			`from llm_runtime.types import UnifiedModel, GenerateConfig`

			`class _LlamaCppUnified:`
			`def __init__(self, model_path: str, **kwargs: Any):`
			`# Import llama_cpp directly instead of from main.py to avoid circular imports`
			`from llama_cpp import Llama`

			`if not model_path.lower().endswith(".gguf"):`
			`raise ValueError(f"Not a valid GGUF model: {model_path}")`

			`self.model_path = model_path`
			`self.kwargs = kwargs`
			`self._llama = None`

			`def _get_model(self):`
			`"""Lazy load the model using existing implementation"""`
			`if self._llama is None:`
			`# Import the _get_llama function from main module to maintain compatibility`
			`try:`
			`import sys`
			`import os`
			`sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))`
			`from main import _get_llama`
			`self._llama = _get_llama(`
			`self.model_path,`
			`n_ctx=self.kwargs.get("n_ctx", 8192),`
			`n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),`
			`lora_path=self.kwargs.get("lora_path"),`
			`n_threads=self.kwargs.get("n_threads"),`
			`)`
			`except ImportError:`
			`# Fallback to direct llama-cpp-python if main import fails`
			`from llama_cpp import Llama`
			`self._llama = Llama(`
			`model_path=self.model_path,`
			`n_ctx=self.kwargs.get("n_ctx", 8192),`
			`n_gpu_layers=self.kwargs.get("n_gpu_layers", 0),`
			`verbose=self.kwargs.get("verbose", False),`
			`n_threads=self.kwargs.get("n_threads")`
			`)`
			`return self._llama`

			`def generate(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> str:`
			`llama = self._get_model()`

			`# Convert GenerateConfig to llama-cpp-python format`
			`result = llama(`
			`prompt,`
			`max_tokens=cfg.max_tokens,`
			`temperature=cfg.temperature,`
			`top_p=cfg.top_p,`
			`stop=list(cfg.stop) if cfg.stop else None,`
			`echo=False`
			`)`

			`return result["choices"][0]["text"]`

			`def stream(self, prompt: str, cfg: GenerateConfig = GenerateConfig(), **kwargs: Any) -> Iterator[str]:`
			`llama = self._get_model()`

			`# Use streaming generation`
			`for chunk in llama.create_completion(`
			`prompt,`
			`max_tokens=cfg.max_tokens,`
			`temperature=cfg.temperature,`
			`top_p=cfg.top_p,`
			`stop=list(cfg.stop) if cfg.stop else None,`
			`stream=True,`
			`echo=False`
			`):`
			`text = chunk["choices"][0]["text"]`
			`if text:`
			`yield text`

			`def tokenize(self, text: str) -> List[int]:`
			`llama = self._get_model()`
			`return llama.tokenize(text.encode("utf-8"), add_bos=False)`

			`def detokenize(self, ids: List[int]) -> str:`
			`llama = self._get_model()`
			`return llama.detokenize(ids).decode("utf-8", errors="ignore")`

			`class LlamaCppLoader:`
			`name = "llamacpp"`

			`def can_load(self, source: str, **kwargs: Any) -> bool:`
			`return source.lower().endswith(".gguf")`

			`def load(self, source: str, **kwargs: Any) -> UnifiedModel:`
			`return _LlamaCppUnified(source, **kwargs)`