Full security platform with web dashboard, 16 Flask blueprints, 26 modules, autonomous AI agent, WebUSB hardware support, and Archon Android companion app. Includes Hash Toolkit, debug console, anti-stalkerware shield, Metasploit/RouterSploit integration, WireGuard VPN, OSINT reconnaissance, and multi-backend LLM support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1466 lines
51 KiB
Python
1466 lines
51 KiB
Python
"""
|
|
AUTARCH LLM Integration Module
|
|
Wrapper for llama-cpp-python to interface with llama.cpp models
|
|
"""
|
|
|
|
import logging
|
|
import sys
|
|
from typing import Optional, Generator, List, Dict, Any
|
|
from pathlib import Path
|
|
|
|
from .config import get_config
|
|
from .banner import Colors
|
|
|
|
_llm_logger = logging.getLogger('autarch.llm')
|
|
|
|
|
|
class LLMError(Exception):
|
|
"""Exception raised for LLM-related errors."""
|
|
pass
|
|
|
|
|
|
class LLM:
|
|
"""Wrapper class for llama-cpp-python integration."""
|
|
|
|
def __init__(self, config=None):
|
|
"""Initialize the LLM wrapper.
|
|
|
|
Args:
|
|
config: Optional Config instance. Uses global config if not provided.
|
|
"""
|
|
self.config = config or get_config()
|
|
self._model = None
|
|
self._model_path = None
|
|
self._metadata_dir = None
|
|
self._special_tokens = {}
|
|
self._chat_format = None
|
|
self._chat_history: List[Dict[str, str]] = []
|
|
|
|
@property
|
|
def is_loaded(self) -> bool:
|
|
"""Check if a model is currently loaded."""
|
|
return self._model is not None
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
"""Get the name of the currently loaded model."""
|
|
if self._model_path:
|
|
return Path(self._model_path).name
|
|
return "No model loaded"
|
|
|
|
def load_model(self, model_path: str = None, verbose: bool = False) -> bool:
|
|
"""Load a GGUF model.
|
|
|
|
Args:
|
|
model_path: Path to the model file. Uses config if not provided.
|
|
verbose: Whether to show loading progress.
|
|
|
|
Returns:
|
|
True if model loaded successfully.
|
|
|
|
Raises:
|
|
LLMError: If model loading fails.
|
|
"""
|
|
try:
|
|
from llama_cpp import Llama
|
|
except ImportError as e:
|
|
raise LLMError(f"llama-cpp-python not installed: {e}")
|
|
|
|
# Get model path from config if not provided
|
|
if model_path is None:
|
|
model_path = self.config.get('llama', 'model_path', '')
|
|
|
|
if not model_path:
|
|
raise LLMError("No model path configured. Run setup first.")
|
|
|
|
model_path = Path(model_path).expanduser()
|
|
if not model_path.exists():
|
|
raise LLMError(f"Model file not found: {model_path}")
|
|
|
|
# Get settings from config
|
|
settings = self.config.get_llama_settings()
|
|
|
|
if verbose:
|
|
print(f"{Colors.CYAN}[*] Loading model: {model_path.name}{Colors.RESET}")
|
|
print(f"{Colors.DIM} Context: {settings['n_ctx']} | Threads: {settings['n_threads']} | GPU Layers: {settings['n_gpu_layers']}{Colors.RESET}")
|
|
|
|
# Look for tokenizer/config files in the model directory or parent
|
|
model_dir = model_path.parent
|
|
chat_format, metadata_dir, special_tokens = self._detect_chat_format(model_dir, verbose)
|
|
|
|
# If not found in same dir, try parent directory
|
|
if not metadata_dir and model_dir.name.lower() in ('gguf', 'guff', 'models'):
|
|
chat_format, metadata_dir, special_tokens = self._detect_chat_format(model_dir.parent, verbose)
|
|
|
|
try:
|
|
llama_kwargs = {
|
|
'model_path': str(model_path),
|
|
'n_ctx': settings['n_ctx'],
|
|
'n_threads': settings['n_threads'],
|
|
'n_gpu_layers': settings['n_gpu_layers'],
|
|
'seed': settings['seed'] if settings['seed'] != -1 else None,
|
|
'verbose': verbose,
|
|
}
|
|
|
|
# Add chat format if detected
|
|
if chat_format:
|
|
llama_kwargs['chat_format'] = chat_format
|
|
if verbose:
|
|
print(f"{Colors.DIM} Chat format: {chat_format}{Colors.RESET}")
|
|
|
|
self._model = Llama(**llama_kwargs)
|
|
self._model_path = str(model_path)
|
|
self._metadata_dir = metadata_dir
|
|
self._special_tokens = special_tokens
|
|
self._chat_format = chat_format
|
|
|
|
if verbose:
|
|
print(f"{Colors.GREEN}[+] Model loaded successfully{Colors.RESET}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
self._model = None
|
|
self._model_path = None
|
|
raise LLMError(f"Failed to load model: {e}")
|
|
|
|
def _detect_chat_format(self, directory: Path, verbose: bool = False) -> tuple:
|
|
"""Detect chat format and special tokens from tokenizer config files.
|
|
|
|
Args:
|
|
directory: Directory to search for config files
|
|
verbose: Whether to print status
|
|
|
|
Returns:
|
|
Tuple of (chat_format, metadata_dir, special_tokens) or (None, None, {})
|
|
"""
|
|
import json
|
|
|
|
if not directory.exists():
|
|
return None, None, {}
|
|
|
|
# Look for tokenizer_config.json
|
|
tokenizer_config = directory / 'tokenizer_config.json'
|
|
config_json = directory / 'config.json'
|
|
special_tokens_file = directory / 'special_tokens_map.json'
|
|
|
|
chat_format = None
|
|
metadata_dir = None
|
|
special_tokens = {}
|
|
|
|
# Check for tokenizer files
|
|
has_tokenizer = (directory / 'tokenizer.json').exists()
|
|
has_tokenizer_config = tokenizer_config.exists()
|
|
has_config = config_json.exists()
|
|
has_special_tokens = special_tokens_file.exists()
|
|
|
|
if has_tokenizer or has_tokenizer_config or has_config or has_special_tokens:
|
|
metadata_dir = str(directory)
|
|
if verbose:
|
|
found_files = []
|
|
if has_tokenizer:
|
|
found_files.append('tokenizer.json')
|
|
if has_tokenizer_config:
|
|
found_files.append('tokenizer_config.json')
|
|
if has_special_tokens:
|
|
found_files.append('special_tokens_map.json')
|
|
if has_config:
|
|
found_files.append('config.json')
|
|
print(f"{Colors.DIM} Found model metadata in: {directory.name}/{Colors.RESET}")
|
|
print(f"{Colors.DIM} Files: {', '.join(found_files)}{Colors.RESET}")
|
|
|
|
# Load special tokens
|
|
if has_special_tokens:
|
|
try:
|
|
with open(special_tokens_file, 'r') as f:
|
|
st = json.load(f)
|
|
# Extract token strings
|
|
for key, value in st.items():
|
|
if isinstance(value, dict):
|
|
special_tokens[key] = value.get('content', '')
|
|
else:
|
|
special_tokens[key] = value
|
|
if verbose and special_tokens:
|
|
tokens_str = ', '.join(f"{k}={v}" for k, v in special_tokens.items() if v)
|
|
print(f"{Colors.DIM} Special tokens: {tokens_str}{Colors.RESET}")
|
|
except (json.JSONDecodeError, IOError):
|
|
pass
|
|
|
|
# Try to detect chat format from tokenizer_config.json
|
|
if has_tokenizer_config:
|
|
try:
|
|
with open(tokenizer_config, 'r') as f:
|
|
tc = json.load(f)
|
|
|
|
# Check chat_template field
|
|
chat_template = tc.get('chat_template', '')
|
|
|
|
# Detect format from chat_template content
|
|
if 'chatml' in chat_template.lower() or '<|im_start|>' in chat_template:
|
|
chat_format = 'chatml'
|
|
elif 'llama-2' in chat_template.lower() or '[INST]' in chat_template:
|
|
chat_format = 'llama-2'
|
|
elif 'mistral' in chat_template.lower():
|
|
chat_format = 'mistral-instruct'
|
|
elif 'vicuna' in chat_template.lower():
|
|
chat_format = 'vicuna'
|
|
elif 'alpaca' in chat_template.lower():
|
|
chat_format = 'alpaca'
|
|
elif 'zephyr' in chat_template.lower():
|
|
chat_format = 'zephyr'
|
|
|
|
# Also check model_type or other fields
|
|
if not chat_format:
|
|
model_type = tc.get('model_type', '').lower()
|
|
if 'llama' in model_type:
|
|
chat_format = 'llama-2'
|
|
elif 'mistral' in model_type:
|
|
chat_format = 'mistral-instruct'
|
|
|
|
except (json.JSONDecodeError, IOError):
|
|
pass
|
|
|
|
# If still no format, try config.json
|
|
if not chat_format and has_config:
|
|
try:
|
|
with open(config_json, 'r') as f:
|
|
cfg = json.load(f)
|
|
|
|
model_type = cfg.get('model_type', '').lower()
|
|
architectures = cfg.get('architectures', [])
|
|
|
|
# Detect from model_type or architectures
|
|
arch_str = ' '.join(architectures).lower()
|
|
|
|
if 'llama' in model_type or 'llama' in arch_str:
|
|
chat_format = 'llama-2'
|
|
elif 'mistral' in model_type or 'mistral' in arch_str:
|
|
chat_format = 'mistral-instruct'
|
|
elif 'qwen' in model_type or 'qwen' in arch_str:
|
|
chat_format = 'chatml'
|
|
|
|
except (json.JSONDecodeError, IOError):
|
|
pass
|
|
|
|
return chat_format, metadata_dir, special_tokens
|
|
|
|
def unload_model(self):
|
|
"""Unload the current model and free resources."""
|
|
if self._model is not None:
|
|
del self._model
|
|
self._model = None
|
|
self._model_path = None
|
|
self._metadata_dir = None
|
|
self._special_tokens = {}
|
|
self._chat_format = None
|
|
self._chat_history.clear()
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int = None,
|
|
temperature: float = None,
|
|
top_p: float = None,
|
|
top_k: int = None,
|
|
repeat_penalty: float = None,
|
|
stop: List[str] = None,
|
|
stream: bool = False
|
|
) -> str | Generator[str, None, None]:
|
|
"""Generate text completion.
|
|
|
|
Args:
|
|
prompt: The input prompt.
|
|
max_tokens: Maximum tokens to generate. Uses config default if None.
|
|
temperature: Sampling temperature. Uses config default if None.
|
|
top_p: Nucleus sampling parameter. Uses config default if None.
|
|
top_k: Top-k sampling parameter. Uses config default if None.
|
|
repeat_penalty: Repetition penalty. Uses config default if None.
|
|
stop: List of stop sequences.
|
|
stream: If True, yields tokens as they're generated.
|
|
|
|
Returns:
|
|
Generated text string, or generator if stream=True.
|
|
|
|
Raises:
|
|
LLMError: If no model is loaded or generation fails.
|
|
"""
|
|
if not self.is_loaded:
|
|
raise LLMError("No model loaded. Call load_model() first.")
|
|
|
|
# Get defaults from config
|
|
settings = self.config.get_llama_settings()
|
|
|
|
params = {
|
|
'max_tokens': max_tokens or settings['max_tokens'],
|
|
'temperature': temperature if temperature is not None else settings['temperature'],
|
|
'top_p': top_p if top_p is not None else settings['top_p'],
|
|
'top_k': top_k if top_k is not None else settings['top_k'],
|
|
'repeat_penalty': repeat_penalty if repeat_penalty is not None else settings['repeat_penalty'],
|
|
'stop': stop or [],
|
|
'stream': stream,
|
|
}
|
|
|
|
try:
|
|
if stream:
|
|
return self._stream_generate(prompt, params)
|
|
else:
|
|
response = self._model(prompt, **params)
|
|
return response['choices'][0]['text']
|
|
|
|
except Exception as e:
|
|
raise LLMError(f"Generation failed: {e}")
|
|
|
|
def _stream_generate(self, prompt: str, params: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming generation method.
|
|
|
|
Args:
|
|
prompt: The input prompt.
|
|
params: Generation parameters.
|
|
|
|
Yields:
|
|
Token strings as they're generated.
|
|
"""
|
|
try:
|
|
for chunk in self._model(prompt, **params):
|
|
token = chunk['choices'][0]['text']
|
|
yield token
|
|
except Exception as e:
|
|
raise LLMError(f"Streaming generation failed: {e}")
|
|
|
|
def chat(
|
|
self,
|
|
message: str,
|
|
system_prompt: str = None,
|
|
stream: bool = False,
|
|
**kwargs
|
|
) -> str | Generator[str, None, None]:
|
|
"""Chat-style interaction with conversation history.
|
|
|
|
Args:
|
|
message: User message.
|
|
system_prompt: Optional system prompt (used on first message).
|
|
stream: If True, yields tokens as they're generated.
|
|
**kwargs: Additional parameters passed to generate().
|
|
|
|
Returns:
|
|
Assistant response string, or generator if stream=True.
|
|
"""
|
|
if not self.is_loaded:
|
|
raise LLMError("No model loaded. Call load_model() first.")
|
|
|
|
# Initialize with system prompt if provided and history is empty
|
|
if system_prompt and not self._chat_history:
|
|
self._chat_history.append({
|
|
'role': 'system',
|
|
'content': system_prompt
|
|
})
|
|
|
|
# Add user message to history
|
|
self._chat_history.append({
|
|
'role': 'user',
|
|
'content': message
|
|
})
|
|
|
|
# Build prompt from history
|
|
prompt = self._build_chat_prompt()
|
|
|
|
# Generate response
|
|
if stream:
|
|
return self._stream_chat(prompt, kwargs)
|
|
else:
|
|
response = self.generate(prompt, stream=False, **kwargs)
|
|
# Clean up response and add to history
|
|
response = response.strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': response
|
|
})
|
|
return response
|
|
|
|
def _stream_chat(self, prompt: str, kwargs: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming chat method.
|
|
|
|
Args:
|
|
prompt: The formatted prompt.
|
|
kwargs: Generation parameters.
|
|
|
|
Yields:
|
|
Token strings as they're generated.
|
|
"""
|
|
full_response = []
|
|
for token in self.generate(prompt, stream=True, **kwargs):
|
|
full_response.append(token)
|
|
yield token
|
|
|
|
# Add complete response to history
|
|
response = ''.join(full_response).strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': response
|
|
})
|
|
|
|
def _build_chat_prompt(self) -> str:
|
|
"""Build a chat prompt from conversation history.
|
|
|
|
Returns:
|
|
Formatted prompt string.
|
|
"""
|
|
# ChatML-style format (works with many models)
|
|
prompt_parts = []
|
|
|
|
for msg in self._chat_history:
|
|
role = msg['role']
|
|
content = msg['content']
|
|
|
|
if role == 'system':
|
|
prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>")
|
|
elif role == 'user':
|
|
prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>")
|
|
elif role == 'assistant':
|
|
prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>")
|
|
|
|
# Add assistant prompt for generation
|
|
prompt_parts.append("<|im_start|>assistant\n")
|
|
|
|
return "\n".join(prompt_parts)
|
|
|
|
def clear_history(self):
|
|
"""Clear the conversation history."""
|
|
self._chat_history.clear()
|
|
|
|
def get_history(self) -> List[Dict[str, str]]:
|
|
"""Get the current conversation history.
|
|
|
|
Returns:
|
|
List of message dictionaries with 'role' and 'content' keys.
|
|
"""
|
|
return self._chat_history.copy()
|
|
|
|
def set_history(self, history: List[Dict[str, str]]):
|
|
"""Set the conversation history.
|
|
|
|
Args:
|
|
history: List of message dictionaries.
|
|
"""
|
|
self._chat_history = history.copy()
|
|
|
|
def get_model_info(self) -> Dict[str, Any]:
|
|
"""Get information about the loaded model.
|
|
|
|
Returns:
|
|
Dictionary with model information.
|
|
"""
|
|
if not self.is_loaded:
|
|
return {'loaded': False}
|
|
|
|
return {
|
|
'loaded': True,
|
|
'model_path': self._model_path,
|
|
'model_name': self.model_name,
|
|
'n_ctx': self._model.n_ctx(),
|
|
'n_vocab': self._model.n_vocab(),
|
|
}
|
|
|
|
|
|
class TransformersLLM:
|
|
"""HuggingFace Transformers backend for safetensors models."""
|
|
|
|
def __init__(self, config=None):
|
|
self.config = config or get_config()
|
|
self._model = None
|
|
self._tokenizer = None
|
|
self._model_path = None
|
|
self._device = None
|
|
self._chat_history: List[Dict[str, str]] = []
|
|
|
|
@property
|
|
def is_loaded(self) -> bool:
|
|
return self._model is not None and self._tokenizer is not None
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
if self._model_path:
|
|
return Path(self._model_path).name
|
|
return "No model loaded"
|
|
|
|
def load_model(self, model_path: str = None, verbose: bool = False) -> bool:
|
|
"""Load a safetensors model using HuggingFace Transformers.
|
|
|
|
Args:
|
|
model_path: Path to model directory OR HuggingFace model ID
|
|
(e.g., 'segolilylabs/Lily-Cybersecurity-7B-v0.2').
|
|
Uses config if not provided.
|
|
verbose: Whether to show loading progress.
|
|
|
|
Returns:
|
|
True if model loaded successfully.
|
|
|
|
Raises:
|
|
LLMError: If model loading fails.
|
|
"""
|
|
try:
|
|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
except ImportError as e:
|
|
raise LLMError(f"transformers/torch not installed: {e}\nInstall with: pip install transformers torch")
|
|
|
|
# Get model path from config if not provided
|
|
if model_path is None:
|
|
model_path = self.config.get('transformers', 'model_path', '')
|
|
|
|
if not model_path:
|
|
raise LLMError("No model path configured. Run setup first.")
|
|
|
|
# Determine if this is a local path or HuggingFace model ID
|
|
model_id = model_path # For from_pretrained()
|
|
is_local = False
|
|
|
|
local_path = Path(model_path).expanduser()
|
|
if local_path.exists():
|
|
if self._is_valid_model_dir(local_path):
|
|
is_local = True
|
|
model_id = str(local_path)
|
|
else:
|
|
raise LLMError(f"Invalid model directory. Expected safetensors files in: {local_path}")
|
|
elif '/' in model_path and not model_path.startswith('/'):
|
|
# Looks like a HuggingFace model ID (e.g., 'org/model-name')
|
|
is_local = False
|
|
model_id = model_path
|
|
else:
|
|
raise LLMError(f"Model not found: {model_path}\nProvide a local path or HuggingFace model ID (e.g., 'segolilylabs/Lily-Cybersecurity-7B-v0.2')")
|
|
|
|
settings = self.config.get_transformers_settings()
|
|
|
|
if verbose:
|
|
display_name = Path(model_id).name if is_local else model_id
|
|
print(f"{Colors.CYAN}[*] Loading model: {display_name}{Colors.RESET}")
|
|
if not is_local:
|
|
print(f"{Colors.DIM} (from HuggingFace Hub/cache){Colors.RESET}")
|
|
|
|
try:
|
|
# Determine device
|
|
if settings['device'] == 'auto':
|
|
if torch.cuda.is_available():
|
|
self._device = 'cuda'
|
|
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
|
self._device = 'mps'
|
|
else:
|
|
self._device = 'cpu'
|
|
else:
|
|
self._device = settings['device']
|
|
|
|
if verbose:
|
|
print(f"{Colors.DIM} Device: {self._device}{Colors.RESET}")
|
|
|
|
# Determine dtype
|
|
if settings['torch_dtype'] == 'auto':
|
|
torch_dtype = torch.float16 if self._device != 'cpu' else torch.float32
|
|
elif settings['torch_dtype'] == 'float16':
|
|
torch_dtype = torch.float16
|
|
elif settings['torch_dtype'] == 'bfloat16':
|
|
torch_dtype = torch.bfloat16
|
|
else:
|
|
torch_dtype = torch.float32
|
|
|
|
# Load tokenizer
|
|
if verbose:
|
|
print(f"{Colors.DIM} Loading tokenizer...{Colors.RESET}")
|
|
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
model_id,
|
|
trust_remote_code=settings['trust_remote_code']
|
|
)
|
|
|
|
# Prepare model loading kwargs
|
|
device_map_cfg = settings.get('device_map', 'auto') or 'auto'
|
|
model_kwargs = {
|
|
'torch_dtype': torch_dtype,
|
|
'trust_remote_code': settings['trust_remote_code'],
|
|
'device_map': device_map_cfg if self._device != 'cpu' else None,
|
|
}
|
|
|
|
# Handle quantization — requires bitsandbytes (Linux/CUDA only)
|
|
_bnb_ok = False
|
|
try:
|
|
import bitsandbytes # noqa: F401
|
|
_bnb_ok = True
|
|
except (ImportError, Exception):
|
|
pass
|
|
|
|
if settings['load_in_8bit'] or settings['load_in_4bit']:
|
|
if _bnb_ok:
|
|
if settings['load_in_8bit']:
|
|
model_kwargs['load_in_8bit'] = True
|
|
# Enable FP32 CPU offload if requested — required when model layers
|
|
# are dispatched to CPU/disk during 8-bit quantization
|
|
if settings.get('llm_int8_enable_fp32_cpu_offload', False):
|
|
model_kwargs['llm_int8_enable_fp32_cpu_offload'] = True
|
|
_llm_logger.info("[LLM] llm_int8_enable_fp32_cpu_offload=True enabled")
|
|
if verbose:
|
|
print(f"{Colors.DIM} Loading in 8-bit quantization...{Colors.RESET}")
|
|
elif settings['load_in_4bit']:
|
|
model_kwargs['load_in_4bit'] = True
|
|
if verbose:
|
|
print(f"{Colors.DIM} Loading in 4-bit quantization...{Colors.RESET}")
|
|
else:
|
|
_llm_logger.warning(
|
|
"[LLM] load_in_8bit/load_in_4bit requested but bitsandbytes is not installed "
|
|
"(Windows is not supported). Loading without quantization."
|
|
)
|
|
|
|
# Load model
|
|
if verbose:
|
|
print(f"{Colors.DIM} Loading model weights...{Colors.RESET}")
|
|
self._model = AutoModelForCausalLM.from_pretrained(
|
|
model_id,
|
|
**model_kwargs
|
|
)
|
|
|
|
# Move to device if not using device_map
|
|
if 'device_map' not in model_kwargs or model_kwargs['device_map'] is None:
|
|
self._model = self._model.to(self._device)
|
|
|
|
self._model.eval()
|
|
self._model_path = model_id
|
|
|
|
if verbose:
|
|
print(f"{Colors.GREEN}[+] Model loaded successfully{Colors.RESET}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
self._model = None
|
|
self._tokenizer = None
|
|
self._model_path = None
|
|
raise LLMError(f"Failed to load model: {e}")
|
|
|
|
def _is_valid_model_dir(self, path: Path) -> bool:
|
|
"""Check if directory contains a valid safetensors model."""
|
|
if not path.is_dir():
|
|
return False
|
|
|
|
# Check for safetensors files
|
|
safetensor_files = list(path.glob("*.safetensors"))
|
|
if safetensor_files:
|
|
return True
|
|
|
|
# Check for model index
|
|
index_file = path / "model.safetensors.index.json"
|
|
if index_file.exists():
|
|
return True
|
|
|
|
# Check for config.json (indicates HF model)
|
|
config_file = path / "config.json"
|
|
if config_file.exists():
|
|
return True
|
|
|
|
return False
|
|
|
|
def unload_model(self):
|
|
"""Unload the current model and free resources."""
|
|
if self._model is not None:
|
|
del self._model
|
|
self._model = None
|
|
if self._tokenizer is not None:
|
|
del self._tokenizer
|
|
self._tokenizer = None
|
|
self._model_path = None
|
|
self._device = None
|
|
self._chat_history.clear()
|
|
|
|
# Clear GPU cache if available
|
|
try:
|
|
import torch
|
|
if torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
except ImportError:
|
|
pass
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int = None,
|
|
temperature: float = None,
|
|
top_p: float = None,
|
|
top_k: int = None,
|
|
repeat_penalty: float = None,
|
|
stop: List[str] = None,
|
|
stream: bool = False
|
|
) -> str | Generator[str, None, None]:
|
|
"""Generate text completion using transformers."""
|
|
if not self.is_loaded:
|
|
raise LLMError("No model loaded. Call load_model() first.")
|
|
|
|
try:
|
|
import torch
|
|
except ImportError:
|
|
raise LLMError("torch not installed")
|
|
|
|
settings = self.config.get_transformers_settings()
|
|
|
|
# Tokenize input
|
|
inputs = self._tokenizer(prompt, return_tensors="pt")
|
|
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
|
|
|
# Generation parameters
|
|
gen_kwargs = {
|
|
'max_new_tokens': max_tokens or settings['max_tokens'],
|
|
'temperature': temperature if temperature is not None else settings['temperature'],
|
|
'top_p': top_p if top_p is not None else settings['top_p'],
|
|
'top_k': top_k if top_k is not None else settings['top_k'],
|
|
'repetition_penalty': repeat_penalty if repeat_penalty is not None else settings['repetition_penalty'],
|
|
'do_sample': True,
|
|
'pad_token_id': self._tokenizer.eos_token_id,
|
|
}
|
|
|
|
# Handle temperature=0
|
|
if gen_kwargs['temperature'] == 0:
|
|
gen_kwargs['do_sample'] = False
|
|
del gen_kwargs['temperature']
|
|
del gen_kwargs['top_p']
|
|
del gen_kwargs['top_k']
|
|
|
|
try:
|
|
if stream:
|
|
return self._stream_generate(inputs, gen_kwargs, stop)
|
|
else:
|
|
with torch.no_grad():
|
|
outputs = self._model.generate(**inputs, **gen_kwargs)
|
|
# Decode only the new tokens
|
|
response = self._tokenizer.decode(
|
|
outputs[0][inputs['input_ids'].shape[1]:],
|
|
skip_special_tokens=True
|
|
)
|
|
# Handle stop sequences
|
|
if stop:
|
|
for stop_seq in stop:
|
|
if stop_seq in response:
|
|
response = response.split(stop_seq)[0]
|
|
return response
|
|
|
|
except Exception as e:
|
|
raise LLMError(f"Generation failed: {e}")
|
|
|
|
def _stream_generate(self, inputs: dict, gen_kwargs: dict, stop: List[str] = None) -> Generator[str, None, None]:
|
|
"""Internal streaming generation using TextIteratorStreamer."""
|
|
try:
|
|
import torch
|
|
from transformers import TextIteratorStreamer
|
|
from threading import Thread
|
|
except ImportError as e:
|
|
raise LLMError(f"Required packages not installed: {e}")
|
|
|
|
streamer = TextIteratorStreamer(
|
|
self._tokenizer,
|
|
skip_prompt=True,
|
|
skip_special_tokens=True
|
|
)
|
|
gen_kwargs['streamer'] = streamer
|
|
|
|
# Run generation in background thread
|
|
thread = Thread(target=lambda: self._model.generate(**inputs, **gen_kwargs))
|
|
thread.start()
|
|
|
|
# Yield tokens as they're generated
|
|
full_text = ""
|
|
for text in streamer:
|
|
# Check for stop sequences
|
|
if stop:
|
|
for stop_seq in stop:
|
|
if stop_seq in text:
|
|
text = text.split(stop_seq)[0]
|
|
yield text
|
|
return
|
|
full_text += text
|
|
yield text
|
|
|
|
thread.join()
|
|
|
|
def chat(
|
|
self,
|
|
message: str,
|
|
system_prompt: str = None,
|
|
stream: bool = False,
|
|
**kwargs
|
|
) -> str | Generator[str, None, None]:
|
|
"""Chat-style interaction with conversation history."""
|
|
if not self.is_loaded:
|
|
raise LLMError("No model loaded. Call load_model() first.")
|
|
|
|
# Initialize with system prompt if provided and history is empty
|
|
if system_prompt and not self._chat_history:
|
|
self._chat_history.append({
|
|
'role': 'system',
|
|
'content': system_prompt
|
|
})
|
|
|
|
# Add user message to history
|
|
self._chat_history.append({
|
|
'role': 'user',
|
|
'content': message
|
|
})
|
|
|
|
# Build prompt from history
|
|
prompt = self._build_chat_prompt()
|
|
|
|
# Generate response
|
|
if stream:
|
|
return self._stream_chat(prompt, kwargs)
|
|
else:
|
|
response = self.generate(prompt, stream=False, **kwargs)
|
|
response = response.strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': response
|
|
})
|
|
return response
|
|
|
|
def _stream_chat(self, prompt: str, kwargs: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming chat method."""
|
|
full_response = []
|
|
for token in self.generate(prompt, stream=True, **kwargs):
|
|
full_response.append(token)
|
|
yield token
|
|
|
|
response = ''.join(full_response).strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': response
|
|
})
|
|
|
|
def _build_chat_prompt(self) -> str:
|
|
"""Build a chat prompt from conversation history."""
|
|
# Try to use the tokenizer's chat template if available
|
|
if hasattr(self._tokenizer, 'apply_chat_template'):
|
|
try:
|
|
return self._tokenizer.apply_chat_template(
|
|
self._chat_history,
|
|
tokenize=False,
|
|
add_generation_prompt=True
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback to ChatML format
|
|
prompt_parts = []
|
|
for msg in self._chat_history:
|
|
role = msg['role']
|
|
content = msg['content']
|
|
if role == 'system':
|
|
prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>")
|
|
elif role == 'user':
|
|
prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>")
|
|
elif role == 'assistant':
|
|
prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>")
|
|
|
|
prompt_parts.append("<|im_start|>assistant\n")
|
|
return "\n".join(prompt_parts)
|
|
|
|
def clear_history(self):
|
|
"""Clear the conversation history."""
|
|
self._chat_history.clear()
|
|
|
|
def get_history(self) -> List[Dict[str, str]]:
|
|
"""Get the current conversation history."""
|
|
return self._chat_history.copy()
|
|
|
|
def set_history(self, history: List[Dict[str, str]]):
|
|
"""Set the conversation history."""
|
|
self._chat_history = history.copy()
|
|
|
|
def get_model_info(self) -> Dict[str, Any]:
|
|
"""Get information about the loaded model."""
|
|
if not self.is_loaded:
|
|
return {'loaded': False}
|
|
|
|
info = {
|
|
'loaded': True,
|
|
'model_path': self._model_path,
|
|
'model_name': self.model_name,
|
|
'device': self._device,
|
|
'backend': 'transformers',
|
|
}
|
|
|
|
# Add vocab size if available
|
|
if hasattr(self._tokenizer, 'vocab_size'):
|
|
info['vocab_size'] = self._tokenizer.vocab_size
|
|
|
|
return info
|
|
|
|
|
|
class ClaudeLLM:
|
|
"""Claude API backend implementing the same interface as LLM."""
|
|
|
|
def __init__(self, config=None):
|
|
self.config = config or get_config()
|
|
self._client = None
|
|
self._model = None
|
|
self._chat_history: List[Dict[str, str]] = []
|
|
|
|
@property
|
|
def is_loaded(self) -> bool:
|
|
return self._client is not None
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
if self._model:
|
|
return self._model
|
|
return "No model loaded"
|
|
|
|
def load_model(self, model_path: str = None, verbose: bool = False) -> bool:
|
|
"""Initialize the Anthropic client.
|
|
|
|
Args:
|
|
model_path: Ignored for Claude (model set in config).
|
|
verbose: Whether to show status messages.
|
|
|
|
Returns:
|
|
True if client initialized successfully.
|
|
|
|
Raises:
|
|
LLMError: If initialization fails.
|
|
"""
|
|
try:
|
|
import anthropic
|
|
except ImportError as e:
|
|
raise LLMError(f"anthropic package not installed: {e}")
|
|
|
|
import os
|
|
settings = self.config.get_claude_settings()
|
|
api_key = settings['api_key'] or os.environ.get('ANTHROPIC_API_KEY', '')
|
|
|
|
if not api_key:
|
|
raise LLMError(
|
|
"No Claude API key found. Set it in autarch_settings.conf [claude] section "
|
|
"or export ANTHROPIC_API_KEY environment variable."
|
|
)
|
|
|
|
self._model = settings['model']
|
|
|
|
if verbose:
|
|
print(f"{Colors.CYAN}[*] Initializing Claude API: {self._model}{Colors.RESET}")
|
|
|
|
try:
|
|
self._client = anthropic.Anthropic(api_key=api_key)
|
|
if verbose:
|
|
print(f"{Colors.GREEN}[+] Claude API ready{Colors.RESET}")
|
|
return True
|
|
except Exception as e:
|
|
self._client = None
|
|
self._model = None
|
|
raise LLMError(f"Failed to initialize Claude client: {e}")
|
|
|
|
def unload_model(self):
|
|
"""Clear the client and history."""
|
|
self._client = None
|
|
self._model = None
|
|
self._chat_history.clear()
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int = None,
|
|
temperature: float = None,
|
|
top_p: float = None,
|
|
top_k: int = None,
|
|
repeat_penalty: float = None,
|
|
stop: List[str] = None,
|
|
stream: bool = False
|
|
) -> str | Generator[str, None, None]:
|
|
"""Generate text from a prompt via Claude API.
|
|
|
|
The prompt is sent as a single user message.
|
|
"""
|
|
if not self.is_loaded:
|
|
raise LLMError("Claude client not initialized. Call load_model() first.")
|
|
|
|
settings = self.config.get_claude_settings()
|
|
|
|
params = {
|
|
'model': self._model,
|
|
'max_tokens': max_tokens or settings['max_tokens'],
|
|
'messages': [{'role': 'user', 'content': prompt}],
|
|
}
|
|
|
|
temp = temperature if temperature is not None else settings['temperature']
|
|
if temp is not None:
|
|
params['temperature'] = temp
|
|
if top_p is not None:
|
|
params['top_p'] = top_p
|
|
if top_k is not None:
|
|
params['top_k'] = top_k
|
|
if stop:
|
|
params['stop_sequences'] = stop
|
|
|
|
try:
|
|
if stream:
|
|
return self._stream_generate(params)
|
|
else:
|
|
response = self._client.messages.create(**params)
|
|
return response.content[0].text
|
|
except Exception as e:
|
|
raise LLMError(f"Claude generation failed: {e}")
|
|
|
|
def _stream_generate(self, params: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming generation."""
|
|
try:
|
|
with self._client.messages.stream(**params) as stream:
|
|
for text in stream.text_stream:
|
|
yield text
|
|
except Exception as e:
|
|
raise LLMError(f"Claude streaming failed: {e}")
|
|
|
|
def chat(
|
|
self,
|
|
message: str,
|
|
system_prompt: str = None,
|
|
stream: bool = False,
|
|
**kwargs
|
|
) -> str | Generator[str, None, None]:
|
|
"""Chat-style interaction with conversation history via Claude API."""
|
|
if not self.is_loaded:
|
|
raise LLMError("Claude client not initialized. Call load_model() first.")
|
|
|
|
# Store system prompt in history for tracking (same as LLM)
|
|
if system_prompt and not self._chat_history:
|
|
self._chat_history.append({
|
|
'role': 'system',
|
|
'content': system_prompt
|
|
})
|
|
|
|
# Add user message to history
|
|
self._chat_history.append({
|
|
'role': 'user',
|
|
'content': message
|
|
})
|
|
|
|
# Build API call from history
|
|
system_text = None
|
|
messages = []
|
|
for msg in self._chat_history:
|
|
if msg['role'] == 'system':
|
|
system_text = msg['content']
|
|
else:
|
|
messages.append({'role': msg['role'], 'content': msg['content']})
|
|
|
|
settings = self.config.get_claude_settings()
|
|
|
|
params = {
|
|
'model': self._model,
|
|
'max_tokens': kwargs.get('max_tokens', settings['max_tokens']),
|
|
'messages': messages,
|
|
}
|
|
|
|
if system_text:
|
|
params['system'] = system_text
|
|
|
|
temp = kwargs.get('temperature', settings['temperature'])
|
|
if temp is not None:
|
|
params['temperature'] = temp
|
|
if 'top_p' in kwargs:
|
|
params['top_p'] = kwargs['top_p']
|
|
if 'top_k' in kwargs:
|
|
params['top_k'] = kwargs['top_k']
|
|
if 'stop' in kwargs and kwargs['stop']:
|
|
params['stop_sequences'] = kwargs['stop']
|
|
|
|
try:
|
|
if stream:
|
|
return self._stream_chat(params)
|
|
else:
|
|
response = self._client.messages.create(**params)
|
|
text = response.content[0].text.strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': text
|
|
})
|
|
return text
|
|
except Exception as e:
|
|
raise LLMError(f"Claude chat failed: {e}")
|
|
|
|
def _stream_chat(self, params: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming chat method."""
|
|
full_response = []
|
|
try:
|
|
with self._client.messages.stream(**params) as stream:
|
|
for text in stream.text_stream:
|
|
full_response.append(text)
|
|
yield text
|
|
except Exception as e:
|
|
raise LLMError(f"Claude streaming chat failed: {e}")
|
|
|
|
response = ''.join(full_response).strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': response
|
|
})
|
|
|
|
def clear_history(self):
|
|
"""Clear the conversation history."""
|
|
self._chat_history.clear()
|
|
|
|
def get_history(self) -> List[Dict[str, str]]:
|
|
"""Get the current conversation history."""
|
|
return self._chat_history.copy()
|
|
|
|
def set_history(self, history: List[Dict[str, str]]):
|
|
"""Set the conversation history."""
|
|
self._chat_history = history.copy()
|
|
|
|
def get_model_info(self) -> Dict[str, Any]:
|
|
"""Get information about the Claude model."""
|
|
if not self.is_loaded:
|
|
return {'loaded': False}
|
|
|
|
return {
|
|
'loaded': True,
|
|
'model_path': 'Claude API',
|
|
'model_name': self.model_name,
|
|
'backend': 'claude',
|
|
}
|
|
|
|
|
|
class HuggingFaceLLM:
|
|
"""HuggingFace Inference API backend implementing the same interface as LLM.
|
|
|
|
Uses the huggingface_hub InferenceClient to call HF-hosted models
|
|
or any compatible text-generation-inference endpoint.
|
|
"""
|
|
|
|
def __init__(self, config=None):
|
|
self.config = config or get_config()
|
|
self._client = None
|
|
self._model = None
|
|
self._chat_history: List[Dict[str, str]] = []
|
|
|
|
@property
|
|
def is_loaded(self) -> bool:
|
|
return self._client is not None
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
if self._model:
|
|
return self._model
|
|
return "No model loaded"
|
|
|
|
def load_model(self, model_path: str = None, verbose: bool = False) -> bool:
|
|
"""Initialize the HuggingFace Inference client."""
|
|
try:
|
|
from huggingface_hub import InferenceClient
|
|
except ImportError as e:
|
|
raise LLMError(f"huggingface_hub package not installed: {e}")
|
|
|
|
import os
|
|
settings = self._get_settings()
|
|
api_key = settings.get('api_key', '') or os.environ.get('HF_TOKEN', '') or os.environ.get('HUGGING_FACE_HUB_TOKEN', '')
|
|
model = model_path or settings.get('model', 'mistralai/Mistral-7B-Instruct-v0.3')
|
|
endpoint = settings.get('endpoint', '')
|
|
|
|
self._model = model
|
|
|
|
if verbose:
|
|
print(f"{Colors.CYAN}[*] Initializing HuggingFace Inference: {self._model}{Colors.RESET}")
|
|
if endpoint:
|
|
print(f"{Colors.DIM} Endpoint: {endpoint}{Colors.RESET}")
|
|
|
|
try:
|
|
kwargs = {}
|
|
if api_key:
|
|
kwargs['token'] = api_key
|
|
if endpoint:
|
|
kwargs['model'] = endpoint
|
|
else:
|
|
kwargs['model'] = model
|
|
|
|
self._client = InferenceClient(**kwargs)
|
|
|
|
if verbose:
|
|
print(f"{Colors.GREEN}[+] HuggingFace Inference ready{Colors.RESET}")
|
|
return True
|
|
except Exception as e:
|
|
self._client = None
|
|
self._model = None
|
|
raise LLMError(f"Failed to initialize HuggingFace client: {e}")
|
|
|
|
def _get_settings(self) -> dict:
|
|
"""Get HuggingFace settings from config."""
|
|
return {
|
|
'api_key': self.config.get('huggingface', 'api_key', fallback=''),
|
|
'model': self.config.get('huggingface', 'model', fallback='mistralai/Mistral-7B-Instruct-v0.3'),
|
|
'endpoint': self.config.get('huggingface', 'endpoint', fallback=''),
|
|
'max_tokens': int(self.config.get('huggingface', 'max_tokens', fallback='1024')),
|
|
'temperature': float(self.config.get('huggingface', 'temperature', fallback='0.7')),
|
|
'top_p': float(self.config.get('huggingface', 'top_p', fallback='0.9')),
|
|
}
|
|
|
|
def unload_model(self):
|
|
"""Clear the client and history."""
|
|
self._client = None
|
|
self._model = None
|
|
self._chat_history.clear()
|
|
|
|
def generate(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int = None,
|
|
temperature: float = None,
|
|
top_p: float = None,
|
|
top_k: int = None,
|
|
repeat_penalty: float = None,
|
|
stop: List[str] = None,
|
|
stream: bool = False
|
|
) -> str | Generator[str, None, None]:
|
|
"""Generate text via HuggingFace Inference API."""
|
|
if not self.is_loaded:
|
|
raise LLMError("HuggingFace client not initialized. Call load_model() first.")
|
|
|
|
settings = self._get_settings()
|
|
|
|
params = {
|
|
'max_new_tokens': max_tokens or settings['max_tokens'],
|
|
'temperature': temperature if temperature is not None else settings['temperature'],
|
|
'top_p': top_p if top_p is not None else settings['top_p'],
|
|
}
|
|
if top_k is not None:
|
|
params['top_k'] = top_k
|
|
if repeat_penalty is not None:
|
|
params['repetition_penalty'] = repeat_penalty
|
|
if stop:
|
|
params['stop_sequences'] = stop
|
|
|
|
try:
|
|
if stream:
|
|
return self._stream_generate(prompt, params)
|
|
else:
|
|
response = self._client.text_generation(
|
|
prompt,
|
|
**params
|
|
)
|
|
return response
|
|
except Exception as e:
|
|
raise LLMError(f"HuggingFace generation failed: {e}")
|
|
|
|
def _stream_generate(self, prompt: str, params: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming generation."""
|
|
try:
|
|
for token in self._client.text_generation(
|
|
prompt,
|
|
stream=True,
|
|
**params
|
|
):
|
|
yield token
|
|
except Exception as e:
|
|
raise LLMError(f"HuggingFace streaming failed: {e}")
|
|
|
|
def chat(
|
|
self,
|
|
message: str,
|
|
system_prompt: str = None,
|
|
stream: bool = False,
|
|
**kwargs
|
|
) -> str | Generator[str, None, None]:
|
|
"""Chat-style interaction via HuggingFace Inference API."""
|
|
if not self.is_loaded:
|
|
raise LLMError("HuggingFace client not initialized. Call load_model() first.")
|
|
|
|
if system_prompt and not self._chat_history:
|
|
self._chat_history.append({
|
|
'role': 'system',
|
|
'content': system_prompt
|
|
})
|
|
|
|
self._chat_history.append({
|
|
'role': 'user',
|
|
'content': message
|
|
})
|
|
|
|
# Build messages for chat completion
|
|
messages = []
|
|
for msg in self._chat_history:
|
|
messages.append({'role': msg['role'], 'content': msg['content']})
|
|
|
|
settings = self._get_settings()
|
|
|
|
try:
|
|
if stream:
|
|
return self._stream_chat(messages, settings, kwargs)
|
|
else:
|
|
response = self._client.chat_completion(
|
|
messages=messages,
|
|
max_tokens=kwargs.get('max_tokens', settings['max_tokens']),
|
|
temperature=kwargs.get('temperature', settings['temperature']),
|
|
)
|
|
text = response.choices[0].message.content.strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': text
|
|
})
|
|
return text
|
|
except Exception as e:
|
|
raise LLMError(f"HuggingFace chat failed: {e}")
|
|
|
|
def _stream_chat(self, messages: list, settings: dict, kwargs: dict) -> Generator[str, None, None]:
|
|
"""Internal streaming chat."""
|
|
full_response = []
|
|
try:
|
|
stream = self._client.chat_completion(
|
|
messages=messages,
|
|
max_tokens=kwargs.get('max_tokens', settings['max_tokens']),
|
|
temperature=kwargs.get('temperature', settings['temperature']),
|
|
stream=True,
|
|
)
|
|
for chunk in stream:
|
|
if chunk.choices and chunk.choices[0].delta.content:
|
|
text = chunk.choices[0].delta.content
|
|
full_response.append(text)
|
|
yield text
|
|
except Exception as e:
|
|
raise LLMError(f"HuggingFace streaming chat failed: {e}")
|
|
|
|
response = ''.join(full_response).strip()
|
|
self._chat_history.append({
|
|
'role': 'assistant',
|
|
'content': response
|
|
})
|
|
|
|
def clear_history(self):
|
|
self._chat_history.clear()
|
|
|
|
def get_history(self) -> List[Dict[str, str]]:
|
|
return self._chat_history.copy()
|
|
|
|
def set_history(self, history: List[Dict[str, str]]):
|
|
self._chat_history = history.copy()
|
|
|
|
def get_model_info(self) -> Dict[str, Any]:
|
|
if not self.is_loaded:
|
|
return {'loaded': False}
|
|
settings = self._get_settings()
|
|
return {
|
|
'loaded': True,
|
|
'model_path': settings.get('endpoint', '') or 'HuggingFace Hub',
|
|
'model_name': self.model_name,
|
|
'backend': 'huggingface',
|
|
}
|
|
|
|
|
|
# Global LLM instance
|
|
_llm_instance = None
|
|
|
|
|
|
def get_llm():
|
|
"""Get the global LLM instance, auto-loading the model if needed.
|
|
|
|
Returns the appropriate backend (LLM, TransformersLLM, ClaudeLLM, or HuggingFaceLLM) based on config.
|
|
"""
|
|
global _llm_instance
|
|
if _llm_instance is None:
|
|
config = get_config()
|
|
backend = config.get('autarch', 'llm_backend', 'local')
|
|
_llm_logger.info(f"[LLM] Initializing backend: {backend}")
|
|
|
|
try:
|
|
if backend == 'claude':
|
|
settings = config.get_claude_settings()
|
|
_llm_logger.info(f"[LLM] Claude model: {settings['model']} | API key set: {bool(settings['api_key'])}")
|
|
_llm_instance = ClaudeLLM(config)
|
|
_llm_instance.load_model()
|
|
_llm_logger.info(f"[LLM] Claude client ready: {settings['model']}")
|
|
|
|
elif backend == 'transformers':
|
|
settings = config.get_transformers_settings()
|
|
_llm_logger.info(f"[LLM] Transformers model: {settings['model_path']} | device: {settings['device']}")
|
|
_llm_instance = TransformersLLM(config)
|
|
if settings['model_path']:
|
|
_llm_instance.load_model()
|
|
_llm_logger.info(f"[LLM] Transformers model loaded: {settings['model_path']}")
|
|
else:
|
|
_llm_logger.warning("[LLM] No transformers model path configured — set one in LLM Settings")
|
|
|
|
elif backend == 'huggingface':
|
|
hf = config.get_huggingface_settings()
|
|
_llm_logger.info(f"[LLM] HuggingFace model: {hf['model']} | provider: {hf.get('provider','auto')} | API key set: {bool(hf['api_key'])}")
|
|
_llm_instance = HuggingFaceLLM(config)
|
|
_llm_instance.load_model()
|
|
_llm_logger.info(f"[LLM] HuggingFace client ready: {hf['model']}")
|
|
|
|
else: # local / llama.cpp
|
|
settings = config.get_llama_settings()
|
|
_llm_logger.info(f"[LLM] llama.cpp model: {settings['model_path']} | n_ctx: {settings['n_ctx']} | n_gpu_layers: {settings['n_gpu_layers']} | threads: {settings['n_threads']}")
|
|
_llm_instance = LLM(config)
|
|
if settings['model_path']:
|
|
_llm_instance.load_model()
|
|
_llm_logger.info(f"[LLM] llama.cpp model loaded: {settings['model_path']}")
|
|
else:
|
|
_llm_logger.warning("[LLM] No local model path configured — set one in LLM Settings")
|
|
|
|
except Exception as exc:
|
|
_llm_logger.error(f"[LLM] Failed to load backend '{backend}': {exc}", exc_info=True)
|
|
_llm_instance = None
|
|
raise
|
|
|
|
return _llm_instance
|
|
|
|
|
|
def detect_model_type(path: str) -> str:
|
|
"""Detect the type of model at the given path.
|
|
|
|
Args:
|
|
path: Path to model file or directory
|
|
|
|
Returns:
|
|
'gguf' for GGUF files, 'transformers' for safetensors directories,
|
|
'unknown' if cannot be determined
|
|
"""
|
|
path = Path(path).expanduser()
|
|
|
|
if not path.exists():
|
|
return 'unknown'
|
|
|
|
# Check for GGUF file
|
|
if path.is_file():
|
|
if path.suffix.lower() == '.gguf':
|
|
return 'gguf'
|
|
# Some GGUF files might not have .gguf extension
|
|
# Check magic bytes
|
|
try:
|
|
with open(path, 'rb') as f:
|
|
magic = f.read(4)
|
|
if magic == b'GGUF':
|
|
return 'gguf'
|
|
except Exception:
|
|
pass
|
|
|
|
# Check for transformers/safetensors directory
|
|
if path.is_dir():
|
|
# Check for safetensors files
|
|
safetensor_files = list(path.glob("*.safetensors"))
|
|
if safetensor_files:
|
|
return 'transformers'
|
|
|
|
# Check for model index
|
|
index_file = path / "model.safetensors.index.json"
|
|
if index_file.exists():
|
|
return 'transformers'
|
|
|
|
# Check for config.json (indicates HF model)
|
|
config_file = path / "config.json"
|
|
if config_file.exists():
|
|
# Could be safetensors or pytorch
|
|
if list(path.glob("*.safetensors")) or (path / "model.safetensors.index.json").exists():
|
|
return 'transformers'
|
|
# Check for pytorch files too
|
|
if list(path.glob("*.bin")) or (path / "pytorch_model.bin").exists():
|
|
return 'transformers'
|
|
|
|
return 'unknown'
|
|
|
|
|
|
def reset_llm():
|
|
"""Reset the global LLM instance (used when switching backends)."""
|
|
global _llm_instance
|
|
if _llm_instance is not None:
|
|
_llm_logger.info("[LLM] Unloading current model instance")
|
|
_llm_instance.unload_model()
|
|
_llm_instance = None
|
|
_llm_logger.info("[LLM] Instance reset — next call to get_llm() will reload")
|