718 lines
26 KiB
Python
718 lines
26 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Multi-Model MCP Server for LLM_Train
|
||
|
|
|
||
|
|
This server provides MCP (Model Context Protocol) access to multiple local models
|
||
|
|
managed by the LLM_Train application. It supports model discovery, switching,
|
||
|
|
and inference through a standardized interface.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import platform
|
||
|
|
import subprocess
|
||
|
|
from typing import Any, Dict, List, Optional, Sequence
|
||
|
|
import argparse
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
try:
|
||
|
|
from mcp.server import Server
|
||
|
|
from mcp.server.models import InitializationOptions
|
||
|
|
from mcp.server.stdio import stdio_server
|
||
|
|
from mcp.types import (
|
||
|
|
CallToolRequestParams,
|
||
|
|
GetPromptRequestParams,
|
||
|
|
ListPromptsRequestParams,
|
||
|
|
ListToolsRequestParams,
|
||
|
|
Prompt,
|
||
|
|
PromptMessage,
|
||
|
|
Resource,
|
||
|
|
TextContent,
|
||
|
|
Tool,
|
||
|
|
EmbeddedResource,
|
||
|
|
)
|
||
|
|
except ImportError:
|
||
|
|
print("MCP library not found. Install with: pip install mcp", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Import our local modules
|
||
|
|
try:
|
||
|
|
from model_library import ModelLibrary, ModelInfo
|
||
|
|
from settings_manager import SettingsManager
|
||
|
|
from llama_cpp import Llama
|
||
|
|
except ImportError as e:
|
||
|
|
print(f"Required modules not found: {e}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
class MultiModelMCPServer:
|
||
|
|
"""MCP Server for managing multiple local models with CUDA support."""
|
||
|
|
|
||
|
|
def __init__(self, settings_path: str = "settings.json"):
|
||
|
|
self.settings = SettingsManager(settings_path)
|
||
|
|
self.library = None
|
||
|
|
self.current_model = None
|
||
|
|
self.current_llm = None
|
||
|
|
self.model_cache = {} # Cache for loaded models
|
||
|
|
|
||
|
|
# Initialize model library if configured
|
||
|
|
library_root = self.settings.get('library.root_folder', '')
|
||
|
|
if library_root and os.path.exists(library_root):
|
||
|
|
max_depth = self.settings.get('library.max_depth', 3)
|
||
|
|
self.library = ModelLibrary(library_root, max_depth)
|
||
|
|
self.library._load_index()
|
||
|
|
|
||
|
|
# Setup logging
|
||
|
|
logging.basicConfig(level=logging.INFO)
|
||
|
|
self.logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# Detect system capabilities
|
||
|
|
self.system_info = self._detect_system_capabilities()
|
||
|
|
self.logger.info(f"System capabilities: {self.system_info}")
|
||
|
|
|
||
|
|
def _detect_system_capabilities(self) -> Dict[str, Any]:
|
||
|
|
"""Detect system capabilities including CUDA, ROCm, and Metal support."""
|
||
|
|
capabilities = {
|
||
|
|
"platform": platform.system(),
|
||
|
|
"architecture": platform.machine(),
|
||
|
|
"cuda_available": False,
|
||
|
|
"cuda_version": None,
|
||
|
|
"cuda_devices": 0,
|
||
|
|
"rocm_available": False,
|
||
|
|
"metal_available": False,
|
||
|
|
"intel_gpu_available": False,
|
||
|
|
"recommended_layers": 0
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Check for CUDA (NVIDIA)
|
||
|
|
if capabilities["platform"] in ["Windows", "Linux"]:
|
||
|
|
try:
|
||
|
|
# Try nvidia-smi command
|
||
|
|
result = subprocess.run(
|
||
|
|
["nvidia-smi", "--query-gpu=count,driver_version", "--format=csv,noheader,nounits"],
|
||
|
|
capture_output=True, text=True, timeout=10
|
||
|
|
)
|
||
|
|
if result.returncode == 0:
|
||
|
|
lines = result.stdout.strip().split('\n')
|
||
|
|
if lines and lines[0]:
|
||
|
|
parts = lines[0].split(', ')
|
||
|
|
if len(parts) >= 2:
|
||
|
|
capabilities["cuda_devices"] = len(lines)
|
||
|
|
capabilities["cuda_version"] = parts[1]
|
||
|
|
capabilities["cuda_available"] = True
|
||
|
|
|
||
|
|
# Recommend using most GPU layers for CUDA
|
||
|
|
capabilities["recommended_layers"] = 35 # Good default for most models
|
||
|
|
self.logger.info(f"CUDA detected: {capabilities['cuda_devices']} device(s), driver {capabilities['cuda_version']}")
|
||
|
|
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Check for Intel GPU on Windows (Arc/Iris Xe)
|
||
|
|
if capabilities["platform"] == "Windows":
|
||
|
|
try:
|
||
|
|
result = subprocess.run(
|
||
|
|
["wmic", "path", "win32_VideoController", "get", "name"],
|
||
|
|
capture_output=True, text=True, timeout=10
|
||
|
|
)
|
||
|
|
if result.returncode == 0 and "intel" in result.stdout.lower():
|
||
|
|
capabilities["intel_gpu_available"] = True
|
||
|
|
if not capabilities["cuda_available"]:
|
||
|
|
capabilities["recommended_layers"] = 15 # Conservative for Intel GPU
|
||
|
|
self.logger.info("Intel GPU detected")
|
||
|
|
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Check for ROCm (AMD) on Linux
|
||
|
|
elif capabilities["platform"] == "Linux":
|
||
|
|
try:
|
||
|
|
result = subprocess.run(
|
||
|
|
["rocm-smi", "--showproductname"],
|
||
|
|
capture_output=True, text=True, timeout=10
|
||
|
|
)
|
||
|
|
if result.returncode == 0:
|
||
|
|
capabilities["rocm_available"] = True
|
||
|
|
capabilities["recommended_layers"] = 25 # Good default for ROCm
|
||
|
|
self.logger.info("ROCm (AMD GPU) detected")
|
||
|
|
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Check for Metal (Apple Silicon) on macOS
|
||
|
|
elif capabilities["platform"] == "Darwin":
|
||
|
|
# Check if running on Apple Silicon
|
||
|
|
if "arm" in capabilities["architecture"].lower() or "apple" in platform.processor().lower():
|
||
|
|
capabilities["metal_available"] = True
|
||
|
|
capabilities["recommended_layers"] = 30 # Good default for Apple Silicon
|
||
|
|
self.logger.info("Apple Silicon (Metal) detected")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
self.logger.warning(f"Error detecting system capabilities: {e}")
|
||
|
|
|
||
|
|
return capabilities
|
||
|
|
|
||
|
|
def get_available_models(self) -> List[ModelInfo]:
|
||
|
|
"""Get list of available models."""
|
||
|
|
if self.library:
|
||
|
|
return list(self.library.models.values())
|
||
|
|
return []
|
||
|
|
|
||
|
|
def load_model(self, model_path: str, **kwargs) -> bool:
|
||
|
|
"""Load a model for inference with optimized GPU acceleration."""
|
||
|
|
try:
|
||
|
|
# Check if model is already loaded
|
||
|
|
if model_path in self.model_cache:
|
||
|
|
self.current_model = model_path
|
||
|
|
self.current_llm = self.model_cache[model_path]
|
||
|
|
return True
|
||
|
|
|
||
|
|
# Get parameters with smart defaults based on system capabilities
|
||
|
|
n_ctx = kwargs.get('n_ctx', 4096)
|
||
|
|
n_threads = kwargs.get('n_threads', min(os.cpu_count() or 4, 8)) # Cap threads for stability
|
||
|
|
|
||
|
|
# Smart GPU layer detection
|
||
|
|
n_gpu_layers = kwargs.get('n_gpu_layers')
|
||
|
|
if n_gpu_layers is None:
|
||
|
|
# Auto-detect optimal GPU layers
|
||
|
|
if self.system_info["cuda_available"]:
|
||
|
|
n_gpu_layers = self.system_info["recommended_layers"]
|
||
|
|
elif self.system_info["rocm_available"]:
|
||
|
|
n_gpu_layers = self.system_info["recommended_layers"]
|
||
|
|
elif self.system_info["metal_available"]:
|
||
|
|
n_gpu_layers = self.system_info["recommended_layers"]
|
||
|
|
elif self.system_info["intel_gpu_available"]:
|
||
|
|
n_gpu_layers = self.system_info["recommended_layers"]
|
||
|
|
else:
|
||
|
|
n_gpu_layers = 0 # CPU only
|
||
|
|
|
||
|
|
# Additional optimizations based on platform
|
||
|
|
llm_kwargs = {
|
||
|
|
"model_path": model_path,
|
||
|
|
"n_ctx": n_ctx,
|
||
|
|
"n_gpu_layers": n_gpu_layers,
|
||
|
|
"n_threads": n_threads,
|
||
|
|
"verbose": False
|
||
|
|
}
|
||
|
|
|
||
|
|
# Platform-specific optimizations
|
||
|
|
if self.system_info["platform"] == "Windows":
|
||
|
|
# Windows optimizations
|
||
|
|
if self.system_info["cuda_available"]:
|
||
|
|
llm_kwargs["n_batch"] = 512 # Good batch size for CUDA on Windows
|
||
|
|
elif self.system_info["intel_gpu_available"]:
|
||
|
|
llm_kwargs["n_batch"] = 256 # Conservative for Intel GPU
|
||
|
|
|
||
|
|
elif self.system_info["platform"] == "Linux":
|
||
|
|
# Linux optimizations
|
||
|
|
if self.system_info["cuda_available"]:
|
||
|
|
llm_kwargs["n_batch"] = 512
|
||
|
|
llm_kwargs["use_mmap"] = True # Better memory management on Linux
|
||
|
|
elif self.system_info["rocm_available"]:
|
||
|
|
llm_kwargs["n_batch"] = 256 # Conservative for ROCm
|
||
|
|
|
||
|
|
elif self.system_info["platform"] == "Darwin":
|
||
|
|
# macOS optimizations
|
||
|
|
if self.system_info["metal_available"]:
|
||
|
|
llm_kwargs["n_batch"] = 512
|
||
|
|
llm_kwargs["use_mmap"] = True
|
||
|
|
|
||
|
|
self.logger.info(f"Loading model with: {n_gpu_layers} GPU layers, {n_threads} threads")
|
||
|
|
|
||
|
|
llm = Llama(**llm_kwargs)
|
||
|
|
|
||
|
|
# Cache the model (limit cache size)
|
||
|
|
if len(self.model_cache) >= 3: # Max 3 models in cache
|
||
|
|
# Remove oldest model
|
||
|
|
oldest_key = next(iter(self.model_cache))
|
||
|
|
del self.model_cache[oldest_key]
|
||
|
|
|
||
|
|
self.model_cache[model_path] = llm
|
||
|
|
self.current_model = model_path
|
||
|
|
self.current_llm = llm
|
||
|
|
|
||
|
|
self.logger.info(f"Successfully loaded model: {model_path}")
|
||
|
|
if n_gpu_layers > 0:
|
||
|
|
acceleration = "CUDA" if self.system_info["cuda_available"] else \
|
||
|
|
"ROCm" if self.system_info["rocm_available"] else \
|
||
|
|
"Metal" if self.system_info["metal_available"] else \
|
||
|
|
"Intel GPU" if self.system_info["intel_gpu_available"] else "GPU"
|
||
|
|
self.logger.info(f"Using {acceleration} acceleration with {n_gpu_layers} layers")
|
||
|
|
|
||
|
|
return True
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
self.logger.error(f"Failed to load model {model_path}: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def generate_text(self, prompt: str, **kwargs) -> Dict[str, Any]:
|
||
|
|
"""Generate text using the current model."""
|
||
|
|
if not self.current_llm:
|
||
|
|
return {"error": "No model loaded"}
|
||
|
|
|
||
|
|
try:
|
||
|
|
max_tokens = kwargs.get('max_tokens', 256)
|
||
|
|
temperature = kwargs.get('temperature', 0.7)
|
||
|
|
stream = kwargs.get('stream', False)
|
||
|
|
|
||
|
|
if stream:
|
||
|
|
# For MCP, we'll collect the stream and return the full result
|
||
|
|
result = ""
|
||
|
|
for chunk in self.current_llm.create_completion(
|
||
|
|
prompt,
|
||
|
|
max_tokens=max_tokens,
|
||
|
|
temperature=temperature,
|
||
|
|
stream=True
|
||
|
|
):
|
||
|
|
if 'choices' in chunk and chunk['choices']:
|
||
|
|
text = chunk['choices'][0].get('text', '')
|
||
|
|
result += text
|
||
|
|
|
||
|
|
return {
|
||
|
|
"text": result,
|
||
|
|
"model": self.current_model,
|
||
|
|
"tokens": len(result.split())
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
response = self.current_llm.create_completion(
|
||
|
|
prompt,
|
||
|
|
max_tokens=max_tokens,
|
||
|
|
temperature=temperature
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"text": response['choices'][0]['text'],
|
||
|
|
"model": self.current_model,
|
||
|
|
"tokens": response['usage']['total_tokens']
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
self.logger.error(f"Generation failed: {e}")
|
||
|
|
return {"error": str(e)}
|
||
|
|
|
||
|
|
def chat_completion(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
|
||
|
|
"""Generate chat completion using the current model."""
|
||
|
|
if not self.current_llm:
|
||
|
|
return {"error": "No model loaded"}
|
||
|
|
|
||
|
|
try:
|
||
|
|
max_tokens = kwargs.get('max_tokens', 256)
|
||
|
|
temperature = kwargs.get('temperature', 0.7)
|
||
|
|
|
||
|
|
response = self.current_llm.create_chat_completion(
|
||
|
|
messages=messages,
|
||
|
|
max_tokens=max_tokens,
|
||
|
|
temperature=temperature
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"message": response['choices'][0]['message'],
|
||
|
|
"model": self.current_model,
|
||
|
|
"tokens": response['usage']['total_tokens']
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
self.logger.error(f"Chat completion failed: {e}")
|
||
|
|
return {"error": str(e)}
|
||
|
|
|
||
|
|
|
||
|
|
# Global server instance
|
||
|
|
mcp_server = MultiModelMCPServer()
|
||
|
|
|
||
|
|
# Create MCP server
|
||
|
|
server = Server("llm-train-models")
|
||
|
|
|
||
|
|
|
||
|
|
@server.list_tools()
|
||
|
|
async def handle_list_tools() -> List[Tool]:
|
||
|
|
"""List available tools."""
|
||
|
|
return [
|
||
|
|
Tool(
|
||
|
|
name="list_models",
|
||
|
|
description="List all available models in the library",
|
||
|
|
inputSchema={
|
||
|
|
"type": "object",
|
||
|
|
"properties": {},
|
||
|
|
"required": []
|
||
|
|
}
|
||
|
|
),
|
||
|
|
Tool(
|
||
|
|
name="load_model",
|
||
|
|
description="Load a specific model for inference",
|
||
|
|
inputSchema={
|
||
|
|
"type": "object",
|
||
|
|
"properties": {
|
||
|
|
"model_path": {
|
||
|
|
"type": "string",
|
||
|
|
"description": "Path to the model file"
|
||
|
|
},
|
||
|
|
"n_ctx": {
|
||
|
|
"type": "integer",
|
||
|
|
"description": "Context window size",
|
||
|
|
"default": 4096
|
||
|
|
},
|
||
|
|
"n_gpu_layers": {
|
||
|
|
"type": "integer",
|
||
|
|
"description": "Number of GPU layers",
|
||
|
|
"default": 0
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"required": ["model_path"]
|
||
|
|
}
|
||
|
|
),
|
||
|
|
Tool(
|
||
|
|
name="generate_text",
|
||
|
|
description="Generate text using the current model",
|
||
|
|
inputSchema={
|
||
|
|
"type": "object",
|
||
|
|
"properties": {
|
||
|
|
"prompt": {
|
||
|
|
"type": "string",
|
||
|
|
"description": "The text prompt for generation"
|
||
|
|
},
|
||
|
|
"max_tokens": {
|
||
|
|
"type": "integer",
|
||
|
|
"description": "Maximum tokens to generate",
|
||
|
|
"default": 256
|
||
|
|
},
|
||
|
|
"temperature": {
|
||
|
|
"type": "number",
|
||
|
|
"description": "Sampling temperature",
|
||
|
|
"default": 0.7
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"required": ["prompt"]
|
||
|
|
}
|
||
|
|
),
|
||
|
|
Tool(
|
||
|
|
name="chat_completion",
|
||
|
|
description="Generate chat completion using the current model",
|
||
|
|
inputSchema={
|
||
|
|
"type": "object",
|
||
|
|
"properties": {
|
||
|
|
"messages": {
|
||
|
|
"type": "array",
|
||
|
|
"description": "Chat messages",
|
||
|
|
"items": {
|
||
|
|
"type": "object",
|
||
|
|
"properties": {
|
||
|
|
"role": {"type": "string"},
|
||
|
|
"content": {"type": "string"}
|
||
|
|
},
|
||
|
|
"required": ["role", "content"]
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"max_tokens": {
|
||
|
|
"type": "integer",
|
||
|
|
"description": "Maximum tokens to generate",
|
||
|
|
"default": 256
|
||
|
|
},
|
||
|
|
"temperature": {
|
||
|
|
"type": "number",
|
||
|
|
"description": "Sampling temperature",
|
||
|
|
"default": 0.7
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"required": ["messages"]
|
||
|
|
}
|
||
|
|
),
|
||
|
|
Tool(
|
||
|
|
name="get_current_model",
|
||
|
|
description="Get information about the currently loaded model",
|
||
|
|
inputSchema={
|
||
|
|
"type": "object",
|
||
|
|
"properties": {},
|
||
|
|
"required": []
|
||
|
|
}
|
||
|
|
),
|
||
|
|
Tool(
|
||
|
|
name="get_system_info",
|
||
|
|
description="Get system capabilities and GPU acceleration status",
|
||
|
|
inputSchema={
|
||
|
|
"type": "object",
|
||
|
|
"properties": {},
|
||
|
|
"required": []
|
||
|
|
}
|
||
|
|
)
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
@server.call_tool()
|
||
|
|
async def handle_call_tool(name: str, arguments: Dict[str, Any]) -> List[TextContent]:
|
||
|
|
"""Handle tool calls."""
|
||
|
|
|
||
|
|
if name == "list_models":
|
||
|
|
models = mcp_server.get_available_models()
|
||
|
|
model_list = []
|
||
|
|
|
||
|
|
for model in models:
|
||
|
|
model_info = {
|
||
|
|
"name": model.name,
|
||
|
|
"path": model.path,
|
||
|
|
"type": model.file_type,
|
||
|
|
"size_mb": round(model.size_mb, 1),
|
||
|
|
"modified": model.modified_date,
|
||
|
|
"tags": model.tags,
|
||
|
|
"metadata": model.metadata
|
||
|
|
}
|
||
|
|
model_list.append(model_info)
|
||
|
|
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=json.dumps(model_list, indent=2)
|
||
|
|
)]
|
||
|
|
|
||
|
|
elif name == "load_model":
|
||
|
|
model_path = arguments.get("model_path")
|
||
|
|
n_ctx = arguments.get("n_ctx", 4096)
|
||
|
|
n_gpu_layers = arguments.get("n_gpu_layers", 0)
|
||
|
|
|
||
|
|
if not model_path:
|
||
|
|
return [TextContent(type="text", text="Error: model_path is required")]
|
||
|
|
|
||
|
|
success = mcp_server.load_model(
|
||
|
|
model_path,
|
||
|
|
n_ctx=n_ctx,
|
||
|
|
n_gpu_layers=n_gpu_layers
|
||
|
|
)
|
||
|
|
|
||
|
|
if success:
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=f"Successfully loaded model: {model_path}"
|
||
|
|
)]
|
||
|
|
else:
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=f"Failed to load model: {model_path}"
|
||
|
|
)]
|
||
|
|
|
||
|
|
elif name == "generate_text":
|
||
|
|
prompt = arguments.get("prompt")
|
||
|
|
max_tokens = arguments.get("max_tokens", 256)
|
||
|
|
temperature = arguments.get("temperature", 0.7)
|
||
|
|
|
||
|
|
if not prompt:
|
||
|
|
return [TextContent(type="text", text="Error: prompt is required")]
|
||
|
|
|
||
|
|
result = mcp_server.generate_text(
|
||
|
|
prompt,
|
||
|
|
max_tokens=max_tokens,
|
||
|
|
temperature=temperature
|
||
|
|
)
|
||
|
|
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=json.dumps(result, indent=2)
|
||
|
|
)]
|
||
|
|
|
||
|
|
elif name == "chat_completion":
|
||
|
|
messages = arguments.get("messages", [])
|
||
|
|
max_tokens = arguments.get("max_tokens", 256)
|
||
|
|
temperature = arguments.get("temperature", 0.7)
|
||
|
|
|
||
|
|
if not messages:
|
||
|
|
return [TextContent(type="text", text="Error: messages are required")]
|
||
|
|
|
||
|
|
result = mcp_server.chat_completion(
|
||
|
|
messages,
|
||
|
|
max_tokens=max_tokens,
|
||
|
|
temperature=temperature
|
||
|
|
)
|
||
|
|
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=json.dumps(result, indent=2)
|
||
|
|
)]
|
||
|
|
|
||
|
|
elif name == "get_current_model":
|
||
|
|
if mcp_server.current_model:
|
||
|
|
# Find model info
|
||
|
|
models = mcp_server.get_available_models()
|
||
|
|
current_info = None
|
||
|
|
|
||
|
|
for model in models:
|
||
|
|
if model.path == mcp_server.current_model:
|
||
|
|
current_info = {
|
||
|
|
"name": model.name,
|
||
|
|
"path": model.path,
|
||
|
|
"type": model.file_type,
|
||
|
|
"size_mb": round(model.size_mb, 1),
|
||
|
|
"metadata": model.metadata
|
||
|
|
}
|
||
|
|
break
|
||
|
|
|
||
|
|
if current_info:
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=json.dumps(current_info, indent=2)
|
||
|
|
)]
|
||
|
|
|
||
|
|
return [TextContent(type="text", text="No model currently loaded")]
|
||
|
|
|
||
|
|
elif name == "get_system_info":
|
||
|
|
system_info = {
|
||
|
|
"platform": mcp_server.system_info["platform"],
|
||
|
|
"architecture": mcp_server.system_info["architecture"],
|
||
|
|
"acceleration": {
|
||
|
|
"cuda_available": mcp_server.system_info["cuda_available"],
|
||
|
|
"cuda_version": mcp_server.system_info["cuda_version"],
|
||
|
|
"cuda_devices": mcp_server.system_info["cuda_devices"],
|
||
|
|
"rocm_available": mcp_server.system_info["rocm_available"],
|
||
|
|
"metal_available": mcp_server.system_info["metal_available"],
|
||
|
|
"intel_gpu_available": mcp_server.system_info["intel_gpu_available"],
|
||
|
|
"recommended_layers": mcp_server.system_info["recommended_layers"]
|
||
|
|
},
|
||
|
|
"current_model_acceleration": "Unknown"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Add current model acceleration info
|
||
|
|
if mcp_server.current_llm:
|
||
|
|
if mcp_server.system_info["cuda_available"]:
|
||
|
|
system_info["current_model_acceleration"] = "CUDA (NVIDIA)"
|
||
|
|
elif mcp_server.system_info["rocm_available"]:
|
||
|
|
system_info["current_model_acceleration"] = "ROCm (AMD)"
|
||
|
|
elif mcp_server.system_info["metal_available"]:
|
||
|
|
system_info["current_model_acceleration"] = "Metal (Apple)"
|
||
|
|
elif mcp_server.system_info["intel_gpu_available"]:
|
||
|
|
system_info["current_model_acceleration"] = "Intel GPU"
|
||
|
|
else:
|
||
|
|
system_info["current_model_acceleration"] = "CPU Only"
|
||
|
|
|
||
|
|
return [TextContent(
|
||
|
|
type="text",
|
||
|
|
text=json.dumps(system_info, indent=2)
|
||
|
|
)]
|
||
|
|
|
||
|
|
else:
|
||
|
|
return [TextContent(type="text", text=f"Unknown tool: {name}")]
|
||
|
|
|
||
|
|
|
||
|
|
@server.list_prompts()
|
||
|
|
async def handle_list_prompts() -> List[Prompt]:
|
||
|
|
"""List available prompts."""
|
||
|
|
return [
|
||
|
|
Prompt(
|
||
|
|
name="model_comparison",
|
||
|
|
description="Compare multiple models on the same prompt",
|
||
|
|
arguments=[
|
||
|
|
{
|
||
|
|
"name": "prompt",
|
||
|
|
"description": "The prompt to test across models",
|
||
|
|
"required": True
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "models",
|
||
|
|
"description": "List of model paths to compare",
|
||
|
|
"required": True
|
||
|
|
}
|
||
|
|
]
|
||
|
|
),
|
||
|
|
Prompt(
|
||
|
|
name="model_benchmark",
|
||
|
|
description="Benchmark a model with standard prompts",
|
||
|
|
arguments=[
|
||
|
|
{
|
||
|
|
"name": "model_path",
|
||
|
|
"description": "Path to the model to benchmark",
|
||
|
|
"required": True
|
||
|
|
}
|
||
|
|
]
|
||
|
|
)
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
@server.get_prompt()
|
||
|
|
async def handle_get_prompt(name: str, arguments: Dict[str, str]) -> Prompt:
|
||
|
|
"""Handle prompt requests."""
|
||
|
|
|
||
|
|
if name == "model_comparison":
|
||
|
|
prompt_text = arguments.get("prompt", "")
|
||
|
|
models = arguments.get("models", "").split(",")
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
PromptMessage(
|
||
|
|
role="user",
|
||
|
|
content=TextContent(
|
||
|
|
type="text",
|
||
|
|
text=f"Compare the following models on this prompt: '{prompt_text}'\n\n"
|
||
|
|
f"Models to test: {', '.join(models)}\n\n"
|
||
|
|
f"For each model, load it and generate a response, then provide a comparison."
|
||
|
|
)
|
||
|
|
)
|
||
|
|
]
|
||
|
|
|
||
|
|
return Prompt(
|
||
|
|
name=name,
|
||
|
|
description="Compare multiple models",
|
||
|
|
messages=messages
|
||
|
|
)
|
||
|
|
|
||
|
|
elif name == "model_benchmark":
|
||
|
|
model_path = arguments.get("model_path", "")
|
||
|
|
|
||
|
|
benchmark_prompts = [
|
||
|
|
"Explain quantum computing in simple terms.",
|
||
|
|
"Write a Python function to calculate fibonacci numbers.",
|
||
|
|
"What are the main causes of climate change?",
|
||
|
|
"Describe the process of photosynthesis.",
|
||
|
|
"Write a short story about a robot learning to paint."
|
||
|
|
]
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
PromptMessage(
|
||
|
|
role="user",
|
||
|
|
content=TextContent(
|
||
|
|
type="text",
|
||
|
|
text=f"Benchmark the model at: {model_path}\n\n"
|
||
|
|
f"Test it with these prompts:\n" +
|
||
|
|
"\n".join(f"{i+1}. {p}" for i, p in enumerate(benchmark_prompts)) +
|
||
|
|
"\n\nProvide the model's response to each prompt and evaluate quality."
|
||
|
|
)
|
||
|
|
)
|
||
|
|
]
|
||
|
|
|
||
|
|
return Prompt(
|
||
|
|
name=name,
|
||
|
|
description="Benchmark model performance",
|
||
|
|
messages=messages
|
||
|
|
)
|
||
|
|
|
||
|
|
else:
|
||
|
|
raise ValueError(f"Unknown prompt: {name}")
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
"""Main entry point."""
|
||
|
|
parser = argparse.ArgumentParser(description="Multi-Model MCP Server")
|
||
|
|
parser.add_argument(
|
||
|
|
"--settings",
|
||
|
|
default="settings.json",
|
||
|
|
help="Path to settings file"
|
||
|
|
)
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Initialize server with settings
|
||
|
|
global mcp_server
|
||
|
|
mcp_server = MultiModelMCPServer(args.settings)
|
||
|
|
|
||
|
|
# Run the server
|
||
|
|
async with stdio_server() as (read_stream, write_stream):
|
||
|
|
await server.run(
|
||
|
|
read_stream,
|
||
|
|
write_stream,
|
||
|
|
InitializationOptions(
|
||
|
|
server_name="llm-train-models",
|
||
|
|
server_version="1.0.0",
|
||
|
|
capabilities=server.get_capabilities(
|
||
|
|
notification_options=None,
|
||
|
|
experimental_capabilities=None
|
||
|
|
)
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(main())
|