Autarch/modules/llm_trainer.py

"""
AUTARCH LLM Trainer Module
Fine-tune language models on the AUTARCH codebase and convert to GGUF.

Generates training datasets from source code, trains LoRA adapters,
merges weights, and quantizes to GGUF format for local inference.
"""

import os
import sys
import subprocess
import json
import re
import ast
import time
import platform
import shutil
from pathlib import Path
from datetime import datetime

sys.path.insert(0, str(Path(__file__).parent.parent))

# Module metadata
DESCRIPTION = "LLM fine-tuning & GGUF training pipeline"
AUTHOR = "darkHal"
VERSION = "1.0"
CATEGORY = "analyze"

_is_win = platform.system() == 'Windows'
_PROJECT_ROOT = Path(__file__).parent.parent
_DATA_DIR = _PROJECT_ROOT / 'data'
_MODELS_DIR = _PROJECT_ROOT / 'models'
_TRAINING_DIR = _DATA_DIR / 'training'


class LLMTrainer:
    """Fine-tuning pipeline: dataset generation, LoRA training, GGUF conversion."""

    def __init__(self):
        self._training_dir = _TRAINING_DIR
        self._training_dir.mkdir(parents=True, exist_ok=True)
        self._models_dir = _MODELS_DIR
        self._project_root = _PROJECT_ROOT
        self._status = {
            'phase': 'idle',
            'progress': 0,
            'message': '',
            'log': [],
        }
        self._training_process = None

    def _log(self, msg, level='info'):
        entry = {'time': datetime.now().strftime('%H:%M:%S'), 'msg': msg, 'level': level}
        self._status['log'].append(entry)
        # Keep last 200 entries
        if len(self._status['log']) > 200:
            self._status['log'] = self._status['log'][-200:]

    def get_status(self):
        return dict(self._status)

    # ==================== DEPENDENCY CHECK ====================

    def check_dependencies(self):
        """Check what training dependencies are installed."""
        deps = {}
        checks = {
            'torch': 'import torch; print(torch.__version__)',
            'transformers': 'import transformers; print(transformers.__version__)',
            'peft': 'import peft; print(peft.__version__)',
            'datasets': 'import datasets; print(datasets.__version__)',
            'unsloth': 'import unsloth; print(unsloth.__version__)',
            'bitsandbytes': 'import bitsandbytes; print(bitsandbytes.__version__)',
            'trl': 'import trl; print(trl.__version__)',
            'accelerate': 'import accelerate; print(accelerate.__version__)',
        }
        for name, cmd in checks.items():
            try:
                result = subprocess.run(
                    [sys.executable, '-c', cmd],
                    capture_output=True, text=True, timeout=15
                )
                if result.returncode == 0:
                    deps[name] = {'installed': True, 'version': result.stdout.strip()}
                else:
                    deps[name] = {'installed': False, 'version': None}
            except Exception:
                deps[name] = {'installed': False, 'version': None}

        # Check for llama.cpp convert script
        llama_cpp_paths = [
            _PROJECT_ROOT / 'tools' / 'llama.cpp',
            Path.home() / 'llama.cpp',
            Path('/usr/local/bin/llama-quantize'),
        ]
        deps['llama_cpp'] = {'installed': False, 'path': None}
        for p in llama_cpp_paths:
            if p.exists():
                deps['llama_cpp'] = {'installed': True, 'path': str(p)}
                break

        # Check GPU
        try:
            result = subprocess.run(
                [sys.executable, '-c',
                 'import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "none")'],
                capture_output=True, text=True, timeout=15
            )
            if result.returncode == 0:
                lines = result.stdout.strip().split('\n')
                deps['cuda'] = {
                    'available': lines[0].strip() == 'True',
                    'device': lines[1].strip() if len(lines) > 1 else 'none',
                }
            else:
                deps['cuda'] = {'available': False, 'device': 'none'}
        except Exception:
            deps['cuda'] = {'available': False, 'device': 'none'}

        # Check Intel XPU
        try:
            result = subprocess.run(
                [sys.executable, '-c',
                 'import torch; import intel_extension_for_pytorch; print(torch.xpu.is_available())'],
                capture_output=True, text=True, timeout=15
            )
            deps['xpu'] = {'available': result.returncode == 0 and 'True' in result.stdout}
        except Exception:
            deps['xpu'] = {'available': False}

        return deps

    def install_dependencies(self):
        """Install training dependencies via pip."""
        self._status['phase'] = 'installing'
        self._status['progress'] = 0
        self._log('Installing training dependencies...')

        packages = [
            'torch', 'transformers', 'peft', 'datasets',
            'trl', 'accelerate', 'bitsandbytes',
        ]
        results = []
        for i, pkg in enumerate(packages):
            self._status['progress'] = int((i / len(packages)) * 100)
            self._status['message'] = f'Installing {pkg}...'
            self._log(f'pip install {pkg}')
            try:
                result = subprocess.run(
                    [sys.executable, '-m', 'pip', 'install', pkg, '--quiet'],
                    capture_output=True, text=True, timeout=300
                )
                results.append({
                    'package': pkg,
                    'success': result.returncode == 0,
                    'output': result.stdout.strip() or result.stderr.strip(),
                })
            except Exception as e:
                results.append({'package': pkg, 'success': False, 'output': str(e)})

        self._status['phase'] = 'idle'
        self._status['progress'] = 100
        self._status['message'] = 'Dependencies installed'
        return results

    # ==================== CODEBASE SCANNING ====================

    def scan_codebase(self):
        """Scan the AUTARCH codebase and return file inventory."""
        inventory = {
            'modules': [],
            'core': [],
            'routes': [],
            'templates': [],
            'configs': [],
            'other': [],
        }

        scan_dirs = {
            'modules': self._project_root / 'modules',
            'core': self._project_root / 'core',
            'routes': self._project_root / 'web' / 'routes',
            'templates': self._project_root / 'web' / 'templates',
        }

        for category, scan_dir in scan_dirs.items():
            if not scan_dir.exists():
                continue
            for f in sorted(scan_dir.glob('*.py' if category != 'templates' else '*.html')):
                try:
                    size = f.stat().st_size
                    lines = f.read_text(encoding='utf-8', errors='replace').count('\n')
                    inventory[category].append({
                        'name': f.name,
                        'path': str(f.relative_to(self._project_root)),
                        'size': size,
                        'lines': lines,
                    })
                except Exception:
                    pass

        # Config files
        for pattern in ['*.conf', '*.json', '*.txt']:
            for f in self._project_root.glob(pattern):
                if f.name.startswith('.'):
                    continue
                try:
                    inventory['configs'].append({
                        'name': f.name,
                        'path': str(f.relative_to(self._project_root)),
                        'size': f.stat().st_size,
                        'lines': f.read_text(encoding='utf-8', errors='replace').count('\n'),
                    })
                except Exception:
                    pass
        for f in (_DATA_DIR).glob('*.txt'):
            try:
                inventory['configs'].append({
                    'name': f'data/{f.name}',
                    'path': str(f.relative_to(self._project_root)),
                    'size': f.stat().st_size,
                    'lines': f.read_text(encoding='utf-8', errors='replace').count('\n'),
                })
            except Exception:
                pass

        # Entry point
        entry = self._project_root / 'autarch.py'
        if entry.exists():
            inventory['other'].append({
                'name': 'autarch.py',
                'path': 'autarch.py',
                'size': entry.stat().st_size,
                'lines': entry.read_text(encoding='utf-8', errors='replace').count('\n'),
            })

        # JS
        js_dir = self._project_root / 'web' / 'static' / 'js'
        if js_dir.exists():
            for f in js_dir.glob('*.js'):
                try:
                    inventory['other'].append({
                        'name': f'static/js/{f.name}',
                        'path': str(f.relative_to(self._project_root)),
                        'size': f.stat().st_size,
                        'lines': f.read_text(encoding='utf-8', errors='replace').count('\n'),
                    })
                except Exception:
                    pass

        total_files = sum(len(v) for v in inventory.values())
        total_lines = sum(item['lines'] for v in inventory.values() for item in v)
        return {
            'inventory': inventory,
            'total_files': total_files,
            'total_lines': total_lines,
        }

    # ==================== PYTHON MODULE EXTRACTION ====================

    def _extract_module_info(self, filepath):
        """Extract structured info from a Python module file."""
        try:
            source = Path(filepath).read_text(encoding='utf-8', errors='replace')
        except Exception:
            return None

        info = {
            'file': str(Path(filepath).relative_to(self._project_root)),
            'source': source,
            'docstring': '',
            'classes': [],
            'functions': [],
            'metadata': {},
        }

        try:
            tree = ast.parse(source)
        except SyntaxError:
            return info

        # Module docstring
        if (tree.body and isinstance(tree.body[0], ast.Expr)
                and isinstance(tree.body[0].value, (ast.Constant, ast.Str))):
            info['docstring'] = getattr(tree.body[0].value, 'value',
                                        getattr(tree.body[0].value, 's', ''))

        # Module-level assignments (DESCRIPTION, AUTHOR, etc.)
        for node in ast.walk(tree):
            if isinstance(node, ast.Assign):
                for target in node.targets:
                    if isinstance(target, ast.Name) and isinstance(node.value, (ast.Constant, ast.Str)):
                        val = getattr(node.value, 'value', getattr(node.value, 's', ''))
                        if target.id in ('DESCRIPTION', 'AUTHOR', 'VERSION', 'CATEGORY', 'NAME'):
                            info['metadata'][target.id] = val

        # Classes and methods
        for node in ast.iter_child_nodes(tree):
            if isinstance(node, ast.ClassDef):
                cls_info = {
                    'name': node.name,
                    'docstring': ast.get_docstring(node) or '',
                    'methods': [],
                }
                for item in node.body:
                    if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
                        args = [a.arg for a in item.args.args if a.arg != 'self']
                        cls_info['methods'].append({
                            'name': item.name,
                            'args': args,
                            'docstring': ast.get_docstring(item) or '',
                            'lineno': item.lineno,
                        })
                info['classes'].append(cls_info)

            elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                args = [a.arg for a in node.args.args if a.arg != 'self']
                info['functions'].append({
                    'name': node.name,
                    'args': args,
                    'docstring': ast.get_docstring(node) or '',
                    'lineno': node.lineno,
                })

        return info

    # ==================== DATASET GENERATION ====================

    def generate_dataset(self, format='sharegpt', include_source=True,
                         include_qa=True, include_module_creation=True):
        """Generate training dataset from the AUTARCH codebase.

        Args:
            format: 'sharegpt' (conversations) or 'instruction' (alpaca-style)
            include_source: Include code understanding pairs
            include_qa: Include Q&A about architecture
            include_module_creation: Include module creation examples

        Returns:
            Dict with dataset path, sample count, preview
        """
        self._status['phase'] = 'generating'
        self._status['progress'] = 0
        self._status['message'] = 'Scanning codebase...'
        self._log('Starting dataset generation...')

        samples = []
        scan = self.scan_codebase()
        all_files = []
        for category, files in scan['inventory'].items():
            for f in files:
                all_files.append((category, f))

        total = len(all_files)

        # ── Phase 1: Code understanding pairs ──
        if include_source:
            self._log(f'Generating code understanding pairs from {total} files...')
            for i, (category, finfo) in enumerate(all_files):
                self._status['progress'] = int((i / total) * 30)
                filepath = self._project_root / finfo['path']
                if not filepath.exists():
                    continue

                if filepath.suffix == '.py':
                    mod_info = self._extract_module_info(filepath)
                    if not mod_info:
                        continue

                    # "What does this file do?" pair
                    desc = mod_info.get('docstring') or mod_info['metadata'].get('DESCRIPTION', '')
                    if desc:
                        samples.append(self._make_sample(
                            f"What does the file `{finfo['path']}` do in AUTARCH?",
                            f"`{finfo['path']}` — {desc}\n\n"
                            f"Category: {mod_info['metadata'].get('CATEGORY', 'core')}\n"
                            f"It contains {len(mod_info['classes'])} class(es) and "
                            f"{len(mod_info['functions'])} top-level function(s).",
                            format
                        ))

                    # Class/method documentation
                    for cls in mod_info['classes']:
                        if cls['methods']:
                            method_list = ', '.join(m['name'] for m in cls['methods']
                                                     if not m['name'].startswith('_'))
                            samples.append(self._make_sample(
                                f"What methods does the `{cls['name']}` class in "
                                f"`{finfo['path']}` provide?",
                                f"The `{cls['name']}` class provides these methods: "
                                f"{method_list}\n\n"
                                + (f"Class description: {cls['docstring']}" if cls['docstring'] else ''),
                                format
                            ))

                        # Individual method docs
                        for method in cls['methods']:
                            if method['docstring'] and not method['name'].startswith('_'):
                                samples.append(self._make_sample(
                                    f"What does `{cls['name']}.{method['name']}()` do?",
                                    f"`{method['name']}({', '.join(method['args'])})` — "
                                    f"{method['docstring']}",
                                    format
                                ))

                elif filepath.suffix == '.html':
                    try:
                        content = filepath.read_text(encoding='utf-8', errors='replace')
                        # Extract template purpose from title block
                        title_match = re.search(r'{%\s*block\s+title\s*%}(.+?){%', content)
                        if title_match:
                            samples.append(self._make_sample(
                                f"What is the `{finfo['path']}` template for?",
                                f"The template `{finfo['path']}` renders the "
                                f"'{title_match.group(1).strip()}' page in the AUTARCH web dashboard.",
                                format
                            ))
                    except Exception:
                        pass

        # ── Phase 2: Architecture Q&A ──
        if include_qa:
            self._status['progress'] = 30
            self._status['message'] = 'Generating architecture Q&A...'
            self._log('Generating architecture Q&A pairs...')
            samples.extend(self._generate_architecture_qa(format, scan))

        # ── Phase 3: Module creation examples ──
        if include_module_creation:
            self._status['progress'] = 60
            self._status['message'] = 'Generating module creation examples...'
            self._log('Generating module creation training data...')
            samples.extend(self._generate_module_creation_samples(format))

        # ── Phase 4: System prompt and identity ──
        self._status['progress'] = 80
        self._status['message'] = 'Adding identity and system context...'
        samples.extend(self._generate_identity_samples(format))

        # ── Save dataset ──
        self._status['progress'] = 90
        self._status['message'] = 'Saving dataset...'

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        dataset_path = self._training_dir / f'autarch_dataset_{timestamp}.jsonl'

        with open(dataset_path, 'w', encoding='utf-8') as f:
            for sample in samples:
                f.write(json.dumps(sample, ensure_ascii=False) + '\n')

        self._status['phase'] = 'idle'
        self._status['progress'] = 100
        self._status['message'] = f'Dataset generated: {len(samples)} samples'
        self._log(f'Dataset saved to {dataset_path} ({len(samples)} samples)')

        return {
            'path': str(dataset_path),
            'filename': dataset_path.name,
            'sample_count': len(samples),
            'format': format,
            'preview': samples[:5],
            'size_bytes': dataset_path.stat().st_size,
        }

    def _make_sample(self, instruction, response, format='sharegpt'):
        """Create a training sample in the specified format."""
        if format == 'sharegpt':
            return {
                'conversations': [
                    {'from': 'human', 'value': instruction},
                    {'from': 'gpt', 'value': response},
                ]
            }
        else:  # alpaca/instruction format
            return {
                'instruction': instruction,
                'input': '',
                'output': response,
            }

    def _generate_architecture_qa(self, format, scan):
        """Generate Q&A pairs about AUTARCH architecture."""
        pairs = []

        # Project overview
        pairs.append(self._make_sample(
            "What is AUTARCH?",
            "AUTARCH (Autonomous Tactical Agent for Reconnaissance, Counterintelligence, "
            "and Hacking) is an autonomous security platform built by darkHal Security Group. "
            "It provides a web-based dashboard with modular tools for defense, offense, "
            "counter-intelligence, analysis, OSINT, and attack simulation. "
            "It features an AI agent (Hal) that can create new modules on demand.",
            format
        ))

        # Directory structure
        pairs.append(self._make_sample(
            "What is the directory structure of AUTARCH?",
            "AUTARCH has this structure:\n"
            "- `modules/` — Plugin modules (Python), each is a standalone tool\n"
            "- `core/` — Framework internals (llm.py, agent.py, tools.py, config.py, wireshark.py)\n"
            "- `web/` — Flask web dashboard (routes/, templates/, static/)\n"
            "- `data/` — Databases, configs, JSON files\n"
            "- `models/` — LLM model files (GGUF)\n"
            "- `autarch.py` — Main entry point\n"
            "- `autarch_settings.conf` — Configuration file",
            format
        ))

        # Module categories
        pairs.append(self._make_sample(
            "What module categories does AUTARCH support?",
            "AUTARCH supports 6 module categories:\n"
            "1. **defense** (Blue) — Security hardening, monitoring, firewalls\n"
            "2. **offense** (Red) — Penetration testing, exploitation\n"
            "3. **counter** (Purple) — Counter-intelligence, threat response\n"
            "4. **analyze** (Cyan) — Analysis, forensics, packet inspection\n"
            "5. **osint** (Green) — Open source intelligence gathering\n"
            "6. **simulate** (Yellow) — Attack simulation, red team exercises",
            format
        ))

        # Web architecture
        pairs.append(self._make_sample(
            "How does the AUTARCH web dashboard work?",
            "The web dashboard is built with Flask and uses Jinja2 templates with vanilla "
            "JavaScript. It runs on port 8181 with HTTPS. Routes are organized as Flask "
            "Blueprints in `web/routes/`. The frontend uses SSE (Server-Sent Events) for "
            "real-time streaming. The sidebar menu links to category pages (Defense, Offense, "
            "Analyze, etc.) which load their respective modules and tools.",
            format
        ))

        # LLM integration
        pairs.append(self._make_sample(
            "How does the LLM system work in AUTARCH?",
            "AUTARCH supports multiple LLM backends:\n"
            "1. **Local GGUF** — llama-cpp-python loads .gguf models from the models/ directory\n"
            "2. **HuggingFace Transformers** — loads full models with optional 4-bit quantization\n"
            "3. **Claude API** — Anthropic's API for cloud inference\n"
            "4. **HuggingFace API** — Inference API for cloud models\n\n"
            "The `core/llm.py` module wraps all backends with a unified interface. "
            "The AI agent (Hal) uses the local GGUF model with a tool-calling loop defined "
            "in `core/agent.py`. It can execute shell commands, read/write files, search code, "
            "and create new modules via the `create_module` tool in `core/tools.py`.",
            format
        ))

        # Config system
        pairs.append(self._make_sample(
            "How is AUTARCH configured?",
            "AUTARCH uses `autarch_settings.conf` (INI format) with sections for: "
            "[llama] (GGUF model settings), [autarch] (general), [msf] (Metasploit RPC), "
            "[osint] (OSINT settings), [transformers] (HuggingFace models), [claude] (API key), "
            "[web] (dashboard host/port/secret), [wireguard] (VPN), [upnp] (port forwarding), "
            "and more. The `core/config.py` module reads and writes this file.",
            format
        ))

        # Module stats
        mod_count = len(scan['inventory'].get('modules', []))
        core_count = len(scan['inventory'].get('core', []))
        pairs.append(self._make_sample(
            "How many modules and core files does AUTARCH have?",
            f"AUTARCH has {mod_count} plugin modules in `modules/` and {core_count} core "
            f"framework files in `core/`. Total codebase is {scan['total_files']} files "
            f"with {scan['total_lines']} lines of code.",
            format
        ))

        return pairs

    def _generate_module_creation_samples(self, format):
        """Generate training data for teaching the LLM how to create modules."""
        pairs = []

        # Module creation pattern
        pairs.append(self._make_sample(
            "How do I create a new AUTARCH module?",
            "Every AUTARCH module in `modules/` must have these elements:\n\n"
            "1. **Module-level metadata**: DESCRIPTION, AUTHOR, VERSION, CATEGORY\n"
            "2. **A `run()` function** — Entry point for CLI mode\n"
            "3. **Imports**: `from core.banner import Colors` for terminal colors\n\n"
            "```python\n"
            '"""\nModule description\n"""\n'
            "import os\nimport sys\nimport subprocess\nfrom pathlib import Path\n\n"
            "DESCRIPTION = \"What this module does\"\n"
            "AUTHOR = \"darkHal\"\nVERSION = \"1.0\"\n"
            "CATEGORY = \"defense\"  # defense/offense/counter/analyze/osint/simulate\n\n"
            "sys.path.insert(0, str(Path(__file__).parent.parent))\n"
            "from core.banner import Colors\n\n\n"
            "class MyModule:\n"
            "    def print_status(self, message, status=\"info\"):\n"
            "        colors = {\"info\": Colors.CYAN, \"success\": Colors.GREEN, "
            "\"warning\": Colors.YELLOW, \"error\": Colors.RED}\n"
            "        symbols = {\"info\": \"*\", \"success\": \"+\", \"warning\": \"!\", \"error\": \"X\"}\n"
            "        print(f\"{colors.get(status, Colors.WHITE)}"
            "[{symbols.get(status, '*')}] {message}{Colors.RESET}\")\n\n"
            "    def run_cmd(self, cmd, timeout=30):\n"
            "        try:\n"
            "            r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)\n"
            "            return r.returncode == 0, r.stdout.strip()\n"
            "        except Exception as e:\n"
            "            return False, str(e)\n\n\n"
            "def run():\n"
            "    mod = MyModule()\n"
            "    # Interactive menu or direct execution\n"
            "```",
            format
        ))

        # Scan existing modules for real examples
        modules_dir = self._project_root / 'modules'
        if modules_dir.exists():
            for mod_file in sorted(modules_dir.glob('*.py')):
                if mod_file.name.startswith('__'):
                    continue
                info = self._extract_module_info(mod_file)
                if not info or not info['metadata'].get('DESCRIPTION'):
                    continue

                # "Create a module like X" example
                desc = info['metadata'].get('DESCRIPTION', '')
                cat = info['metadata'].get('CATEGORY', 'analyze')
                source = info['source']

                # Only use first 3000 chars to keep training samples reasonable
                if len(source) > 3000:
                    source = source[:3000] + '\n# ... (truncated for training)\n'

                pairs.append(self._make_sample(
                    f"Create an AUTARCH module for: {desc}",
                    f"Here's a {cat} module that {desc.lower()}:\n\n```python\n{source}\n```",
                    format
                ))

        # Specific module creation scenarios
        scenarios = [
            ("Create a defense module that monitors port 5555 for incoming connections",
             "port_monitor", "defense",
             "Monitors a specific port for incoming TCP connections and alerts on new connections."),
            ("Create an OSINT module that looks up domain WHOIS information",
             "whois_lookup", "osint",
             "Performs WHOIS lookups on domains to gather registration information."),
            ("Create an analyze module that checks for open S3 buckets",
             "s3_checker", "analyze",
             "Checks if AWS S3 buckets are publicly accessible."),
        ]
        for prompt, name, cat, desc in scenarios:
            pairs.append(self._make_sample(
                prompt,
                f"I'll create the `{name}.py` module in the `{cat}` category.\n\n"
                f"```python\n"
                f'"""\n{desc}\n"""\n'
                f"import os\nimport sys\nimport subprocess\nimport socket\n"
                f"from pathlib import Path\n\n"
                f"DESCRIPTION = \"{desc}\"\n"
                f"AUTHOR = \"darkHal\"\nVERSION = \"1.0\"\n"
                f"CATEGORY = \"{cat}\"\n\n"
                f"sys.path.insert(0, str(Path(__file__).parent.parent))\n"
                f"from core.banner import Colors\n\n\n"
                f"class {name.title().replace('_', '')}:\n"
                f"    def print_status(self, msg, status='info'):\n"
                f"        colors = {{'info': Colors.CYAN, 'success': Colors.GREEN, "
                f"'warning': Colors.YELLOW, 'error': Colors.RED}}\n"
                f"        symbols = {{'info': '*', 'success': '+', 'warning': '!', 'error': 'X'}}\n"
                f"        print(f\"{{colors.get(status, Colors.WHITE)}}"
                f"[{{symbols.get(status, '*')}}] {{msg}}{{Colors.RESET}}\")\n\n"
                f"    def run_cmd(self, cmd, timeout=30):\n"
                f"        try:\n"
                f"            r = subprocess.run(cmd, shell=True, capture_output=True, "
                f"text=True, timeout=timeout)\n"
                f"            return r.returncode == 0, r.stdout.strip()\n"
                f"        except Exception as e:\n"
                f"            return False, str(e)\n\n\n"
                f"def run():\n"
                f"    mod = {name.title().replace('_', '')}()\n"
                f"    mod.print_status('Starting {desc.lower()}')\n"
                f"```\n\n"
                f"This module is saved as `modules/{name}.py` and will appear in the "
                f"{cat} category on the dashboard.",
                format
            ))

        return pairs

    def _generate_identity_samples(self, format):
        """Generate identity and behavior training data."""
        pairs = []

        pairs.append(self._make_sample(
            "Who are you?",
            "I'm Hal, the AI agent powering Project AUTARCH — an autonomous security "
            "platform built by darkHal Security Group. I can create new tools and modules, "
            "analyze code, run system commands, and help with security operations.",
            format
        ))

        pairs.append(self._make_sample(
            "What can you do?",
            "I can:\n"
            "- Create new AUTARCH modules on demand (defense, offense, analyze, etc.)\n"
            "- Read and analyze source code files\n"
            "- Execute shell commands\n"
            "- Search the codebase for patterns and files\n"
            "- Help with security analysis and tool development\n"
            "- Explain how AUTARCH works\n"
            "- Write Python code following AUTARCH's module patterns",
            format
        ))

        pairs.append(self._make_sample(
            "What tools do you have access to?",
            "I have these tools:\n"
            "- `shell` — Execute system commands\n"
            "- `read_file` — Read file contents\n"
            "- `write_file` — Write files\n"
            "- `list_dir` — List directory contents\n"
            "- `search_files` — Search for files by name\n"
            "- `search_content` — Search file contents (grep)\n"
            "- `create_module` — Create a new AUTARCH module (validates and saves to modules/)\n\n"
            "When asked to create a module, I use the `create_module` tool which validates "
            "the code has the required metadata (DESCRIPTION, AUTHOR, VERSION, CATEGORY) and "
            "a `run()` function, then saves it to the `modules/` directory.",
            format
        ))

        return pairs

    # ==================== LIST DATASETS ====================

    def list_datasets(self):
        """List generated training datasets."""
        datasets = []
        if self._training_dir.exists():
            for f in sorted(self._training_dir.glob('*.jsonl'), reverse=True):
                try:
                    line_count = sum(1 for _ in open(f, encoding='utf-8'))
                    datasets.append({
                        'filename': f.name,
                        'path': str(f),
                        'size_bytes': f.stat().st_size,
                        'sample_count': line_count,
                        'created': datetime.fromtimestamp(f.stat().st_mtime).isoformat(),
                    })
                except Exception:
                    pass
        return datasets

    def preview_dataset(self, filename, limit=10):
        """Preview samples from a dataset file."""
        filepath = self._training_dir / filename
        if not filepath.exists():
            return {'error': 'Dataset not found'}

        samples = []
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i >= limit:
                        break
                    samples.append(json.loads(line))
        except Exception as e:
            return {'error': str(e)}

        return {'filename': filename, 'samples': samples, 'total': i + 1 if samples else 0}

    def delete_dataset(self, filename):
        """Delete a dataset file."""
        filepath = self._training_dir / filename
        if filepath.exists() and filepath.suffix == '.jsonl':
            filepath.unlink()
            return True
        return False

    # ==================== TRAINING ====================

    def get_training_config(self):
        """Get default training configuration."""
        return {
            'base_model': '',
            'dataset': '',
            'output_dir': str(self._training_dir / 'output'),
            'lora_r': 16,
            'lora_alpha': 32,
            'lora_dropout': 0.05,
            'num_epochs': 3,
            'batch_size': 4,
            'gradient_accumulation_steps': 4,
            'learning_rate': 2e-4,
            'max_seq_length': 2048,
            'warmup_ratio': 0.03,
            'use_4bit': True,
            'use_unsloth': False,
            'save_steps': 50,
            'logging_steps': 10,
        }

    def browse_models(self, directory=''):
        """Browse local directories for model files (HuggingFace format)."""
        if not directory:
            directory = str(self._models_dir)
        target = Path(directory)
        if not target.exists():
            return {'error': f'Directory not found: {directory}', 'entries': []}

        entries = []
        try:
            for item in sorted(target.iterdir()):
                if item.name.startswith('.'):
                    continue
                entry = {
                    'name': item.name,
                    'path': str(item).replace('\\', '/'),
                    'is_dir': item.is_dir(),
                }
                if item.is_dir():
                    # Check if it looks like a HuggingFace model directory
                    has_config = (item / 'config.json').exists()
                    has_model = any(item.glob('*.safetensors')) or any(item.glob('*.bin'))
                    entry['is_model'] = has_config and has_model
                elif item.suffix in ('.gguf', '.bin', '.safetensors'):
                    entry['size_gb'] = round(item.stat().st_size / (1024**3), 2)
                entries.append(entry)
        except PermissionError:
            return {'error': f'Permission denied: {directory}', 'entries': []}

        return {
            'current_dir': str(target).replace('\\', '/'),
            'parent_dir': str(target.parent).replace('\\', '/') if target.parent != target else None,
            'entries': entries,
        }

    def start_training(self, config):
        """Start LoRA fine-tuning in a background process."""
        if self._training_process and self._training_process.poll() is None:
            return {'error': 'Training already in progress'}

        # Check critical dependencies before starting
        deps = self.check_dependencies()
        missing = []
        for pkg in ['torch', 'transformers', 'peft', 'datasets', 'trl']:
            if not deps.get(pkg, {}).get('installed'):
                missing.append(pkg)
        if missing:
            return {'error': f'Missing required packages: {", ".join(missing)}. Go to the Dependencies tab to install them.'}

        self._status['phase'] = 'training'
        self._status['progress'] = 0
        self._status['message'] = 'Starting training...'
        self._log('Starting LoRA fine-tuning...')

        # Generate the training script
        script_path = self._training_dir / 'train_lora.py'
        output_dir = Path(config.get('output_dir', str(self._training_dir / 'output')))
        output_dir.mkdir(parents=True, exist_ok=True)
        config['output_dir'] = str(output_dir)

        script = self._generate_training_script(config)
        script_path.write_text(script, encoding='utf-8')
        self._log(f'Training script written to {script_path}')

        # Run in background
        log_path = self._training_dir / 'training.log'
        try:
            with open(log_path, 'w') as log_file:
                self._training_process = subprocess.Popen(
                    [sys.executable, str(script_path)],
                    stdout=log_file,
                    stderr=subprocess.STDOUT,
                    cwd=str(self._project_root),
                )
            self._log(f'Training started (PID: {self._training_process.pid})')
            return {
                'success': True,
                'pid': self._training_process.pid,
                'log_path': str(log_path),
                'output_dir': str(output_dir),
            }
        except Exception as e:
            self._status['phase'] = 'idle'
            self._log(f'Failed to start training: {e}', 'error')
            return {'error': str(e)}

    def _generate_training_script(self, config):
        """Generate the LoRA training Python script."""
        # Use forward slashes for all paths to avoid Python escape sequence issues
        dataset_path = config.get('dataset', '').replace('\\', '/')
        base_model = config.get('base_model', '').replace('\\', '/')
        output_dir = config.get('output_dir', str(self._training_dir / 'output')).replace('\\', '/')

        use_unsloth = config.get('use_unsloth', False)

        if use_unsloth:
            return f'''#!/usr/bin/env python3
"""AUTARCH LoRA Training Script (Unsloth)"""
import json
from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="{base_model}",
    max_seq_length={config.get('max_seq_length', 2048)},
    load_in_4bit={config.get('use_4bit', True)},
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r={config.get('lora_r', 16)},
    lora_alpha={config.get('lora_alpha', 32)},
    lora_dropout={config.get('lora_dropout', 0.05)},
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
)

# Load dataset
samples = []
with open("{dataset_path}", "r") as f:
    for line in f:
        samples.append(json.loads(line))

def format_sample(sample):
    if "conversations" in sample:
        msgs = sample["conversations"]
        text = ""
        for msg in msgs:
            role = "user" if msg["from"] == "human" else "assistant"
            text += f"<|im_start|>{{role}}\\n{{msg['value']}}<|im_end|>\\n"
        return {{"text": text}}
    else:
        return {{"text": f"<|im_start|>user\\n{{sample['instruction']}}\\n{{sample.get('input','')}}<|im_end|>\\n<|im_start|>assistant\\n{{sample['output']}}<|im_end|>\\n"}}

dataset = Dataset.from_list([format_sample(s) for s in samples])

# Train
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length={config.get('max_seq_length', 2048)},
    args=TrainingArguments(
        output_dir="{output_dir}",
        num_train_epochs={config.get('num_epochs', 3)},
        per_device_train_batch_size={config.get('batch_size', 4)},
        gradient_accumulation_steps={config.get('gradient_accumulation_steps', 4)},
        learning_rate={config.get('learning_rate', 2e-4)},
        warmup_ratio={config.get('warmup_ratio', 0.03)},
        save_steps={config.get('save_steps', 50)},
        logging_steps={config.get('logging_steps', 10)},
        fp16=True,
        optim="adamw_8bit",
    ),
)

print("Starting training...")
trainer.train()
print("Training complete!")

# Save
model.save_pretrained("{output_dir}/lora_adapter")
tokenizer.save_pretrained("{output_dir}/lora_adapter")
print(f"LoRA adapter saved to {output_dir}/lora_adapter")
'''
        else:
            return f'''#!/usr/bin/env python3
"""AUTARCH LoRA Training Script (Transformers + PEFT)"""
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit={config.get('use_4bit', True)},
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
) if {config.get('use_4bit', True)} else None

print("Loading base model: {base_model}")
model = AutoModelForCausalLM.from_pretrained(
    "{base_model}",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
)
tokenizer = AutoTokenizer.from_pretrained("{base_model}", trust_remote_code=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if {config.get('use_4bit', True)}:
    model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r={config.get('lora_r', 16)},
    lora_alpha={config.get('lora_alpha', 32)},
    lora_dropout={config.get('lora_dropout', 0.05)},
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load dataset
samples = []
with open("{dataset_path}", "r") as f:
    for line in f:
        samples.append(json.loads(line))

def format_sample(sample):
    if "conversations" in sample:
        msgs = sample["conversations"]
        text = ""
        for msg in msgs:
            role = "user" if msg["from"] == "human" else "assistant"
            text += f"<|im_start|>{{role}}\\n{{msg['value']}}<|im_end|>\\n"
        return {{"text": text}}
    else:
        return {{"text": f"<|im_start|>user\\n{{sample['instruction']}}\\n{{sample.get('input','')}}<|im_end|>\\n<|im_start|>assistant\\n{{sample['output']}}<|im_end|>\\n"}}

dataset = Dataset.from_list([format_sample(s) for s in samples])
print(f"Dataset: {{len(dataset)}} samples")

# Train
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length={config.get('max_seq_length', 2048)},
    args=TrainingArguments(
        output_dir="{output_dir}",
        num_train_epochs={config.get('num_epochs', 3)},
        per_device_train_batch_size={config.get('batch_size', 4)},
        gradient_accumulation_steps={config.get('gradient_accumulation_steps', 4)},
        learning_rate={config.get('learning_rate', 2e-4)},
        warmup_ratio={config.get('warmup_ratio', 0.03)},
        save_steps={config.get('save_steps', 50)},
        logging_steps={config.get('logging_steps', 10)},
        fp16=True,
        optim="adamw_8bit",
        report_to="none",
    ),
)

print("Starting training...")
trainer.train()
print("Training complete!")

# Save
model.save_pretrained("{output_dir}/lora_adapter")
tokenizer.save_pretrained("{output_dir}/lora_adapter")
print(f"LoRA adapter saved to {output_dir}/lora_adapter")
'''

    def get_training_status(self):
        """Get current training status including log tail."""
        result = dict(self._status)

        if self._training_process:
            poll = self._training_process.poll()
            if poll is None:
                result['training_running'] = True
                result['pid'] = self._training_process.pid
            else:
                result['training_running'] = False
                result['exit_code'] = poll
                if self._status['phase'] == 'training':
                    self._status['phase'] = 'idle'
                    self._status['message'] = 'Training finished' if poll == 0 else f'Training failed (exit {poll})'
        else:
            result['training_running'] = False

        # Read training log tail
        log_path = self._training_dir / 'training.log'
        if log_path.exists():
            try:
                lines = log_path.read_text(encoding='utf-8', errors='replace').split('\n')
                result['training_log'] = '\n'.join(lines[-50:])
            except Exception:
                result['training_log'] = ''
        else:
            result['training_log'] = ''

        return result

    def stop_training(self):
        """Stop the running training process."""
        if self._training_process and self._training_process.poll() is None:
            self._training_process.terminate()
            self._training_process.wait(timeout=10)
            self._status['phase'] = 'idle'
            self._status['message'] = 'Training stopped by user'
            self._log('Training stopped by user', 'warning')
            return True
        return False

    # ==================== GGUF CONVERSION ====================

    def list_adapters(self):
        """List saved LoRA adapters."""
        adapters = []
        output_dir = self._training_dir / 'output'
        if output_dir.exists():
            for d in output_dir.iterdir():
                if d.is_dir():
                    config_path = d / 'adapter_config.json'
                    if config_path.exists():
                        try:
                            config = json.loads(config_path.read_text())
                            adapters.append({
                                'name': d.name,
                                'path': str(d),
                                'base_model': config.get('base_model_name_or_path', ''),
                                'r': config.get('r', 0),
                                'lora_alpha': config.get('lora_alpha', 0),
                            })
                        except Exception:
                            adapters.append({'name': d.name, 'path': str(d)})
        return adapters

    def merge_and_convert(self, adapter_path, output_name, quantization='Q5_K_M'):
        """Merge LoRA adapter with base model and convert to GGUF.

        This is a multi-step process:
        1. Load base model + LoRA adapter
        2. Merge weights
        3. Save merged model
        4. Convert to GGUF format
        5. Quantize
        """
        self._status['phase'] = 'converting'
        self._status['progress'] = 0
        self._status['message'] = 'Starting merge and conversion...'
        self._log(f'Starting merge: adapter={adapter_path}, quant={quantization}')

        merged_dir = self._training_dir / 'merged'
        merged_dir.mkdir(parents=True, exist_ok=True)
        output_path = self._models_dir / f'{output_name}.gguf'

        # Generate merge+convert script
        script = f'''#!/usr/bin/env python3
"""Merge LoRA adapter and convert to GGUF."""
import json, sys
from pathlib import Path

adapter_path = Path("{adapter_path}")
config_path = adapter_path / "adapter_config.json"
if not config_path.exists():
    print("ERROR: adapter_config.json not found")
    sys.exit(1)

config = json.loads(config_path.read_text())
base_model = config.get("base_model_name_or_path", "")
if not base_model:
    print("ERROR: No base_model_name_or_path in adapter config")
    sys.exit(1)

print(f"Base model: {{base_model}}")
print(f"Adapter: {{adapter_path}}")

# Step 1: Load and merge
print("Loading base model...")
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(base_model, device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(base_model)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(model, str(adapter_path))

print("Merging weights...")
model = model.merge_and_unload()

merged_path = "{merged_dir}"
print(f"Saving merged model to {{merged_path}}")
model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)
print("Merge complete!")
'''
        script_path = self._training_dir / 'merge_model.py'
        script_path.write_text(script, encoding='utf-8')

        # Run merge
        self._status['message'] = 'Merging LoRA adapter with base model...'
        self._status['progress'] = 10
        try:
            result = subprocess.run(
                [sys.executable, str(script_path)],
                capture_output=True, text=True, timeout=1800  # 30 min max
            )
            if result.returncode != 0:
                self._log(f'Merge failed: {result.stderr}', 'error')
                self._status['phase'] = 'idle'
                return {'error': f'Merge failed: {result.stderr[-500:]}'}
            self._log('Merge complete')
        except subprocess.TimeoutExpired:
            self._status['phase'] = 'idle'
            return {'error': 'Merge timed out (30 min limit)'}

        # Convert to GGUF using llama.cpp convert script
        self._status['message'] = 'Converting to GGUF format...'
        self._status['progress'] = 60

        # Try to find llama.cpp convert script
        convert_script = None
        search_paths = [
            self._project_root / 'tools' / 'llama.cpp' / 'convert_hf_to_gguf.py',
            Path.home() / 'llama.cpp' / 'convert_hf_to_gguf.py',
        ]
        for p in search_paths:
            if p.exists():
                convert_script = p
                break

        if not convert_script:
            # Try pip-installed llama-cpp-python convert
            self._log('llama.cpp convert script not found, trying pip package...', 'warning')
            try:
                result = subprocess.run(
                    [sys.executable, '-m', 'llama_cpp.convert',
                     str(merged_dir), '--outfile', str(output_path),
                     '--outtype', quantization.lower()],
                    capture_output=True, text=True, timeout=1800
                )
                if result.returncode == 0:
                    self._status['phase'] = 'idle'
                    self._status['progress'] = 100
                    self._log(f'GGUF saved to {output_path}')
                    return {
                        'success': True,
                        'output_path': str(output_path),
                        'size_bytes': output_path.stat().st_size if output_path.exists() else 0,
                    }
            except Exception:
                pass

            self._status['phase'] = 'idle'
            self._status['message'] = 'Merged model saved but GGUF conversion requires llama.cpp'
            return {
                'partial': True,
                'merged_path': str(merged_dir),
                'message': 'Model merged successfully. To convert to GGUF, install llama.cpp '
                           'and run: python convert_hf_to_gguf.py <merged_path> --outfile <output.gguf>',
            }

        # Run convert script
        try:
            result = subprocess.run(
                [sys.executable, str(convert_script),
                 str(merged_dir), '--outfile', str(output_path),
                 '--outtype', 'f16'],
                capture_output=True, text=True, timeout=1800
            )
            if result.returncode != 0:
                self._status['phase'] = 'idle'
                return {'error': f'GGUF conversion failed: {result.stderr[-500:]}'}
        except subprocess.TimeoutExpired:
            self._status['phase'] = 'idle'
            return {'error': 'GGUF conversion timed out'}

        # Quantize if not f16
        if quantization.upper() != 'F16':
            self._status['message'] = f'Quantizing to {quantization}...'
            self._status['progress'] = 80

            quantize_bin = None
            for p in [self._project_root / 'tools' / 'llama.cpp' / 'llama-quantize',
                       Path.home() / 'llama.cpp' / 'llama-quantize',
                       Path('/usr/local/bin/llama-quantize')]:
                if p.exists():
                    quantize_bin = p
                    break
                # Check .exe variant on Windows
                p_exe = p.with_suffix('.exe')
                if p_exe.exists():
                    quantize_bin = p_exe
                    break

            if quantize_bin:
                quant_output = output_path.with_stem(f'{output_name}_{quantization}')
                try:
                    result = subprocess.run(
                        [str(quantize_bin), str(output_path),
                         str(quant_output), quantization],
                        capture_output=True, text=True, timeout=1800
                    )
                    if result.returncode == 0:
                        # Replace f16 with quantized version
                        output_path.unlink()
                        shutil.move(str(quant_output), str(output_path))
                        self._log(f'Quantized to {quantization}')
                except Exception as e:
                    self._log(f'Quantization failed: {e}', 'warning')

        self._status['phase'] = 'idle'
        self._status['progress'] = 100
        self._status['message'] = f'GGUF model saved: {output_path.name}'
        self._log(f'GGUF model saved to {output_path}')

        return {
            'success': True,
            'output_path': str(output_path),
            'size_bytes': output_path.stat().st_size if output_path.exists() else 0,
        }

    def list_models(self):
        """List available GGUF models."""
        models = []
        if self._models_dir.exists():
            for f in sorted(self._models_dir.glob('*.gguf')):
                models.append({
                    'name': f.stem,
                    'filename': f.name,
                    'path': str(f),
                    'size_bytes': f.stat().st_size,
                    'size_gb': round(f.stat().st_size / (1024**3), 2),
                    'modified': datetime.fromtimestamp(f.stat().st_mtime).isoformat(),
                })
        return models

    # ==================== EVALUATION ====================

    def evaluate_model(self, model_path, test_prompts=None):
        """Quick evaluation of a GGUF model with test prompts."""
        if not test_prompts:
            test_prompts = [
                "What is AUTARCH?",
                "How do I create a new defense module?",
                "What module categories does AUTARCH support?",
                "Create a module that scans for open ports on localhost.",
            ]

        self._status['phase'] = 'evaluating'
        self._status['message'] = 'Loading model for evaluation...'
        self._log(f'Evaluating model: {model_path}')

        results = []
        try:
            from core.llm import LLM
            llm = LLM()
            llm.load_model(model_path)

            for i, prompt in enumerate(test_prompts):
                self._status['progress'] = int((i / len(test_prompts)) * 100)
                self._status['message'] = f'Testing prompt {i+1}/{len(test_prompts)}...'

                response = llm.generate(prompt, max_tokens=512)
                results.append({
                    'prompt': prompt,
                    'response': response,
                    'length': len(response),
                })

        except Exception as e:
            self._status['phase'] = 'idle'
            return {'error': str(e)}

        self._status['phase'] = 'idle'
        self._status['progress'] = 100
        self._status['message'] = 'Evaluation complete'
        return {'results': results, 'model': model_path}


# ==================== SINGLETON ====================

_trainer_instance = None


def get_trainer():
    """Get or create singleton LLMTrainer instance."""
    global _trainer_instance
    if _trainer_instance is None:
        _trainer_instance = LLMTrainer()
    return _trainer_instance


# ==================== CLI ====================

def run():
    """CLI entry point."""
    from core.banner import Colors, clear_screen, display_banner
    clear_screen()
    display_banner()
    print(f"\n{Colors.BOLD}{Colors.CYAN}LLM Trainer{Colors.RESET}\n")

    trainer = LLMTrainer()

    print(f"{Colors.CYAN}[*] Checking dependencies...{Colors.RESET}")
    deps = trainer.check_dependencies()
    for name, info in deps.items():
        if isinstance(info, dict) and 'installed' in info:
            status = f"{Colors.GREEN}v{info['version']}{Colors.RESET}" if info['installed'] else f"{Colors.RED}Not installed{Colors.RESET}"
            print(f"  {name}: {status}")

    print(f"\n{Colors.CYAN}[*] Scanning codebase...{Colors.RESET}")
    scan = trainer.scan_codebase()
    print(f"  Files: {scan['total_files']}")
    print(f"  Lines: {scan['total_lines']}")

    while True:
        print(f"\n{Colors.BOLD}Options:{Colors.RESET}")
        print("  1. Generate training dataset")
        print("  2. List datasets")
        print("  3. Check dependencies")
        print("  4. Install dependencies")
        print("  0. Exit")

        choice = input(f"\n{Colors.CYAN}Select: {Colors.RESET}").strip()
        if choice == '1':
            result = trainer.generate_dataset()
            print(f"\n{Colors.GREEN}[+] Generated {result['sample_count']} samples{Colors.RESET}")
            print(f"  File: {result['path']}")
        elif choice == '2':
            datasets = trainer.list_datasets()
            for d in datasets:
                print(f"  {d['filename']} — {d['sample_count']} samples, "
                      f"{d['size_bytes']//1024}KB")
        elif choice == '3':
            deps = trainer.check_dependencies()
            for name, info in deps.items():
                if isinstance(info, dict) and 'installed' in info:
                    status = f"{Colors.GREEN}v{info['version']}{Colors.RESET}" if info['installed'] else f"{Colors.RED}Missing{Colors.RESET}"
                    print(f"  {name}: {status}")
        elif choice == '4':
            results = trainer.install_dependencies()
            for r in results:
                status = f"{Colors.GREEN}OK{Colors.RESET}" if r['success'] else f"{Colors.RED}FAIL{Colors.RESET}"
                print(f"  {r['package']}: {status}")
        elif choice == '0':
            break

    input("\nPress Enter to continue...")