scripts/build_training_data.py

#!/usr/bin/env python3
"""
AUTARCH LoRA Training Data Generator
Extracts instruction/input/output triplets from the codebase
for fine-tuning LLMs on AUTARCH module creation patterns.

Run: python scripts/build_training_data.py
Output: data/codex/autarch_training.jsonl

Generates training pairs for:
- Module creation (description → code)
- Route creation (feature description → Flask blueprint)
- Config patterns (section description → config code)
- Template patterns (feature → Jinja2 template)
"""

import ast
import json
import sys
import re
from pathlib import Path
from datetime import datetime

FRAMEWORK_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(FRAMEWORK_DIR))

OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl'


def extract_module_pair(filepath: Path) -> dict:
    """Extract a training pair from a module file."""
    try:
        source = filepath.read_text(encoding='utf-8', errors='ignore')
        tree = ast.parse(source)
    except (SyntaxError, UnicodeDecodeError):
        return None

    description = None
    category = None
    author = None
    version = None
    docstring = ast.get_docstring(tree) or ''

    for node in ast.walk(tree):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant):
                    if target.id == 'DESCRIPTION':
                        description = node.value.value
                    elif target.id == 'CATEGORY':
                        category = node.value.value
                    elif target.id == 'AUTHOR':
                        author = node.value.value
                    elif target.id == 'VERSION':
                        version = node.value.value

    if not description or not category:
        return None

    # Build the instruction
    instruction = (
        f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. "
        f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY "
        f"attributes and a run() entry point function."
    )

    return {
        'instruction': instruction,
        'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}",
        'output': source,
        'type': 'module_creation',
        'category': category,
        'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
    }


def extract_route_pair(filepath: Path) -> dict:
    """Extract a training pair from a route file."""
    try:
        source = filepath.read_text(encoding='utf-8', errors='ignore')
        tree = ast.parse(source)
    except (SyntaxError, UnicodeDecodeError):
        return None

    docstring = ast.get_docstring(tree) or ''

    # Find blueprint name and prefix
    bp_name = None
    bp_prefix = None
    for node in ast.walk(tree):
        if isinstance(node, ast.Assign):
            for target in node.targets:
                if isinstance(target, ast.Name) and isinstance(node.value, ast.Call):
                    if hasattr(node.value, 'func'):
                        func_name = ''
                        if hasattr(node.value.func, 'id'):
                            func_name = node.value.func.id
                        elif hasattr(node.value.func, 'attr'):
                            func_name = node.value.func.attr
                        if func_name == 'Blueprint':
                            bp_name = target.id
                            for kw in node.value.keywords:
                                if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant):
                                    bp_prefix = kw.value.value

    if not bp_name:
        return None

    # Count routes
    routes = []
    for node in ast.iter_child_nodes(tree):
        if isinstance(node, ast.FunctionDef):
            for deco in node.decorator_list:
                if isinstance(deco, ast.Call) and hasattr(deco, 'func'):
                    if hasattr(deco.func, 'attr') and deco.func.attr == 'route':
                        doc = ast.get_docstring(node) or ''
                        routes.append({
                            'handler': node.name,
                            'doc': doc.split('\n')[0] if doc else '',
                        })

    feature_name = filepath.stem.replace('_', ' ').title()
    instruction = (
        f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. "
        f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', "
        f"use @login_required on all routes, and follow AUTARCH web route conventions. "
        f"It needs {len(routes)} route handlers."
    )

    return {
        'instruction': instruction,
        'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}",
        'output': source,
        'type': 'route_creation',
        'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
    }


def extract_template_pair(filepath: Path) -> dict:
    """Extract a training pair from a template file."""
    try:
        source = filepath.read_text(encoding='utf-8', errors='ignore')
    except Exception:
        return None

    if '{% extends' not in source or '{% block content %}' not in source:
        return None

    # Count sections, tabs, buttons, forms
    sections = source.count('class="section"') + source.count("class='section'")
    tabs = source.count('class="tab"') + source.count("class='tab'")
    forms = source.count('<form') + source.count('fetch(')
    has_script = '<script>' in source

    feature_name = filepath.stem.replace('_', ' ').title()
    instruction = (
        f"Create an AUTARCH web template for the '{feature_name}' page. "
        f"It should extend base.html, have a page header, and use AUTARCH's "
        f"CSS variables and UI patterns (sections, tab bars, data tables, buttons)."
    )

    return {
        'instruction': instruction,
        'input': (
            f"Template: {filepath.name}\n"
            f"Sections: {sections}\nTabs: {tabs}\nForms/API calls: {forms}\n"
            f"Has JavaScript: {has_script}"
        ),
        'output': source,
        'type': 'template_creation',
        'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
    }


def extract_core_api_pairs(filepath: Path) -> list:
    """Extract training pairs showing how to use core APIs."""
    pairs = []
    try:
        source = filepath.read_text(encoding='utf-8', errors='ignore')
        tree = ast.parse(source)
    except (SyntaxError, UnicodeDecodeError):
        return pairs

    for node in ast.iter_child_nodes(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            if node.name.startswith('_'):
                continue
            doc = ast.get_docstring(node) or ''
            if not doc:
                continue

            # Extract the function source
            lines = source.split('\n')
            start = node.lineno - 1
            end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20
            func_source = '\n'.join(lines[start:end])

            args = [a.arg for a in node.args.args if a.arg != 'self']
            module_name = filepath.stem

            pairs.append({
                'instruction': f"Show how to implement the `{node.name}` function in core/{module_name}.py",
                'input': f"Function: {node.name}({', '.join(args)})\nDocstring: {doc.split(chr(10))[0]}",
                'output': func_source,
                'type': 'api_reference',
                'source_file': f"core/{filepath.name}",
            })

    return pairs


def build_training_data():
    """Generate training data from the codebase."""
    print("[training] Scanning codebase for training pairs...")

    pairs = []

    # Module pairs
    modules_dir = FRAMEWORK_DIR / 'modules'
    for f in sorted(modules_dir.glob('*.py')):
        if f.name == '__init__.py':
            continue
        pair = extract_module_pair(f)
        if pair:
            pairs.append(pair)

    module_count = len(pairs)
    print(f"  Modules: {module_count} pairs")

    # Route pairs
    routes_dir = FRAMEWORK_DIR / 'web' / 'routes'
    for f in sorted(routes_dir.glob('*.py')):
        if f.name == '__init__.py':
            continue
        pair = extract_route_pair(f)
        if pair:
            pairs.append(pair)

    route_count = len(pairs) - module_count
    print(f"  Routes: {route_count} pairs")

    # Template pairs
    templates_dir = FRAMEWORK_DIR / 'web' / 'templates'
    for f in sorted(templates_dir.glob('*.html')):
        pair = extract_template_pair(f)
        if pair:
            pairs.append(pair)

    template_count = len(pairs) - module_count - route_count
    print(f"  Templates: {template_count} pairs")

    # Core API pairs
    core_dir = FRAMEWORK_DIR / 'core'
    api_start = len(pairs)
    for f in sorted(core_dir.glob('*.py')):
        if f.name == '__init__.py':
            continue
        api_pairs = extract_core_api_pairs(f)
        pairs.extend(api_pairs)

    api_count = len(pairs) - api_start
    print(f"  Core API: {api_count} pairs")

    # Write JSONL
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        for pair in pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    total_size = OUTPUT_PATH.stat().st_size
    print(f"\n[training] Written {len(pairs)} training pairs ({total_size:,} bytes) to {OUTPUT_PATH}")
    print(f"[training] Breakdown: {module_count} modules, {route_count} routes, "
          f"{template_count} templates, {api_count} core API functions")

    # Also output a summary
    summary_path = OUTPUT_PATH.with_suffix('.summary.json')
    summary = {
        'generated': datetime.now().isoformat(),
        'total_pairs': len(pairs),
        'modules': module_count,
        'routes': route_count,
        'templates': template_count,
        'core_api': api_count,
        'output_bytes': total_size,
        'types': {},
    }
    for p in pairs:
        t = p['type']
        summary['types'][t] = summary['types'].get(t, 0) + 1
    summary_path.write_text(json.dumps(summary, indent=2), encoding='utf-8')
    print(f"[training] Summary: {summary_path}")


if __name__ == '__main__':
    build_training_data()
AUTARCH v1.9 — remote monitoring, SSH manager, daemon, vault, cleanup - Add Remote Monitoring Station with PIAP device profile system - Add SSH/SSHD manager with fail2ban integration - Add privileged daemon architecture for safe root operations - Add encrypted vault, HAL memory, HAL auto-analyst - Add network security suite, module creator, codex training - Add start.sh launcher script and GTK3 desktop launcher - Remove Output/ build artifacts, installer files, loose docs - Update .gitignore for runtime data and build artifacts - Update README for v1.9 with new launch method, screenshots, and features Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-24 06:59:06 -07:00			`#!/usr/bin/env python3`
			`"""`
			`AUTARCH LoRA Training Data Generator`
			`Extracts instruction/input/output triplets from the codebase`
			`for fine-tuning LLMs on AUTARCH module creation patterns.`

			`Run: python scripts/build_training_data.py`
			`Output: data/codex/autarch_training.jsonl`

			`Generates training pairs for:`
			`- Module creation (description → code)`
			`- Route creation (feature description → Flask blueprint)`
			`- Config patterns (section description → config code)`
			`- Template patterns (feature → Jinja2 template)`
			`"""`

			`import ast`
			`import json`
			`import sys`
			`import re`
			`from pathlib import Path`
			`from datetime import datetime`

			`FRAMEWORK_DIR = Path(__file__).parent.parent`
			`sys.path.insert(0, str(FRAMEWORK_DIR))`

			`OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl'`


			`def extract_module_pair(filepath: Path) -> dict:`
			`"""Extract a training pair from a module file."""`
			`try:`
			`source = filepath.read_text(encoding='utf-8', errors='ignore')`
			`tree = ast.parse(source)`
			`except (SyntaxError, UnicodeDecodeError):`
			`return None`

			`description = None`
			`category = None`
			`author = None`
			`version = None`
			`docstring = ast.get_docstring(tree) or ''`

			`for node in ast.walk(tree):`
			`if isinstance(node, ast.Assign):`
			`for target in node.targets:`
			`if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant):`
			`if target.id == 'DESCRIPTION':`
			`description = node.value.value`
			`elif target.id == 'CATEGORY':`
			`category = node.value.value`
			`elif target.id == 'AUTHOR':`
			`author = node.value.value`
			`elif target.id == 'VERSION':`
			`version = node.value.value`

			`if not description or not category:`
			`return None`

			`# Build the instruction`
			`instruction = (`
			`f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. "`
			`f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY "`
			`f"attributes and a run() entry point function."`
			`)`

			`return {`
			`'instruction': instruction,`
			`'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}",`
			`'output': source,`
			`'type': 'module_creation',`
			`'category': category,`
			`'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),`
			`}`


			`def extract_route_pair(filepath: Path) -> dict:`
			`"""Extract a training pair from a route file."""`
			`try:`
			`source = filepath.read_text(encoding='utf-8', errors='ignore')`
			`tree = ast.parse(source)`
			`except (SyntaxError, UnicodeDecodeError):`
			`return None`

			`docstring = ast.get_docstring(tree) or ''`

			`# Find blueprint name and prefix`
			`bp_name = None`
			`bp_prefix = None`
			`for node in ast.walk(tree):`
			`if isinstance(node, ast.Assign):`
			`for target in node.targets:`
			`if isinstance(target, ast.Name) and isinstance(node.value, ast.Call):`
			`if hasattr(node.value, 'func'):`
			`func_name = ''`
			`if hasattr(node.value.func, 'id'):`
			`func_name = node.value.func.id`
			`elif hasattr(node.value.func, 'attr'):`
			`func_name = node.value.func.attr`
			`if func_name == 'Blueprint':`
			`bp_name = target.id`
			`for kw in node.value.keywords:`
			`if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant):`
			`bp_prefix = kw.value.value`

			`if not bp_name:`
			`return None`

			`# Count routes`
			`routes = []`
			`for node in ast.iter_child_nodes(tree):`
			`if isinstance(node, ast.FunctionDef):`
			`for deco in node.decorator_list:`
			`if isinstance(deco, ast.Call) and hasattr(deco, 'func'):`
			`if hasattr(deco.func, 'attr') and deco.func.attr == 'route':`
			`doc = ast.get_docstring(node) or ''`
			`routes.append({`
			`'handler': node.name,`
			`'doc': doc.split('\n')[0] if doc else '',`
			`})`

			`feature_name = filepath.stem.replace('_', ' ').title()`
			`instruction = (`
			`f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. "`
			`f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', "`
			`f"use @login_required on all routes, and follow AUTARCH web route conventions. "`
			`f"It needs {len(routes)} route handlers."`
			`)`

			`return {`
			`'instruction': instruction,`
			`'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}",`
			`'output': source,`
			`'type': 'route_creation',`
			`'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),`
			`}`


			`def extract_template_pair(filepath: Path) -> dict:`
			`"""Extract a training pair from a template file."""`
			`try:`
			`source = filepath.read_text(encoding='utf-8', errors='ignore')`
			`except Exception:`
			`return None`

			`if '{% extends' not in source or '{% block content %}' not in source:`
			`return None`

			`# Count sections, tabs, buttons, forms`
			`sections = source.count('class="section"') + source.count("class='section'")`
			`tabs = source.count('class="tab"') + source.count("class='tab'")`
			`forms = source.count('<form') + source.count('fetch(')`
			`has_script = '<script>' in source`

			`feature_name = filepath.stem.replace('_', ' ').title()`
			`instruction = (`
			`f"Create an AUTARCH web template for the '{feature_name}' page. "`
			`f"It should extend base.html, have a page header, and use AUTARCH's "`
			`f"CSS variables and UI patterns (sections, tab bars, data tables, buttons)."`
			`)`

			`return {`
			`'instruction': instruction,`
			`'input': (`
			`f"Template: {filepath.name}\n"`
			`f"Sections: {sections}\nTabs: {tabs}\nForms/API calls: {forms}\n"`
			`f"Has JavaScript: {has_script}"`
			`),`
			`'output': source,`
			`'type': 'template_creation',`
			`'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),`
			`}`


			`def extract_core_api_pairs(filepath: Path) -> list:`
			`"""Extract training pairs showing how to use core APIs."""`
			`pairs = []`
			`try:`
			`source = filepath.read_text(encoding='utf-8', errors='ignore')`
			`tree = ast.parse(source)`
			`except (SyntaxError, UnicodeDecodeError):`
			`return pairs`

			`for node in ast.iter_child_nodes(tree):`
			`if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):`
			`if node.name.startswith('_'):`
			`continue`
			`doc = ast.get_docstring(node) or ''`
			`if not doc:`
			`continue`

			`# Extract the function source`
			`lines = source.split('\n')`
			`start = node.lineno - 1`
			`end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20`
			`func_source = '\n'.join(lines[start:end])`

			`args = [a.arg for a in node.args.args if a.arg != 'self']`
			`module_name = filepath.stem`

			`pairs.append({`
			'instruction': f"Show how to implement the `{node.name}` function in core/{module_name}.py",
			`'input': f"Function: {node.name}({', '.join(args)})\nDocstring: {doc.split(chr(10))[0]}",`
			`'output': func_source,`
			`'type': 'api_reference',`
			`'source_file': f"core/{filepath.name}",`
			`})`

			`return pairs`


			`def build_training_data():`
			`"""Generate training data from the codebase."""`
			`print("[training] Scanning codebase for training pairs...")`

			`pairs = []`

			`# Module pairs`
			`modules_dir = FRAMEWORK_DIR / 'modules'`
			`for f in sorted(modules_dir.glob('*.py')):`
			`if f.name == '__init__.py':`
			`continue`
			`pair = extract_module_pair(f)`
			`if pair:`
			`pairs.append(pair)`

			`module_count = len(pairs)`
			`print(f" Modules: {module_count} pairs")`

			`# Route pairs`
			`routes_dir = FRAMEWORK_DIR / 'web' / 'routes'`
			`for f in sorted(routes_dir.glob('*.py')):`
			`if f.name == '__init__.py':`
			`continue`
			`pair = extract_route_pair(f)`
			`if pair:`
			`pairs.append(pair)`

			`route_count = len(pairs) - module_count`
			`print(f" Routes: {route_count} pairs")`

			`# Template pairs`
			`templates_dir = FRAMEWORK_DIR / 'web' / 'templates'`
			`for f in sorted(templates_dir.glob('*.html')):`
			`pair = extract_template_pair(f)`
			`if pair:`
			`pairs.append(pair)`

			`template_count = len(pairs) - module_count - route_count`
			`print(f" Templates: {template_count} pairs")`

			`# Core API pairs`
			`core_dir = FRAMEWORK_DIR / 'core'`
			`api_start = len(pairs)`
			`for f in sorted(core_dir.glob('*.py')):`
			`if f.name == '__init__.py':`
			`continue`
			`api_pairs = extract_core_api_pairs(f)`
			`pairs.extend(api_pairs)`

			`api_count = len(pairs) - api_start`
			`print(f" Core API: {api_count} pairs")`

			`# Write JSONL`
			`OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)`
			`with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:`
			`for pair in pairs:`
			`f.write(json.dumps(pair, ensure_ascii=False) + '\n')`

			`total_size = OUTPUT_PATH.stat().st_size`
			`print(f"\n[training] Written {len(pairs)} training pairs ({total_size:,} bytes) to {OUTPUT_PATH}")`
			`print(f"[training] Breakdown: {module_count} modules, {route_count} routes, "`
			`f"{template_count} templates, {api_count} core API functions")`

			`# Also output a summary`
			`summary_path = OUTPUT_PATH.with_suffix('.summary.json')`
			`summary = {`
			`'generated': datetime.now().isoformat(),`
			`'total_pairs': len(pairs),`
			`'modules': module_count,`
			`'routes': route_count,`
			`'templates': template_count,`
			`'core_api': api_count,`
			`'output_bytes': total_size,`
			`'types': {},`
			`}`
			`for p in pairs:`
			`t = p['type']`
			`summary['types'][t] = summary['types'].get(t, 0) + 1`
			`summary_path.write_text(json.dumps(summary, indent=2), encoding='utf-8')`
			`print(f"[training] Summary: {summary_path}")`


			`if __name__ == '__main__':`
			`build_training_data()`