#!/usr/bin/env python3 """ AUTARCH LoRA Training Data Generator Extracts instruction/input/output triplets from the codebase for fine-tuning LLMs on AUTARCH module creation patterns. Run: python scripts/build_training_data.py Output: data/codex/autarch_training.jsonl Generates training pairs for: - Module creation (description → code) - Route creation (feature description → Flask blueprint) - Config patterns (section description → config code) - Template patterns (feature → Jinja2 template) """ import ast import json import sys import re from pathlib import Path from datetime import datetime FRAMEWORK_DIR = Path(__file__).parent.parent sys.path.insert(0, str(FRAMEWORK_DIR)) OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl' def extract_module_pair(filepath: Path) -> dict: """Extract a training pair from a module file.""" try: source = filepath.read_text(encoding='utf-8', errors='ignore') tree = ast.parse(source) except (SyntaxError, UnicodeDecodeError): return None description = None category = None author = None version = None docstring = ast.get_docstring(tree) or '' for node in ast.walk(tree): if isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant): if target.id == 'DESCRIPTION': description = node.value.value elif target.id == 'CATEGORY': category = node.value.value elif target.id == 'AUTHOR': author = node.value.value elif target.id == 'VERSION': version = node.value.value if not description or not category: return None # Build the instruction instruction = ( f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. " f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY " f"attributes and a run() entry point function." ) return { 'instruction': instruction, 'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}", 'output': source, 'type': 'module_creation', 'category': category, 'source_file': str(filepath.relative_to(FRAMEWORK_DIR)), } def extract_route_pair(filepath: Path) -> dict: """Extract a training pair from a route file.""" try: source = filepath.read_text(encoding='utf-8', errors='ignore') tree = ast.parse(source) except (SyntaxError, UnicodeDecodeError): return None docstring = ast.get_docstring(tree) or '' # Find blueprint name and prefix bp_name = None bp_prefix = None for node in ast.walk(tree): if isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name) and isinstance(node.value, ast.Call): if hasattr(node.value, 'func'): func_name = '' if hasattr(node.value.func, 'id'): func_name = node.value.func.id elif hasattr(node.value.func, 'attr'): func_name = node.value.func.attr if func_name == 'Blueprint': bp_name = target.id for kw in node.value.keywords: if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant): bp_prefix = kw.value.value if not bp_name: return None # Count routes routes = [] for node in ast.iter_child_nodes(tree): if isinstance(node, ast.FunctionDef): for deco in node.decorator_list: if isinstance(deco, ast.Call) and hasattr(deco, 'func'): if hasattr(deco.func, 'attr') and deco.func.attr == 'route': doc = ast.get_docstring(node) or '' routes.append({ 'handler': node.name, 'doc': doc.split('\n')[0] if doc else '', }) feature_name = filepath.stem.replace('_', ' ').title() instruction = ( f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. " f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', " f"use @login_required on all routes, and follow AUTARCH web route conventions. " f"It needs {len(routes)} route handlers." ) return { 'instruction': instruction, 'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}", 'output': source, 'type': 'route_creation', 'source_file': str(filepath.relative_to(FRAMEWORK_DIR)), } def extract_template_pair(filepath: Path) -> dict: """Extract a training pair from a template file.""" try: source = filepath.read_text(encoding='utf-8', errors='ignore') except Exception: return None if '{% extends' not in source or '{% block content %}' not in source: return None # Count sections, tabs, buttons, forms sections = source.count('class="section"') + source.count("class='section'") tabs = source.count('class="tab"') + source.count("class='tab'") forms = source.count('