#!/usr/bin/env python3 """ AUTARCH LoRA Training Data Generator Extracts instruction/input/output triplets from the codebase for fine-tuning LLMs on AUTARCH module creation patterns. Run: python scripts/build_training_data.py Output: data/codex/autarch_training.jsonl Generates training pairs for: - Module creation (description → code) - Route creation (feature description → Flask blueprint) - Config patterns (section description → config code) - Template patterns (feature → Jinja2 template) """ import ast import json import sys import re from pathlib import Path from datetime import datetime FRAMEWORK_DIR = Path(__file__).parent.parent sys.path.insert(0, str(FRAMEWORK_DIR)) OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl' def extract_module_pair(filepath: Path) -> dict: """Extract a training pair from a module file.""" try: source = filepath.read_text(encoding='utf-8', errors='ignore') tree = ast.parse(source) except (SyntaxError, UnicodeDecodeError): return None description = None category = None author = None version = None docstring = ast.get_docstring(tree) or '' for node in ast.walk(tree): if isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant): if target.id == 'DESCRIPTION': description = node.value.value elif target.id == 'CATEGORY': category = node.value.value elif target.id == 'AUTHOR': author = node.value.value elif target.id == 'VERSION': version = node.value.value if not description or not category: return None # Build the instruction instruction = ( f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. " f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY " f"attributes and a run() entry point function." ) return { 'instruction': instruction, 'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}", 'output': source, 'type': 'module_creation', 'category': category, 'source_file': str(filepath.relative_to(FRAMEWORK_DIR)), } def extract_route_pair(filepath: Path) -> dict: """Extract a training pair from a route file.""" try: source = filepath.read_text(encoding='utf-8', errors='ignore') tree = ast.parse(source) except (SyntaxError, UnicodeDecodeError): return None docstring = ast.get_docstring(tree) or '' # Find blueprint name and prefix bp_name = None bp_prefix = None for node in ast.walk(tree): if isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name) and isinstance(node.value, ast.Call): if hasattr(node.value, 'func'): func_name = '' if hasattr(node.value.func, 'id'): func_name = node.value.func.id elif hasattr(node.value.func, 'attr'): func_name = node.value.func.attr if func_name == 'Blueprint': bp_name = target.id for kw in node.value.keywords: if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant): bp_prefix = kw.value.value if not bp_name: return None # Count routes routes = [] for node in ast.iter_child_nodes(tree): if isinstance(node, ast.FunctionDef): for deco in node.decorator_list: if isinstance(deco, ast.Call) and hasattr(deco, 'func'): if hasattr(deco.func, 'attr') and deco.func.attr == 'route': doc = ast.get_docstring(node) or '' routes.append({ 'handler': node.name, 'doc': doc.split('\n')[0] if doc else '', }) feature_name = filepath.stem.replace('_', ' ').title() instruction = ( f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. " f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', " f"use @login_required on all routes, and follow AUTARCH web route conventions. " f"It needs {len(routes)} route handlers." ) return { 'instruction': instruction, 'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}", 'output': source, 'type': 'route_creation', 'source_file': str(filepath.relative_to(FRAMEWORK_DIR)), } def extract_template_pair(filepath: Path) -> dict: """Extract a training pair from a template file.""" try: source = filepath.read_text(encoding='utf-8', errors='ignore') except Exception: return None if '{% extends' not in source or '{% block content %}' not in source: return None # Count sections, tabs, buttons, forms sections = source.count('class="section"') + source.count("class='section'") tabs = source.count('class="tab"') + source.count("class='tab'") forms = source.count(' list: """Extract training pairs showing how to use core APIs.""" pairs = [] try: source = filepath.read_text(encoding='utf-8', errors='ignore') tree = ast.parse(source) except (SyntaxError, UnicodeDecodeError): return pairs for node in ast.iter_child_nodes(tree): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): if node.name.startswith('_'): continue doc = ast.get_docstring(node) or '' if not doc: continue # Extract the function source lines = source.split('\n') start = node.lineno - 1 end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20 func_source = '\n'.join(lines[start:end]) args = [a.arg for a in node.args.args if a.arg != 'self'] module_name = filepath.stem pairs.append({ 'instruction': f"Show how to implement the `{node.name}` function in core/{module_name}.py", 'input': f"Function: {node.name}({', '.join(args)})\nDocstring: {doc.split(chr(10))[0]}", 'output': func_source, 'type': 'api_reference', 'source_file': f"core/{filepath.name}", }) return pairs def build_training_data(): """Generate training data from the codebase.""" print("[training] Scanning codebase for training pairs...") pairs = [] # Module pairs modules_dir = FRAMEWORK_DIR / 'modules' for f in sorted(modules_dir.glob('*.py')): if f.name == '__init__.py': continue pair = extract_module_pair(f) if pair: pairs.append(pair) module_count = len(pairs) print(f" Modules: {module_count} pairs") # Route pairs routes_dir = FRAMEWORK_DIR / 'web' / 'routes' for f in sorted(routes_dir.glob('*.py')): if f.name == '__init__.py': continue pair = extract_route_pair(f) if pair: pairs.append(pair) route_count = len(pairs) - module_count print(f" Routes: {route_count} pairs") # Template pairs templates_dir = FRAMEWORK_DIR / 'web' / 'templates' for f in sorted(templates_dir.glob('*.html')): pair = extract_template_pair(f) if pair: pairs.append(pair) template_count = len(pairs) - module_count - route_count print(f" Templates: {template_count} pairs") # Core API pairs core_dir = FRAMEWORK_DIR / 'core' api_start = len(pairs) for f in sorted(core_dir.glob('*.py')): if f.name == '__init__.py': continue api_pairs = extract_core_api_pairs(f) pairs.extend(api_pairs) api_count = len(pairs) - api_start print(f" Core API: {api_count} pairs") # Write JSONL OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_PATH, 'w', encoding='utf-8') as f: for pair in pairs: f.write(json.dumps(pair, ensure_ascii=False) + '\n') total_size = OUTPUT_PATH.stat().st_size print(f"\n[training] Written {len(pairs)} training pairs ({total_size:,} bytes) to {OUTPUT_PATH}") print(f"[training] Breakdown: {module_count} modules, {route_count} routes, " f"{template_count} templates, {api_count} core API functions") # Also output a summary summary_path = OUTPUT_PATH.with_suffix('.summary.json') summary = { 'generated': datetime.now().isoformat(), 'total_pairs': len(pairs), 'modules': module_count, 'routes': route_count, 'templates': template_count, 'core_api': api_count, 'output_bytes': total_size, 'types': {}, } for p in pairs: t = p['type'] summary['types'][t] = summary['types'].get(t, 0) + 1 summary_path.write_text(json.dumps(summary, indent=2), encoding='utf-8') print(f"[training] Summary: {summary_path}") if __name__ == '__main__': build_training_data()