Files
autarch/scripts/build_training_data.py

296 lines
10 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
AUTARCH LoRA Training Data Generator
Extracts instruction/input/output triplets from the codebase
for fine-tuning LLMs on AUTARCH module creation patterns.
Run: python scripts/build_training_data.py
Output: data/codex/autarch_training.jsonl
Generates training pairs for:
- Module creation (description code)
- Route creation (feature description Flask blueprint)
- Config patterns (section description config code)
- Template patterns (feature Jinja2 template)
"""
import ast
import json
import sys
import re
from pathlib import Path
from datetime import datetime
FRAMEWORK_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(FRAMEWORK_DIR))
OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl'
def extract_module_pair(filepath: Path) -> dict:
"""Extract a training pair from a module file."""
try:
source = filepath.read_text(encoding='utf-8', errors='ignore')
tree = ast.parse(source)
except (SyntaxError, UnicodeDecodeError):
return None
description = None
category = None
author = None
version = None
docstring = ast.get_docstring(tree) or ''
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant):
if target.id == 'DESCRIPTION':
description = node.value.value
elif target.id == 'CATEGORY':
category = node.value.value
elif target.id == 'AUTHOR':
author = node.value.value
elif target.id == 'VERSION':
version = node.value.value
if not description or not category:
return None
# Build the instruction
instruction = (
f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. "
f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY "
f"attributes and a run() entry point function."
)
return {
'instruction': instruction,
'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}",
'output': source,
'type': 'module_creation',
'category': category,
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
}
def extract_route_pair(filepath: Path) -> dict:
"""Extract a training pair from a route file."""
try:
source = filepath.read_text(encoding='utf-8', errors='ignore')
tree = ast.parse(source)
except (SyntaxError, UnicodeDecodeError):
return None
docstring = ast.get_docstring(tree) or ''
# Find blueprint name and prefix
bp_name = None
bp_prefix = None
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name) and isinstance(node.value, ast.Call):
if hasattr(node.value, 'func'):
func_name = ''
if hasattr(node.value.func, 'id'):
func_name = node.value.func.id
elif hasattr(node.value.func, 'attr'):
func_name = node.value.func.attr
if func_name == 'Blueprint':
bp_name = target.id
for kw in node.value.keywords:
if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant):
bp_prefix = kw.value.value
if not bp_name:
return None
# Count routes
routes = []
for node in ast.iter_child_nodes(tree):
if isinstance(node, ast.FunctionDef):
for deco in node.decorator_list:
if isinstance(deco, ast.Call) and hasattr(deco, 'func'):
if hasattr(deco.func, 'attr') and deco.func.attr == 'route':
doc = ast.get_docstring(node) or ''
routes.append({
'handler': node.name,
'doc': doc.split('\n')[0] if doc else '',
})
feature_name = filepath.stem.replace('_', ' ').title()
instruction = (
f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. "
f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', "
f"use @login_required on all routes, and follow AUTARCH web route conventions. "
f"It needs {len(routes)} route handlers."
)
return {
'instruction': instruction,
'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}",
'output': source,
'type': 'route_creation',
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
}
def extract_template_pair(filepath: Path) -> dict:
"""Extract a training pair from a template file."""
try:
source = filepath.read_text(encoding='utf-8', errors='ignore')
except Exception:
return None
if '{% extends' not in source or '{% block content %}' not in source:
return None
# Count sections, tabs, buttons, forms
sections = source.count('class="section"') + source.count("class='section'")
tabs = source.count('class="tab"') + source.count("class='tab'")
forms = source.count('<form') + source.count('fetch(')
has_script = '<script>' in source
feature_name = filepath.stem.replace('_', ' ').title()
instruction = (
f"Create an AUTARCH web template for the '{feature_name}' page. "
f"It should extend base.html, have a page header, and use AUTARCH's "
f"CSS variables and UI patterns (sections, tab bars, data tables, buttons)."
)
return {
'instruction': instruction,
'input': (
f"Template: {filepath.name}\n"
f"Sections: {sections}\nTabs: {tabs}\nForms/API calls: {forms}\n"
f"Has JavaScript: {has_script}"
),
'output': source,
'type': 'template_creation',
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
}
def extract_core_api_pairs(filepath: Path) -> list:
"""Extract training pairs showing how to use core APIs."""
pairs = []
try:
source = filepath.read_text(encoding='utf-8', errors='ignore')
tree = ast.parse(source)
except (SyntaxError, UnicodeDecodeError):
return pairs
for node in ast.iter_child_nodes(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
if node.name.startswith('_'):
continue
doc = ast.get_docstring(node) or ''
if not doc:
continue
# Extract the function source
lines = source.split('\n')
start = node.lineno - 1
end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20
func_source = '\n'.join(lines[start:end])
args = [a.arg for a in node.args.args if a.arg != 'self']
module_name = filepath.stem
pairs.append({
'instruction': f"Show how to implement the `{node.name}` function in core/{module_name}.py",
'input': f"Function: {node.name}({', '.join(args)})\nDocstring: {doc.split(chr(10))[0]}",
'output': func_source,
'type': 'api_reference',
'source_file': f"core/{filepath.name}",
})
return pairs
def build_training_data():
"""Generate training data from the codebase."""
print("[training] Scanning codebase for training pairs...")
pairs = []
# Module pairs
modules_dir = FRAMEWORK_DIR / 'modules'
for f in sorted(modules_dir.glob('*.py')):
if f.name == '__init__.py':
continue
pair = extract_module_pair(f)
if pair:
pairs.append(pair)
module_count = len(pairs)
print(f" Modules: {module_count} pairs")
# Route pairs
routes_dir = FRAMEWORK_DIR / 'web' / 'routes'
for f in sorted(routes_dir.glob('*.py')):
if f.name == '__init__.py':
continue
pair = extract_route_pair(f)
if pair:
pairs.append(pair)
route_count = len(pairs) - module_count
print(f" Routes: {route_count} pairs")
# Template pairs
templates_dir = FRAMEWORK_DIR / 'web' / 'templates'
for f in sorted(templates_dir.glob('*.html')):
pair = extract_template_pair(f)
if pair:
pairs.append(pair)
template_count = len(pairs) - module_count - route_count
print(f" Templates: {template_count} pairs")
# Core API pairs
core_dir = FRAMEWORK_DIR / 'core'
api_start = len(pairs)
for f in sorted(core_dir.glob('*.py')):
if f.name == '__init__.py':
continue
api_pairs = extract_core_api_pairs(f)
pairs.extend(api_pairs)
api_count = len(pairs) - api_start
print(f" Core API: {api_count} pairs")
# Write JSONL
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
for pair in pairs:
f.write(json.dumps(pair, ensure_ascii=False) + '\n')
total_size = OUTPUT_PATH.stat().st_size
print(f"\n[training] Written {len(pairs)} training pairs ({total_size:,} bytes) to {OUTPUT_PATH}")
print(f"[training] Breakdown: {module_count} modules, {route_count} routes, "
f"{template_count} templates, {api_count} core API functions")
# Also output a summary
summary_path = OUTPUT_PATH.with_suffix('.summary.json')
summary = {
'generated': datetime.now().isoformat(),
'total_pairs': len(pairs),
'modules': module_count,
'routes': route_count,
'templates': template_count,
'core_api': api_count,
'output_bytes': total_size,
'types': {},
}
for p in pairs:
t = p['type']
summary['types'][t] = summary['types'].get(t, 0) + 1
summary_path.write_text(json.dumps(summary, indent=2), encoding='utf-8')
print(f"[training] Summary: {summary_path}")
if __name__ == '__main__':
build_training_data()