296 lines
10 KiB
Python
296 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
AUTARCH LoRA Training Data Generator
|
||
|
|
Extracts instruction/input/output triplets from the codebase
|
||
|
|
for fine-tuning LLMs on AUTARCH module creation patterns.
|
||
|
|
|
||
|
|
Run: python scripts/build_training_data.py
|
||
|
|
Output: data/codex/autarch_training.jsonl
|
||
|
|
|
||
|
|
Generates training pairs for:
|
||
|
|
- Module creation (description → code)
|
||
|
|
- Route creation (feature description → Flask blueprint)
|
||
|
|
- Config patterns (section description → config code)
|
||
|
|
- Template patterns (feature → Jinja2 template)
|
||
|
|
"""
|
||
|
|
|
||
|
|
import ast
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
import re
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
FRAMEWORK_DIR = Path(__file__).parent.parent
|
||
|
|
sys.path.insert(0, str(FRAMEWORK_DIR))
|
||
|
|
|
||
|
|
OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl'
|
||
|
|
|
||
|
|
|
||
|
|
def extract_module_pair(filepath: Path) -> dict:
|
||
|
|
"""Extract a training pair from a module file."""
|
||
|
|
try:
|
||
|
|
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||
|
|
tree = ast.parse(source)
|
||
|
|
except (SyntaxError, UnicodeDecodeError):
|
||
|
|
return None
|
||
|
|
|
||
|
|
description = None
|
||
|
|
category = None
|
||
|
|
author = None
|
||
|
|
version = None
|
||
|
|
docstring = ast.get_docstring(tree) or ''
|
||
|
|
|
||
|
|
for node in ast.walk(tree):
|
||
|
|
if isinstance(node, ast.Assign):
|
||
|
|
for target in node.targets:
|
||
|
|
if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant):
|
||
|
|
if target.id == 'DESCRIPTION':
|
||
|
|
description = node.value.value
|
||
|
|
elif target.id == 'CATEGORY':
|
||
|
|
category = node.value.value
|
||
|
|
elif target.id == 'AUTHOR':
|
||
|
|
author = node.value.value
|
||
|
|
elif target.id == 'VERSION':
|
||
|
|
version = node.value.value
|
||
|
|
|
||
|
|
if not description or not category:
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Build the instruction
|
||
|
|
instruction = (
|
||
|
|
f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. "
|
||
|
|
f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY "
|
||
|
|
f"attributes and a run() entry point function."
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
'instruction': instruction,
|
||
|
|
'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}",
|
||
|
|
'output': source,
|
||
|
|
'type': 'module_creation',
|
||
|
|
'category': category,
|
||
|
|
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def extract_route_pair(filepath: Path) -> dict:
|
||
|
|
"""Extract a training pair from a route file."""
|
||
|
|
try:
|
||
|
|
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||
|
|
tree = ast.parse(source)
|
||
|
|
except (SyntaxError, UnicodeDecodeError):
|
||
|
|
return None
|
||
|
|
|
||
|
|
docstring = ast.get_docstring(tree) or ''
|
||
|
|
|
||
|
|
# Find blueprint name and prefix
|
||
|
|
bp_name = None
|
||
|
|
bp_prefix = None
|
||
|
|
for node in ast.walk(tree):
|
||
|
|
if isinstance(node, ast.Assign):
|
||
|
|
for target in node.targets:
|
||
|
|
if isinstance(target, ast.Name) and isinstance(node.value, ast.Call):
|
||
|
|
if hasattr(node.value, 'func'):
|
||
|
|
func_name = ''
|
||
|
|
if hasattr(node.value.func, 'id'):
|
||
|
|
func_name = node.value.func.id
|
||
|
|
elif hasattr(node.value.func, 'attr'):
|
||
|
|
func_name = node.value.func.attr
|
||
|
|
if func_name == 'Blueprint':
|
||
|
|
bp_name = target.id
|
||
|
|
for kw in node.value.keywords:
|
||
|
|
if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant):
|
||
|
|
bp_prefix = kw.value.value
|
||
|
|
|
||
|
|
if not bp_name:
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Count routes
|
||
|
|
routes = []
|
||
|
|
for node in ast.iter_child_nodes(tree):
|
||
|
|
if isinstance(node, ast.FunctionDef):
|
||
|
|
for deco in node.decorator_list:
|
||
|
|
if isinstance(deco, ast.Call) and hasattr(deco, 'func'):
|
||
|
|
if hasattr(deco.func, 'attr') and deco.func.attr == 'route':
|
||
|
|
doc = ast.get_docstring(node) or ''
|
||
|
|
routes.append({
|
||
|
|
'handler': node.name,
|
||
|
|
'doc': doc.split('\n')[0] if doc else '',
|
||
|
|
})
|
||
|
|
|
||
|
|
feature_name = filepath.stem.replace('_', ' ').title()
|
||
|
|
instruction = (
|
||
|
|
f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. "
|
||
|
|
f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', "
|
||
|
|
f"use @login_required on all routes, and follow AUTARCH web route conventions. "
|
||
|
|
f"It needs {len(routes)} route handlers."
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
'instruction': instruction,
|
||
|
|
'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}",
|
||
|
|
'output': source,
|
||
|
|
'type': 'route_creation',
|
||
|
|
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def extract_template_pair(filepath: Path) -> dict:
|
||
|
|
"""Extract a training pair from a template file."""
|
||
|
|
try:
|
||
|
|
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||
|
|
except Exception:
|
||
|
|
return None
|
||
|
|
|
||
|
|
if '{% extends' not in source or '{% block content %}' not in source:
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Count sections, tabs, buttons, forms
|
||
|
|
sections = source.count('class="section"') + source.count("class='section'")
|
||
|
|
tabs = source.count('class="tab"') + source.count("class='tab'")
|
||
|
|
forms = source.count('<form') + source.count('fetch(')
|
||
|
|
has_script = '<script>' in source
|
||
|
|
|
||
|
|
feature_name = filepath.stem.replace('_', ' ').title()
|
||
|
|
instruction = (
|
||
|
|
f"Create an AUTARCH web template for the '{feature_name}' page. "
|
||
|
|
f"It should extend base.html, have a page header, and use AUTARCH's "
|
||
|
|
f"CSS variables and UI patterns (sections, tab bars, data tables, buttons)."
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
'instruction': instruction,
|
||
|
|
'input': (
|
||
|
|
f"Template: {filepath.name}\n"
|
||
|
|
f"Sections: {sections}\nTabs: {tabs}\nForms/API calls: {forms}\n"
|
||
|
|
f"Has JavaScript: {has_script}"
|
||
|
|
),
|
||
|
|
'output': source,
|
||
|
|
'type': 'template_creation',
|
||
|
|
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def extract_core_api_pairs(filepath: Path) -> list:
|
||
|
|
"""Extract training pairs showing how to use core APIs."""
|
||
|
|
pairs = []
|
||
|
|
try:
|
||
|
|
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||
|
|
tree = ast.parse(source)
|
||
|
|
except (SyntaxError, UnicodeDecodeError):
|
||
|
|
return pairs
|
||
|
|
|
||
|
|
for node in ast.iter_child_nodes(tree):
|
||
|
|
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||
|
|
if node.name.startswith('_'):
|
||
|
|
continue
|
||
|
|
doc = ast.get_docstring(node) or ''
|
||
|
|
if not doc:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Extract the function source
|
||
|
|
lines = source.split('\n')
|
||
|
|
start = node.lineno - 1
|
||
|
|
end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20
|
||
|
|
func_source = '\n'.join(lines[start:end])
|
||
|
|
|
||
|
|
args = [a.arg for a in node.args.args if a.arg != 'self']
|
||
|
|
module_name = filepath.stem
|
||
|
|
|
||
|
|
pairs.append({
|
||
|
|
'instruction': f"Show how to implement the `{node.name}` function in core/{module_name}.py",
|
||
|
|
'input': f"Function: {node.name}({', '.join(args)})\nDocstring: {doc.split(chr(10))[0]}",
|
||
|
|
'output': func_source,
|
||
|
|
'type': 'api_reference',
|
||
|
|
'source_file': f"core/{filepath.name}",
|
||
|
|
})
|
||
|
|
|
||
|
|
return pairs
|
||
|
|
|
||
|
|
|
||
|
|
def build_training_data():
|
||
|
|
"""Generate training data from the codebase."""
|
||
|
|
print("[training] Scanning codebase for training pairs...")
|
||
|
|
|
||
|
|
pairs = []
|
||
|
|
|
||
|
|
# Module pairs
|
||
|
|
modules_dir = FRAMEWORK_DIR / 'modules'
|
||
|
|
for f in sorted(modules_dir.glob('*.py')):
|
||
|
|
if f.name == '__init__.py':
|
||
|
|
continue
|
||
|
|
pair = extract_module_pair(f)
|
||
|
|
if pair:
|
||
|
|
pairs.append(pair)
|
||
|
|
|
||
|
|
module_count = len(pairs)
|
||
|
|
print(f" Modules: {module_count} pairs")
|
||
|
|
|
||
|
|
# Route pairs
|
||
|
|
routes_dir = FRAMEWORK_DIR / 'web' / 'routes'
|
||
|
|
for f in sorted(routes_dir.glob('*.py')):
|
||
|
|
if f.name == '__init__.py':
|
||
|
|
continue
|
||
|
|
pair = extract_route_pair(f)
|
||
|
|
if pair:
|
||
|
|
pairs.append(pair)
|
||
|
|
|
||
|
|
route_count = len(pairs) - module_count
|
||
|
|
print(f" Routes: {route_count} pairs")
|
||
|
|
|
||
|
|
# Template pairs
|
||
|
|
templates_dir = FRAMEWORK_DIR / 'web' / 'templates'
|
||
|
|
for f in sorted(templates_dir.glob('*.html')):
|
||
|
|
pair = extract_template_pair(f)
|
||
|
|
if pair:
|
||
|
|
pairs.append(pair)
|
||
|
|
|
||
|
|
template_count = len(pairs) - module_count - route_count
|
||
|
|
print(f" Templates: {template_count} pairs")
|
||
|
|
|
||
|
|
# Core API pairs
|
||
|
|
core_dir = FRAMEWORK_DIR / 'core'
|
||
|
|
api_start = len(pairs)
|
||
|
|
for f in sorted(core_dir.glob('*.py')):
|
||
|
|
if f.name == '__init__.py':
|
||
|
|
continue
|
||
|
|
api_pairs = extract_core_api_pairs(f)
|
||
|
|
pairs.extend(api_pairs)
|
||
|
|
|
||
|
|
api_count = len(pairs) - api_start
|
||
|
|
print(f" Core API: {api_count} pairs")
|
||
|
|
|
||
|
|
# Write JSONL
|
||
|
|
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
||
|
|
for pair in pairs:
|
||
|
|
f.write(json.dumps(pair, ensure_ascii=False) + '\n')
|
||
|
|
|
||
|
|
total_size = OUTPUT_PATH.stat().st_size
|
||
|
|
print(f"\n[training] Written {len(pairs)} training pairs ({total_size:,} bytes) to {OUTPUT_PATH}")
|
||
|
|
print(f"[training] Breakdown: {module_count} modules, {route_count} routes, "
|
||
|
|
f"{template_count} templates, {api_count} core API functions")
|
||
|
|
|
||
|
|
# Also output a summary
|
||
|
|
summary_path = OUTPUT_PATH.with_suffix('.summary.json')
|
||
|
|
summary = {
|
||
|
|
'generated': datetime.now().isoformat(),
|
||
|
|
'total_pairs': len(pairs),
|
||
|
|
'modules': module_count,
|
||
|
|
'routes': route_count,
|
||
|
|
'templates': template_count,
|
||
|
|
'core_api': api_count,
|
||
|
|
'output_bytes': total_size,
|
||
|
|
'types': {},
|
||
|
|
}
|
||
|
|
for p in pairs:
|
||
|
|
t = p['type']
|
||
|
|
summary['types'][t] = summary['types'].get(t, 0) + 1
|
||
|
|
summary_path.write_text(json.dumps(summary, indent=2), encoding='utf-8')
|
||
|
|
print(f"[training] Summary: {summary_path}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
build_training_data()
|