AUTARCH v1.9 — remote monitoring, SSH manager, daemon, vault, cleanup
- Add Remote Monitoring Station with PIAP device profile system - Add SSH/SSHD manager with fail2ban integration - Add privileged daemon architecture for safe root operations - Add encrypted vault, HAL memory, HAL auto-analyst - Add network security suite, module creator, codex training - Add start.sh launcher script and GTK3 desktop launcher - Remove Output/ build artifacts, installer files, loose docs - Update .gitignore for runtime data and build artifacts - Update README for v1.9 with new launch method, screenshots, and features Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
295
scripts/build_training_data.py
Normal file
295
scripts/build_training_data.py
Normal file
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AUTARCH LoRA Training Data Generator
|
||||
Extracts instruction/input/output triplets from the codebase
|
||||
for fine-tuning LLMs on AUTARCH module creation patterns.
|
||||
|
||||
Run: python scripts/build_training_data.py
|
||||
Output: data/codex/autarch_training.jsonl
|
||||
|
||||
Generates training pairs for:
|
||||
- Module creation (description → code)
|
||||
- Route creation (feature description → Flask blueprint)
|
||||
- Config patterns (section description → config code)
|
||||
- Template patterns (feature → Jinja2 template)
|
||||
"""
|
||||
|
||||
import ast
|
||||
import json
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
FRAMEWORK_DIR = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(FRAMEWORK_DIR))
|
||||
|
||||
OUTPUT_PATH = FRAMEWORK_DIR / 'data' / 'codex' / 'autarch_training.jsonl'
|
||||
|
||||
|
||||
def extract_module_pair(filepath: Path) -> dict:
|
||||
"""Extract a training pair from a module file."""
|
||||
try:
|
||||
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||||
tree = ast.parse(source)
|
||||
except (SyntaxError, UnicodeDecodeError):
|
||||
return None
|
||||
|
||||
description = None
|
||||
category = None
|
||||
author = None
|
||||
version = None
|
||||
docstring = ast.get_docstring(tree) or ''
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Assign):
|
||||
for target in node.targets:
|
||||
if isinstance(target, ast.Name) and isinstance(node.value, ast.Constant):
|
||||
if target.id == 'DESCRIPTION':
|
||||
description = node.value.value
|
||||
elif target.id == 'CATEGORY':
|
||||
category = node.value.value
|
||||
elif target.id == 'AUTHOR':
|
||||
author = node.value.value
|
||||
elif target.id == 'VERSION':
|
||||
version = node.value.value
|
||||
|
||||
if not description or not category:
|
||||
return None
|
||||
|
||||
# Build the instruction
|
||||
instruction = (
|
||||
f"Create an AUTARCH module in the '{category}' category that {description.lower().rstrip('.')}. "
|
||||
f"The module should follow AUTARCH conventions with DESCRIPTION, AUTHOR, VERSION, CATEGORY "
|
||||
f"attributes and a run() entry point function."
|
||||
)
|
||||
|
||||
return {
|
||||
'instruction': instruction,
|
||||
'input': f"Module name: {filepath.stem}\nCategory: {category}\nDescription: {description}",
|
||||
'output': source,
|
||||
'type': 'module_creation',
|
||||
'category': category,
|
||||
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
|
||||
}
|
||||
|
||||
|
||||
def extract_route_pair(filepath: Path) -> dict:
|
||||
"""Extract a training pair from a route file."""
|
||||
try:
|
||||
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||||
tree = ast.parse(source)
|
||||
except (SyntaxError, UnicodeDecodeError):
|
||||
return None
|
||||
|
||||
docstring = ast.get_docstring(tree) or ''
|
||||
|
||||
# Find blueprint name and prefix
|
||||
bp_name = None
|
||||
bp_prefix = None
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Assign):
|
||||
for target in node.targets:
|
||||
if isinstance(target, ast.Name) and isinstance(node.value, ast.Call):
|
||||
if hasattr(node.value, 'func'):
|
||||
func_name = ''
|
||||
if hasattr(node.value.func, 'id'):
|
||||
func_name = node.value.func.id
|
||||
elif hasattr(node.value.func, 'attr'):
|
||||
func_name = node.value.func.attr
|
||||
if func_name == 'Blueprint':
|
||||
bp_name = target.id
|
||||
for kw in node.value.keywords:
|
||||
if kw.arg == 'url_prefix' and isinstance(kw.value, ast.Constant):
|
||||
bp_prefix = kw.value.value
|
||||
|
||||
if not bp_name:
|
||||
return None
|
||||
|
||||
# Count routes
|
||||
routes = []
|
||||
for node in ast.iter_child_nodes(tree):
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
for deco in node.decorator_list:
|
||||
if isinstance(deco, ast.Call) and hasattr(deco, 'func'):
|
||||
if hasattr(deco.func, 'attr') and deco.func.attr == 'route':
|
||||
doc = ast.get_docstring(node) or ''
|
||||
routes.append({
|
||||
'handler': node.name,
|
||||
'doc': doc.split('\n')[0] if doc else '',
|
||||
})
|
||||
|
||||
feature_name = filepath.stem.replace('_', ' ').title()
|
||||
instruction = (
|
||||
f"Create a Flask blueprint route file for AUTARCH's '{feature_name}' feature. "
|
||||
f"It should have a blueprint with url_prefix='{bp_prefix or '/' + filepath.stem}', "
|
||||
f"use @login_required on all routes, and follow AUTARCH web route conventions. "
|
||||
f"It needs {len(routes)} route handlers."
|
||||
)
|
||||
|
||||
return {
|
||||
'instruction': instruction,
|
||||
'input': f"Feature: {feature_name}\nBlueprint: {bp_name}\nPrefix: {bp_prefix}\nRoutes: {len(routes)}",
|
||||
'output': source,
|
||||
'type': 'route_creation',
|
||||
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
|
||||
}
|
||||
|
||||
|
||||
def extract_template_pair(filepath: Path) -> dict:
|
||||
"""Extract a training pair from a template file."""
|
||||
try:
|
||||
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if '{% extends' not in source or '{% block content %}' not in source:
|
||||
return None
|
||||
|
||||
# Count sections, tabs, buttons, forms
|
||||
sections = source.count('class="section"') + source.count("class='section'")
|
||||
tabs = source.count('class="tab"') + source.count("class='tab'")
|
||||
forms = source.count('<form') + source.count('fetch(')
|
||||
has_script = '<script>' in source
|
||||
|
||||
feature_name = filepath.stem.replace('_', ' ').title()
|
||||
instruction = (
|
||||
f"Create an AUTARCH web template for the '{feature_name}' page. "
|
||||
f"It should extend base.html, have a page header, and use AUTARCH's "
|
||||
f"CSS variables and UI patterns (sections, tab bars, data tables, buttons)."
|
||||
)
|
||||
|
||||
return {
|
||||
'instruction': instruction,
|
||||
'input': (
|
||||
f"Template: {filepath.name}\n"
|
||||
f"Sections: {sections}\nTabs: {tabs}\nForms/API calls: {forms}\n"
|
||||
f"Has JavaScript: {has_script}"
|
||||
),
|
||||
'output': source,
|
||||
'type': 'template_creation',
|
||||
'source_file': str(filepath.relative_to(FRAMEWORK_DIR)),
|
||||
}
|
||||
|
||||
|
||||
def extract_core_api_pairs(filepath: Path) -> list:
|
||||
"""Extract training pairs showing how to use core APIs."""
|
||||
pairs = []
|
||||
try:
|
||||
source = filepath.read_text(encoding='utf-8', errors='ignore')
|
||||
tree = ast.parse(source)
|
||||
except (SyntaxError, UnicodeDecodeError):
|
||||
return pairs
|
||||
|
||||
for node in ast.iter_child_nodes(tree):
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
if node.name.startswith('_'):
|
||||
continue
|
||||
doc = ast.get_docstring(node) or ''
|
||||
if not doc:
|
||||
continue
|
||||
|
||||
# Extract the function source
|
||||
lines = source.split('\n')
|
||||
start = node.lineno - 1
|
||||
end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20
|
||||
func_source = '\n'.join(lines[start:end])
|
||||
|
||||
args = [a.arg for a in node.args.args if a.arg != 'self']
|
||||
module_name = filepath.stem
|
||||
|
||||
pairs.append({
|
||||
'instruction': f"Show how to implement the `{node.name}` function in core/{module_name}.py",
|
||||
'input': f"Function: {node.name}({', '.join(args)})\nDocstring: {doc.split(chr(10))[0]}",
|
||||
'output': func_source,
|
||||
'type': 'api_reference',
|
||||
'source_file': f"core/{filepath.name}",
|
||||
})
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def build_training_data():
|
||||
"""Generate training data from the codebase."""
|
||||
print("[training] Scanning codebase for training pairs...")
|
||||
|
||||
pairs = []
|
||||
|
||||
# Module pairs
|
||||
modules_dir = FRAMEWORK_DIR / 'modules'
|
||||
for f in sorted(modules_dir.glob('*.py')):
|
||||
if f.name == '__init__.py':
|
||||
continue
|
||||
pair = extract_module_pair(f)
|
||||
if pair:
|
||||
pairs.append(pair)
|
||||
|
||||
module_count = len(pairs)
|
||||
print(f" Modules: {module_count} pairs")
|
||||
|
||||
# Route pairs
|
||||
routes_dir = FRAMEWORK_DIR / 'web' / 'routes'
|
||||
for f in sorted(routes_dir.glob('*.py')):
|
||||
if f.name == '__init__.py':
|
||||
continue
|
||||
pair = extract_route_pair(f)
|
||||
if pair:
|
||||
pairs.append(pair)
|
||||
|
||||
route_count = len(pairs) - module_count
|
||||
print(f" Routes: {route_count} pairs")
|
||||
|
||||
# Template pairs
|
||||
templates_dir = FRAMEWORK_DIR / 'web' / 'templates'
|
||||
for f in sorted(templates_dir.glob('*.html')):
|
||||
pair = extract_template_pair(f)
|
||||
if pair:
|
||||
pairs.append(pair)
|
||||
|
||||
template_count = len(pairs) - module_count - route_count
|
||||
print(f" Templates: {template_count} pairs")
|
||||
|
||||
# Core API pairs
|
||||
core_dir = FRAMEWORK_DIR / 'core'
|
||||
api_start = len(pairs)
|
||||
for f in sorted(core_dir.glob('*.py')):
|
||||
if f.name == '__init__.py':
|
||||
continue
|
||||
api_pairs = extract_core_api_pairs(f)
|
||||
pairs.extend(api_pairs)
|
||||
|
||||
api_count = len(pairs) - api_start
|
||||
print(f" Core API: {api_count} pairs")
|
||||
|
||||
# Write JSONL
|
||||
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
||||
for pair in pairs:
|
||||
f.write(json.dumps(pair, ensure_ascii=False) + '\n')
|
||||
|
||||
total_size = OUTPUT_PATH.stat().st_size
|
||||
print(f"\n[training] Written {len(pairs)} training pairs ({total_size:,} bytes) to {OUTPUT_PATH}")
|
||||
print(f"[training] Breakdown: {module_count} modules, {route_count} routes, "
|
||||
f"{template_count} templates, {api_count} core API functions")
|
||||
|
||||
# Also output a summary
|
||||
summary_path = OUTPUT_PATH.with_suffix('.summary.json')
|
||||
summary = {
|
||||
'generated': datetime.now().isoformat(),
|
||||
'total_pairs': len(pairs),
|
||||
'modules': module_count,
|
||||
'routes': route_count,
|
||||
'templates': template_count,
|
||||
'core_api': api_count,
|
||||
'output_bytes': total_size,
|
||||
'types': {},
|
||||
}
|
||||
for p in pairs:
|
||||
t = p['type']
|
||||
summary['types'][t] = summary['types'].get(t, 0) + 1
|
||||
summary_path.write_text(json.dumps(summary, indent=2), encoding='utf-8')
|
||||
print(f"[training] Summary: {summary_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
build_training_data()
|
||||
Reference in New Issue
Block a user