Autarch Will Control The Internet

This commit is contained in:
DigiJ
2026-03-13 15:17:15 -07:00
commit 4d3570781e
401 changed files with 484494 additions and 0 deletions

46
.config/amd_rx6700xt.conf Normal file
View File

@@ -0,0 +1,46 @@
# AUTARCH LLM Configuration Template
# Hardware: AMD Radeon RX 6700 XT (12GB VRAM)
# Optimized for: GPU inference with ROCm/HIP support
#
# This configuration is optimized for AMD GPUs using ROCm.
# The RX 6700 XT has 12GB VRAM, excellent for 7B-13B models.
# Requires ROCm drivers and PyTorch with ROCm support.
[llama]
# GGUF Model Settings (llama.cpp)
# Note: llama.cpp requires HIP/ROCm build for AMD GPU support
# Build with: CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
model_path =
n_ctx = 8192
n_threads = 8
n_gpu_layers = -1
temperature = 0.7
top_p = 0.9
top_k = 40
repeat_penalty = 1.1
max_tokens = 4096
seed = -1
[transformers]
# SafeTensors Model Settings (HuggingFace)
# ROCm uses 'cuda' device identifier in PyTorch
model_path =
device = cuda
torch_dtype = float16
load_in_8bit = false
load_in_4bit = false
trust_remote_code = false
max_tokens = 4096
temperature = 0.7
top_p = 0.9
top_k = 40
repetition_penalty = 1.1
# Notes:
# - 12GB VRAM allows running 13B models at float16
# - For 33B+ models, enable load_in_4bit = true
# - ROCm support requires specific PyTorch version:
# pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
# - llama.cpp needs HIP build for GPU acceleration
# - If GPU not detected, falls back to CPU (check ROCm installation)
# - n_ctx = 8192 works well with 12GB VRAM

View File

@@ -0,0 +1,41 @@
# AUTARCH LLM Configuration Template
# Hardware: NVIDIA GeForce RTX 4070 Mobile (8GB VRAM)
# Optimized for: GPU inference with good VRAM management
#
# This configuration balances performance and memory usage for mobile RTX 4070.
# The 4070 Mobile has 8GB VRAM, suitable for 7B models at full precision
# or 13B models with quantization.
[llama]
# GGUF Model Settings (llama.cpp)
model_path =
n_ctx = 8192
n_threads = 8
n_gpu_layers = -1
temperature = 0.7
top_p = 0.9
top_k = 40
repeat_penalty = 1.1
max_tokens = 4096
seed = -1
[transformers]
# SafeTensors Model Settings (HuggingFace)
model_path =
device = cuda
torch_dtype = float16
load_in_8bit = false
load_in_4bit = false
trust_remote_code = false
max_tokens = 4096
temperature = 0.7
top_p = 0.9
top_k = 40
repetition_penalty = 1.1
# Notes:
# - n_gpu_layers = -1 offloads all layers to GPU
# - For 13B+ models, enable load_in_4bit = true
# - float16 is optimal for RTX 4070
# - n_ctx = 8192 uses ~2GB VRAM overhead
# - Reduce n_ctx to 4096 if running out of VRAM

View File

@@ -0,0 +1,46 @@
# AUTARCH LLM Configuration Template
# Hardware: Orange Pi 5 Plus (RK3588 SoC, 8-core ARM, 16GB RAM)
# Optimized for: CPU-only inference on ARM64
#
# This configuration is optimized for the Orange Pi 5 Plus running
# CPU-only inference. The RK3588 has 4x Cortex-A76 + 4x Cortex-A55 cores.
# Best with quantized GGUF models (Q4_K_M or Q5_K_M).
[llama]
# GGUF Model Settings (llama.cpp)
# Recommended: Use Q4_K_M or Q5_K_M quantized models
model_path =
n_ctx = 2048
n_threads = 4
n_gpu_layers = 0
temperature = 0.7
top_p = 0.9
top_k = 40
repeat_penalty = 1.1
max_tokens = 1024
seed = -1
[transformers]
# SafeTensors Model Settings (HuggingFace)
# Note: CPU inference is slow with transformers, GGUF recommended
model_path =
device = cpu
torch_dtype = float32
load_in_8bit = false
load_in_4bit = false
trust_remote_code = false
max_tokens = 1024
temperature = 0.7
top_p = 0.9
top_k = 40
repetition_penalty = 1.1
# Notes:
# - n_threads = 4 uses only the fast A76 cores (better perf than all 8)
# - n_ctx = 2048 balances memory usage and capability
# - n_gpu_layers = 0 for pure CPU inference
# - Strongly recommend GGUF Q4_K_M models for best speed
# - 7B Q4 models use ~4GB RAM, leaving room for system
# - max_tokens = 1024 keeps generation times reasonable
# - For transformers: CPU with float32 is slow but works
# - Avoid 13B+ models unless heavily quantized

View File

@@ -0,0 +1,67 @@
# AUTARCH LLM Configuration Template
# Hardware: Orange Pi 5 Plus with ARM Mali-G610 MP4 GPU
# Status: EXPERIMENTAL - Mali GPU support for LLMs is limited
#
# WARNING: This configuration is experimental!
# The Mali-G610 GPU has limited LLM support. Most frameworks
# fall back to CPU. This config attempts to leverage what GPU
# acceleration is available.
[llama]
# GGUF Model Settings (llama.cpp)
# Note: llama.cpp OpenCL backend may provide some acceleration
# Build with: CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
# Requires: libclblast-dev, opencl-headers, ocl-icd-opencl-dev
model_path =
n_ctx = 2048
n_threads = 4
n_gpu_layers = 8
temperature = 0.7
top_p = 0.9
top_k = 40
repeat_penalty = 1.1
max_tokens = 1024
seed = -1
[transformers]
# SafeTensors Model Settings (HuggingFace)
# Note: PyTorch has experimental Vulkan backend for mobile GPUs
# This is highly experimental and may not work
model_path =
device = cpu
torch_dtype = float32
load_in_8bit = false
load_in_4bit = true
trust_remote_code = false
max_tokens = 1024
temperature = 0.7
top_p = 0.9
top_k = 40
repetition_penalty = 1.1
# EXPERIMENTAL NOTES:
#
# Mali-G610 GPU Support Status:
# - OpenCL: Partial support via CLBlast, may accelerate some layers
# - Vulkan: PyTorch vulkan backend is experimental
# - Direct Mali: No native support in major LLM frameworks
#
# To enable OpenCL acceleration for llama.cpp:
# 1. Install dependencies:
# sudo apt install libclblast-dev opencl-headers ocl-icd-opencl-dev
# 2. Install Mali OpenCL driver (if available for your distro)
# 3. Rebuild llama-cpp-python with CLBlast:
# CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python --force-reinstall
#
# n_gpu_layers = 8: Offloads only some layers (conservative)
# - Increase if stable, decrease if crashes
# - Set to 0 if OpenCL not working
#
# For transformers:
# - load_in_4bit = true reduces memory pressure
# - CPU inference is the reliable fallback
#
# Performance Expectations:
# - Best case: 20-30% speedup over pure CPU
# - Likely case: Similar to CPU or unstable
# - Use orangepi5plus_cpu.conf for stable operation