Autarch Will Control The Internet

2026-03-13 15:17:15 -07:00
commit 4d3570781e
401 changed files with 484494 additions and 0 deletions
--- a/.config/amd_rx6700xt.conf
+++ b/.config/amd_rx6700xt.conf
@@ -0,0 +1,46 @@
+# AUTARCH LLM Configuration Template
+# Hardware: AMD Radeon RX 6700 XT (12GB VRAM)
+# Optimized for: GPU inference with ROCm/HIP support
+#
+# This configuration is optimized for AMD GPUs using ROCm.
+# The RX 6700 XT has 12GB VRAM, excellent for 7B-13B models.
+# Requires ROCm drivers and PyTorch with ROCm support.
+
+[llama]
+# GGUF Model Settings (llama.cpp)
+# Note: llama.cpp requires HIP/ROCm build for AMD GPU support
+# Build with: CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
+model_path =
+n_ctx = 8192
+n_threads = 8
+n_gpu_layers = -1
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repeat_penalty = 1.1
+max_tokens = 4096
+seed = -1
+
+[transformers]
+# SafeTensors Model Settings (HuggingFace)
+# ROCm uses 'cuda' device identifier in PyTorch
+model_path =
+device = cuda
+torch_dtype = float16
+load_in_8bit = false
+load_in_4bit = false
+trust_remote_code = false
+max_tokens = 4096
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repetition_penalty = 1.1
+
+# Notes:
+# - 12GB VRAM allows running 13B models at float16
+# - For 33B+ models, enable load_in_4bit = true
+# - ROCm support requires specific PyTorch version:
+#   pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
+# - llama.cpp needs HIP build for GPU acceleration
+# - If GPU not detected, falls back to CPU (check ROCm installation)
+# - n_ctx = 8192 works well with 12GB VRAM
--- a/.config/nvidia_4070_mobile.conf
+++ b/.config/nvidia_4070_mobile.conf
@@ -0,0 +1,41 @@
+# AUTARCH LLM Configuration Template
+# Hardware: NVIDIA GeForce RTX 4070 Mobile (8GB VRAM)
+# Optimized for: GPU inference with good VRAM management
+#
+# This configuration balances performance and memory usage for mobile RTX 4070.
+# The 4070 Mobile has 8GB VRAM, suitable for 7B models at full precision
+# or 13B models with quantization.
+
+[llama]
+# GGUF Model Settings (llama.cpp)
+model_path =
+n_ctx = 8192
+n_threads = 8
+n_gpu_layers = -1
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repeat_penalty = 1.1
+max_tokens = 4096
+seed = -1
+
+[transformers]
+# SafeTensors Model Settings (HuggingFace)
+model_path =
+device = cuda
+torch_dtype = float16
+load_in_8bit = false
+load_in_4bit = false
+trust_remote_code = false
+max_tokens = 4096
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repetition_penalty = 1.1
+
+# Notes:
+# - n_gpu_layers = -1 offloads all layers to GPU
+# - For 13B+ models, enable load_in_4bit = true
+# - float16 is optimal for RTX 4070
+# - n_ctx = 8192 uses ~2GB VRAM overhead
+# - Reduce n_ctx to 4096 if running out of VRAM
--- a/.config/orangepi5plus_cpu.conf
+++ b/.config/orangepi5plus_cpu.conf
@@ -0,0 +1,46 @@
+# AUTARCH LLM Configuration Template
+# Hardware: Orange Pi 5 Plus (RK3588 SoC, 8-core ARM, 16GB RAM)
+# Optimized for: CPU-only inference on ARM64
+#
+# This configuration is optimized for the Orange Pi 5 Plus running
+# CPU-only inference. The RK3588 has 4x Cortex-A76 + 4x Cortex-A55 cores.
+# Best with quantized GGUF models (Q4_K_M or Q5_K_M).
+
+[llama]
+# GGUF Model Settings (llama.cpp)
+# Recommended: Use Q4_K_M or Q5_K_M quantized models
+model_path =
+n_ctx = 2048
+n_threads = 4
+n_gpu_layers = 0
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repeat_penalty = 1.1
+max_tokens = 1024
+seed = -1
+
+[transformers]
+# SafeTensors Model Settings (HuggingFace)
+# Note: CPU inference is slow with transformers, GGUF recommended
+model_path =
+device = cpu
+torch_dtype = float32
+load_in_8bit = false
+load_in_4bit = false
+trust_remote_code = false
+max_tokens = 1024
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repetition_penalty = 1.1
+
+# Notes:
+# - n_threads = 4 uses only the fast A76 cores (better perf than all 8)
+# - n_ctx = 2048 balances memory usage and capability
+# - n_gpu_layers = 0 for pure CPU inference
+# - Strongly recommend GGUF Q4_K_M models for best speed
+# - 7B Q4 models use ~4GB RAM, leaving room for system
+# - max_tokens = 1024 keeps generation times reasonable
+# - For transformers: CPU with float32 is slow but works
+# - Avoid 13B+ models unless heavily quantized
--- a/.config/orangepi5plus_mali.conf
+++ b/.config/orangepi5plus_mali.conf
@@ -0,0 +1,67 @@
+# AUTARCH LLM Configuration Template
+# Hardware: Orange Pi 5 Plus with ARM Mali-G610 MP4 GPU
+# Status: EXPERIMENTAL - Mali GPU support for LLMs is limited
+#
+# WARNING: This configuration is experimental!
+# The Mali-G610 GPU has limited LLM support. Most frameworks
+# fall back to CPU. This config attempts to leverage what GPU
+# acceleration is available.
+
+[llama]
+# GGUF Model Settings (llama.cpp)
+# Note: llama.cpp OpenCL backend may provide some acceleration
+# Build with: CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
+# Requires: libclblast-dev, opencl-headers, ocl-icd-opencl-dev
+model_path =
+n_ctx = 2048
+n_threads = 4
+n_gpu_layers = 8
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repeat_penalty = 1.1
+max_tokens = 1024
+seed = -1
+
+[transformers]
+# SafeTensors Model Settings (HuggingFace)
+# Note: PyTorch has experimental Vulkan backend for mobile GPUs
+# This is highly experimental and may not work
+model_path =
+device = cpu
+torch_dtype = float32
+load_in_8bit = false
+load_in_4bit = true
+trust_remote_code = false
+max_tokens = 1024
+temperature = 0.7
+top_p = 0.9
+top_k = 40
+repetition_penalty = 1.1
+
+# EXPERIMENTAL NOTES:
+#
+# Mali-G610 GPU Support Status:
+# - OpenCL: Partial support via CLBlast, may accelerate some layers
+# - Vulkan: PyTorch vulkan backend is experimental
+# - Direct Mali: No native support in major LLM frameworks
+#
+# To enable OpenCL acceleration for llama.cpp:
+# 1. Install dependencies:
+#    sudo apt install libclblast-dev opencl-headers ocl-icd-opencl-dev
+# 2. Install Mali OpenCL driver (if available for your distro)
+# 3. Rebuild llama-cpp-python with CLBlast:
+#    CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python --force-reinstall
+#
+# n_gpu_layers = 8: Offloads only some layers (conservative)
+# - Increase if stable, decrease if crashes
+# - Set to 0 if OpenCL not working
+#
+# For transformers:
+# - load_in_4bit = true reduces memory pressure
+# - CPU inference is the reliable fallback
+#
+# Performance Expectations:
+# - Best case: 20-30% speedup over pure CPU
+# - Likely case: Similar to CPU or unstable
+# - Use orangepi5plus_cpu.conf for stable operation