Autarch Will Control The Internet
This commit is contained in:
46
.config/amd_rx6700xt.conf
Normal file
46
.config/amd_rx6700xt.conf
Normal file
@@ -0,0 +1,46 @@
|
||||
# AUTARCH LLM Configuration Template
|
||||
# Hardware: AMD Radeon RX 6700 XT (12GB VRAM)
|
||||
# Optimized for: GPU inference with ROCm/HIP support
|
||||
#
|
||||
# This configuration is optimized for AMD GPUs using ROCm.
|
||||
# The RX 6700 XT has 12GB VRAM, excellent for 7B-13B models.
|
||||
# Requires ROCm drivers and PyTorch with ROCm support.
|
||||
|
||||
[llama]
|
||||
# GGUF Model Settings (llama.cpp)
|
||||
# Note: llama.cpp requires HIP/ROCm build for AMD GPU support
|
||||
# Build with: CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
|
||||
model_path =
|
||||
n_ctx = 8192
|
||||
n_threads = 8
|
||||
n_gpu_layers = -1
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repeat_penalty = 1.1
|
||||
max_tokens = 4096
|
||||
seed = -1
|
||||
|
||||
[transformers]
|
||||
# SafeTensors Model Settings (HuggingFace)
|
||||
# ROCm uses 'cuda' device identifier in PyTorch
|
||||
model_path =
|
||||
device = cuda
|
||||
torch_dtype = float16
|
||||
load_in_8bit = false
|
||||
load_in_4bit = false
|
||||
trust_remote_code = false
|
||||
max_tokens = 4096
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repetition_penalty = 1.1
|
||||
|
||||
# Notes:
|
||||
# - 12GB VRAM allows running 13B models at float16
|
||||
# - For 33B+ models, enable load_in_4bit = true
|
||||
# - ROCm support requires specific PyTorch version:
|
||||
# pip install torch --index-url https://download.pytorch.org/whl/rocm5.6
|
||||
# - llama.cpp needs HIP build for GPU acceleration
|
||||
# - If GPU not detected, falls back to CPU (check ROCm installation)
|
||||
# - n_ctx = 8192 works well with 12GB VRAM
|
||||
41
.config/nvidia_4070_mobile.conf
Normal file
41
.config/nvidia_4070_mobile.conf
Normal file
@@ -0,0 +1,41 @@
|
||||
# AUTARCH LLM Configuration Template
|
||||
# Hardware: NVIDIA GeForce RTX 4070 Mobile (8GB VRAM)
|
||||
# Optimized for: GPU inference with good VRAM management
|
||||
#
|
||||
# This configuration balances performance and memory usage for mobile RTX 4070.
|
||||
# The 4070 Mobile has 8GB VRAM, suitable for 7B models at full precision
|
||||
# or 13B models with quantization.
|
||||
|
||||
[llama]
|
||||
# GGUF Model Settings (llama.cpp)
|
||||
model_path =
|
||||
n_ctx = 8192
|
||||
n_threads = 8
|
||||
n_gpu_layers = -1
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repeat_penalty = 1.1
|
||||
max_tokens = 4096
|
||||
seed = -1
|
||||
|
||||
[transformers]
|
||||
# SafeTensors Model Settings (HuggingFace)
|
||||
model_path =
|
||||
device = cuda
|
||||
torch_dtype = float16
|
||||
load_in_8bit = false
|
||||
load_in_4bit = false
|
||||
trust_remote_code = false
|
||||
max_tokens = 4096
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repetition_penalty = 1.1
|
||||
|
||||
# Notes:
|
||||
# - n_gpu_layers = -1 offloads all layers to GPU
|
||||
# - For 13B+ models, enable load_in_4bit = true
|
||||
# - float16 is optimal for RTX 4070
|
||||
# - n_ctx = 8192 uses ~2GB VRAM overhead
|
||||
# - Reduce n_ctx to 4096 if running out of VRAM
|
||||
46
.config/orangepi5plus_cpu.conf
Normal file
46
.config/orangepi5plus_cpu.conf
Normal file
@@ -0,0 +1,46 @@
|
||||
# AUTARCH LLM Configuration Template
|
||||
# Hardware: Orange Pi 5 Plus (RK3588 SoC, 8-core ARM, 16GB RAM)
|
||||
# Optimized for: CPU-only inference on ARM64
|
||||
#
|
||||
# This configuration is optimized for the Orange Pi 5 Plus running
|
||||
# CPU-only inference. The RK3588 has 4x Cortex-A76 + 4x Cortex-A55 cores.
|
||||
# Best with quantized GGUF models (Q4_K_M or Q5_K_M).
|
||||
|
||||
[llama]
|
||||
# GGUF Model Settings (llama.cpp)
|
||||
# Recommended: Use Q4_K_M or Q5_K_M quantized models
|
||||
model_path =
|
||||
n_ctx = 2048
|
||||
n_threads = 4
|
||||
n_gpu_layers = 0
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repeat_penalty = 1.1
|
||||
max_tokens = 1024
|
||||
seed = -1
|
||||
|
||||
[transformers]
|
||||
# SafeTensors Model Settings (HuggingFace)
|
||||
# Note: CPU inference is slow with transformers, GGUF recommended
|
||||
model_path =
|
||||
device = cpu
|
||||
torch_dtype = float32
|
||||
load_in_8bit = false
|
||||
load_in_4bit = false
|
||||
trust_remote_code = false
|
||||
max_tokens = 1024
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repetition_penalty = 1.1
|
||||
|
||||
# Notes:
|
||||
# - n_threads = 4 uses only the fast A76 cores (better perf than all 8)
|
||||
# - n_ctx = 2048 balances memory usage and capability
|
||||
# - n_gpu_layers = 0 for pure CPU inference
|
||||
# - Strongly recommend GGUF Q4_K_M models for best speed
|
||||
# - 7B Q4 models use ~4GB RAM, leaving room for system
|
||||
# - max_tokens = 1024 keeps generation times reasonable
|
||||
# - For transformers: CPU with float32 is slow but works
|
||||
# - Avoid 13B+ models unless heavily quantized
|
||||
67
.config/orangepi5plus_mali.conf
Normal file
67
.config/orangepi5plus_mali.conf
Normal file
@@ -0,0 +1,67 @@
|
||||
# AUTARCH LLM Configuration Template
|
||||
# Hardware: Orange Pi 5 Plus with ARM Mali-G610 MP4 GPU
|
||||
# Status: EXPERIMENTAL - Mali GPU support for LLMs is limited
|
||||
#
|
||||
# WARNING: This configuration is experimental!
|
||||
# The Mali-G610 GPU has limited LLM support. Most frameworks
|
||||
# fall back to CPU. This config attempts to leverage what GPU
|
||||
# acceleration is available.
|
||||
|
||||
[llama]
|
||||
# GGUF Model Settings (llama.cpp)
|
||||
# Note: llama.cpp OpenCL backend may provide some acceleration
|
||||
# Build with: CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
|
||||
# Requires: libclblast-dev, opencl-headers, ocl-icd-opencl-dev
|
||||
model_path =
|
||||
n_ctx = 2048
|
||||
n_threads = 4
|
||||
n_gpu_layers = 8
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repeat_penalty = 1.1
|
||||
max_tokens = 1024
|
||||
seed = -1
|
||||
|
||||
[transformers]
|
||||
# SafeTensors Model Settings (HuggingFace)
|
||||
# Note: PyTorch has experimental Vulkan backend for mobile GPUs
|
||||
# This is highly experimental and may not work
|
||||
model_path =
|
||||
device = cpu
|
||||
torch_dtype = float32
|
||||
load_in_8bit = false
|
||||
load_in_4bit = true
|
||||
trust_remote_code = false
|
||||
max_tokens = 1024
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repetition_penalty = 1.1
|
||||
|
||||
# EXPERIMENTAL NOTES:
|
||||
#
|
||||
# Mali-G610 GPU Support Status:
|
||||
# - OpenCL: Partial support via CLBlast, may accelerate some layers
|
||||
# - Vulkan: PyTorch vulkan backend is experimental
|
||||
# - Direct Mali: No native support in major LLM frameworks
|
||||
#
|
||||
# To enable OpenCL acceleration for llama.cpp:
|
||||
# 1. Install dependencies:
|
||||
# sudo apt install libclblast-dev opencl-headers ocl-icd-opencl-dev
|
||||
# 2. Install Mali OpenCL driver (if available for your distro)
|
||||
# 3. Rebuild llama-cpp-python with CLBlast:
|
||||
# CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python --force-reinstall
|
||||
#
|
||||
# n_gpu_layers = 8: Offloads only some layers (conservative)
|
||||
# - Increase if stable, decrease if crashes
|
||||
# - Set to 0 if OpenCL not working
|
||||
#
|
||||
# For transformers:
|
||||
# - load_in_4bit = true reduces memory pressure
|
||||
# - CPU inference is the reliable fallback
|
||||
#
|
||||
# Performance Expectations:
|
||||
# - Best case: 20-30% speedup over pure CPU
|
||||
# - Likely case: Similar to CPU or unstable
|
||||
# - Use orangepi5plus_cpu.conf for stable operation
|
||||
Reference in New Issue
Block a user