# AUTARCH LLM Configuration Template # Hardware: Orange Pi 5 Plus (RK3588 SoC, 8-core ARM, 16GB RAM) # Optimized for: CPU-only inference on ARM64 # # This configuration is optimized for the Orange Pi 5 Plus running # CPU-only inference. The RK3588 has 4x Cortex-A76 + 4x Cortex-A55 cores. # Best with quantized GGUF models (Q4_K_M or Q5_K_M). [llama] # GGUF Model Settings (llama.cpp) # Recommended: Use Q4_K_M or Q5_K_M quantized models model_path = n_ctx = 2048 n_threads = 4 n_gpu_layers = 0 temperature = 0.7 top_p = 0.9 top_k = 40 repeat_penalty = 1.1 max_tokens = 1024 seed = -1 [transformers] # SafeTensors Model Settings (HuggingFace) # Note: CPU inference is slow with transformers, GGUF recommended model_path = device = cpu torch_dtype = float32 load_in_8bit = false load_in_4bit = false trust_remote_code = false max_tokens = 1024 temperature = 0.7 top_p = 0.9 top_k = 40 repetition_penalty = 1.1 # Notes: # - n_threads = 4 uses only the fast A76 cores (better perf than all 8) # - n_ctx = 2048 balances memory usage and capability # - n_gpu_layers = 0 for pure CPU inference # - Strongly recommend GGUF Q4_K_M models for best speed # - 7B Q4 models use ~4GB RAM, leaving room for system # - max_tokens = 1024 keeps generation times reasonable # - For transformers: CPU with float32 is slow but works # - Avoid 13B+ models unless heavily quantized