# AUTARCH LLM Configuration Template # Hardware: NVIDIA GeForce RTX 4070 Mobile (8GB VRAM) # Optimized for: GPU inference with good VRAM management # # This configuration balances performance and memory usage for mobile RTX 4070. # The 4070 Mobile has 8GB VRAM, suitable for 7B models at full precision # or 13B models with quantization. [llama] # GGUF Model Settings (llama.cpp) model_path = n_ctx = 8192 n_threads = 8 n_gpu_layers = -1 temperature = 0.7 top_p = 0.9 top_k = 40 repeat_penalty = 1.1 max_tokens = 4096 seed = -1 [transformers] # SafeTensors Model Settings (HuggingFace) model_path = device = cuda torch_dtype = float16 load_in_8bit = false load_in_4bit = false trust_remote_code = false max_tokens = 4096 temperature = 0.7 top_p = 0.9 top_k = 40 repetition_penalty = 1.1 # Notes: # - n_gpu_layers = -1 offloads all layers to GPU # - For 13B+ models, enable load_in_4bit = true # - float16 is optimal for RTX 4070 # - n_ctx = 8192 uses ~2GB VRAM overhead # - Reduce n_ctx to 4096 if running out of VRAM