# AUTARCH LLM Configuration Template
# Hardware: NVIDIA GeForce RTX 4070 Mobile (8GB VRAM)
# Optimized for: GPU inference with good VRAM management
#
# This configuration balances performance and memory usage for mobile RTX 4070.
# The 4070 Mobile has 8GB VRAM, suitable for 7B models at full precision
# or 13B models with quantization.

[llama]
# GGUF Model Settings (llama.cpp)
model_path =
n_ctx = 8192
n_threads = 8
n_gpu_layers = -1
temperature = 0.7
top_p = 0.9
top_k = 40
repeat_penalty = 1.1
max_tokens = 4096
seed = -1

[transformers]
# SafeTensors Model Settings (HuggingFace)
model_path =
device = cuda
torch_dtype = float16
load_in_8bit = false
load_in_4bit = false
trust_remote_code = false
max_tokens = 4096
temperature = 0.7
top_p = 0.9
top_k = 40
repetition_penalty = 1.1

# Notes:
# - n_gpu_layers = -1 offloads all layers to GPU
# - For 13B+ models, enable load_in_4bit = true
# - float16 is optimal for RTX 4070
# - n_ctx = 8192 uses ~2GB VRAM overhead
# - Reduce n_ctx to 4096 if running out of VRAM