Autarch Will Control The Internet
This commit is contained in:
41
.config/nvidia_4070_mobile.conf
Normal file
41
.config/nvidia_4070_mobile.conf
Normal file
@@ -0,0 +1,41 @@
|
||||
# AUTARCH LLM Configuration Template
|
||||
# Hardware: NVIDIA GeForce RTX 4070 Mobile (8GB VRAM)
|
||||
# Optimized for: GPU inference with good VRAM management
|
||||
#
|
||||
# This configuration balances performance and memory usage for mobile RTX 4070.
|
||||
# The 4070 Mobile has 8GB VRAM, suitable for 7B models at full precision
|
||||
# or 13B models with quantization.
|
||||
|
||||
[llama]
|
||||
# GGUF Model Settings (llama.cpp)
|
||||
model_path =
|
||||
n_ctx = 8192
|
||||
n_threads = 8
|
||||
n_gpu_layers = -1
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repeat_penalty = 1.1
|
||||
max_tokens = 4096
|
||||
seed = -1
|
||||
|
||||
[transformers]
|
||||
# SafeTensors Model Settings (HuggingFace)
|
||||
model_path =
|
||||
device = cuda
|
||||
torch_dtype = float16
|
||||
load_in_8bit = false
|
||||
load_in_4bit = false
|
||||
trust_remote_code = false
|
||||
max_tokens = 4096
|
||||
temperature = 0.7
|
||||
top_p = 0.9
|
||||
top_k = 40
|
||||
repetition_penalty = 1.1
|
||||
|
||||
# Notes:
|
||||
# - n_gpu_layers = -1 offloads all layers to GPU
|
||||
# - For 13B+ models, enable load_in_4bit = true
|
||||
# - float16 is optimal for RTX 4070
|
||||
# - n_ctx = 8192 uses ~2GB VRAM overhead
|
||||
# - Reduce n_ctx to 4096 if running out of VRAM
|
||||
Reference in New Issue
Block a user