68 lines
2.1 KiB
Plaintext
68 lines
2.1 KiB
Plaintext
|
|
# AUTARCH LLM Configuration Template
|
||
|
|
# Hardware: Orange Pi 5 Plus with ARM Mali-G610 MP4 GPU
|
||
|
|
# Status: EXPERIMENTAL - Mali GPU support for LLMs is limited
|
||
|
|
#
|
||
|
|
# WARNING: This configuration is experimental!
|
||
|
|
# The Mali-G610 GPU has limited LLM support. Most frameworks
|
||
|
|
# fall back to CPU. This config attempts to leverage what GPU
|
||
|
|
# acceleration is available.
|
||
|
|
|
||
|
|
[llama]
|
||
|
|
# GGUF Model Settings (llama.cpp)
|
||
|
|
# Note: llama.cpp OpenCL backend may provide some acceleration
|
||
|
|
# Build with: CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
|
||
|
|
# Requires: libclblast-dev, opencl-headers, ocl-icd-opencl-dev
|
||
|
|
model_path =
|
||
|
|
n_ctx = 2048
|
||
|
|
n_threads = 4
|
||
|
|
n_gpu_layers = 8
|
||
|
|
temperature = 0.7
|
||
|
|
top_p = 0.9
|
||
|
|
top_k = 40
|
||
|
|
repeat_penalty = 1.1
|
||
|
|
max_tokens = 1024
|
||
|
|
seed = -1
|
||
|
|
|
||
|
|
[transformers]
|
||
|
|
# SafeTensors Model Settings (HuggingFace)
|
||
|
|
# Note: PyTorch has experimental Vulkan backend for mobile GPUs
|
||
|
|
# This is highly experimental and may not work
|
||
|
|
model_path =
|
||
|
|
device = cpu
|
||
|
|
torch_dtype = float32
|
||
|
|
load_in_8bit = false
|
||
|
|
load_in_4bit = true
|
||
|
|
trust_remote_code = false
|
||
|
|
max_tokens = 1024
|
||
|
|
temperature = 0.7
|
||
|
|
top_p = 0.9
|
||
|
|
top_k = 40
|
||
|
|
repetition_penalty = 1.1
|
||
|
|
|
||
|
|
# EXPERIMENTAL NOTES:
|
||
|
|
#
|
||
|
|
# Mali-G610 GPU Support Status:
|
||
|
|
# - OpenCL: Partial support via CLBlast, may accelerate some layers
|
||
|
|
# - Vulkan: PyTorch vulkan backend is experimental
|
||
|
|
# - Direct Mali: No native support in major LLM frameworks
|
||
|
|
#
|
||
|
|
# To enable OpenCL acceleration for llama.cpp:
|
||
|
|
# 1. Install dependencies:
|
||
|
|
# sudo apt install libclblast-dev opencl-headers ocl-icd-opencl-dev
|
||
|
|
# 2. Install Mali OpenCL driver (if available for your distro)
|
||
|
|
# 3. Rebuild llama-cpp-python with CLBlast:
|
||
|
|
# CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python --force-reinstall
|
||
|
|
#
|
||
|
|
# n_gpu_layers = 8: Offloads only some layers (conservative)
|
||
|
|
# - Increase if stable, decrease if crashes
|
||
|
|
# - Set to 0 if OpenCL not working
|
||
|
|
#
|
||
|
|
# For transformers:
|
||
|
|
# - load_in_4bit = true reduces memory pressure
|
||
|
|
# - CPU inference is the reliable fallback
|
||
|
|
#
|
||
|
|
# Performance Expectations:
|
||
|
|
# - Best case: 20-30% speedup over pure CPU
|
||
|
|
# - Likely case: Similar to CPU or unstable
|
||
|
|
# - Use orangepi5plus_cpu.conf for stable operation
|