69 lines
2.0 KiB
Python
69 lines
2.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test GPTQ model loader (requires auto-gptq package)
|
|
"""
|
|
import os
|
|
import traceback
|
|
|
|
def test_gptq_loader(model_id="TheBloke/Llama-2-7B-Chat-GPTQ"):
|
|
"""Test loading GPTQ model with integer device"""
|
|
print(f"=== Testing GPTQ Loader: {model_id} ===")
|
|
|
|
try:
|
|
from auto_gptq import AutoGPTQForCausalLM
|
|
from transformers import AutoTokenizer
|
|
|
|
# Load tokenizer
|
|
print("Loading tokenizer...")
|
|
tok = AutoTokenizer.from_pretrained(
|
|
model_id,
|
|
use_fast=True,
|
|
token=os.getenv("HF_TOKEN")
|
|
)
|
|
print("✓ Tokenizer loaded")
|
|
|
|
# Load GPTQ model with integer device
|
|
print("Loading GPTQ model...")
|
|
model = AutoGPTQForCausalLM.from_quantized(
|
|
model_id,
|
|
device=0, # integer index, not "cuda:0"
|
|
use_safetensors=True,
|
|
trust_remote_code=True,
|
|
token=os.getenv("HF_TOKEN")
|
|
)
|
|
print("✓ GPTQ model loaded on GPU 0")
|
|
|
|
# Test generation
|
|
prompt = "The benefits of GPU inference are"
|
|
inputs = tok(prompt, return_tensors="pt").to("cuda")
|
|
|
|
print("Testing generation...")
|
|
outputs = model.generate(
|
|
**inputs,
|
|
max_new_tokens=16,
|
|
do_sample=False,
|
|
pad_token_id=tok.eos_token_id
|
|
)
|
|
|
|
result = tok.decode(outputs[0], skip_special_tokens=True)
|
|
print(f"✓ Generation test passed:")
|
|
print(f"Output: {result}")
|
|
|
|
return True
|
|
|
|
except ImportError as e:
|
|
print(f"✗ auto-gptq not available: {e}")
|
|
print("Install with: pip install auto-gptq")
|
|
return False
|
|
except Exception as e:
|
|
print(f"✗ Error: {type(e).__name__} - {e}")
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
import torch
|
|
print("torch:", torch.__version__)
|
|
print("cuda available:", torch.cuda.is_available())
|
|
print()
|
|
|
|
test_gptq_loader() |