first commit
This commit is contained in:
35
tools/debug_torch.py
Normal file
35
tools/debug_torch.py
Normal file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick sanity check for PyTorch and CUDA setup
|
||||
"""
|
||||
import torch
|
||||
|
||||
def check_torch_setup():
|
||||
"""Check PyTorch and CUDA configuration"""
|
||||
print("=== PyTorch & CUDA Debug Info ===")
|
||||
print("torch:", torch.__version__)
|
||||
print("cuda available:", torch.cuda.is_available())
|
||||
print("cuda runtime:", torch.version.cuda)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
print("gpu count:", torch.cuda.device_count(), "name:", torch.cuda.get_device_name(0))
|
||||
print("cuda arch list:", torch.cuda.get_arch_list())
|
||||
for i in range(torch.cuda.device_count()):
|
||||
props = torch.cuda.get_device_properties(i)
|
||||
print(f"GPU {i}: {props.name}, Memory: {props.total_memory/1024**3:.1f}GB")
|
||||
else:
|
||||
print("WARNING: CUDA not available - you may have CPU-only torch")
|
||||
print("To fix: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
|
||||
|
||||
# Test basic tensor operations
|
||||
try:
|
||||
x = torch.randn(2, 2)
|
||||
if torch.cuda.is_available():
|
||||
x_gpu = x.cuda()
|
||||
print("✓ Basic CUDA tensor operations work")
|
||||
print("✓ Basic CPU tensor operations work")
|
||||
except Exception as e:
|
||||
print(f"✗ Tensor operations failed: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_torch_setup()
|
||||
160
tools/inspect_devices.py
Normal file
160
tools/inspect_devices.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import torch
|
||||
import re
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
def dtype_nbytes(dt: torch.dtype) -> int:
|
||||
return {
|
||||
torch.float32: 4, torch.float: 4,
|
||||
torch.float16: 2, torch.bfloat16: 2,
|
||||
torch.int8: 1, torch.uint8: 1,
|
||||
torch.int4: 0.5, # pseudo for 4-bit quant libs
|
||||
}.get(dt, 4)
|
||||
|
||||
def pretty_bytes(n: float) -> str:
|
||||
for u in ["B","KB","MB","GB","TB"]:
|
||||
if n < 1024 or u == "TB": return f"{n:.2f} {u}"
|
||||
n /= 1024
|
||||
|
||||
def inspect_model_devices(model_path_or_id: str) -> str:
|
||||
"""Inspect where model parameters are placed and return detailed report"""
|
||||
output = []
|
||||
|
||||
try:
|
||||
output.append(f"=== Inspecting Model: {model_path_or_id} ===\n")
|
||||
|
||||
# Load model as-is (don't force a map yet—show reality)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path_or_id,
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
low_cpu_mem_usage=True
|
||||
)
|
||||
|
||||
output.append(f">>> hf_device_map present: {hasattr(model, 'hf_device_map')}")
|
||||
if hasattr(model, "hf_device_map"):
|
||||
output.append(">>> device_map (first 20 entries):")
|
||||
for i, (k, v) in enumerate(model.hf_device_map.items()):
|
||||
if i < 20:
|
||||
output.append(f" {k:40s} -> {v}")
|
||||
if len(model.hf_device_map) > 20:
|
||||
output.append(f" ... and {len(model.hf_device_map) - 20} more entries")
|
||||
|
||||
totals = {}
|
||||
by_dtype = {}
|
||||
on_meta = []
|
||||
|
||||
for n, p in model.named_parameters():
|
||||
dev = str(p.device)
|
||||
totals[dev] = totals.get(dev, 0) + p.numel() * p.element_size()
|
||||
by_dtype[p.dtype] = by_dtype.get(p.dtype, 0) + p.numel() * p.element_size()
|
||||
if dev == "meta":
|
||||
on_meta.append(n)
|
||||
|
||||
output.append("\n=== Bytes by device ===")
|
||||
for dev, b in totals.items():
|
||||
output.append(f" {dev:10s} : {pretty_bytes(b)}")
|
||||
|
||||
output.append("\n=== Bytes by dtype ===")
|
||||
for dt, b in by_dtype.items():
|
||||
output.append(f" {str(dt):12s} : {pretty_bytes(b)}")
|
||||
|
||||
if on_meta:
|
||||
output.append(f"\n⚠️ WARNING: {len(on_meta)} parameters on META (not really loaded). Examples:")
|
||||
for n in on_meta[:10]:
|
||||
output.append(f" - {n}")
|
||||
if len(on_meta) > 10:
|
||||
output.append(f" ... and {len(on_meta) - 10} more")
|
||||
|
||||
if torch.cuda.is_available():
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used = total - free
|
||||
output.append(f"\n=== CUDA Memory ===")
|
||||
output.append(f" Used: {pretty_bytes(used)} / Total: {pretty_bytes(total)} on cuda:0")
|
||||
output.append(f" Free: {pretty_bytes(free)} ({(free/total)*100:.1f}%)")
|
||||
else:
|
||||
output.append("\n❌ CUDA not available.")
|
||||
|
||||
# Quick check if fully on GPU
|
||||
all_cuda = all(str(p.device).startswith("cuda") for _, p in model.named_parameters())
|
||||
no_meta = not any(str(p.device) == "meta" for _, p in model.named_parameters())
|
||||
|
||||
output.append(f"\n=== Summary ===")
|
||||
if all_cuda and no_meta:
|
||||
output.append("✅ All parameters are on CUDA")
|
||||
else:
|
||||
output.append("❌ Model is NOT fully on GPU")
|
||||
if on_meta:
|
||||
output.append(" - Some parameters are on META device")
|
||||
if not all_cuda:
|
||||
output.append(" - Some parameters are on CPU")
|
||||
|
||||
# Clean up model to free memory
|
||||
del model
|
||||
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
||||
|
||||
except Exception as e:
|
||||
output.append(f"❌ Error inspecting model: {str(e)}")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
def inspect_loaded_model(model) -> str:
|
||||
"""Inspect an already loaded model"""
|
||||
output = []
|
||||
|
||||
try:
|
||||
output.append("=== Inspecting Currently Loaded Model ===\n")
|
||||
|
||||
totals = {}
|
||||
by_dtype = {}
|
||||
on_meta = []
|
||||
|
||||
for n, p in model.named_parameters():
|
||||
dev = str(p.device)
|
||||
totals[dev] = totals.get(dev, 0) + p.numel() * p.element_size()
|
||||
by_dtype[p.dtype] = by_dtype.get(p.dtype, 0) + p.numel() * p.element_size()
|
||||
if dev == "meta":
|
||||
on_meta.append(n)
|
||||
|
||||
output.append("=== Bytes by device ===")
|
||||
for dev, b in totals.items():
|
||||
output.append(f" {dev:10s} : {pretty_bytes(b)}")
|
||||
|
||||
output.append("\n=== Bytes by dtype ===")
|
||||
for dt, b in by_dtype.items():
|
||||
output.append(f" {str(dt):12s} : {pretty_bytes(b)}")
|
||||
|
||||
if on_meta:
|
||||
output.append(f"\n⚠️ WARNING: {len(on_meta)} parameters on META. Examples:")
|
||||
for n in on_meta[:5]:
|
||||
output.append(f" - {n}")
|
||||
|
||||
if torch.cuda.is_available():
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used = total - free
|
||||
output.append(f"\n=== CUDA Memory ===")
|
||||
output.append(f" Used: {pretty_bytes(used)} / Total: {pretty_bytes(total)}")
|
||||
output.append(f" Free: {pretty_bytes(free)} ({(free/total)*100:.1f}%)")
|
||||
|
||||
# Quick check
|
||||
all_cuda = all(str(p.device).startswith("cuda") for _, p in model.named_parameters())
|
||||
no_meta = not any(str(p.device) == "meta" for _, p in model.named_parameters())
|
||||
|
||||
output.append(f"\n=== Summary ===")
|
||||
if all_cuda and no_meta:
|
||||
output.append("✅ All parameters are on CUDA")
|
||||
else:
|
||||
output.append("❌ Model is NOT fully on GPU")
|
||||
|
||||
except Exception as e:
|
||||
output.append(f"❌ Error: {str(e)}")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
if len(sys.argv) > 1:
|
||||
model_path = sys.argv[1]
|
||||
result = inspect_model_devices(model_path)
|
||||
print(result)
|
||||
else:
|
||||
print("Usage: python inspect_devices.py <model_path_or_id>")
|
||||
69
tools/test_gptq.py
Normal file
69
tools/test_gptq.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test GPTQ model loader (requires auto-gptq package)
|
||||
"""
|
||||
import os
|
||||
import traceback
|
||||
|
||||
def test_gptq_loader(model_id="TheBloke/Llama-2-7B-Chat-GPTQ"):
|
||||
"""Test loading GPTQ model with integer device"""
|
||||
print(f"=== Testing GPTQ Loader: {model_id} ===")
|
||||
|
||||
try:
|
||||
from auto_gptq import AutoGPTQForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Load tokenizer
|
||||
print("Loading tokenizer...")
|
||||
tok = AutoTokenizer.from_pretrained(
|
||||
model_id,
|
||||
use_fast=True,
|
||||
token=os.getenv("HF_TOKEN")
|
||||
)
|
||||
print("✓ Tokenizer loaded")
|
||||
|
||||
# Load GPTQ model with integer device
|
||||
print("Loading GPTQ model...")
|
||||
model = AutoGPTQForCausalLM.from_quantized(
|
||||
model_id,
|
||||
device=0, # integer index, not "cuda:0"
|
||||
use_safetensors=True,
|
||||
trust_remote_code=True,
|
||||
token=os.getenv("HF_TOKEN")
|
||||
)
|
||||
print("✓ GPTQ model loaded on GPU 0")
|
||||
|
||||
# Test generation
|
||||
prompt = "The benefits of GPU inference are"
|
||||
inputs = tok(prompt, return_tensors="pt").to("cuda")
|
||||
|
||||
print("Testing generation...")
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=16,
|
||||
do_sample=False,
|
||||
pad_token_id=tok.eos_token_id
|
||||
)
|
||||
|
||||
result = tok.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"✓ Generation test passed:")
|
||||
print(f"Output: {result}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"✗ auto-gptq not available: {e}")
|
||||
print("Install with: pip install auto-gptq")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {type(e).__name__} - {e}")
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
import torch
|
||||
print("torch:", torch.__version__)
|
||||
print("cuda available:", torch.cuda.is_available())
|
||||
print()
|
||||
|
||||
test_gptq_loader()
|
||||
69
tools/test_meta_fp16.py
Normal file
69
tools/test_meta_fp16.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Minimal loader test for Meta's FP16/BF16 Llama models (no GPTQ)
|
||||
"""
|
||||
import os
|
||||
import traceback
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
def test_meta_fp16_loader():
|
||||
"""Test loading Meta's Llama model with device_map=auto"""
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B" # Meta's repo
|
||||
|
||||
print(f"=== Testing Meta FP16 Loader: {MODEL_ID} ===")
|
||||
|
||||
try:
|
||||
# Load tokenizer
|
||||
print("Loading tokenizer...")
|
||||
tok = AutoTokenizer.from_pretrained(
|
||||
MODEL_ID,
|
||||
use_fast=True,
|
||||
token=os.getenv("HF_TOKEN")
|
||||
)
|
||||
print("✓ Tokenizer loaded")
|
||||
|
||||
# Load model with device_map="auto"
|
||||
print("Loading model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID,
|
||||
torch_dtype="auto",
|
||||
device_map="auto", # let accelerate place it
|
||||
trust_remote_code=True,
|
||||
token=os.getenv("HF_TOKEN")
|
||||
)
|
||||
|
||||
# Check device placement
|
||||
device = next(model.parameters()).device
|
||||
print(f"✓ Model loaded on device: {device}")
|
||||
|
||||
# Test generation
|
||||
prompt = "The benefits of GPU inference are"
|
||||
inputs = tok(prompt, return_tensors="pt").to(device)
|
||||
|
||||
print("Testing generation...")
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=16,
|
||||
do_sample=False,
|
||||
pad_token_id=tok.eos_token_id
|
||||
)
|
||||
|
||||
result = tok.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"✓ Generation test passed:")
|
||||
print(f"Output: {result}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {type(e).__name__} - {e}")
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
import torch
|
||||
print("torch:", torch.__version__)
|
||||
print("cuda available:", torch.cuda.is_available())
|
||||
print()
|
||||
|
||||
test_meta_fp16_loader()
|
||||
Reference in New Issue
Block a user