first commit

2026-03-13 12:56:43 -07:00
commit 159cf9fcfe
309 changed files with 64584 additions and 0 deletions
--- a/tools/debug_torch.py
+++ b/tools/debug_torch.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""
+Quick sanity check for PyTorch and CUDA setup
+"""
+import torch
+
+def check_torch_setup():
+    """Check PyTorch and CUDA configuration"""
+    print("=== PyTorch & CUDA Debug Info ===")
+    print("torch:", torch.__version__)
+    print("cuda available:", torch.cuda.is_available())
+    print("cuda runtime:", torch.version.cuda)
+    
+    if torch.cuda.is_available():
+        print("gpu count:", torch.cuda.device_count(), "name:", torch.cuda.get_device_name(0))
+        print("cuda arch list:", torch.cuda.get_arch_list())
+        for i in range(torch.cuda.device_count()):
+            props = torch.cuda.get_device_properties(i)
+            print(f"GPU {i}: {props.name}, Memory: {props.total_memory/1024**3:.1f}GB")
+    else:
+        print("WARNING: CUDA not available - you may have CPU-only torch")
+        print("To fix: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
+    
+    # Test basic tensor operations
+    try:
+        x = torch.randn(2, 2)
+        if torch.cuda.is_available():
+            x_gpu = x.cuda()
+            print("✓ Basic CUDA tensor operations work")
+        print("✓ Basic CPU tensor operations work")
+    except Exception as e:
+        print(f"✗ Tensor operations failed: {e}")
+
+if __name__ == "__main__":
+    check_torch_setup()
--- a/tools/inspect_devices.py
+++ b/tools/inspect_devices.py
@@ -0,0 +1,160 @@
+import torch
+import re
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+def dtype_nbytes(dt: torch.dtype) -> int:
+    return {
+        torch.float32: 4, torch.float: 4,
+        torch.float16: 2, torch.bfloat16: 2,
+        torch.int8: 1, torch.uint8: 1,
+        torch.int4: 0.5,  # pseudo for 4-bit quant libs
+    }.get(dt, 4)
+
+def pretty_bytes(n: float) -> str:
+    for u in ["B","KB","MB","GB","TB"]:
+        if n < 1024 or u == "TB": return f"{n:.2f} {u}"
+        n /= 1024
+
+def inspect_model_devices(model_path_or_id: str) -> str:
+    """Inspect where model parameters are placed and return detailed report"""
+    output = []
+    
+    try:
+        output.append(f"=== Inspecting Model: {model_path_or_id} ===\n")
+        
+        # Load model as-is (don't force a map yet—show reality)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path_or_id, 
+            torch_dtype="auto", 
+            device_map="auto", 
+            low_cpu_mem_usage=True
+        )
+        
+        output.append(f">>> hf_device_map present: {hasattr(model, 'hf_device_map')}")
+        if hasattr(model, "hf_device_map"):
+            output.append(">>> device_map (first 20 entries):")
+            for i, (k, v) in enumerate(model.hf_device_map.items()):
+                if i < 20: 
+                    output.append(f"  {k:40s} -> {v}")
+            if len(model.hf_device_map) > 20: 
+                output.append(f"  ... and {len(model.hf_device_map) - 20} more entries")
+        
+        totals = {}
+        by_dtype = {}
+        on_meta = []
+        
+        for n, p in model.named_parameters():
+            dev = str(p.device)
+            totals[dev] = totals.get(dev, 0) + p.numel() * p.element_size()
+            by_dtype[p.dtype] = by_dtype.get(p.dtype, 0) + p.numel() * p.element_size()
+            if dev == "meta":
+                on_meta.append(n)
+        
+        output.append("\n=== Bytes by device ===")
+        for dev, b in totals.items():
+            output.append(f"  {dev:10s} : {pretty_bytes(b)}")
+        
+        output.append("\n=== Bytes by dtype ===")
+        for dt, b in by_dtype.items():
+            output.append(f"  {str(dt):12s} : {pretty_bytes(b)}")
+        
+        if on_meta:
+            output.append(f"\n⚠️  WARNING: {len(on_meta)} parameters on META (not really loaded). Examples:")
+            for n in on_meta[:10]:
+                output.append(f"    - {n}")
+            if len(on_meta) > 10:
+                output.append(f"    ... and {len(on_meta) - 10} more")
+        
+        if torch.cuda.is_available():
+            free, total = torch.cuda.mem_get_info()
+            used = total - free
+            output.append(f"\n=== CUDA Memory ===")
+            output.append(f"  Used: {pretty_bytes(used)} / Total: {pretty_bytes(total)} on cuda:0")
+            output.append(f"  Free: {pretty_bytes(free)} ({(free/total)*100:.1f}%)")
+        else:
+            output.append("\n❌ CUDA not available.")
+        
+        # Quick check if fully on GPU
+        all_cuda = all(str(p.device).startswith("cuda") for _, p in model.named_parameters())
+        no_meta = not any(str(p.device) == "meta" for _, p in model.named_parameters())
+        
+        output.append(f"\n=== Summary ===")
+        if all_cuda and no_meta:
+            output.append("✅ All parameters are on CUDA")
+        else:
+            output.append("❌ Model is NOT fully on GPU")
+            if on_meta:
+                output.append("   - Some parameters are on META device")
+            if not all_cuda:
+                output.append("   - Some parameters are on CPU")
+        
+        # Clean up model to free memory
+        del model
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        
+    except Exception as e:
+        output.append(f"❌ Error inspecting model: {str(e)}")
+    
+    return "\n".join(output)
+
+def inspect_loaded_model(model) -> str:
+    """Inspect an already loaded model"""
+    output = []
+    
+    try:
+        output.append("=== Inspecting Currently Loaded Model ===\n")
+        
+        totals = {}
+        by_dtype = {}
+        on_meta = []
+        
+        for n, p in model.named_parameters():
+            dev = str(p.device)
+            totals[dev] = totals.get(dev, 0) + p.numel() * p.element_size()
+            by_dtype[p.dtype] = by_dtype.get(p.dtype, 0) + p.numel() * p.element_size()
+            if dev == "meta":
+                on_meta.append(n)
+        
+        output.append("=== Bytes by device ===")
+        for dev, b in totals.items():
+            output.append(f"  {dev:10s} : {pretty_bytes(b)}")
+        
+        output.append("\n=== Bytes by dtype ===")
+        for dt, b in by_dtype.items():
+            output.append(f"  {str(dt):12s} : {pretty_bytes(b)}")
+        
+        if on_meta:
+            output.append(f"\n⚠️  WARNING: {len(on_meta)} parameters on META. Examples:")
+            for n in on_meta[:5]:
+                output.append(f"    - {n}")
+        
+        if torch.cuda.is_available():
+            free, total = torch.cuda.mem_get_info()
+            used = total - free
+            output.append(f"\n=== CUDA Memory ===")
+            output.append(f"  Used: {pretty_bytes(used)} / Total: {pretty_bytes(total)}")
+            output.append(f"  Free: {pretty_bytes(free)} ({(free/total)*100:.1f}%)")
+        
+        # Quick check
+        all_cuda = all(str(p.device).startswith("cuda") for _, p in model.named_parameters())
+        no_meta = not any(str(p.device) == "meta" for _, p in model.named_parameters())
+        
+        output.append(f"\n=== Summary ===")
+        if all_cuda and no_meta:
+            output.append("✅ All parameters are on CUDA")
+        else:
+            output.append("❌ Model is NOT fully on GPU")
+        
+    except Exception as e:
+        output.append(f"❌ Error: {str(e)}")
+    
+    return "\n".join(output)
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        model_path = sys.argv[1]
+        result = inspect_model_devices(model_path)
+        print(result)
+    else:
+        print("Usage: python inspect_devices.py <model_path_or_id>")
--- a/tools/test_gptq.py
+++ b/tools/test_gptq.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+Test GPTQ model loader (requires auto-gptq package)
+"""
+import os
+import traceback
+
+def test_gptq_loader(model_id="TheBloke/Llama-2-7B-Chat-GPTQ"):
+    """Test loading GPTQ model with integer device"""
+    print(f"=== Testing GPTQ Loader: {model_id} ===")
+    
+    try:
+        from auto_gptq import AutoGPTQForCausalLM
+        from transformers import AutoTokenizer
+        
+        # Load tokenizer
+        print("Loading tokenizer...")
+        tok = AutoTokenizer.from_pretrained(
+            model_id, 
+            use_fast=True,
+            token=os.getenv("HF_TOKEN")
+        )
+        print("✓ Tokenizer loaded")
+        
+        # Load GPTQ model with integer device
+        print("Loading GPTQ model...")
+        model = AutoGPTQForCausalLM.from_quantized(
+            model_id,
+            device=0,                  # integer index, not "cuda:0"
+            use_safetensors=True,
+            trust_remote_code=True,
+            token=os.getenv("HF_TOKEN")
+        )
+        print("✓ GPTQ model loaded on GPU 0")
+        
+        # Test generation
+        prompt = "The benefits of GPU inference are"
+        inputs = tok(prompt, return_tensors="pt").to("cuda")
+        
+        print("Testing generation...")
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=16,
+            do_sample=False,
+            pad_token_id=tok.eos_token_id
+        )
+        
+        result = tok.decode(outputs[0], skip_special_tokens=True)
+        print(f"✓ Generation test passed:")
+        print(f"Output: {result}")
+        
+        return True
+        
+    except ImportError as e:
+        print(f"✗ auto-gptq not available: {e}")
+        print("Install with: pip install auto-gptq")
+        return False
+    except Exception as e:
+        print(f"✗ Error: {type(e).__name__} - {e}")
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    import torch
+    print("torch:", torch.__version__)
+    print("cuda available:", torch.cuda.is_available())
+    print()
+    
+    test_gptq_loader()
--- a/tools/test_meta_fp16.py
+++ b/tools/test_meta_fp16.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+Minimal loader test for Meta's FP16/BF16 Llama models (no GPTQ)
+"""
+import os
+import traceback
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+def test_meta_fp16_loader():
+    """Test loading Meta's Llama model with device_map=auto"""
+    MODEL_ID = "meta-llama/Meta-Llama-3.1-8B"  # Meta's repo
+    
+    print(f"=== Testing Meta FP16 Loader: {MODEL_ID} ===")
+    
+    try:
+        # Load tokenizer
+        print("Loading tokenizer...")
+        tok = AutoTokenizer.from_pretrained(
+            MODEL_ID, 
+            use_fast=True,
+            token=os.getenv("HF_TOKEN")
+        )
+        print("✓ Tokenizer loaded")
+        
+        # Load model with device_map="auto"
+        print("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype="auto",
+            device_map="auto",          # let accelerate place it
+            trust_remote_code=True,
+            token=os.getenv("HF_TOKEN")
+        )
+        
+        # Check device placement
+        device = next(model.parameters()).device
+        print(f"✓ Model loaded on device: {device}")
+        
+        # Test generation
+        prompt = "The benefits of GPU inference are"
+        inputs = tok(prompt, return_tensors="pt").to(device)
+        
+        print("Testing generation...")
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=16,
+                do_sample=False,
+                pad_token_id=tok.eos_token_id
+            )
+        
+        result = tok.decode(outputs[0], skip_special_tokens=True)
+        print(f"✓ Generation test passed:")
+        print(f"Output: {result}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"✗ Error: {type(e).__name__} - {e}")
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    import torch
+    print("torch:", torch.__version__)
+    print("cuda available:", torch.cuda.is_available())
+    print()
+    
+    test_meta_fp16_loader()