llm_runtime/chat_session.py

"""
Chat Session Management for KV Caching

This module provides chat session management with persistent KV cache
for efficient multi-turn conversations across different model formats.
"""

from typing import Any, Dict, List, Optional, Tuple, Union
from dataclasses import dataclass
import threading


@dataclass
class ChatSessionConfig:
    """Configuration for chat sessions"""
    max_context_length: int = 4096
    cache_warmup: bool = True  # Pre-fill system prompt
    streaming: bool = True
    

class ChatSession:
    """
    Manages persistent KV cache state for multi-turn conversations.
    
    Supports different model formats:
    - GGUF models: Rely on llama-cpp-python's built-in caching
    - Transformers models: Manual KV cache management with past_key_values
    """
    
    def __init__(self, session_id: str, config: ChatSessionConfig = None):
        self.session_id = session_id
        self.config = config or ChatSessionConfig()
        self.lock = threading.RLock()
        
        # KV cache state (transformers models)
        self.past_key_values: Optional[Tuple] = None
        self.cached_input_ids: Optional[List[int]] = None
        self.context_length: int = 0
        
        # Conversation history
        self.messages: List[Dict[str, str]] = []
        self.system_prompt: Optional[str] = None
        
        # State tracking
        self.is_prefilled: bool = False
        self.last_prompt_hash: Optional[str] = None
        self.last_prompt: Optional[str] = None
        
    def add_message(self, role: str, content: str) -> None:
        """Add a message to the conversation history"""
        with self.lock:
            if role == "system" and not self.messages:
                self.system_prompt = content
            self.messages.append({"role": role, "content": content})
    
    def clear_messages(self) -> None:
        """Clear conversation history but preserve system prompt"""
        with self.lock:
            if self.system_prompt:
                self.messages = [{"role": "system", "content": self.system_prompt}]
            else:
                self.messages = []
            self.invalidate_cache()
    
    def invalidate_cache(self) -> None:
        """Invalidate KV cache - forces re-prefill on next generation"""
        with self.lock:
            self.past_key_values = None
            self.cached_input_ids = None
            self.context_length = 0
            self.is_prefilled = False
            self.last_prompt_hash = None
            self.last_prompt = None
    
    def should_invalidate(self, new_prompt: str, tokenizer=None) -> bool:
        """Check if cache should be invalidated based on prompt changes with proper token-level validation"""
        import hashlib
        
        current_hash = hashlib.md5(new_prompt.encode()).hexdigest()
        
        # Always invalidate if no cache exists
        if self.past_key_values is None:
            return True
            
        # Always invalidate if we don't have a tokenizer for proper validation
        if tokenizer is None:
            return True
            
        # Always invalidate if no previous prompt exists
        if not hasattr(self, 'last_prompt') or self.last_prompt is None:
            return True
            
        # Check for exact token-level prefix match
        if self._has_valid_token_prefix(new_prompt, self.last_prompt, tokenizer):
            # Additional safety checks for turn boundaries
            if self._violates_turn_boundaries(new_prompt, self.last_prompt):
                print("[KV_CACHE] Invalidating cache: turn boundary violation detected")
                return True
            return False
        
        # Invalidate if no valid prefix match
        print("[KV_CACHE] Invalidating cache: no valid token prefix match")
        return True
    
    def _has_valid_token_prefix(self, new_prompt: str, old_prompt: str, tokenizer) -> bool:
        """Check if new prompt has exact token-level prefix match with cached prompt"""
        try:
            # Tokenize both prompts
            old_tokens = tokenizer.encode(old_prompt, add_special_tokens=False)
            new_tokens = tokenizer.encode(new_prompt, add_special_tokens=False)
            
            # New prompt must be longer than or equal to old
            if len(new_tokens) < len(old_tokens):
                return False
                
            # Check exact token-by-token match for the prefix
            for i, (old_token, new_token) in enumerate(zip(old_tokens, new_tokens)):
                if old_token != new_token:
                    print(f"[KV_CACHE] Token mismatch at position {i}: old={old_token}, new={new_token}")
                    return False
            
            return True
        except Exception as e:
            print(f"[KV_CACHE] Error in token prefix validation: {e}")
            return False
    
    def _violates_turn_boundaries(self, new_prompt: str, old_prompt: str) -> bool:
        """Check if the prompt violates turn boundaries (conversation integrity)"""
        # Look for end-of-turn markers that indicate conversation corruption
        eot_markers = ["<|eot_id|>", "<|end_of_turn|>", "</s>", "<|endoftext|>"]
        
        # If the old prompt ended with an EOT marker, we should start fresh
        for marker in eot_markers:
            if old_prompt.rstrip().endswith(marker):
                # Check if new prompt is a proper continuation (should start with User: or similar)
                continuation_part = new_prompt[len(old_prompt):].lstrip()
                if not (continuation_part.startswith("User:") or continuation_part.startswith("Human:") or continuation_part.startswith("\nUser:") or continuation_part.startswith("\nHuman:")):
                    return True
        
        # Check for conversation format corruption (duplicate roles, malformed structure)
        if self._has_conversation_corruption(new_prompt):
            return True
            
        return False
    
    def _has_conversation_corruption(self, prompt: str) -> bool:
        """Detect conversation format corruption that indicates cache should be invalidated"""
        # Look for signs of corrupted conversation format
        corruption_patterns = [
            "Assistant: Assistant:",  # Duplicate assistant labels
            "User: User:",  # Duplicate user labels  
            "Assistant: User",  # Role confusion
            "User: Assistant:",  # Role confusion
            "of of of",  # Repetitive token generation (sign of corruption)
            "151 of 151",  # Specific corruption pattern we observed
        ]
        
        for pattern in corruption_patterns:
            if pattern in prompt:
                print(f"[KV_CACHE] Detected conversation corruption: '{pattern}'")
                return True
                
        return False
    
    def update_cache(self, past_key_values: Tuple, input_ids: List[int], prompt: str) -> None:
        """Update the KV cache state with turn boundary validation"""
        import hashlib
        
        with self.lock:
            # Validate that we're ending on a complete turn boundary
            if not self._is_complete_turn(prompt):
                print("[KV_CACHE] Warning: Caching incomplete turn - this may cause issues")
            
            self.past_key_values = past_key_values
            self.cached_input_ids = input_ids
            self.context_length = len(input_ids)
            self.is_prefilled = True
            self.last_prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
            self.last_prompt = prompt  # Store the actual prompt for comparison
    
    def _is_complete_turn(self, prompt: str) -> bool:
        """Check if the prompt ends on a complete turn boundary"""
        # Look for proper turn endings
        prompt_trimmed = prompt.rstrip()
        
        # Should end with Assistant response, not mid-generation
        valid_endings = [
            "</s>",
            "<|eot_id|>", 
            "<|end_of_turn|>",
            "<|endoftext|>",
        ]
        
        # Or should end with a complete sentence/response
        if any(prompt_trimmed.endswith(ending) for ending in valid_endings):
            return True
            
        # For non-marked conversations, check if it looks like a complete response
        # (ends with punctuation and doesn't look cut off)
        if prompt_trimmed.endswith(('.', '!', '?', ':', ';')):
            return True
            
        # If it ends with "Assistant:" it's ready for generation
        if prompt_trimmed.endswith("Assistant:"):
            return True
        
        return False
    
    def get_cache_info(self) -> Dict[str, Any]:
        """Get information about current cache state"""
        with self.lock:
            return {
                "session_id": self.session_id,
                "has_cache": self.past_key_values is not None,
                "context_length": self.context_length,
                "is_prefilled": self.is_prefilled,
                "message_count": len(self.messages),
                "max_context": self.config.max_context_length
            }


class ChatSessionManager:
    """Manages multiple chat sessions"""
    
    def __init__(self):
        self.sessions: Dict[str, ChatSession] = {}
        self.lock = threading.RLock()
        self.default_session_id = "default"
    
    def get_session(self, session_id: str = None, config: ChatSessionConfig = None) -> ChatSession:
        """Get or create a chat session"""
        if session_id is None:
            session_id = self.default_session_id
            
        with self.lock:
            if session_id not in self.sessions:
                self.sessions[session_id] = ChatSession(session_id, config)
            return self.sessions[session_id]
    
    def clear_session(self, session_id: str = None) -> None:
        """Clear a specific session"""
        if session_id is None:
            session_id = self.default_session_id
            
        with self.lock:
            if session_id in self.sessions:
                self.sessions[session_id].invalidate_cache()
                self.sessions[session_id].clear_messages()
    
    def remove_session(self, session_id: str) -> None:
        """Remove a session completely"""
        with self.lock:
            if session_id in self.sessions:
                del self.sessions[session_id]
    
    def get_all_sessions(self) -> List[str]:
        """Get list of all session IDs"""
        with self.lock:
            return list(self.sessions.keys())


# Global session manager instance
_session_manager = ChatSessionManager()


def get_session_manager() -> ChatSessionManager:
    """Get the global chat session manager"""
    return _session_manager
first commit 2026-03-13 12:56:43 -07:00			`"""`
			`Chat Session Management for KV Caching`

			`This module provides chat session management with persistent KV cache`
			`for efficient multi-turn conversations across different model formats.`
			`"""`

			`from typing import Any, Dict, List, Optional, Tuple, Union`
			`from dataclasses import dataclass`
			`import threading`


			`@dataclass`
			`class ChatSessionConfig:`
			`"""Configuration for chat sessions"""`
			`max_context_length: int = 4096`
			`cache_warmup: bool = True # Pre-fill system prompt`
			`streaming: bool = True`


			`class ChatSession:`
			`"""`
			`Manages persistent KV cache state for multi-turn conversations.`

			`Supports different model formats:`
			`- GGUF models: Rely on llama-cpp-python's built-in caching`
			`- Transformers models: Manual KV cache management with past_key_values`
			`"""`

			`def __init__(self, session_id: str, config: ChatSessionConfig = None):`
			`self.session_id = session_id`
			`self.config = config or ChatSessionConfig()`
			`self.lock = threading.RLock()`

			`# KV cache state (transformers models)`
			`self.past_key_values: Optional[Tuple] = None`
			`self.cached_input_ids: Optional[List[int]] = None`
			`self.context_length: int = 0`

			`# Conversation history`
			`self.messages: List[Dict[str, str]] = []`
			`self.system_prompt: Optional[str] = None`

			`# State tracking`
			`self.is_prefilled: bool = False`
			`self.last_prompt_hash: Optional[str] = None`
			`self.last_prompt: Optional[str] = None`

			`def add_message(self, role: str, content: str) -> None:`
			`"""Add a message to the conversation history"""`
			`with self.lock:`
			`if role == "system" and not self.messages:`
			`self.system_prompt = content`
			`self.messages.append({"role": role, "content": content})`

			`def clear_messages(self) -> None:`
			`"""Clear conversation history but preserve system prompt"""`
			`with self.lock:`
			`if self.system_prompt:`
			`self.messages = [{"role": "system", "content": self.system_prompt}]`
			`else:`
			`self.messages = []`
			`self.invalidate_cache()`

			`def invalidate_cache(self) -> None:`
			`"""Invalidate KV cache - forces re-prefill on next generation"""`
			`with self.lock:`
			`self.past_key_values = None`
			`self.cached_input_ids = None`
			`self.context_length = 0`
			`self.is_prefilled = False`
			`self.last_prompt_hash = None`
			`self.last_prompt = None`

			`def should_invalidate(self, new_prompt: str, tokenizer=None) -> bool:`
			`"""Check if cache should be invalidated based on prompt changes with proper token-level validation"""`
			`import hashlib`

			`current_hash = hashlib.md5(new_prompt.encode()).hexdigest()`

			`# Always invalidate if no cache exists`
			`if self.past_key_values is None:`
			`return True`

			`# Always invalidate if we don't have a tokenizer for proper validation`
			`if tokenizer is None:`
			`return True`

			`# Always invalidate if no previous prompt exists`
			`if not hasattr(self, 'last_prompt') or self.last_prompt is None:`
			`return True`

			`# Check for exact token-level prefix match`
			`if self._has_valid_token_prefix(new_prompt, self.last_prompt, tokenizer):`
			`# Additional safety checks for turn boundaries`
			`if self._violates_turn_boundaries(new_prompt, self.last_prompt):`
			`print("[KV_CACHE] Invalidating cache: turn boundary violation detected")`
			`return True`
			`return False`

			`# Invalidate if no valid prefix match`
			`print("[KV_CACHE] Invalidating cache: no valid token prefix match")`
			`return True`

			`def _has_valid_token_prefix(self, new_prompt: str, old_prompt: str, tokenizer) -> bool:`
			`"""Check if new prompt has exact token-level prefix match with cached prompt"""`
			`try:`
			`# Tokenize both prompts`
			`old_tokens = tokenizer.encode(old_prompt, add_special_tokens=False)`
			`new_tokens = tokenizer.encode(new_prompt, add_special_tokens=False)`

			`# New prompt must be longer than or equal to old`
			`if len(new_tokens) < len(old_tokens):`
			`return False`

			`# Check exact token-by-token match for the prefix`
			`for i, (old_token, new_token) in enumerate(zip(old_tokens, new_tokens)):`
			`if old_token != new_token:`
			`print(f"[KV_CACHE] Token mismatch at position {i}: old={old_token}, new={new_token}")`
			`return False`

			`return True`
			`except Exception as e:`
			`print(f"[KV_CACHE] Error in token prefix validation: {e}")`
			`return False`

			`def _violates_turn_boundaries(self, new_prompt: str, old_prompt: str) -> bool:`
			`"""Check if the prompt violates turn boundaries (conversation integrity)"""`
			`# Look for end-of-turn markers that indicate conversation corruption`
			`eot_markers = ["<\|eot_id\|>", "<\|end_of_turn\|>", "</s>", "<\|endoftext\|>"]`

			`# If the old prompt ended with an EOT marker, we should start fresh`
			`for marker in eot_markers:`
			`if old_prompt.rstrip().endswith(marker):`
			`# Check if new prompt is a proper continuation (should start with User: or similar)`
			`continuation_part = new_prompt[len(old_prompt):].lstrip()`
			`if not (continuation_part.startswith("User:") or continuation_part.startswith("Human:") or continuation_part.startswith("\nUser:") or continuation_part.startswith("\nHuman:")):`
			`return True`

			`# Check for conversation format corruption (duplicate roles, malformed structure)`
			`if self._has_conversation_corruption(new_prompt):`
			`return True`

			`return False`

			`def _has_conversation_corruption(self, prompt: str) -> bool:`
			`"""Detect conversation format corruption that indicates cache should be invalidated"""`
			`# Look for signs of corrupted conversation format`
			`corruption_patterns = [`
			`"Assistant: Assistant:", # Duplicate assistant labels`
			`"User: User:", # Duplicate user labels`
			`"Assistant: User", # Role confusion`
			`"User: Assistant:", # Role confusion`
			`"of of of", # Repetitive token generation (sign of corruption)`
			`"151 of 151", # Specific corruption pattern we observed`
			`]`

			`for pattern in corruption_patterns:`
			`if pattern in prompt:`
			`print(f"[KV_CACHE] Detected conversation corruption: '{pattern}'")`
			`return True`

			`return False`

			`def update_cache(self, past_key_values: Tuple, input_ids: List[int], prompt: str) -> None:`
			`"""Update the KV cache state with turn boundary validation"""`
			`import hashlib`

			`with self.lock:`
			`# Validate that we're ending on a complete turn boundary`
			`if not self._is_complete_turn(prompt):`
			`print("[KV_CACHE] Warning: Caching incomplete turn - this may cause issues")`

			`self.past_key_values = past_key_values`
			`self.cached_input_ids = input_ids`
			`self.context_length = len(input_ids)`
			`self.is_prefilled = True`
			`self.last_prompt_hash = hashlib.md5(prompt.encode()).hexdigest()`
			`self.last_prompt = prompt # Store the actual prompt for comparison`

			`def _is_complete_turn(self, prompt: str) -> bool:`
			`"""Check if the prompt ends on a complete turn boundary"""`
			`# Look for proper turn endings`
			`prompt_trimmed = prompt.rstrip()`

			`# Should end with Assistant response, not mid-generation`
			`valid_endings = [`
			`"</s>",`
			`"<\|eot_id\|>",`
			`"<\|end_of_turn\|>",`
			`"<\|endoftext\|>",`
			`]`

			`# Or should end with a complete sentence/response`
			`if any(prompt_trimmed.endswith(ending) for ending in valid_endings):`
			`return True`

			`# For non-marked conversations, check if it looks like a complete response`
			`# (ends with punctuation and doesn't look cut off)`
			`if prompt_trimmed.endswith(('.', '!', '?', ':', ';')):`
			`return True`

			`# If it ends with "Assistant:" it's ready for generation`
			`if prompt_trimmed.endswith("Assistant:"):`
			`return True`

			`return False`

			`def get_cache_info(self) -> Dict[str, Any]:`
			`"""Get information about current cache state"""`
			`with self.lock:`
			`return {`
			`"session_id": self.session_id,`
			`"has_cache": self.past_key_values is not None,`
			`"context_length": self.context_length,`
			`"is_prefilled": self.is_prefilled,`
			`"message_count": len(self.messages),`
			`"max_context": self.config.max_context_length`
			`}`


			`class ChatSessionManager:`
			`"""Manages multiple chat sessions"""`

			`def __init__(self):`
			`self.sessions: Dict[str, ChatSession] = {}`
			`self.lock = threading.RLock()`
			`self.default_session_id = "default"`

			`def get_session(self, session_id: str = None, config: ChatSessionConfig = None) -> ChatSession:`
			`"""Get or create a chat session"""`
			`if session_id is None:`
			`session_id = self.default_session_id`

			`with self.lock:`
			`if session_id not in self.sessions:`
			`self.sessions[session_id] = ChatSession(session_id, config)`
			`return self.sessions[session_id]`

			`def clear_session(self, session_id: str = None) -> None:`
			`"""Clear a specific session"""`
			`if session_id is None:`
			`session_id = self.default_session_id`

			`with self.lock:`
			`if session_id in self.sessions:`
			`self.sessions[session_id].invalidate_cache()`
			`self.sessions[session_id].clear_messages()`

			`def remove_session(self, session_id: str) -> None:`
			`"""Remove a session completely"""`
			`with self.lock:`
			`if session_id in self.sessions:`
			`del self.sessions[session_id]`

			`def get_all_sessions(self) -> List[str]:`
			`"""Get list of all session IDs"""`
			`with self.lock:`
			`return list(self.sessions.keys())`


			`# Global session manager instance`
			`_session_manager = ChatSessionManager()`


			`def get_session_manager() -> ChatSessionManager:`
			`"""Get the global chat session manager"""`
			`return _session_manager`