Source code for src.llm

"""
LLM Interface

Handles local LLM loading, prompt formatting, and answer generation.
Supports multiple backends: transformers, llama-cpp, and OpenAI (optional).
"""

import logging
import time
from dataclasses import dataclass
from typing import Any

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from src.query import QueryResult

logger = logging.getLogger(__name__)


[docs] @dataclass class LLMConfig: """Configuration for LLM settings.""" backend: str model_path: str temperature: float max_tokens: int top_p: float repeat_penalty: float context_window: int
[docs] @dataclass class LLMResponse: """Response from LLM with metadata.""" answer: str prompt_tokens: int response_tokens: int generation_time_ms: float model_used: str
[docs] class BaseLLM: """Base class for LLM implementations."""
[docs] def __init__(self, config: LLMConfig): """ Initialize LLM with configuration. Args: config: LLM configuration """ self.config = config self.model = None self.tokenizer = None self._load_model() logger.info(f"Initialized {config.backend} LLM: {config.model_path}")
def _load_model(self) -> None: """Load the LLM model. To be implemented by subclasses.""" raise NotImplementedError
[docs] def generate(self, prompt: str) -> LLMResponse: """ Generate response from prompt. Args: prompt: Input prompt Returns: LLMResponse with answer and metadata """ raise NotImplementedError
[docs] class TransformersLLM(BaseLLM): """LLM implementation using transformers library.""" def _load_model(self) -> None: """Load transformers model and tokenizer.""" try: logger.info(f"Loading transformers model: {self.config.model_path}") # Load tokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.config.model_path, trust_remote_code=True ) # Add padding token if not present if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token # Load model self.model = AutoModelForCausalLM.from_pretrained( self.config.model_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, trust_remote_code=True, ) logger.info(f"Loaded transformers model on {self.model.device}") except Exception as e: logger.error(f"Failed to load transformers model: {e}") raise
[docs] def generate(self, prompt: str) -> LLMResponse: """ Generate response using transformers. Args: prompt: Input prompt Returns: LLMResponse with answer and metadata """ start_time = time.time() try: # Tokenize input inputs = self.tokenizer( prompt, return_tensors="pt", truncation=True, max_length=self.config.context_window, ) # Move to same device as model if hasattr(self.model, "device"): inputs = {k: v.to(self.model.device) for k, v in inputs.items()} logger.debug(f"Input tokens: {inputs['input_ids'].shape[1]}") # Generate response with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=self.config.max_tokens, temperature=self.config.temperature, top_p=self.config.top_p, repetition_penalty=self.config.repeat_penalty, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id, early_stopping=True, ) logger.debug(f"Output tokens: {outputs.shape[1]}") # Decode response - get only the new tokens input_length = inputs["input_ids"].shape[1] response_tokens = outputs[0][input_length:] logger.debug(f"Response tokens shape: {response_tokens.shape}") logger.debug(f"Response tokens: {response_tokens}") # Decode the response tokens answer = self.tokenizer.decode(response_tokens, skip_special_tokens=True) logger.debug(f"Decoded answer: '{answer}'") # Calculate metadata generation_time_ms = (time.time() - start_time) * 1000 prompt_tokens = input_length response_tokens_count = len(response_tokens) return LLMResponse( answer=answer.strip(), prompt_tokens=prompt_tokens, response_tokens=response_tokens_count, generation_time_ms=generation_time_ms, model_used=f"transformers:{self.config.model_path}", ) except Exception as e: logger.error(f"Error during generation: {e}") raise
[docs] class LlamaCppLLM(BaseLLM): """LLM implementation using llama-cpp-python.""" def _load_model(self) -> None: """Load llama-cpp model.""" try: logger.info(f"Loading llama-cpp model: {self.config.model_path}") # Import llama-cpp-python from llama_cpp import Llama self.model = Llama( model_path=self.config.model_path, n_ctx=self.config.context_window, n_threads=4, # Configurable n_gpu_layers=0, # CPU only for now ) logger.info("Loaded llama-cpp model") except ImportError: logger.error( "llama-cpp-python not installed. Install with: pip install llama-cpp-python" ) raise except Exception as e: logger.error(f"Failed to load llama-cpp model: {e}") raise
[docs] def generate(self, prompt: str) -> LLMResponse: """ Generate response using llama-cpp. Args: prompt: Input prompt Returns: LLMResponse with answer and metadata """ start_time = time.time() try: # Generate response response = self.model( prompt, max_tokens=self.config.max_tokens, temperature=self.config.temperature, top_p=self.config.top_p, repeat_penalty=self.config.repeat_penalty, stop=["</s>", "\n\n\n"], # Common stop tokens ) answer = response["choices"][0]["text"].strip() # Calculate metadata generation_time_ms = (time.time() - start_time) * 1000 prompt_tokens = response["usage"]["prompt_tokens"] response_tokens = response["usage"]["completion_tokens"] return LLMResponse( answer=answer, prompt_tokens=prompt_tokens, response_tokens=response_tokens, generation_time_ms=generation_time_ms, model_used=f"llama-cpp:{self.config.model_path}", ) except Exception as e: logger.error(f"Error during generation: {e}") raise
[docs] class OpenAILLM(BaseLLM): """LLM implementation using OpenAI API (optional).""" def _load_model(self) -> None: """Initialize OpenAI client.""" try: import openai self.client = openai.OpenAI( api_key=self.config.model_path # model_path contains API key ) logger.info("Initialized OpenAI client") except ImportError: logger.error("openai not installed. Install with: pip install openai") raise except Exception as e: logger.error(f"Failed to initialize OpenAI client: {e}") raise
[docs] def generate(self, prompt: str) -> LLMResponse: """ Generate response using OpenAI API. Args: prompt: Input prompt Returns: LLMResponse with answer and metadata """ start_time = time.time() try: # Generate response response = self.client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], max_tokens=self.config.max_tokens, temperature=self.config.temperature, top_p=self.config.top_p, ) answer = response.choices[0].message.content.strip() # Calculate metadata generation_time_ms = (time.time() - start_time) * 1000 prompt_tokens = response.usage.prompt_tokens response_tokens = response.usage.completion_tokens return LLMResponse( answer=answer, prompt_tokens=prompt_tokens, response_tokens=response_tokens, generation_time_ms=generation_time_ms, model_used="openai:gpt-3.5-turbo", ) except Exception as e: logger.error(f"Error during OpenAI generation: {e}") raise
[docs] class LLMInterface: """Main interface for LLM operations."""
[docs] def __init__(self, config: dict[str, Any]): """ Initialize LLM interface. Args: config: Configuration dictionary """ self.config = config # Create LLM config llm_config = LLMConfig( backend=config.get("llm", {}).get("backend", "transformers"), model_path=config.get("llm", {}).get( "model_path", "microsoft/DialoGPT-medium" ), temperature=config.get("llm", {}).get("temperature", 0.2), max_tokens=config.get("llm", {}).get("max_tokens", 1024), top_p=config.get("llm", {}).get("top_p", 0.9), repeat_penalty=config.get("llm", {}).get("repeat_penalty", 1.1), context_window=config.get("llm", {}).get("context_window", 4096), ) # Initialize appropriate LLM if llm_config.backend == "transformers": self.llm = TransformersLLM(llm_config) elif llm_config.backend == "llama-cpp": self.llm = LlamaCppLLM(llm_config) elif llm_config.backend == "openai": self.llm = OpenAILLM(llm_config) else: raise ValueError(f"Unsupported LLM backend: {llm_config.backend}") # Load prompt templates self.prompts = config.get("prompts", {}) logger.info("Initialized LLMInterface")
[docs] def format_prompt(self, query: str, context: str) -> str: """ Format prompt with query and context. Args: query: User query context: Retrieved document context Returns: Formatted prompt """ # Mistral instruction format template = self.prompts.get( "query_template", "<s>[INST] Based on the following context, answer the question. If the context doesn't contain enough information to answer the question, respond with 'I don't have enough information to answer this question.'\n\nContext:\n{context}\n\nQuestion: {question} [/INST]", ) return template.format(context=context, question=query)
[docs] def generate_answer(self, query: str, query_result: QueryResult) -> LLMResponse: """ Generate answer from query and retrieved chunks. Args: query: User query query_result: QueryResult with retrieved chunks Returns: LLMResponse with generated answer """ if not query_result.chunks: # No relevant chunks found no_answer_template = self.prompts.get( "no_answer_template", "I don't have enough information to answer this question based on the available documents.", ) return LLMResponse( answer=no_answer_template, prompt_tokens=0, response_tokens=0, generation_time_ms=0.0, model_used=self.llm.config.backend, ) # Format context from chunks context_parts = [] for chunk in query_result.chunks: context_parts.append( f"[Document: {chunk.metadata.file_name}, Page: {chunk.metadata.page_number}]\n{chunk.text}" ) context = "\n\n".join(context_parts) # Format prompt prompt = self.format_prompt(query, context) # Generate answer response = self.llm.generate(prompt) logger.info(f"Generated answer in {response.generation_time_ms:.2f}ms") logger.info( f"Used {response.prompt_tokens} prompt tokens, {response.response_tokens} response tokens" ) return response
[docs] def get_model_info(self) -> dict[str, Any]: """ Get information about the loaded model. Returns: Dictionary with model information """ return { "backend": self.llm.config.backend, "model_path": self.llm.config.model_path, "temperature": self.llm.config.temperature, "max_tokens": self.llm.config.max_tokens, "context_window": self.llm.config.context_window, }
[docs] def create_llm_interface(config: dict[str, Any]) -> LLMInterface: """ Create LLM interface from configuration. Args: config: Configuration dictionary Returns: LLMInterface instance """ return LLMInterface(config)
[docs] def generate_answer_from_query( query: str, query_result: QueryResult, config: dict[str, Any] ) -> str: """ Generate answer from query and query result. Args: query: User query query_result: QueryResult with retrieved chunks config: Configuration dictionary Returns: Generated answer string """ # Create LLM interface llm_interface = create_llm_interface(config) # Generate answer response = llm_interface.generate_answer(query, query_result) return response.answer
[docs] def format_llm_response(response: LLMResponse, verbose: bool = False) -> str: """ Format LLM response for output. Args: response: LLMResponse to format verbose: Whether to include metadata Returns: Formatted output string """ if verbose: lines = [] lines.append("=== LLM Response ===") lines.append(f"Answer: {response.answer}") lines.append(f"Model: {response.model_used}") lines.append(f"Generation time: {response.generation_time_ms:.2f}ms") lines.append(f"Prompt tokens: {response.prompt_tokens}") lines.append(f"Response tokens: {response.response_tokens}") return "\n".join(lines) else: return response.answer