Source code for main

#!/usr/bin/env python3
"""
Document-based Question Answering System

A local, modular RAG (retrieval-augmented generation) system that processes
PDF documents and enables natural language queries.

Usage:
    python main.py --mode ingest --documents ./data/
    python main.py --mode query --query "What is Consult+ prediction for Tesla stock?"
"""

import argparse
import logging
import sys
import copy
from pathlib import Path
from typing import Optional, Dict, Any

import yaml
from dotenv import load_dotenv

# Import utility functions
from src.utils import setup_logging, get_logger, log_system_info

# Load environment variables
load_dotenv()

# Get logger
logger = get_logger(__name__)


[docs] def load_config(config_path: str = "config.yaml") -> dict: """ Load configuration from YAML file. Args: config_path: Path to the configuration file Returns: Dictionary containing configuration parameters Raises: FileNotFoundError: If config file doesn't exist yaml.YAMLError: If config file is malformed """ try: with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) logger.info(f"Configuration loaded from {config_path}") return config except FileNotFoundError: logger.error(f"Configuration file {config_path} not found") sys.exit(1) except yaml.YAMLError as e: logger.error(f"Error parsing configuration file: {e}") sys.exit(1)
[docs] def validate_ingest_args(args: argparse.Namespace) -> None: """ Validate arguments for ingest mode. Args: args: Parsed command line arguments Raises: ValueError: If validation fails """ documents_path = Path(args.documents) if not documents_path.exists(): raise ValueError(f"Documents directory {documents_path} does not exist") if not documents_path.is_dir(): raise ValueError(f"{documents_path} is not a directory") # Check if directory contains PDF files pdf_files = list(documents_path.glob("*.pdf")) if not pdf_files: logger.warning(f"No PDF files found in {documents_path}") logger.info(f"Found {len(pdf_files)} PDF files in {documents_path}")
[docs] def validate_query_args(args: argparse.Namespace) -> None: """ Validate arguments for query mode. Args: args: Parsed command line arguments Raises: ValueError: If validation fails """ if not args.query or not args.query.strip(): raise ValueError("Query cannot be empty") # Check if index exists index_path = Path("index") if not index_path.exists(): raise ValueError("Index directory does not exist. Please run ingest mode first.") logger.info(f"Query: {args.query}")
[docs] def merge_config_with_args(config: Dict[str, Any], args: argparse.Namespace) -> Dict[str, Any]: """ Merge CLI arguments with configuration, CLI args take precedence. Args: config: Configuration dictionary args: Parsed command line arguments Returns: Updated configuration dictionary """ # Create a deep copy to avoid modifying the original merged_config = copy.deepcopy(config) # Merge PDF processing settings if args.chunk_size is not None: if "pdf" not in merged_config: merged_config["pdf"] = {} merged_config["pdf"]["chunk_size"] = args.chunk_size if args.chunk_overlap is not None: if "pdf" not in merged_config: merged_config["pdf"] = {} merged_config["pdf"]["chunk_overlap"] = args.chunk_overlap if args.pdf_engine is not None: if "pdf" not in merged_config: merged_config["pdf"] = {} merged_config["pdf"]["engine"] = args.pdf_engine # Merge embedding settings if args.embedding_model is not None: if "embedding" not in merged_config: merged_config["embedding"] = {} merged_config["embedding"]["model_name"] = args.embedding_model if args.top_k is not None: if "embedding" not in merged_config: merged_config["embedding"] = {} merged_config["embedding"]["top_k"] = args.top_k if args.similarity_threshold is not None: if "embedding" not in merged_config: merged_config["embedding"] = {} merged_config["embedding"]["similarity_threshold"] = args.similarity_threshold # Merge LLM settings if args.llm_backend is not None: if "llm" not in merged_config: merged_config["llm"] = {} merged_config["llm"]["backend"] = args.llm_backend if args.llm_model is not None: if "llm" not in merged_config: merged_config["llm"] = {} merged_config["llm"]["model_path"] = args.llm_model if args.temperature is not None: if "llm" not in merged_config: merged_config["llm"] = {} merged_config["llm"]["temperature"] = args.temperature if args.max_tokens is not None: if "llm" not in merged_config: merged_config["llm"] = {} merged_config["llm"]["max_tokens"] = args.max_tokens return merged_config
[docs] def main() -> None: """ Main entry point for the document-based question answering system. Handles command line argument parsing and routes to appropriate functionality based on the selected mode. """ parser = argparse.ArgumentParser( description="Document-based Question Answering System", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python main.py --mode ingest --documents ./data/ python main.py --mode query --query "What is Consult+ prediction for Tesla stock?" """ ) parser.add_argument( "--mode", choices=["ingest", "query"], required=True, help="Operation mode: ingest documents or query the system" ) parser.add_argument( "--documents", type=str, help="Path to directory containing PDF documents (required for ingest mode)" ) parser.add_argument( "--query", type=str, help="Natural language query (required for query mode)" ) parser.add_argument( "--config", type=str, default="config.yaml", help="Path to configuration file (default: config.yaml)" ) parser.add_argument( "--chunk-size", type=int, help="Size of text chunks for processing (overrides config)" ) parser.add_argument( "--chunk-overlap", type=int, help="Overlap between text chunks (overrides config)" ) parser.add_argument( "--top-k", type=int, help="Number of top similar chunks to retrieve (overrides config)" ) parser.add_argument( "--similarity-threshold", type=float, help="Minimum similarity threshold for chunk selection (overrides config)" ) parser.add_argument( "--embedding-model", type=str, help="Embedding model to use (overrides config)" ) parser.add_argument( "--llm-backend", type=str, choices=["transformers", "llama-cpp", "openai"], help="LLM backend to use (overrides config)" ) parser.add_argument( "--llm-model", type=str, help="LLM model path or name (overrides config)" ) parser.add_argument( "--temperature", type=float, help="LLM temperature (overrides config)" ) parser.add_argument( "--max-tokens", type=int, help="Maximum tokens for LLM generation (overrides config)" ) parser.add_argument( "--pdf-engine", type=str, choices=["pymupdf", "pdfplumber", "pdfminer"], help="PDF processing engine (overrides config)" ) parser.add_argument( "--verbose", action="store_true", help="Enable verbose logging" ) args = parser.parse_args() # Set up logging based on configuration try: # Load configuration for logging setup config = load_config(args.config) system_config = config.get("system", {}) # Set up logging setup_logging( log_level=system_config.get("log_level", "INFO"), log_file=system_config.get("log_file", None) ) # Log system information log_system_info(logger) except Exception as e: # Fallback to basic logging if config loading fails setup_logging(log_level="INFO") logger.warning(f"Could not load configuration for logging setup: {e}") # Set logging level based on verbose flag if args.verbose: logging.getLogger().setLevel(logging.DEBUG) logger.debug("Verbose logging enabled") try: # Load configuration config = load_config(args.config) # Merge CLI arguments with configuration config = merge_config_with_args(config, args) # Validate arguments based on mode if args.mode == "ingest": validate_ingest_args(args) logger.info("Starting document ingestion...") from src.ingest import ingest_documents ingest_documents(args.documents, config, args) # After ingestion, create embeddings logger.info("Creating embeddings from ingested chunks...") from src.embed import create_embeddings_from_chunks_file chunks_file = Path(config.get("storage", {}).get("index_dir", "./index")) / "chunks.json" output_path = Path(config.get("storage", {}).get("index_dir", "./index")) create_embeddings_from_chunks_file(chunks_file, config, output_path) elif args.mode == "query": validate_query_args(args) logger.info("Starting query processing...") from src.query import process_query, format_query_output from src.llm import generate_answer_from_query, format_llm_response # Process query to get relevant chunks result = process_query(args.query, config, args) # Generate answer using LLM logger.info("Generating answer using LLM...") answer = generate_answer_from_query(args.query, result, config) # Format output if args.verbose: # Show both chunks and LLM answer chunks_output = format_query_output(result, verbose=True) llm_output = format_llm_response( type('obj', (object,), { 'answer': answer, 'prompt_tokens': 0, 'response_tokens': 0, 'generation_time_ms': 0.0, 'model_used': 'llm' }), verbose=True ) print(f"{chunks_output}\n\n{llm_output}") else: # Show only the answer print(answer) except ValueError as e: logger.error(f"Validation error: {e}") sys.exit(1) except Exception as e: logger.error(f"Unexpected error: {e}") sys.exit(1)
if __name__ == "__main__": main()