commit c7d85e7e5d77c458afa5297597bf0762099f6356 Author: nsa Date: Tue Dec 17 10:47:33 2024 +0100 Genesis commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..4335380 --- /dev/null +++ b/README.md @@ -0,0 +1,132 @@ + +# **XBO Product Query System** + +This repository contains a **PDF-based product query assistant** built using **LangChain**, **OpenAI** models, and **FAISS** for vector-based search. It processes PDF documents, creates a searchable FAISS index, and allows querying product-related information interactively. + +--- + +## **Features** + +- Processes PDF documents and extracts text. +- Creates and saves a FAISS index for fast, vector-based querying. +- Utilizes OpenAI's GPT models to answer natural language questions about products. +- Provides an interactive CLI-based query interface. +- Allows toggling FAISS index creation for optimized usage. + +--- + +## **Requirements** + +- Python 3.8+ +- An OpenAI API key +- A folder containing PDF documents + +--- + +## **Installation** + +1. **Clone the Repository**: + ```bash + git clone https://gitea.digital-bridge.net/nasim/CodeChallenge.git + cd CodeChallenge + ``` + +2. **Set Up a Virtual Environment**: + ```bash + python -m venv venv + source venv/bin/activate + ``` + +3. **Install Dependencies**: + Install all required Python libraries using the `requirements.txt` file: + ```bash + pip install -r requirements.txt + ``` + +--- + +## **Configuration** + +The application configuration is defined in **`app/config.py`**. + +### **Setting Up the OpenAI API Key** +Replace the placeholder API key or use environment variables for production: +- Open **`app/config.py`** and add your OpenAI API key: + ```python + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your-openai-api-key") + ``` + +- Alternatively, create a `.env` file at the root of your project and add: + ```dotenv + OPENAI_API_KEY=your-openai-api-key + PDF_FOLDER=./app/pdfs + ``` + +### **CREATE_FAISS_INDEX Flag** +- This flag determines whether to create a new FAISS index from PDFs or load an existing one. + +- In **`app/config.py`**, set: + ```python + CREATE_FAISS_INDEX = True # To create a new index + CREATE_FAISS_INDEX = False # To load an existing index + ``` + On initial start set this to TRUE +--- + +## **Usage** + +1. Place all PDF files in the folder specified in `PDF_FOLDER` (default: `./app/pdfs`). + +2. Run the application: + ```bash + python -m app.main + ``` + +3. Interact with the assistant using natural language questions: + - Example: *"Welche Leuchtmittel haben eine Lebensdauer von mehr als 3000 Stunden?"* + - To exit, type `exit`. + +--- + +## **Folder Structure** + +``` +xbo-product-query/ +│ +├── app/ +│ ├── main.py # Application entry point +│ ├── config.py # Configuration settings +│ ├── services/ +│ │ ├── file_service.py # Handles PDF file processing +│ │ ├── faiss_service.py # Handles FAISS index creation/loading +│ │ ├── dependencies.py # Dependency injection for services +│ │ +│ ├── pdfs/ # Directory to store PDF files +│ +├── requirements.txt # Python dependencies +└── README.md # Documentation +``` + +## **Scalability** + +When scaling up following optimization can be applied: + +1. **Update FAISS** + + - Allow appending new documents to the existing FAISS index without rebuilding it entirely. + +2. **Memory and Disk Management** + - Use a persistent FAISS index stored on disk, which can be loaded as needed. + - Enable FAISS's `diskann` mode to load only the necessary portions of the index into memory, reducing RAM consumption. + +3. **Batch Processing of Queries** + - Break large queries into smaller batches to improve response times. + - Distribute query processing across multiple threads or workers to enable parallel computation. + +4. **Efficient Chunking Strategy** + - Divide large PDF texts into smaller, manageable chunks. + - Implement a consistent chunking algorithm to ensure relevant information is efficiently embedded and retrieved. + +5. **Map-Reduce Strategy** + - Use LangChain's Map-Reduce approach for processing large datasets. The Map step processes individual document chunks, and the Reduce step combines intermediate results into a final response. + diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..7c5e420 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,12 @@ +# app/__init__.py + +from app.config import get_config +from fastapi import FastAPI + +def create_app(env_name: str) -> FastAPI: + config = get_config(env_name) + app = FastAPI(title="Code Challenge") + + app.config = config + + return app diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..4bb0f9f --- /dev/null +++ b/app/config.py @@ -0,0 +1,56 @@ +import os + +class BaseConfig: + """ + Base configuration class that holds default settings for the application. + Environment-specific configurations will inherit from this class. + """ + PDF_FOLDER = os.getenv("PDF_FOLDER", "./app/pdfs") + ENV = "base" + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your-api-key-here") + CREATE_FAISS_INDEX = True + + def __init__(self): + """ + Validates required configurations and ensures all necessary environment variables are set. + """ + if not self.OPENAI_API_KEY: + raise ValueError("OPENAI_API_KEY environment variable must be set.") + +class DevelopmentConfig(BaseConfig): + """ + Configuration class for the development environment. + Inherits defaults from BaseConfig. + """ + ENV = "development" + DEBUG = True + + +class ProductionConfig(BaseConfig): + """ + Configuration class for the production environment. + Inherits defaults from BaseConfig but overrides production-specific settings. + """ + ENV = "production" + DEBUG = False + + +def get_config(env_name: str = "development"): + """ + Retrieves the appropriate configuration instance based on the environment name. + + :param env_name: Name of the environment (e.g., 'development', 'production'). + :return: An instance of the selected configuration class. + """ + configs = { + "development": DevelopmentConfig, + "production": ProductionConfig, + } + config_class = configs.get(env_name.lower()) + + if not config_class: + raise ValueError(f"Unknown environment '{env_name}'. Valid options are 'development' or 'production'.") + + config_instance = config_class() + print(f"[INFO] Loaded configuration for environment: {config_instance.ENV}") + return config_instance diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..3699869 --- /dev/null +++ b/app/main.py @@ -0,0 +1,87 @@ +import asyncio +import sys +from app.services.dependencies import get_file_service, get_faiss_service +from app.config import get_config +from langchain_openai import ChatOpenAI +from langchain.schema import Document +from langchain.prompts import PromptTemplate + + +async def main(): + """ + Entry point for the XBO product assistant. + """ + config = get_config(env_name="development") + file_service = get_file_service(config=config) + + openai_api_key = config.OPENAI_API_KEY + create_faiss_index = config.CREATE_FAISS_INDEX + print ("Create FAISS Index: ", create_faiss_index) + print("Wilkommen zum XBO Kaufberater!") + faiss_service = get_faiss_service(openai_api_key) + + try: + if create_faiss_index: + print("[INFO] Creating a new FAISS index...") + pdfs = file_service.load_pdfs() + if not pdfs: + print("[ERROR] No PDFs found.") + sys.exit(1) + + all_documents = [] + for pdf in pdfs: + print(f"Processing PDF: {pdf}") + text = file_service.extract_text_from_pdf(pdf) + all_documents.append(Document(page_content=text, metadata={"source": pdf})) + + vectorstore = faiss_service.create_faiss_index(all_documents) + else: + vectorstore = faiss_service.load_faiss_index() + except Exception as e: + print(f"[ERROR] {e}") + sys.exit(1) + + llm = ChatOpenAI(model="gpt-4o", openai_api_key=openai_api_key) + retriever = vectorstore.as_retriever(search_kwargs={"k": 21}, search_type="mmr") + + while True: + user_input = input("\nWas möchten Sie wissen? (type 'exit' to quit): ").strip() + if user_input.lower() == "exit": + print("Auf Wiedersehen!") + break + + try: + print("[INFO] Retrieving relevant documents...") + docs = retriever.invoke(user_input) + + if not docs: + print("\n[ANSWER]: Keine passenden Informationen gefunden.") + continue + + context = "\n\n".join([doc.page_content for doc in docs]) + + prompt = PromptTemplate( + template=""" + Du bist ein Assistent, der Fragen zu Produktinformationen beantwortet. + + Kontext: + {context} + + Frage: + {question} + + Antwort: + """, + input_variables=["context", "question"] + ) + + response = llm.invoke(prompt.format(context=context, question=user_input)) + + print("\n[ANSWER]:") + print(response.content) + except Exception as e: + print(f"[ERROR] Failed to process query: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/app/pdfs/ZMP_1007177_XBO_1600_W_HSC_XL_OFR.pdf b/app/pdfs/ZMP_1007177_XBO_1600_W_HSC_XL_OFR.pdf new file mode 100644 index 0000000..f01341c Binary files /dev/null and b/app/pdfs/ZMP_1007177_XBO_1600_W_HSC_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007179_XBO_1600_W_XL_OFR.pdf b/app/pdfs/ZMP_1007179_XBO_1600_W_XL_OFR.pdf new file mode 100644 index 0000000..2e7f92c Binary files /dev/null and b/app/pdfs/ZMP_1007179_XBO_1600_W_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007184_XBO_2000_W_H_XL_OFR.pdf b/app/pdfs/ZMP_1007184_XBO_2000_W_H_XL_OFR.pdf new file mode 100644 index 0000000..71140ec Binary files /dev/null and b/app/pdfs/ZMP_1007184_XBO_2000_W_H_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007187_XBO_2000_W_HTP_XL_OFR.pdf b/app/pdfs/ZMP_1007187_XBO_2000_W_HTP_XL_OFR.pdf new file mode 100644 index 0000000..081b93b Binary files /dev/null and b/app/pdfs/ZMP_1007187_XBO_2000_W_HTP_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007189_XBO_2500_W_HS_XL_OFR.pdf b/app/pdfs/ZMP_1007189_XBO_2500_W_HS_XL_OFR.pdf new file mode 100644 index 0000000..d375494 Binary files /dev/null and b/app/pdfs/ZMP_1007189_XBO_2500_W_HS_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007191_XBO_3000_W_H_XL_OFR.pdf b/app/pdfs/ZMP_1007191_XBO_3000_W_H_XL_OFR.pdf new file mode 100644 index 0000000..bb43b31 Binary files /dev/null and b/app/pdfs/ZMP_1007191_XBO_3000_W_H_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007193_XBO_3000_W_HS_XL_OFR.pdf b/app/pdfs/ZMP_1007193_XBO_3000_W_HS_XL_OFR.pdf new file mode 100644 index 0000000..85c53f6 Binary files /dev/null and b/app/pdfs/ZMP_1007193_XBO_3000_W_HS_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007195_XBO_3000_W_HTP_XL_OFR.pdf b/app/pdfs/ZMP_1007195_XBO_3000_W_HTP_XL_OFR.pdf new file mode 100644 index 0000000..2221a18 Binary files /dev/null and b/app/pdfs/ZMP_1007195_XBO_3000_W_HTP_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007197_XBO_4000_W_HTP_XL_OFR.pdf b/app/pdfs/ZMP_1007197_XBO_4000_W_HTP_XL_OFR.pdf new file mode 100644 index 0000000..547cea5 Binary files /dev/null and b/app/pdfs/ZMP_1007197_XBO_4000_W_HTP_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007199_XBO_4000_W_HS_XL_OFR.pdf b/app/pdfs/ZMP_1007199_XBO_4000_W_HS_XL_OFR.pdf new file mode 100644 index 0000000..e402df7 Binary files /dev/null and b/app/pdfs/ZMP_1007199_XBO_4000_W_HS_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007201_XBO_4500_W_HS_XL_OFR.pdf b/app/pdfs/ZMP_1007201_XBO_4500_W_HS_XL_OFR.pdf new file mode 100644 index 0000000..71b9bb6 Binary files /dev/null and b/app/pdfs/ZMP_1007201_XBO_4500_W_HS_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007203_XBO_5000_W_H_XL_OFR.pdf b/app/pdfs/ZMP_1007203_XBO_5000_W_H_XL_OFR.pdf new file mode 100644 index 0000000..46fb6ec Binary files /dev/null and b/app/pdfs/ZMP_1007203_XBO_5000_W_H_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007207_XBO_6000_W_HS_XL_OFR.pdf b/app/pdfs/ZMP_1007207_XBO_6000_W_HS_XL_OFR.pdf new file mode 100644 index 0000000..94d81b3 Binary files /dev/null and b/app/pdfs/ZMP_1007207_XBO_6000_W_HS_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1007209_XBO_7000_W_HS_XL_OFR.pdf b/app/pdfs/ZMP_1007209_XBO_7000_W_HS_XL_OFR.pdf new file mode 100644 index 0000000..28d5609 Binary files /dev/null and b/app/pdfs/ZMP_1007209_XBO_7000_W_HS_XL_OFR.pdf differ diff --git a/app/pdfs/ZMP_1200637_XBO_2000_W_HS_OFR.pdf b/app/pdfs/ZMP_1200637_XBO_2000_W_HS_OFR.pdf new file mode 100644 index 0000000..5ebf4ed Binary files /dev/null and b/app/pdfs/ZMP_1200637_XBO_2000_W_HS_OFR.pdf differ diff --git a/app/pdfs/ZMP_55851_XBO_10000_W_HS_OFR.pdf b/app/pdfs/ZMP_55851_XBO_10000_W_HS_OFR.pdf new file mode 100644 index 0000000..a107672 Binary files /dev/null and b/app/pdfs/ZMP_55851_XBO_10000_W_HS_OFR.pdf differ diff --git a/app/pdfs/ZMP_55852_XBO_1000_W_HS_OFR.pdf b/app/pdfs/ZMP_55852_XBO_1000_W_HS_OFR.pdf new file mode 100644 index 0000000..8bb7fb6 Binary files /dev/null and b/app/pdfs/ZMP_55852_XBO_1000_W_HS_OFR.pdf differ diff --git a/app/pdfs/ZMP_55853_XBO_1000_W_HSC_OFR.pdf b/app/pdfs/ZMP_55853_XBO_1000_W_HSC_OFR.pdf new file mode 100644 index 0000000..9e42638 Binary files /dev/null and b/app/pdfs/ZMP_55853_XBO_1000_W_HSC_OFR.pdf differ diff --git a/app/pdfs/ZMP_55854_XBO_1000_W_HTP_OFR.pdf b/app/pdfs/ZMP_55854_XBO_1000_W_HTP_OFR.pdf new file mode 100644 index 0000000..5000f92 Binary files /dev/null and b/app/pdfs/ZMP_55854_XBO_1000_W_HTP_OFR.pdf differ diff --git a/app/pdfs/ZMP_55864_XBO_2000_W_SHSC_OFR.pdf b/app/pdfs/ZMP_55864_XBO_2000_W_SHSC_OFR.pdf new file mode 100644 index 0000000..c01b3ae Binary files /dev/null and b/app/pdfs/ZMP_55864_XBO_2000_W_SHSC_OFR.pdf differ diff --git a/app/pdfs/ZMP_55877_XBO_4000_W_HSA_OFR.pdf b/app/pdfs/ZMP_55877_XBO_4000_W_HSA_OFR.pdf new file mode 100644 index 0000000..81aa2d7 Binary files /dev/null and b/app/pdfs/ZMP_55877_XBO_4000_W_HSA_OFR.pdf differ diff --git a/app/services/dependencies.py b/app/services/dependencies.py new file mode 100644 index 0000000..2a9aac0 --- /dev/null +++ b/app/services/dependencies.py @@ -0,0 +1,33 @@ +from fastapi import Depends +from app.config import get_config +from app.services.file_service import FileService +from app.services.faiss_service import FAISSService + + + +def get_file_service(config=Depends(get_config)) -> FileService: + """ + Dependency function to provide a FileService instance. + + :param config: Configuration object obtained via dependency injection. + :return: An instance of FileService initialized with the PDF folder path. + """ + if not hasattr(config, "PDF_FOLDER") or not config.PDF_FOLDER: + raise ValueError("PDF_FOLDER is not configured in the application settings.") + return FileService(folder_path=config.PDF_FOLDER) + + +def get_faiss_service(file_service=Depends(get_file_service)) -> FAISSService: + """ + Dependency function to provide a FAISSService instance. + + :param config: Configuration object obtained via dependency injection. + :param file_service: FileService instance for handling PDFs and documents. + :return: An instance of FAISSService initialized with vectorstore and embeddings. + """ + config = get_config() + + return FAISSService( + openai_api_key=config.OPENAI_API_KEY, + index_path="local_faiss_index", + ) diff --git a/app/services/faiss_service.py b/app/services/faiss_service.py new file mode 100644 index 0000000..ad68e9e --- /dev/null +++ b/app/services/faiss_service.py @@ -0,0 +1,53 @@ +from langchain_community.vectorstores import FAISS +from langchain_openai import OpenAIEmbeddings +from langchain.schema import Document + + +class FAISSService: + """ + A service for creating and loading FAISS indexes for document embeddings. + """ + + def __init__(self, openai_api_key, index_path="local_faiss_index"): + """ + Initialize the FAISS service. + + :param openai_api_key: OpenAI API key for embeddings. + :param index_path: Path to save or load the FAISS index. + """ + self.openai_api_key = openai_api_key + self.index_path = index_path + + def create_faiss_index(self, documents): + """ + Create a FAISS index from a list of documents. + + :param documents: List of langchain Document objects. + :return: FAISS vectorstore instance. + """ + print("[INFO] Creating FAISS index...") + vectorstore = FAISS.from_documents( + documents, + OpenAIEmbeddings( + model="text-embedding-ada-002", + openai_api_key=self.openai_api_key + ) + ) + vectorstore.save_local(self.index_path) + print(f"[INFO] FAISS index saved to {self.index_path}.") + return vectorstore + + def load_faiss_index(self): + """ + Load an existing FAISS index. + + :return: Loaded FAISS vectorstore instance. + """ + print("[INFO] Loading FAISS index...") + vectorstore = FAISS.load_local( + self.index_path, + OpenAIEmbeddings(openai_api_key=self.openai_api_key), + allow_dangerous_deserialization=True + ) + print(f"[INFO] FAISS index loaded from {self.index_path}.") + return vectorstore diff --git a/app/services/file_service.py b/app/services/file_service.py new file mode 100644 index 0000000..fa40565 --- /dev/null +++ b/app/services/file_service.py @@ -0,0 +1,49 @@ +# app/services/file_service.py + +import os +import pdfplumber + +class FileService: + """ + A service to handle file-related operations, including loading PDFs from a folder. + """ + def __init__(self, folder_path: str): + """ + Initialize the FileService with the folder path to read files from. + """ + self.folder_path = os.path.abspath(folder_path) + # print(f"[DEBUG] Initialized FileService with folder path: {self.folder_path}") + + def load_pdfs(self): + """ + Reads all PDF files from the folder and returns their paths. + + :return: List of paths to PDF files in the folder. + """ + if not os.path.exists(self.folder_path): + raise FileNotFoundError(f"The folder {self.folder_path} does not exist.") + + pdf_files = [ + os.path.join(self.folder_path, f) + for f in os.listdir(self.folder_path) + if f.endswith(".pdf") + ] + + if not pdf_files: + raise FileNotFoundError(f"No PDF files found in the folder {self.folder_path}.") + + return pdf_files + + def extract_text_from_pdf(self, pdf_path): + """ + Extracts text from the PDF file using pdfplumber. + :param pdf_path: Path to the PDF file. + :return: Extracted text as a string. + """ + text = "" + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text += page_text + "\n" + return text \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..214ce3b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +# FastAPI and ASGI server +fastapi + +# LangChain and associated dependencies +langchain +langchain-community +langchain-openai + +# OpenAI API client +openai + +# Embeddings and Vector Stores +faiss-cpu + +# Utility Libraries +python-dotenv==1.0.1 # For environment variable management +pydantic==2.7.1 # Data validation library (dependency for FastAPI and LangChain) + +# Async support +asyncio==3.4.3 # Part of Python >=3.6 but included explicitly for clarity +pdfplumber +