diff --git a/app.py b/app.py new file mode 100644 index 0000000..aea0635 --- /dev/null +++ b/app.py @@ -0,0 +1,112 @@ +import os +import uuid +import numpy as np +import faiss +import httpx +from fastapi import FastAPI, UploadFile, Form +from langchain_community.document_loaders import Docx2txtLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from pydantic import BaseModel +from dotenv import load_dotenv +from sentence_transformers import SentenceTransformer + +# Load environment variables +load_dotenv() + +# Initialize the embedding model +MODEL = SentenceTransformer("all-MiniLM-L6-v2") +OPENROUTER_API_KEY = "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" + +app = FastAPI(title="RAG-via-FastAPI") + +# --- helpers --------------------------------------------------------- + +def docx_to_chunks(file_path): + pages = Docx2txtLoader(file_path).load() + chunks = RecursiveCharacterTextSplitter( + chunk_size=1000, chunk_overlap=200 + ).split_documents(pages) + return [c.page_content for c in chunks] + +def get_embeddings(texts): + # Generate embeddings using SentenceTransformer + embeddings = MODEL.encode(texts, convert_to_tensor=False) + return np.array(embeddings, dtype="float32") + +def build_faiss_index(vectors): + dim = vectors.shape[1] + index = faiss.IndexFlatL2(dim) + index.add(vectors) + return index + +async def openrouter_chat(prompt): + headers = { + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "HTTP-Referer": "https://yourapp.example", + "Content-Type": "application/json" + } + data = { + "model": "google/gemma-3-27b-it:free", # any OpenRouter-hosted model + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ], + "temperature": 0.3, + "max_tokens": 512 + } + async with httpx.AsyncClient(timeout=120) as client: + r = await client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers=headers, + json=data + ) + r.raise_for_status() + return r.json()["choices"][0]["message"]["content"] + +# --- request / response models -------------------------------------- + +class QueryRequest(BaseModel): + query: str + +class RAGResponse(BaseModel): + answer: str + sources: list[int] + +# --- endpoints ------------------------------------------------------ + +@app.post("/rag", response_model=RAGResponse) +async def rag_endpoint(file: UploadFile, query: str = Form(...)): + # Create temp directory if it doesn't exist + os.makedirs("temp", exist_ok=True) + + # Use a local temp directory instead of /tmp for Windows compatibility + tmp_path = f"temp/{uuid.uuid4()}.docx" + + try: + # Save uploaded file + with open(tmp_path, "wb") as f: + f.write(await file.read()) + + # 1. chunk + chunks = docx_to_chunks(tmp_path) + + # 2. embed + embeddings = get_embeddings(chunks) + + # 3. search + index = build_faiss_index(embeddings) + q_vec = get_embeddings([query]) + distances, idxs = index.search(q_vec, k=3) + context = "\n\n".join(chunks[i] for i in idxs[0]) + + # 4. generate + prompt = (f"Use ONLY the context to answer the question.\n\n" + f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:") + answer = await openrouter_chat(prompt) + + return {"answer": answer, "sources": idxs[0].tolist()} + + finally: + # Cleanup temp file + if os.path.exists(tmp_path): + os.remove(tmp_path) diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..d967854 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,3 @@ +""" +RAG (Retrieval Augmented Generation) application package. +""" \ No newline at end of file diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/routes.py b/app/api/routes.py new file mode 100644 index 0000000..03ded41 --- /dev/null +++ b/app/api/routes.py @@ -0,0 +1,72 @@ +import os +import uuid +import time +import logging +from fastapi import APIRouter, UploadFile, Form +from app.core.models import RAGResponse +from app.services.document import docx_to_chunks +from app.services.embedding import embedding_service +from app.services.llm import chat_completion +from app.core.config import settings + +logging.basicConfig(level=logging.INFO) +router = APIRouter() + +def log_latency(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + latency_ms = (end_time - start_time) * 1000 + logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms") + return result + return wrapper + +@log_latency +def search_faiss(index, query_vector, k, chunks): # Added chunks parameter + """Search the FAISS index for similar vectors.""" + distances, idxs = index.search(query_vector, k) + # Log top k results with their distances and text preview + for i in range(k): + chunk_text = chunks[idxs[0][i]] + preview = ' '.join(chunk_text.split()[:20]) # Get first 50 words + logging.info(f"Top {i+1} result - Index: {idxs[0][i]}, Distance: {distances[0][i]:.4f}") + logging.info(f"Text preview: {preview}...") + return distances, idxs + +@router.post("/rag", response_model=RAGResponse) +async def rag_endpoint(file: UploadFile, query: str = Form(...)): + # Create temp directory if it doesn't exist + os.makedirs("temp", exist_ok=True) + + # Use a local temp directory for Windows compatibility + tmp_path = f"temp/{uuid.uuid4()}.docx" + + try: + # Save uploaded file + with open(tmp_path, "wb") as f: + f.write(await file.read()) + + # 1. chunk + chunks = docx_to_chunks(tmp_path) + + # 2. embed + embeddings = embedding_service.get_embeddings(chunks) + + # 3. search + index = embedding_service.build_faiss_index(embeddings) + q_vec = embedding_service.get_embeddings([query]) + distances, idxs = search_faiss(index, q_vec, settings.TOP_K_RESULTS, chunks) + + # 4. generate + context = "\n\n".join(chunks[i] for i in idxs[0]) + prompt = (f"Use ONLY the context to answer the question.\n\n" + f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:") + answer = await chat_completion(prompt) + + return {"answer": answer, "sources": idxs[0].tolist()} + + finally: + # Cleanup temp file + if os.path.exists(tmp_path): + os.remove(tmp_path) \ No newline at end of file diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/config.py b/app/core/config.py new file mode 100644 index 0000000..2a98211 --- /dev/null +++ b/app/core/config.py @@ -0,0 +1,13 @@ +from pydantic_settings import BaseSettings + +class Settings(BaseSettings): + OPENROUTER_API_KEY: str = "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" + MODEL_NAME: str = "all-MiniLM-L6-v2" + CHUNK_SIZE: int = 1000 + CHUNK_OVERLAP: int = 200 + TOP_K_RESULTS: int = 3 + + class Config: + env_file = ".env" + +settings = Settings() \ No newline at end of file diff --git a/app/core/models.py b/app/core/models.py new file mode 100644 index 0000000..63703e5 --- /dev/null +++ b/app/core/models.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +class QueryRequest(BaseModel): + query: str + +class RAGResponse(BaseModel): + answer: str + sources: list[int] \ No newline at end of file diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/document.py b/app/services/document.py new file mode 100644 index 0000000..382c64d --- /dev/null +++ b/app/services/document.py @@ -0,0 +1,27 @@ +from langchain_community.document_loaders import Docx2txtLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from app.core.config import settings +import time +import logging + +logging.basicConfig(level=logging.INFO) + +def log_latency(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + latency_ms = (end_time - start_time) * 1000 + logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms") + return result + return wrapper + +@log_latency +def docx_to_chunks(file_path: str) -> list[str]: + """Convert a DOCX file to text chunks.""" + pages = Docx2txtLoader(file_path).load() + chunks = RecursiveCharacterTextSplitter( + chunk_size=settings.CHUNK_SIZE, + chunk_overlap=settings.CHUNK_OVERLAP + ).split_documents(pages) + return [c.page_content for c in chunks] \ No newline at end of file diff --git a/app/services/embedding.py b/app/services/embedding.py new file mode 100644 index 0000000..ee97890 --- /dev/null +++ b/app/services/embedding.py @@ -0,0 +1,39 @@ +import numpy as np +import faiss +from sentence_transformers import SentenceTransformer +from app.core.config import settings +import logging +import time + +logging.basicConfig(level=logging.INFO) + +def log_latency(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + latency_ms = (end_time - start_time) * 1000 + logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms") + return result + return wrapper + +class EmbeddingService: + + def __init__(self): + self.model = SentenceTransformer(settings.MODEL_NAME) + + @log_latency + def get_embeddings(self, texts: list[str]) -> np.ndarray: + """Generate embeddings for a list of texts.""" + embeddings = self.model.encode(texts, convert_to_tensor=False) + return np.array(embeddings, dtype="float32") + + @log_latency + def build_faiss_index(self, vectors: np.ndarray) -> faiss.Index: + """Build a FAISS index from vectors.""" + dim = vectors.shape[1] + index = faiss.IndexFlatL2(dim) + index.add(vectors) + return index + +embedding_service = EmbeddingService() \ No newline at end of file diff --git a/app/services/llm.py b/app/services/llm.py new file mode 100644 index 0000000..8ed6856 --- /dev/null +++ b/app/services/llm.py @@ -0,0 +1,41 @@ +import httpx +from app.core.config import settings +import logging +import time + +logging.basicConfig(level=logging.INFO) + +def log_latency(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + latency_ms = (end_time - start_time) * 1000 + logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms") + return result + return wrapper + +@log_latency +async def chat_completion(prompt: str) -> str: + """Get completion from OpenRouter API.""" + headers = { + "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}", + "Content-Type": "application/json" + } + data = { + "model": "google/gemma-3-27b-it:free", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ], + "temperature": 0.1, + } + async with httpx.AsyncClient(timeout=120) as client: + r = await client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers=headers, + json=data + ) + r.raise_for_status() + return r.json()["choices"][0]["message"]["content"] + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..a09e9a0 --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +from fastapi import FastAPI +from app.api.routes import router + +app = FastAPI(title="RAG-via-FastAPI") +app.include_router(router) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f598c0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +annotated-types==0.7.0 +anyio==4.9.0 +click==8.2.0 +colorama==0.4.6 +fastapi==0.115.12 +h11==0.16.0 +idna==3.10 +pydantic==2.11.4 +pydantic_core==2.33.2 +sniffio==1.3.1 +starlette==0.46.2 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +uvicorn==0.34.2 +python-multipart +langchain-community +python-docx +faiss-cpu +sentence-transformers +httpx +python-dotenv +pydantic-settings