Feature done

Average latency 25ms
2025-05-19 17:20:17 +05:30 · 2025-05-19 17:20:17 +05:30 · 6e16fc99c9
commit 6e16fc99c9
parent b9de7c5ecf
13 changed files with 342 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -0,0 +1,112 @@
+import os
+import uuid
+import numpy as np
+import faiss
+import httpx
+from fastapi import FastAPI, UploadFile, Form
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from pydantic import BaseModel
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer
+
+# Load environment variables
+load_dotenv()
+
+# Initialize the embedding model
+MODEL = SentenceTransformer("all-MiniLM-L6-v2")
+OPENROUTER_API_KEY = "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"
+
+app = FastAPI(title="RAG-via-FastAPI")
+
+# --- helpers ---------------------------------------------------------
+
+def docx_to_chunks(file_path):
+    pages = Docx2txtLoader(file_path).load()
+    chunks = RecursiveCharacterTextSplitter(
+        chunk_size=1000, chunk_overlap=200
+    ).split_documents(pages)
+    return [c.page_content for c in chunks]
+
+def get_embeddings(texts):
+    # Generate embeddings using SentenceTransformer
+    embeddings = MODEL.encode(texts, convert_to_tensor=False)
+    return np.array(embeddings, dtype="float32")
+
+def build_faiss_index(vectors):
+    dim = vectors.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(vectors)
+    return index
+
+async def openrouter_chat(prompt):
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "HTTP-Referer": "https://yourapp.example",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "google/gemma-3-27b-it:free",   # any OpenRouter-hosted model
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ],
+        "temperature": 0.3,
+        "max_tokens": 512
+    }
+    async with httpx.AsyncClient(timeout=120) as client:
+        r = await client.post(
+            "https://openrouter.ai/api/v1/chat/completions",
+            headers=headers,
+            json=data
+        )
+        r.raise_for_status()
+        return r.json()["choices"][0]["message"]["content"]
+
+# --- request / response models --------------------------------------
+
+class QueryRequest(BaseModel):
+    query: str
+
+class RAGResponse(BaseModel):
+    answer: str
+    sources: list[int]
+
+# --- endpoints ------------------------------------------------------
+
+@app.post("/rag", response_model=RAGResponse)
+async def rag_endpoint(file: UploadFile, query: str = Form(...)):
+    # Create temp directory if it doesn't exist
+    os.makedirs("temp", exist_ok=True)
+    
+    # Use a local temp directory instead of /tmp for Windows compatibility
+    tmp_path = f"temp/{uuid.uuid4()}.docx"
+    
+    try:
+        # Save uploaded file
+        with open(tmp_path, "wb") as f:
+            f.write(await file.read())
+
+        # 1. chunk
+        chunks = docx_to_chunks(tmp_path)
+        
+        # 2. embed
+        embeddings = get_embeddings(chunks)
+
+        # 3. search
+        index = build_faiss_index(embeddings)
+        q_vec = get_embeddings([query])
+        distances, idxs = index.search(q_vec, k=3)
+        context = "\n\n".join(chunks[i] for i in idxs[0])
+
+        # 4. generate
+        prompt = (f"Use ONLY the context to answer the question.\n\n"
+                f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:")
+        answer = await openrouter_chat(prompt)
+
+        return {"answer": answer, "sources": idxs[0].tolist()}
+    
+    finally:
+        # Cleanup temp file
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
--- a/app/init.py
+++ b/app/init.py
@ -0,0 +1,3 @@
+"""
+RAG (Retrieval Augmented Generation) application package.
+"""
--- a/app/api/init.py
+++ b/app/api/init.py
--- a/app/api/routes.py
+++ b/app/api/routes.py
@ -0,0 +1,72 @@
+import os
+import uuid
+import time
+import logging
+from fastapi import APIRouter, UploadFile, Form
+from app.core.models import RAGResponse
+from app.services.document import docx_to_chunks
+from app.services.embedding import embedding_service
+from app.services.llm import chat_completion
+from app.core.config import settings
+
+logging.basicConfig(level=logging.INFO)
+router = APIRouter()
+
+def log_latency(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        latency_ms = (end_time - start_time) * 1000
+        logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms")
+        return result
+    return wrapper
+
+@log_latency
+def search_faiss(index, query_vector, k, chunks):  # Added chunks parameter
+    """Search the FAISS index for similar vectors."""
+    distances, idxs = index.search(query_vector, k)
+    # Log top k results with their distances and text preview
+    for i in range(k):
+        chunk_text = chunks[idxs[0][i]]
+        preview = ' '.join(chunk_text.split()[:20])  # Get first 50 words
+        logging.info(f"Top {i+1} result - Index: {idxs[0][i]}, Distance: {distances[0][i]:.4f}")
+        logging.info(f"Text preview: {preview}...")
+    return distances, idxs
+
+@router.post("/rag", response_model=RAGResponse)
+async def rag_endpoint(file: UploadFile, query: str = Form(...)):
+    # Create temp directory if it doesn't exist
+    os.makedirs("temp", exist_ok=True)
+    
+    # Use a local temp directory for Windows compatibility
+    tmp_path = f"temp/{uuid.uuid4()}.docx"
+    
+    try:
+        # Save uploaded file
+        with open(tmp_path, "wb") as f:
+            f.write(await file.read())
+
+        # 1. chunk
+        chunks = docx_to_chunks(tmp_path)
+
+        # 2. embed
+        embeddings = embedding_service.get_embeddings(chunks)
+
+        # 3. search
+        index = embedding_service.build_faiss_index(embeddings)
+        q_vec = embedding_service.get_embeddings([query])
+        distances, idxs = search_faiss(index, q_vec, settings.TOP_K_RESULTS, chunks)
+
+        # 4. generate
+        context = "\n\n".join(chunks[i] for i in idxs[0])
+        prompt = (f"Use ONLY the context to answer the question.\n\n"
+                f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:")
+        answer = await chat_completion(prompt)
+
+        return {"answer": answer, "sources": idxs[0].tolist()}
+    
+    finally:
+        # Cleanup temp file
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
--- a/app/core/init.py
+++ b/app/core/init.py
--- a/app/core/config.py
+++ b/app/core/config.py
@ -0,0 +1,13 @@
+from pydantic_settings import BaseSettings
+
+class Settings(BaseSettings):
+    OPENROUTER_API_KEY: str = "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"
+    MODEL_NAME: str = "all-MiniLM-L6-v2"
+    CHUNK_SIZE: int = 1000
+    CHUNK_OVERLAP: int = 200
+    TOP_K_RESULTS: int = 3
+    
+    class Config:
+        env_file = ".env"
+
+settings = Settings()
--- a/app/core/models.py
+++ b/app/core/models.py
@ -0,0 +1,8 @@
+from pydantic import BaseModel
+
+class QueryRequest(BaseModel):
+    query: str
+
+class RAGResponse(BaseModel):
+    answer: str
+    sources: list[int]
--- a/app/services/init.py
+++ b/app/services/init.py
--- a/app/services/document.py
+++ b/app/services/document.py
@ -0,0 +1,27 @@
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from app.core.config import settings
+import time
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+def log_latency(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        latency_ms = (end_time - start_time) * 1000
+        logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms")
+        return result
+    return wrapper
+
+@log_latency
+def docx_to_chunks(file_path: str) -> list[str]:
+    """Convert a DOCX file to text chunks."""
+    pages = Docx2txtLoader(file_path).load()
+    chunks = RecursiveCharacterTextSplitter(
+        chunk_size=settings.CHUNK_SIZE,
+        chunk_overlap=settings.CHUNK_OVERLAP
+    ).split_documents(pages)
+    return [c.page_content for c in chunks]
--- a/app/services/embedding.py
+++ b/app/services/embedding.py
@ -0,0 +1,39 @@
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from app.core.config import settings
+import logging
+import time
+
+logging.basicConfig(level=logging.INFO)
+
+def log_latency(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        latency_ms = (end_time - start_time) * 1000
+        logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms")
+        return result
+    return wrapper
+
+class EmbeddingService:
+    
+    def __init__(self):
+        self.model = SentenceTransformer(settings.MODEL_NAME)
+
+    @log_latency
+    def get_embeddings(self, texts: list[str]) -> np.ndarray:
+        """Generate embeddings for a list of texts."""
+        embeddings = self.model.encode(texts, convert_to_tensor=False)
+        return np.array(embeddings, dtype="float32")
+
+    @log_latency
+    def build_faiss_index(self, vectors: np.ndarray) -> faiss.Index:
+        """Build a FAISS index from vectors."""
+        dim = vectors.shape[1]
+        index = faiss.IndexFlatL2(dim)
+        index.add(vectors)
+        return index
+    
+embedding_service = EmbeddingService()
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -0,0 +1,41 @@
+import httpx
+from app.core.config import settings
+import logging
+import time
+
+logging.basicConfig(level=logging.INFO)
+
+def log_latency(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        latency_ms = (end_time - start_time) * 1000
+        logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms")
+        return result
+    return wrapper
+
+@log_latency
+async def chat_completion(prompt: str) -> str:
+    """Get completion from OpenRouter API."""
+    headers = {
+        "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "google/gemma-3-27b-it:free",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ],
+        "temperature": 0.1,
+    }
+    async with httpx.AsyncClient(timeout=120) as client:
+        r = await client.post(
+            "https://openrouter.ai/api/v1/chat/completions",
+            headers=headers,
+            json=data
+        )
+        r.raise_for_status()
+        return r.json()["choices"][0]["message"]["content"]
+    
--- a/main.py
+++ b/main.py
@ -0,0 +1,5 @@
+from fastapi import FastAPI
+from app.api.routes import router
+
+app = FastAPI(title="RAG-via-FastAPI")
+app.include_router(router)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,22 @@
+annotated-types==0.7.0
+anyio==4.9.0
+click==8.2.0
+colorama==0.4.6
+fastapi==0.115.12
+h11==0.16.0
+idna==3.10
+pydantic==2.11.4
+pydantic_core==2.33.2
+sniffio==1.3.1
+starlette==0.46.2
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+uvicorn==0.34.2
+python-multipart
+langchain-community
+python-docx
+faiss-cpu
+sentence-transformers
+httpx
+python-dotenv
+pydantic-settings