27 lines
920 B
Python
27 lines
920 B
Python
from langchain_community.document_loaders import Docx2txtLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from app.core.config import settings
|
|
import time
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
def log_latency(func):
|
|
def wrapper(*args, **kwargs):
|
|
start_time = time.time()
|
|
result = func(*args, **kwargs)
|
|
end_time = time.time()
|
|
latency_ms = (end_time - start_time) * 1000
|
|
logging.info(f"Latency for {func.__name__}: {latency_ms:.2f} ms")
|
|
return result
|
|
return wrapper
|
|
|
|
@log_latency
|
|
def docx_to_chunks(file_path: str) -> list[str]:
|
|
"""Convert a DOCX file to text chunks."""
|
|
pages = Docx2txtLoader(file_path).load()
|
|
chunks = RecursiveCharacterTextSplitter(
|
|
chunk_size=settings.CHUNK_SIZE,
|
|
chunk_overlap=settings.CHUNK_OVERLAP
|
|
).split_documents(pages)
|
|
return [c.page_content for c in chunks] |