Phase 4: Documents & Memory — upload, FTS, AI tools, context injection

Backend:
- Document + MemoryEntry models with Alembic migration (GIN FTS index)
- File upload endpoint with path traversal protection (sanitized filenames)
- Background document text extraction (PyMuPDF)
- Full-text search on extracted_text via PostgreSQL tsvector/tsquery
- Memory CRUD with enum-validated categories/importance, field allow-list
- AI tools: save_memory, search_documents, get_memory (Claude function calling)
- Tool execution loop in stream_ai_response (multi-turn tool use)
- Context assembly: injects critical memory + relevant doc excerpts
- File storage abstraction (local filesystem, S3-swappable)
- Secure file deletion (DB flush before disk delete)

Frontend:
- Document upload dialog (drag-and-drop + file picker)
- Document list with status badges, search, download (via authenticated blob)
- Document viewer with extracted text preview
- Memory list grouped by category with importance color coding
- Memory editor with category/importance dropdowns
- Documents + Memory pages with full CRUD
- Enabled sidebar navigation for both sections

Review fixes applied:
- Sanitized upload filenames (path traversal prevention)
- Download via axios blob (not bare <a href>, preserves auth)
- Route ordering: /search before /{id}/reindex
- Memory update allows is_active=False + field allow-list
- MemoryEditor form resets on mode switch
- Literal enum validation on category/importance schemas
- DB flush before file deletion for data integrity

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-19 13:46:59 +03:00
parent 03afb7a075
commit 8b8fe916f0
37 changed files with 1921 additions and 26 deletions

View File

@@ -0,0 +1,126 @@
import asyncio
import uuid
from pathlib import PurePosixPath
from typing import Annotated
from fastapi import APIRouter, Depends, Query, UploadFile, File, HTTPException, status
from fastapi.responses import FileResponse
from sqlalchemy.ext.asyncio import AsyncSession
from app.api.deps import get_current_user
from app.config import settings
from app.database import get_db
from app.models.user import User
from app.schemas.document import DocumentListResponse, DocumentResponse, DocumentSearchRequest
from app.services import document_service
from app.utils.file_storage import save_upload, get_file_path
from app.workers.document_processor import process_document
router = APIRouter(prefix="/documents", tags=["documents"])
ALLOWED_MIME_TYPES = [
"application/pdf",
"image/jpeg",
"image/png",
"image/tiff",
"image/webp",
]
@router.post("/", response_model=DocumentResponse, status_code=status.HTTP_201_CREATED)
async def upload_document(
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
file: UploadFile = File(...),
doc_type: str = Query(default="other"),
):
if file.content_type not in ALLOWED_MIME_TYPES:
raise HTTPException(status_code=400, detail=f"Unsupported file type: {file.content_type}")
content = await file.read()
if len(content) > settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024:
raise HTTPException(status_code=400, detail=f"File too large. Max {settings.MAX_UPLOAD_SIZE_MB}MB")
doc_id = uuid.uuid4()
safe_name = PurePosixPath(file.filename or "upload").name
filename = f"{doc_id}_{safe_name}"
storage_path = await save_upload(user.id, doc_id, filename, content)
doc = await document_service.create_document(
db, user.id, filename, safe_name,
storage_path, file.content_type or "application/octet-stream",
len(content), doc_type,
)
# Trigger background processing
asyncio.create_task(process_document(doc.id, storage_path, file.content_type or ""))
return DocumentResponse.model_validate(doc)
@router.get("/", response_model=DocumentListResponse)
async def list_documents(
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
doc_type: str | None = Query(default=None),
processing_status: str | None = Query(default=None),
):
docs = await document_service.get_user_documents(db, user.id, doc_type, processing_status)
return DocumentListResponse(documents=[DocumentResponse.model_validate(d) for d in docs])
@router.get("/{doc_id}", response_model=DocumentResponse)
async def get_document(
doc_id: uuid.UUID,
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
):
doc = await document_service.get_document(db, doc_id, user.id)
return DocumentResponse.model_validate(doc)
@router.get("/{doc_id}/download")
async def download_document(
doc_id: uuid.UUID,
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
):
doc = await document_service.get_document(db, doc_id, user.id)
file_path = get_file_path(doc.storage_path)
if not file_path.exists():
raise HTTPException(status_code=404, detail="File not found on disk")
return FileResponse(
path=str(file_path),
filename=doc.original_filename,
media_type=doc.mime_type,
)
@router.delete("/{doc_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_document(
doc_id: uuid.UUID,
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
):
await document_service.delete_document(db, doc_id, user.id)
@router.post("/search", response_model=DocumentListResponse)
async def search_documents(
data: DocumentSearchRequest,
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
):
docs = await document_service.search_documents(db, user.id, data.query)
return DocumentListResponse(documents=[DocumentResponse.model_validate(d) for d in docs])
@router.post("/{doc_id}/reindex", response_model=DocumentResponse)
async def reindex_document(
doc_id: uuid.UUID,
user: Annotated[User, Depends(get_current_user)],
db: Annotated[AsyncSession, Depends(get_db)],
):
doc = await document_service.get_document(db, doc_id, user.id)
asyncio.create_task(process_document(doc.id, doc.storage_path, doc.mime_type))
return DocumentResponse.model_validate(doc)