Phase 4: Documents & Memory — upload, FTS, AI tools, context injection

Backend: - Document + MemoryEntry models with Alembic migration (GIN FTS index) - File upload endpoint with path traversal protection (sanitized filenames) - Background document text extraction (PyMuPDF) - Full-text search on extracted_text via PostgreSQL tsvector/tsquery - Memory CRUD with enum-validated categories/importance, field allow-list - AI tools: save_memory, search_documents, get_memory (Claude function calling) - Tool execution loop in stream_ai_response (multi-turn tool use) - Context assembly: injects critical memory + relevant doc excerpts - File storage abstraction (local filesystem, S3-swappable) - Secure file deletion (DB flush before disk delete) Frontend: - Document upload dialog (drag-and-drop + file picker) - Document list with status badges, search, download (via authenticated blob) - Document viewer with extracted text preview - Memory list grouped by category with importance color coding - Memory editor with category/importance dropdowns - Documents + Memory pages with full CRUD - Enabled sidebar navigation for both sections Review fixes applied: - Sanitized upload filenames (path traversal prevention) - Download via axios blob (not bare <a href>, preserves auth) - Route ordering: /search before /{id}/reindex - Memory update allows is_active=False + field allow-list - MemoryEditor form resets on mode switch - Literal enum validation on category/importance schemas - DB flush before file deletion for data integrity Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 13:46:59 +03:00
parent 03afb7a075
commit 8b8fe916f0
37 changed files with 1921 additions and 26 deletions
--- a/backend/app/api/v1/documents.py
+++ b/backend/app/api/v1/documents.py
@@ -0,0 +1,126 @@
+import asyncio
+import uuid
+from pathlib import PurePosixPath
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, Query, UploadFile, File, HTTPException, status
+from fastapi.responses import FileResponse
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.api.deps import get_current_user
+from app.config import settings
+from app.database import get_db
+from app.models.user import User
+from app.schemas.document import DocumentListResponse, DocumentResponse, DocumentSearchRequest
+from app.services import document_service
+from app.utils.file_storage import save_upload, get_file_path
+from app.workers.document_processor import process_document
+
+router = APIRouter(prefix="/documents", tags=["documents"])
+
+ALLOWED_MIME_TYPES = [
+    "application/pdf",
+    "image/jpeg",
+    "image/png",
+    "image/tiff",
+    "image/webp",
+]
+
+
+@router.post("/", response_model=DocumentResponse, status_code=status.HTTP_201_CREATED)
+async def upload_document(
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+    file: UploadFile = File(...),
+    doc_type: str = Query(default="other"),
+):
+    if file.content_type not in ALLOWED_MIME_TYPES:
+        raise HTTPException(status_code=400, detail=f"Unsupported file type: {file.content_type}")
+
+    content = await file.read()
+    if len(content) > settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024:
+        raise HTTPException(status_code=400, detail=f"File too large. Max {settings.MAX_UPLOAD_SIZE_MB}MB")
+
+    doc_id = uuid.uuid4()
+    safe_name = PurePosixPath(file.filename or "upload").name
+    filename = f"{doc_id}_{safe_name}"
+    storage_path = await save_upload(user.id, doc_id, filename, content)
+
+    doc = await document_service.create_document(
+        db, user.id, filename, safe_name,
+        storage_path, file.content_type or "application/octet-stream",
+        len(content), doc_type,
+    )
+
+    # Trigger background processing
+    asyncio.create_task(process_document(doc.id, storage_path, file.content_type or ""))
+
+    return DocumentResponse.model_validate(doc)
+
+
+@router.get("/", response_model=DocumentListResponse)
+async def list_documents(
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+    doc_type: str | None = Query(default=None),
+    processing_status: str | None = Query(default=None),
+):
+    docs = await document_service.get_user_documents(db, user.id, doc_type, processing_status)
+    return DocumentListResponse(documents=[DocumentResponse.model_validate(d) for d in docs])
+
+
+@router.get("/{doc_id}", response_model=DocumentResponse)
+async def get_document(
+    doc_id: uuid.UUID,
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+):
+    doc = await document_service.get_document(db, doc_id, user.id)
+    return DocumentResponse.model_validate(doc)
+
+
+@router.get("/{doc_id}/download")
+async def download_document(
+    doc_id: uuid.UUID,
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+):
+    doc = await document_service.get_document(db, doc_id, user.id)
+    file_path = get_file_path(doc.storage_path)
+    if not file_path.exists():
+        raise HTTPException(status_code=404, detail="File not found on disk")
+    return FileResponse(
+        path=str(file_path),
+        filename=doc.original_filename,
+        media_type=doc.mime_type,
+    )
+
+
+@router.delete("/{doc_id}", status_code=status.HTTP_204_NO_CONTENT)
+async def delete_document(
+    doc_id: uuid.UUID,
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+):
+    await document_service.delete_document(db, doc_id, user.id)
+
+
+@router.post("/search", response_model=DocumentListResponse)
+async def search_documents(
+    data: DocumentSearchRequest,
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+):
+    docs = await document_service.search_documents(db, user.id, data.query)
+    return DocumentListResponse(documents=[DocumentResponse.model_validate(d) for d in docs])
+
+
+@router.post("/{doc_id}/reindex", response_model=DocumentResponse)
+async def reindex_document(
+    doc_id: uuid.UUID,
+    user: Annotated[User, Depends(get_current_user)],
+    db: Annotated[AsyncSession, Depends(get_db)],
+):
+    doc = await document_service.get_document(db, doc_id, user.id)
+    asyncio.create_task(process_document(doc.id, doc.storage_path, doc.mime_type))
+    return DocumentResponse.model_validate(doc)