from pathlib import Path def extract_text_from_pdf(file_path: str) -> str: import fitz # PyMuPDF text_parts = [] with fitz.open(file_path) as doc: for page in doc: text_parts.append(page.get_text()) return "\n".join(text_parts).strip() def extract_text(file_path: str, mime_type: str) -> str: if mime_type == "application/pdf": return extract_text_from_pdf(file_path) # For images, we'd use pytesseract but skip for now as it requires system deps # For other types, return empty return ""