""" Comprehensive storage management with chunked uploads, multiple backends, and security. Supports local storage, S3-compatible storage, and async operations with Redis caching. """ import asyncio import hashlib import mimetypes import os from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple from uuid import UUID, uuid4 import aiofiles import aiofiles.os from sqlalchemy import select, update from sqlalchemy.orm import selectinload from app.core.config import get_settings from app.core.database import db_manager, get_cache_manager from app.core.logging import get_logger from app.core.models.content_models import Content, ContentChunk from app.core.security import encrypt_file, decrypt_file, generate_access_token logger = get_logger(__name__) settings = get_settings() class StorageBackend: """Abstract base class for storage backends.""" async def store_chunk(self, upload_id: UUID, chunk_index: int, data: bytes) -> str: """Store a file chunk and return its identifier.""" raise NotImplementedError async def retrieve_chunk(self, chunk_id: str) -> bytes: """Retrieve a file chunk by its identifier.""" raise NotImplementedError async def delete_chunk(self, chunk_id: str) -> bool: """Delete a file chunk.""" raise NotImplementedError async def assemble_file(self, upload_id: UUID, chunks: List[str]) -> str: """Assemble chunks into final file and return file path.""" raise NotImplementedError async def delete_file(self, file_path: str) -> bool: """Delete a complete file.""" raise NotImplementedError async def get_file_stream(self, file_path: str) -> AsyncGenerator[bytes, None]: """Get async file stream for download.""" raise NotImplementedError class LocalStorageBackend(StorageBackend): """Local filesystem storage backend with encryption support.""" def __init__(self): self.base_path = Path(settings.STORAGE_PATH) self.chunks_path = self.base_path / "chunks" self.files_path = self.base_path / "files" # Create directories if they don't exist self.chunks_path.mkdir(parents=True, exist_ok=True) self.files_path.mkdir(parents=True, exist_ok=True) async def store_chunk(self, upload_id: UUID, chunk_index: int, data: bytes) -> str: """Store chunk to local filesystem with optional encryption.""" try: chunk_id = f"{upload_id}_{chunk_index:06d}" chunk_path = self.chunks_path / f"{chunk_id}.chunk" # Encrypt chunk if encryption is enabled if settings.ENCRYPT_FILES: data = encrypt_file(data, str(upload_id)) async with aiofiles.open(chunk_path, 'wb') as f: await f.write(data) await logger.adebug( "Chunk stored successfully", upload_id=str(upload_id), chunk_index=chunk_index, chunk_size=len(data) ) return chunk_id except Exception as e: await logger.aerror( "Failed to store chunk", upload_id=str(upload_id), chunk_index=chunk_index, error=str(e) ) raise async def retrieve_chunk(self, chunk_id: str) -> bytes: """Retrieve and optionally decrypt chunk from local filesystem.""" try: chunk_path = self.chunks_path / f"{chunk_id}.chunk" if not chunk_path.exists(): raise FileNotFoundError(f"Chunk {chunk_id} not found") async with aiofiles.open(chunk_path, 'rb') as f: data = await f.read() # Decrypt chunk if encryption is enabled if settings.ENCRYPT_FILES: upload_id = chunk_id.split('_')[0] data = decrypt_file(data, upload_id) return data except Exception as e: await logger.aerror("Failed to retrieve chunk", chunk_id=chunk_id, error=str(e)) raise async def delete_chunk(self, chunk_id: str) -> bool: """Delete chunk file from local filesystem.""" try: chunk_path = self.chunks_path / f"{chunk_id}.chunk" if chunk_path.exists(): await aiofiles.os.remove(chunk_path) return True return False except Exception as e: await logger.aerror("Failed to delete chunk", chunk_id=chunk_id, error=str(e)) return False async def assemble_file(self, upload_id: UUID, chunks: List[str]) -> str: """Assemble chunks into final file.""" try: file_id = str(uuid4()) file_path = self.files_path / f"{file_id}" async with aiofiles.open(file_path, 'wb') as output_file: for chunk_id in chunks: chunk_data = await self.retrieve_chunk(chunk_id) await output_file.write(chunk_data) # Clean up chunks after assembly for chunk_id in chunks: await self.delete_chunk(chunk_id) await logger.ainfo( "File assembled successfully", upload_id=str(upload_id), file_path=str(file_path), chunks_count=len(chunks) ) return str(file_path) except Exception as e: await logger.aerror( "Failed to assemble file", upload_id=str(upload_id), error=str(e) ) raise async def delete_file(self, file_path: str) -> bool: """Delete file from local filesystem.""" try: path = Path(file_path) if path.exists() and path.is_file(): await aiofiles.os.remove(path) return True return False except Exception as e: await logger.aerror("Failed to delete file", file_path=file_path, error=str(e)) return False async def get_file_stream(self, file_path: str) -> AsyncGenerator[bytes, None]: """Stream file content for download.""" try: path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File {file_path} not found") async with aiofiles.open(path, 'rb') as f: while True: chunk = await f.read(65536) # 64KB chunks if not chunk: break yield chunk except Exception as e: await logger.aerror("Failed to stream file", file_path=file_path, error=str(e)) raise class StorageManager: """Main storage manager with upload session management and caching.""" def __init__(self): self.backend = LocalStorageBackend() # Can be extended to support S3, etc. self.cache_manager = get_cache_manager() async def create_upload_session(self, content_id: UUID, total_size: int) -> Dict[str, Any]: """Create new upload session with chunked upload support.""" try: upload_id = uuid4() session_data = { "upload_id": str(upload_id), "content_id": str(content_id), "total_size": total_size, "chunk_size": settings.CHUNK_SIZE, "total_chunks": (total_size + settings.CHUNK_SIZE - 1) // settings.CHUNK_SIZE, "uploaded_chunks": [], "created_at": datetime.utcnow().isoformat(), "expires_at": (datetime.utcnow() + timedelta(hours=24)).isoformat(), "status": "active" } # Store session in cache session_key = f"upload_session:{upload_id}" await self.cache_manager.set(session_key, session_data, ttl=86400) # 24 hours # Store in database for persistence async with db_manager.get_session() as session: upload_session = ContentUploadSession( id=upload_id, content_id=content_id, total_size=total_size, chunk_size=settings.CHUNK_SIZE, total_chunks=session_data["total_chunks"], expires_at=datetime.fromisoformat(session_data["expires_at"]) ) session.add(upload_session) await session.commit() await logger.ainfo( "Upload session created", upload_id=str(upload_id), content_id=str(content_id), total_size=total_size ) return { "upload_id": str(upload_id), "chunk_size": settings.CHUNK_SIZE, "total_chunks": session_data["total_chunks"], "upload_url": f"/api/v1/storage/upload/{upload_id}", "expires_at": session_data["expires_at"] } except Exception as e: await logger.aerror( "Failed to create upload session", content_id=str(content_id), error=str(e) ) raise async def upload_chunk( self, upload_id: UUID, chunk_index: int, chunk_data: bytes, chunk_hash: str ) -> Dict[str, Any]: """Upload and validate a file chunk.""" try: # Verify chunk hash calculated_hash = hashlib.sha256(chunk_data).hexdigest() if calculated_hash != chunk_hash: raise ValueError("Chunk hash mismatch") # Get upload session session_data = await self._get_upload_session(upload_id) if not session_data: raise ValueError("Upload session not found or expired") # Check if chunk already uploaded if chunk_index in session_data.get("uploaded_chunks", []): return {"status": "already_uploaded", "chunk_index": chunk_index} # Store chunk chunk_id = await self.backend.store_chunk(upload_id, chunk_index, chunk_data) # Update session data session_data["uploaded_chunks"].append(chunk_index) session_data["uploaded_chunks"].sort() session_key = f"upload_session:{upload_id}" await self.cache_manager.set(session_key, session_data, ttl=86400) # Store chunk info in database async with db_manager.get_session() as session: chunk_record = ContentChunk( upload_id=upload_id, chunk_index=chunk_index, chunk_id=chunk_id, chunk_hash=chunk_hash, chunk_size=len(chunk_data) ) session.add(chunk_record) await session.commit() await logger.adebug( "Chunk uploaded successfully", upload_id=str(upload_id), chunk_index=chunk_index, chunk_size=len(chunk_data) ) return { "status": "uploaded", "chunk_index": chunk_index, "uploaded_chunks": len(session_data["uploaded_chunks"]), "total_chunks": session_data["total_chunks"] } except Exception as e: await logger.aerror( "Failed to upload chunk", upload_id=str(upload_id), chunk_index=chunk_index, error=str(e) ) raise async def finalize_upload(self, upload_id: UUID) -> Dict[str, Any]: """Finalize upload by assembling chunks into final file.""" try: # Get upload session session_data = await self._get_upload_session(upload_id) if not session_data: raise ValueError("Upload session not found") # Verify all chunks are uploaded uploaded_chunks = session_data.get("uploaded_chunks", []) total_chunks = session_data["total_chunks"] if len(uploaded_chunks) != total_chunks: missing_chunks = set(range(total_chunks)) - set(uploaded_chunks) raise ValueError(f"Missing chunks: {missing_chunks}") # Get chunk IDs in order async with db_manager.get_session() as session: stmt = ( select(ContentChunk) .where(ContentChunk.upload_id == upload_id) .order_by(ContentChunk.chunk_index) ) result = await session.execute(stmt) chunks = result.scalars().all() chunk_ids = [chunk.chunk_id for chunk in chunks] # Assemble file file_path = await self.backend.assemble_file(upload_id, chunk_ids) # Update content record async with db_manager.get_session() as session: stmt = ( update(Content) .where(Content.id == UUID(session_data["content_id"])) .values( file_path=file_path, status="completed", updated_at=datetime.utcnow() ) ) await session.execute(stmt) await session.commit() # Clean up session session_key = f"upload_session:{upload_id}" await self.cache_manager.delete(session_key) await logger.ainfo( "Upload finalized successfully", upload_id=str(upload_id), file_path=file_path, total_chunks=total_chunks ) return { "status": "completed", "file_path": file_path, "content_id": session_data["content_id"] } except Exception as e: await logger.aerror( "Failed to finalize upload", upload_id=str(upload_id), error=str(e) ) raise async def get_file_stream(self, file_path: str) -> AsyncGenerator[bytes, None]: """Get file stream for download with caching support.""" try: # Check if file is cached cache_key = f"file_stream:{hashlib.md5(file_path.encode()).hexdigest()}" async for chunk in self.backend.get_file_stream(file_path): yield chunk except Exception as e: await logger.aerror("Failed to get file stream", file_path=file_path, error=str(e)) raise async def delete_content_files(self, content_id: UUID) -> bool: """Delete all files associated with content.""" try: async with db_manager.get_session() as session: # Get content stmt = select(Content).where(Content.id == content_id) result = await session.execute(stmt) content = result.scalar_one_or_none() if not content or not content.file_path: return True # Delete main file await self.backend.delete_file(content.file_path) # Delete any remaining chunks chunk_stmt = select(ContentChunk).where( ContentChunk.upload_id == content_id ) chunk_result = await session.execute(chunk_stmt) chunks = chunk_result.scalars().all() for chunk in chunks: await self.backend.delete_chunk(chunk.chunk_id) # Update content record update_stmt = ( update(Content) .where(Content.id == content_id) .values(file_path=None, status="deleted") ) await session.execute(update_stmt) await session.commit() await logger.ainfo( "Content files deleted", content_id=str(content_id) ) return True except Exception as e: await logger.aerror( "Failed to delete content files", content_id=str(content_id), error=str(e) ) return False async def get_storage_stats(self) -> Dict[str, Any]: """Get storage usage statistics.""" try: async with db_manager.get_session() as session: # Get total files and size from sqlalchemy import func stmt = select( func.count(Content.id).label('total_files'), func.sum(Content.file_size).label('total_size') ).where(Content.status == 'completed') result = await session.execute(stmt) stats = result.first() # Get storage by type type_stmt = select( Content.content_type, func.count(Content.id).label('count'), func.sum(Content.file_size).label('size') ).where(Content.status == 'completed').group_by(Content.content_type) type_result = await session.execute(type_stmt) type_stats = { row.content_type: { 'count': row.count, 'size': row.size or 0 } for row in type_result } return { 'total_files': stats.total_files or 0, 'total_size': stats.total_size or 0, 'by_type': type_stats, 'updated_at': datetime.utcnow().isoformat() } except Exception as e: await logger.aerror("Failed to get storage stats", error=str(e)) return {} async def _get_upload_session(self, upload_id: UUID) -> Optional[Dict[str, Any]]: """Get upload session from cache or database.""" # Try cache first session_key = f"upload_session:{upload_id}" session_data = await self.cache_manager.get(session_key) if session_data: # Check if session is expired expires_at = datetime.fromisoformat(session_data["expires_at"]) if expires_at > datetime.utcnow(): return session_data # Fallback to database try: async with db_manager.get_session() as session: stmt = ( select(ContentUploadSession) .where(ContentUploadSession.id == upload_id) ) result = await session.execute(stmt) upload_session = result.scalar_one_or_none() if upload_session and upload_session.expires_at > datetime.utcnow(): # Rebuild session data chunk_stmt = select(ContentChunk).where( ContentChunk.upload_id == upload_id ) chunk_result = await session.execute(chunk_stmt) chunks = chunk_result.scalars().all() session_data = { "upload_id": str(upload_session.id), "content_id": str(upload_session.content_id), "total_size": upload_session.total_size, "chunk_size": upload_session.chunk_size, "total_chunks": upload_session.total_chunks, "uploaded_chunks": [chunk.chunk_index for chunk in chunks], "created_at": upload_session.created_at.isoformat(), "expires_at": upload_session.expires_at.isoformat(), "status": "active" } # Update cache await self.cache_manager.set(session_key, session_data, ttl=86400) return session_data except Exception as e: await logger.aerror( "Failed to get upload session from database", upload_id=str(upload_id), error=str(e) ) return None # Additional model for upload sessions from app.core.models.base import BaseModel from sqlalchemy import Column, Integer, DateTime, String class ContentUploadSession(BaseModel): """Model for tracking upload sessions.""" __tablename__ = "content_upload_sessions" content_id = Column("content_id", String(36), nullable=False) total_size = Column(Integer, nullable=False) chunk_size = Column(Integer, nullable=False, default=1048576) # 1MB total_chunks = Column(Integer, nullable=False) expires_at = Column(DateTime, nullable=False) completed_at = Column(DateTime, nullable=True)