uploader-bot/app/core/background/convert_v3_service.py

525 lines
22 KiB
Python

import asyncio
import os
import json
import shutil
import tempfile
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Tuple
from sqlalchemy import select, and_, or_
from app.core.logger import make_log
from app.core.storage import db_session
from app.core._config import UPLOADS_DIR, BACKEND_LOGS_DIR_HOST
from app.core.models.content_v3 import (
EncryptedContent,
ContentKey,
ContentDerivative,
UploadSession,
)
from app.core.models.node_storage import StoredContent
from app.core.ipfs_client import cat_stream
from app.core.crypto.encf_stream import decrypt_encf_auto
from app.core.crypto.keywrap import unwrap_dek, wrap_dek, KeyWrapError
from app.core.network.key_client import request_key_from_peer
from app.core.models.my_network import KnownNode
from app.core._utils.resolve_content import resolve_content
from app.core.content.content_id import ContentId
CONCURRENCY = int(os.getenv("CONVERT_V3_MAX_CONCURRENCY", "3"))
STAGING_SUBDIR = os.getenv("CONVERT_V3_STAGING_SUBDIR", "convert-staging")
UPLOADS_PATH = Path(UPLOADS_DIR).resolve()
_host_uploads_env = os.getenv("BACKEND_DATA_DIR_HOST")
HOST_UPLOADS_PATH = Path(_host_uploads_env).resolve() if _host_uploads_env else None
@dataclass
class PlainStaging:
container_path: str
host_path: str
def _container_to_host(path: str) -> str:
"""Map a container path under UPLOADS_DIR to the host path for docker -v."""
if not HOST_UPLOADS_PATH:
raise RuntimeError("BACKEND_DATA_DIR_HOST is not configured for convert_v3")
real_path = Path(path).resolve()
try:
real_path.relative_to(UPLOADS_PATH)
except ValueError:
# Not under uploads; best effort fallback to original string
return str(real_path)
rel = real_path.relative_to(UPLOADS_PATH)
return str(HOST_UPLOADS_PATH / rel)
MEDIA_CONVERTER_CPU_LIMIT = os.getenv("MEDIA_CONVERTER_CPU_LIMIT")
MEDIA_CONVERTER_MEM_LIMIT = os.getenv("MEDIA_CONVERTER_MEM_LIMIT")
MEDIA_CONVERTER_CPUSET = os.getenv("MEDIA_CONVERTER_CPUSET") or os.getenv("CONVERT_CPUSET")
ERROR_TRUNCATE_LIMIT = 512
def _ensure_dir(path: str):
try:
os.makedirs(path, exist_ok=True)
except Exception:
pass
async def _sha256_b58(file_path: str) -> str:
import hashlib
import base58
h = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(2 * 1024 * 1024), b''):
h.update(chunk)
return base58.b58encode(h.digest()).decode()
async def _save_derivative(file_path: str, filename: str) -> Tuple[str, int]:
"""Move file into UPLOADS_DIR under sha256 b58 name; return (hash_b58, size)."""
file_hash = await _sha256_b58(file_path)
dst = os.path.join(UPLOADS_DIR, file_hash)
try:
os.remove(dst)
except FileNotFoundError:
pass
shutil.move(file_path, dst)
size = os.path.getsize(dst)
return file_hash, size
async def _run_media_converter(staging: PlainStaging, input_ext: str, quality: str, trim_value: Optional[str], is_audio: bool):
if not os.path.exists(staging.container_path):
raise FileNotFoundError(f"Plain input missing at {staging.container_path}")
host_input_path = staging.host_path
if not host_input_path or not host_input_path.startswith('/'):
host_input_path = os.path.abspath(host_input_path)
rid = __import__('uuid').uuid4().hex[:8]
output_dir_container = UPLOADS_PATH / "convert-output" / f"conv_{rid}"
output_dir_host = _container_to_host(output_dir_container)
_ensure_dir(str(output_dir_container))
logs_dir_candidate = os.getenv("BACKEND_LOGS_DIR_HOST", "")
logs_dir_host = logs_dir_candidate if logs_dir_candidate else str(HOST_UPLOADS_PATH / "logs" / "converter") if HOST_UPLOADS_PATH else "/tmp/converter-logs"
if not logs_dir_host.startswith('/'):
logs_dir_host = os.path.join(os.getcwd(), logs_dir_host)
try:
os.makedirs(logs_dir_host, exist_ok=True)
except Exception:
fallback_logs = HOST_UPLOADS_PATH / "logs" / "converter" if HOST_UPLOADS_PATH else Path("/tmp/converter-logs")
logs_dir_host = str(fallback_logs)
os.makedirs(logs_dir_host, exist_ok=True)
cmd = [
"docker", "run", "--rm",
"-v", f"{host_input_path}:/app/input:ro",
"-v", f"{output_dir_host}:/app/output",
"-v", f"{logs_dir_host}:/app/logs",
]
if MEDIA_CONVERTER_CPU_LIMIT:
cmd.extend(["--cpus", str(MEDIA_CONVERTER_CPU_LIMIT)])
if MEDIA_CONVERTER_MEM_LIMIT:
cmd.extend(["--memory", str(MEDIA_CONVERTER_MEM_LIMIT)])
if MEDIA_CONVERTER_CPUSET:
cmd.extend(["--cpuset-cpus", MEDIA_CONVERTER_CPUSET])
cmd.append("media_converter")
cmd.extend(["--ext", input_ext, "--quality", quality])
if trim_value:
cmd.extend(["--trim", trim_value])
make_log('convert_v3', f"Run media_converter cmd: {' '.join(cmd)}")
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"media_converter failed: {stderr.decode()}")
# Find produced media file and optional output.json
try:
files = os.listdir(output_dir_container)
except Exception as e:
raise RuntimeError(f"Read output dir error: {e}")
media_files = [f for f in files if f != "output.json"]
if len(media_files) != 1:
raise RuntimeError(f"Expected one media file, found {len(media_files)}: {media_files}")
output_media = os.path.join(output_dir_container, media_files[0])
ffprobe_meta = {}
out_json = os.path.join(output_dir_container, "output.json")
if os.path.exists(out_json):
try:
with open(out_json, 'r') as f:
ffprobe_meta = json.load(f)
except Exception:
ffprobe_meta = {}
return output_media, ffprobe_meta
async def _update_upload_session(ec: EncryptedContent, all_success: bool, errors: List[str]):
async with db_session() as session:
upload_row = (await session.execute(
select(UploadSession).where(UploadSession.encrypted_cid == ec.encrypted_cid)
)).scalars().first()
if upload_row:
if all_success:
upload_row.state = 'converted'
upload_row.error = None
elif upload_row.state != 'converted':
upload_row.state = 'conversion_failed'
if errors:
upload_row.error = _short_error(errors[0])
await session.commit()
async def _convert_content(ec: EncryptedContent, staging: PlainStaging):
content_kind = 'audio' if ec.content_type.startswith('audio/') else ('video' if ec.content_type.startswith('video/') else 'other')
input_ext = (ec.content_type.split('/')[-1] or 'bin')
is_audio = content_kind == 'audio'
encrypted_hash_b58 = ContentId.deserialize(ec.encrypted_cid).content_hash_b58
if content_kind == 'other':
errors: List[str] = []
all_success = True
try:
file_hash, size_bytes = await _save_derivative(staging.container_path, staging.container_path)
plain_path = os.path.join(UPLOADS_DIR, file_hash)
plain_filename = f"{ec.encrypted_cid}.{input_ext}" if input_ext else ec.encrypted_cid
async with db_session() as session:
existing = (await session.execute(select(StoredContent).where(StoredContent.hash == file_hash))).scalars().first()
if existing:
sc = existing
sc.type = sc.type or "local/content_bin"
sc.filename = plain_filename
sc.meta = {
**(sc.meta or {}),
'encrypted_cid': ec.encrypted_cid,
'kind': 'original',
'content_type': ec.content_type,
}
sc.updated = datetime.utcnow()
else:
sc = StoredContent(
type="local/content_bin",
hash=file_hash,
user_id=None,
filename=plain_filename,
meta={
'encrypted_cid': ec.encrypted_cid,
'kind': 'original',
'content_type': ec.content_type,
},
created=datetime.utcnow(),
)
session.add(sc)
await session.flush()
encrypted_records = (await session.execute(select(StoredContent).where(StoredContent.hash == encrypted_hash_b58))).scalars().all()
for encrypted_sc in encrypted_records:
meta = dict(encrypted_sc.meta or {})
converted = dict(meta.get('converted_content') or {})
converted['original'] = file_hash
meta['converted_content'] = converted
if 'content_type' not in meta:
meta['content_type'] = ec.content_type
encrypted_sc.meta = meta
encrypted_sc.decrypted_content_id = sc.id
encrypted_sc.updated = datetime.utcnow()
derivative = ContentDerivative(
content_id=ec.id,
kind='decrypted_original',
local_path=plain_path,
content_type=ec.content_type,
size_bytes=size_bytes,
status='ready',
)
session.add(derivative)
await session.commit()
make_log('convert_v3', f"Stored original derivative for {ec.encrypted_cid}")
except Exception as e:
all_success = False
errors.append(str(e))
make_log('convert_v3', f"Convert error {ec.encrypted_cid} opt=original: {e}", level='error')
await _update_upload_session(ec, all_success, errors)
return
# audio/video path
required = ['high', 'low', 'low_preview']
conf = ec.preview_conf or {}
intervals = conf.get('intervals') or [[0, int(conf.get('duration_ms', 30000))]]
main_interval = intervals[0]
start_s = max(0, int(main_interval[0]) // 1000)
dur_s = max(1, int((main_interval[1] - main_interval[0]) // 1000) or 30)
trim_value = f"{start_s}-{start_s + dur_s}"
qualities = {
'high': 'high',
'low': 'low',
'low_preview': 'low',
}
all_success = True
errors: List[str] = []
for opt in required:
derivative_kind = f"decrypted_{opt if opt != 'low_preview' else 'preview'}"
derivative_id: Optional[int] = None
try:
async with db_session() as session:
cd = ContentDerivative(
content_id=ec.id,
kind=derivative_kind,
interval_start_ms=main_interval[0] if opt == 'low_preview' else None,
interval_end_ms=main_interval[1] if opt == 'low_preview' else None,
local_path="",
status='processing',
)
session.add(cd)
await session.flush()
derivative_id = cd.id
await session.commit()
out_path, ffprobe = await _run_media_converter(
staging=staging,
input_ext=input_ext,
quality=qualities[opt],
trim_value=trim_value if opt == 'low_preview' else None,
is_audio=is_audio,
)
file_hash, size_bytes = await _save_derivative(out_path, os.path.basename(out_path))
async with db_session() as session:
sc = (await session.execute(select(StoredContent).where(StoredContent.hash == file_hash))).scalars().first()
meta_payload = {'encrypted_cid': ec.encrypted_cid, 'kind': opt, 'ffprobe_meta': ffprobe}
if sc:
sc.type = sc.type or "local/content_bin"
sc.filename = os.path.basename(out_path)
sc.meta = meta_payload
sc.updated = datetime.utcnow()
else:
sc = StoredContent(
type="local/content_bin",
hash=file_hash,
user_id=None,
filename=os.path.basename(out_path),
meta=meta_payload,
created=datetime.utcnow(),
)
session.add(sc)
await session.flush()
encrypted_sc = (await session.execute(select(StoredContent).where(StoredContent.hash == encrypted_hash_b58))).scalars().first()
if encrypted_sc:
meta = dict(encrypted_sc.meta or {})
converted = dict(meta.get('converted_content') or {})
converted[opt] = file_hash
meta['converted_content'] = converted
encrypted_sc.meta = meta
if opt == 'high':
encrypted_sc.decrypted_content_id = sc.id
encrypted_sc.updated = datetime.utcnow()
cd = await session.get(ContentDerivative, derivative_id) if derivative_id else None
if cd:
cd.local_path = os.path.join(UPLOADS_DIR, file_hash)
cd.size_bytes = size_bytes
if is_audio:
cd.content_type = 'audio/flac' if opt == 'high' else 'audio/mpeg'
else:
cd.content_type = ec.content_type if opt == 'high' else 'video/mp4'
cd.status = 'ready'
cd.error = None
await session.commit()
output_parent = Path(out_path).parent
shutil.rmtree(output_parent, ignore_errors=True)
make_log('convert_v3', f"Converted {ec.encrypted_cid} opt={opt} -> {file_hash}")
except Exception as e:
make_log('convert_v3', f"Convert error {ec.encrypted_cid} opt={opt}: {e}", level='error')
all_success = False
errors.append(_short_error(e))
async with db_session() as session:
cd = await session.get(ContentDerivative, derivative_id) if derivative_id else None
if cd:
cd.status = 'failed'
cd.error = _short_error(e)
else:
session.add(ContentDerivative(
content_id=ec.id,
kind=derivative_kind,
status='failed',
error=_short_error(e),
local_path="",
))
await session.commit()
await _update_upload_session(ec, all_success, errors)
async def _pick_pending(limit: int) -> List[Tuple[EncryptedContent, PlainStaging]]:
async with db_session() as session:
# Include preview-enabled media and non-media content that need decrypted originals
non_media_filter = and_(
EncryptedContent.content_type.isnot(None),
~EncryptedContent.content_type.like('audio/%'),
~EncryptedContent.content_type.like('video/%'),
)
ecs = (await session.execute(
select(EncryptedContent)
.where(or_(EncryptedContent.preview_enabled == True, non_media_filter))
.order_by(EncryptedContent.created_at.desc())
)).scalars().all()
picked: List[Tuple[EncryptedContent, PlainStaging]] = []
for ec in ecs:
try:
cid_obj, cid_err = resolve_content(ec.encrypted_cid)
if cid_err:
make_log('convert_v3', f"Skip {ec.encrypted_cid}: resolve error {cid_err}", level='debug')
continue
encrypted_hash_b58 = cid_obj.content_hash_b58
except Exception as exc:
make_log('convert_v3', f"Skip {ec.encrypted_cid}: resolve exception {exc}", level='warning')
continue
sc = (await session.execute(select(StoredContent).where(StoredContent.hash == encrypted_hash_b58))).scalars().first()
if not sc or sc.onchain_index is None:
continue
# Check if derivatives already ready
rows = (await session.execute(select(ContentDerivative).where(ContentDerivative.content_id == ec.id))).scalars().all()
kinds_ready = {r.kind for r in rows if r.status == 'ready'}
if ec.content_type.startswith('audio/'):
required = {'decrypted_low', 'decrypted_high'}
elif ec.content_type.startswith('video/'):
required = {'decrypted_low', 'decrypted_high', 'decrypted_preview'}
else:
required = {'decrypted_original'}
if required.issubset(kinds_ready):
continue
# Always decrypt from IPFS using local or remote key
staging: Optional[PlainStaging] = None
ck = (await session.execute(select(ContentKey).where(ContentKey.content_id == ec.id))).scalars().first()
if ck:
staging = await stage_plain_from_ipfs(ec, ck.key_ciphertext_b64)
if not staging:
peers = (await session.execute(select(KnownNode))).scalars().all()
for peer in peers:
meta = peer.meta or {}
public_host = meta.get('public_host')
if not public_host:
last_resp = (meta.get('last_response') or {}).get('node', {}) if isinstance(meta, dict) else {}
public_host = last_resp.get('public_host')
base_url = public_host or f"http://{peer.ip}:{peer.port}"
dek = await request_key_from_peer(base_url, ec.encrypted_cid)
if not dek:
continue
try:
dek_b64 = wrap_dek(dek)
except KeyWrapError as exc:
make_log('convert_v3', f"wrap failed for peer DEK: {exc}", level='error')
continue
session_ck = ContentKey(
content_id=ec.id,
key_ciphertext_b64=dek_b64,
key_fingerprint=peer.public_key,
issuer_node_id=peer.public_key,
allow_auto_grant=True,
)
session.add(session_ck)
await session.commit()
staging = await stage_plain_from_ipfs(ec, dek_b64)
if staging:
break
if not staging or not os.path.exists(staging.container_path):
continue
picked.append((ec, staging))
if len(picked) >= limit:
break
return picked
async def worker_loop():
sem = asyncio.Semaphore(CONCURRENCY)
async def _run_one(ec: EncryptedContent, staging: PlainStaging):
async with sem:
try:
await _convert_content(ec, staging)
# After successful conversion, attempt to remove staging file to avoid duplicates
try:
if staging and staging.container_path and os.path.exists(staging.container_path):
os.remove(staging.container_path)
except Exception:
pass
except Exception as e:
make_log('convert_v3', f"job error {ec.encrypted_cid}: {e}", level='error')
while True:
try:
batch = await _pick_pending(limit=CONCURRENCY * 2)
if not batch:
await asyncio.sleep(3)
continue
tasks = [asyncio.create_task(_run_one(ec, staging)) for (ec, staging) in batch]
await asyncio.gather(*tasks)
except Exception as e:
make_log('convert_v3', f"loop error: {e}", level='error')
await asyncio.sleep(2)
async def main_fn(memory):
make_log('convert_v3', f"Service started with concurrency={CONCURRENCY}", level='info')
await worker_loop()
async def stage_plain_from_ipfs(ec: EncryptedContent, dek_wrapped: str) -> Optional[PlainStaging]:
"""Download encrypted ENCF stream from IPFS and decrypt on the fly into shared staging."""
os.makedirs(UPLOADS_PATH / STAGING_SUBDIR, exist_ok=True)
try:
dek = unwrap_dek(dek_wrapped)
except KeyWrapError as exc:
make_log('convert_v3', f"unwrap failed for {ec.encrypted_cid}: {exc}", level='error')
return None
tmp = tempfile.NamedTemporaryFile(
prefix=f"dec_{ec.encrypted_cid[:8]}_",
dir=UPLOADS_PATH / STAGING_SUBDIR,
delete=False,
)
tmp_path = tmp.name
tmp.close()
try:
async def _aiter():
async for ch in cat_stream(ec.encrypted_cid):
yield ch
await decrypt_encf_auto(_aiter(), dek, tmp_path)
host_path = _container_to_host(tmp_path)
return PlainStaging(container_path=tmp_path, host_path=host_path)
except Exception as e:
make_log('convert_v3', f"decrypt from ipfs failed: {e}", level='error')
try:
os.remove(tmp_path)
except Exception:
pass
return None
def _short_error(message: str, limit: int = ERROR_TRUNCATE_LIMIT) -> str:
if not message:
return message
message = str(message)
return message if len(message) <= limit else message[: limit - 3] + '...'