import asyncio import os from datetime import datetime from typing import List, Optional import httpx from urllib.parse import urlparse import random import shutil from sqlalchemy import select from app.core.logger import make_log from app.core.storage import db_session from app.core.models.my_network import KnownNode, RemoteContentIndex from app.core.models.events import NodeEvent from app.core.models.content_v3 import EncryptedContent, ContentDerivative from app.core.ipfs_client import pin_add, pin_ls, find_providers, swarm_connect, add_streamed_file from app.core.events.service import LOCAL_PUBLIC_KEY INTERVAL_SEC = 60 ENV_PIN_CONCURRENCY = int(os.getenv('SYNC_MAX_CONCURRENT_PINS', '4')) ENV_DISK_WATERMARK_PCT = int(os.getenv('SYNC_DISK_LOW_WATERMARK_PCT', '90')) async def fetch_index(base_url: str, etag: Optional[str], since: Optional[str]) -> tuple[List[dict], Optional[str]]: try: headers = {} params = {} if since: params['since'] = since url = f"{base_url.rstrip('/')}/api/v1/content.delta" if since else f"{base_url.rstrip('/')}/api/v1/content.index" if etag: headers['If-None-Match'] = etag # follow_redirects handles peers that force HTTPS and issue 301s async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: r = await client.get(url, headers=headers, params=params) if r.status_code != 200: if r.status_code == 304: return [], etag return [], etag j = r.json() new_etag = r.headers.get('ETag') or etag return j.get('items') or [], (j.get('next_since') or new_etag or etag) except Exception: return [], etag async def upsert_content(item: dict): cid = item.get('encrypted_cid') if not cid: return async with db_session() as session: row = (await session.execute(select(EncryptedContent).where(EncryptedContent.encrypted_cid == cid))).scalars().first() if not row: row = EncryptedContent( encrypted_cid=cid, title=item.get('title') or cid, description=item.get('description') or '', content_type=item.get('content_type') or 'application/octet-stream', enc_size_bytes=item.get('size_bytes'), preview_enabled=bool(item.get('preview_enabled')), preview_conf=item.get('preview_conf') or {}, salt_b64=item.get('salt_b64'), ) session.add(row) else: row.title = item.get('title') or row.title row.description = item.get('description') or row.description row.content_type = item.get('content_type') or row.content_type row.enc_size_bytes = item.get('size_bytes') or row.enc_size_bytes row.preview_enabled = bool(item.get('preview_enabled')) if item.get('preview_enabled') is not None else row.preview_enabled if item.get('preview_conf'): row.preview_conf = item['preview_conf'] if item.get('salt_b64'): row.salt_b64 = item['salt_b64'] await session.commit() # Fetch thumbnail via HTTP if provided and not present locally cover_url = item.get('cover_url') if cover_url: try: async with db_session() as session: ec = (await session.execute(select(EncryptedContent).where(EncryptedContent.encrypted_cid == cid))).scalars().first() have_thumb = (await session.execute(select(ContentDerivative).where(ContentDerivative.content_id == ec.id, ContentDerivative.kind == 'decrypted_thumbnail', ContentDerivative.status == 'ready'))).scalars().first() if not have_thumb: import httpx, tempfile, os async with httpx.AsyncClient(timeout=30) as client: r = await client.get(cover_url) r.raise_for_status() tmp = tempfile.NamedTemporaryFile(delete=False) tmp.write(r.content) tmp.close() # Save into store from app.core.background.convert_v3_service import _save_derivative h, size = await _save_derivative(tmp.name, os.path.basename(cover_url) or 'thumb.jpg') cd = ContentDerivative( content_id=ec.id, kind='decrypted_thumbnail', local_path=os.path.join(os.getenv('UPLOADS_DIR', '/app/data'), h), content_type=r.headers.get('Content-Type') or 'image/jpeg', size_bytes=size, status='ready', ) session.add(cd) await session.commit() except Exception as e: make_log('index_scout_v3', f"thumbnail fetch failed for {cid}: {e}", level='warning') def _node_base_url(node: KnownNode) -> Optional[str]: meta = node.meta or {} public_host = (meta.get('public_host') or '').strip() if public_host: base = public_host.rstrip('/') if base.startswith('http://') or base.startswith('https://'): return base scheme = 'https' if node.port == 443 else 'http' return f"{scheme}://{base.lstrip('/')}" scheme = 'https' if node.port == 443 else 'http' host = (node.ip or '').strip() if not host: return None default_port = 443 if scheme == 'https' else 80 if node.port and node.port != default_port: return f"{scheme}://{host}:{node.port}" return f"{scheme}://{host}" async def _update_remote_index(node_id: int, items: List[dict], *, incremental: bool): if not items: return async with db_session() as session: existing_rows = (await session.execute( select(RemoteContentIndex).where(RemoteContentIndex.remote_node_id == node_id) )).scalars().all() existing_map = {row.encrypted_hash: row for row in existing_rows if row.encrypted_hash} seen = set() now = datetime.utcnow() for item in items: cid = item.get('encrypted_cid') if not cid: continue seen.add(cid) payload_meta = { 'title': item.get('title'), 'description': item.get('description'), 'size_bytes': item.get('size_bytes'), 'preview_enabled': item.get('preview_enabled'), 'preview_conf': item.get('preview_conf'), 'issuer_node_id': item.get('issuer_node_id'), 'salt_b64': item.get('salt_b64'), } meta_clean = {k: v for k, v in payload_meta.items() if v is not None} row = existing_map.get(cid) if row: row.content_type = item.get('content_type') or row.content_type row.meta = {**(row.meta or {}), **meta_clean} row.last_updated = now else: row = RemoteContentIndex( remote_node_id=node_id, content_type=item.get('content_type') or 'application/octet-stream', encrypted_hash=cid, meta=meta_clean, last_updated=now, ) session.add(row) if not incremental and existing_map: for hash_value, row in list(existing_map.items()): if hash_value not in seen: await session.delete(row) await session.commit() async def main_fn(memory): make_log('index_scout_v3', 'Service started', level='info') sem = None while True: try: # Read runtime config from ServiceConfig (fallback to env) from app.core.models._config import ServiceConfig async with db_session() as session: max_pins = int(await ServiceConfig(session).get('SYNC_MAX_CONCURRENT_PINS', ENV_PIN_CONCURRENCY)) disk_pct = int(await ServiceConfig(session).get('SYNC_DISK_LOW_WATERMARK_PCT', ENV_DISK_WATERMARK_PCT)) if sem is None or sem._value != max_pins: sem = asyncio.Semaphore(max_pins) async with db_session() as session: nodes = (await session.execute(select(KnownNode))).scalars().all() node_by_pk = {n.public_key: n for n in nodes if n.public_key} async with db_session() as session: pending_events = (await session.execute( select(NodeEvent) .where(NodeEvent.event_type == 'content_indexed', NodeEvent.status.in_(('recorded', 'local', 'processing'))) .order_by(NodeEvent.created_at.asc()) .limit(25) )).scalars().all() for ev in pending_events: if ev.status != 'processing': ev.status = 'processing' await session.commit() for ev in pending_events: payload = ev.payload or {} cid = payload.get('encrypted_cid') or payload.get('content_cid') if ev.origin_public_key == LOCAL_PUBLIC_KEY: async with db_session() as session: ref = await session.get(NodeEvent, ev.id) if ref: ref.status = 'applied' ref.applied_at = datetime.utcnow() await session.commit() continue if not cid: async with db_session() as session: ref = await session.get(NodeEvent, ev.id) if ref: ref.status = 'applied' ref.applied_at = datetime.utcnow() await session.commit() continue node = node_by_pk.get(ev.origin_public_key) if not node: async with db_session() as session: node = (await session.execute(select(KnownNode).where(KnownNode.public_key == ev.origin_public_key))).scalar_one_or_none() if node: node_by_pk[node.public_key] = node if not node: make_log('index_scout_v3', f"Event {ev.uid} refers to unknown node {ev.origin_public_key}", level='debug') async with db_session() as session: ref = await session.get(NodeEvent, ev.id) if ref: ref.status = 'recorded' await session.commit() continue try: await _pin_one(node, cid) async with db_session() as session: ref = await session.get(NodeEvent, ev.id) if ref: ref.status = 'applied' ref.applied_at = datetime.utcnow() await session.commit() except Exception as exc: make_log('index_scout_v3', f"Event pin failed for {cid}: {exc}", level='warning') async with db_session() as session: ref = await session.get(NodeEvent, ev.id) if ref: ref.status = 'recorded' await session.commit() for n in nodes: base = _node_base_url(n) if not base: continue # jitter 0..30s per node to reduce stampede await asyncio.sleep(random.uniform(0, 30)) etag = (n.meta or {}).get('index_etag') since = (n.meta or {}).get('index_since') items, marker = await fetch_index(base, etag, since) if not items and marker == etag: continue # update node markers try: async with db_session() as session: row = (await session.execute(select(KnownNode).where(KnownNode.id == n.id))).scalars().first() if row: meta = row.meta or {} meta['index_etag'] = marker meta['index_since'] = marker if (marker and 'T' in str(marker)) else meta.get('index_since') row.meta = meta await session.commit() except Exception: pass if not items: continue make_log('index_scout_v3', f"Fetched {len(items)} from {base}") try: await _update_remote_index(n.id, items, incremental=bool(since)) except Exception as exc: make_log('index_scout_v3', f"remote index update failed for node {n.id}: {exc}", level='warning') # Check disk watermark try: from app.core._config import UPLOADS_DIR du = shutil.disk_usage(UPLOADS_DIR) used_pct = int(100 * (1 - du.free / du.total)) if used_pct >= disk_pct: make_log('index_scout_v3', f"Disk watermark reached ({used_pct}%), skipping pins") continue except Exception: pass async def _pin_one(node: KnownNode, cid: str): async with sem: try: node_ipfs_meta = (node.meta or {}).get('ipfs') or {} multiaddrs = node_ipfs_meta.get('multiaddrs') or [] for addr in multiaddrs: try: await swarm_connect(addr) except Exception: pass try: existing = await pin_ls(cid) if existing and existing.get('Keys'): make_log('index_scout_v3', f"pin {cid} already present", level='debug') return except Exception: pass # Try to pre-connect to discovered providers try: provs = await find_providers(cid, max_results=5) for p in provs: for addr in (p.get('addrs') or [])[:2]: try: await swarm_connect(addr) except Exception: pass except Exception: pass try: await asyncio.wait_for(pin_add(cid, recursive=True), timeout=60) return except httpx.HTTPStatusError as http_err: body = (http_err.response.text or '').lower() if http_err.response else '' if 'already pinned' in body or 'pin already set' in body: make_log('index_scout_v3', f"pin {cid} already present", level='debug') return raise except Exception as e: # Attempt HTTP gateway fallback before logging failure fallback_sources = [] node_host = node.meta.get('public_host') if isinstance(node.meta, dict) else None try: # Derive gateway host: prefer public_host domain if present parsed = urlparse(node_host) if node_host else None gateway_host = parsed.hostname if parsed and parsed.hostname else (node.ip or '').split(':')[0] gateway_port = parsed.port if (parsed and parsed.port not in (None, 80, 443)) else 8080 if gateway_host: gateway_url = f"http://{gateway_host}:{gateway_port}/ipfs/{cid}" make_log('index_scout_v3', f"fallback download start {cid} via {gateway_url}", level='debug') async with httpx.AsyncClient(timeout=None) as client: resp = await client.get(gateway_url) resp.raise_for_status() data = resp.content chunk_bytes = int(os.getenv('CRYPTO_CHUNK_BYTES', '1048576')) add_params = { 'cid-version': 1, 'raw-leaves': 'true', 'chunker': f'size-{chunk_bytes}', 'hash': 'sha2-256', 'pin': 'true', } result = await add_streamed_file([data], filename=f'{cid}.bin', params=add_params) if str(result.get('Hash')) != str(cid): raise ValueError(f"gateway add returned mismatched CID {result.get('Hash')}") make_log('index_scout_v3', f"pin {cid} fetched via gateway {gateway_host}:{gateway_port}", level='info') return else: fallback_sources.append('gateway-host-missing') except Exception as fallback_err: fallback_sources.append(str(fallback_err)) make_log('index_scout_v3', f"pin {cid} failed: {e}; fallback={'; '.join(fallback_sources) if fallback_sources else 'none'}", level='warning') tasks = [] for it in items: await upsert_content(it) cid = it.get('encrypted_cid') if cid: make_log('index_scout_v3', f"queue pin {cid}") tasks.append(asyncio.create_task(_pin_one(n, cid))) if tasks: await asyncio.gather(*tasks) except Exception as e: make_log('index_scout_v3', f"loop error: {e}", level='error') await asyncio.sleep(INTERVAL_SEC)