71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
"""JSON IO Manager for storing asset outputs as JSON files"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from dagster import IOManager, io_manager, OutputContext, InputContext
|
|
|
|
|
|
class JSONIOManager(IOManager):
|
|
"""IO Manager that stores outputs as JSON files in data/{run_id}/ directory"""
|
|
|
|
def __init__(self, base_path: str = "data"):
|
|
self.base_path = Path(base_path)
|
|
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
def _get_path(self, context) -> Path:
|
|
"""Get file path for asset with run_id subdirectory"""
|
|
asset_name = context.asset_key.path[-1]
|
|
|
|
# For InputContext, try upstream run_id first, fallback to finding latest
|
|
if isinstance(context, InputContext):
|
|
try:
|
|
run_id = context.upstream_output.run_id
|
|
except:
|
|
# If upstream run_id not available, find the most recent run directory
|
|
# that contains this asset (for partial re-runs)
|
|
run_dirs = sorted([d for d in self.base_path.iterdir() if d.is_dir()],
|
|
key=lambda d: d.stat().st_mtime, reverse=True)
|
|
for run_dir in run_dirs:
|
|
potential_path = run_dir / f"{asset_name}.json"
|
|
if potential_path.exists():
|
|
return potential_path
|
|
# If not found, use the latest run_dir
|
|
run_id = run_dirs[0].name if run_dirs else "unknown"
|
|
else:
|
|
run_id = context.run_id
|
|
|
|
# Create run-specific directory
|
|
run_dir = self.base_path / run_id
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
return run_dir / f"{asset_name}.json"
|
|
|
|
def handle_output(self, context: OutputContext, obj: Any):
|
|
"""Save asset output to JSON file"""
|
|
file_path = self._get_path(context)
|
|
|
|
with open(file_path, 'w') as f:
|
|
json.dump(obj, f, indent=2)
|
|
|
|
context.log.info(f"Saved {context.asset_key.path[-1]} to {file_path}")
|
|
|
|
def load_input(self, context: InputContext) -> Any:
|
|
"""Load asset input from JSON file"""
|
|
file_path = self._get_path(context)
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"Asset output not found: {file_path}")
|
|
|
|
with open(file_path, 'r') as f:
|
|
obj = json.load(f)
|
|
|
|
context.log.info(f"Loaded {context.asset_key.path[-1]} from {file_path}")
|
|
return obj
|
|
|
|
|
|
@io_manager
|
|
def json_io_manager():
|
|
"""Factory for JSON IO Manager"""
|
|
return JSONIOManager(base_path="data")
|