Chapter 10 Orchestration, Scheduling & Resources


I. Chapter Purpose & Scope

specifications: orchestrator backends & DAG submission, priority & preemption, triggers & dependencies, retry & timeouts, SLA/SLO & alerts, resource profiling & quotas, autoscaling & cost metrology; ensure alignment with data contracts, DQ gates, monitoring, and the Metrology chapter.resource, and scheduling, orchestrationFix pipeline

II. Terminology & Dependencies


III. Fields & Structure (Normative)

orchestration:

orchestrator: "airflow|argo|ray|custom"

dag:

max_concurrency: 128

backfill: {enabled: true, window: "P7D"}

dependencies:

- {from:"validate.schema", to:"transform.normalize"}

- {from:"transform.normalize", to:"feature.map"}

triggers:

cron: "5 * * * *"

event: {source:"kafka", topic:"ds.ready", group:"pipeline-consumer"} # optional

scheduling:

queue: "high|default|low"

priority: 5

preempt: true

retries: {max: 3, backoff: "expo", jitter_ms: 200}

timeout_s: 3600

sla:

latency_ms: {p50: 5000, p95: 15000, p99: 30000}

availability: 0.999

error_rate: 0.01

alert_rules:

- {name:"sla_breach_p99", rule:"latency_ms.p99>30000 for 10m", severity:"high"}

resources:

requests: {cpu: 4, mem_gb: 16, gpu: 0}

limits: {cpu: 8, mem_gb: 32, gpu: 0}

disk_gb: 200

net_mbps: 800

qos: "burstable|guaranteed|best-effort"

autoscale:

enabled: true

policy:

metric: "qps|latency_ms.p95|cpu|custom"

target: 0.7

min_replicas: 2

max_replicas: 64

cooldown_s: 120

cost:

budget:

currency: "USD"

monthly_cap: 2000

pricing_refs:

compute: "pricing/compute@v1.0"

storage: "pricing/storage@v1.0"

egress: "pricing/egress@v1.0"

metrology:

units: "SI"

check_dim: true


IV. Orchestrator Backend & Submission


V. Scheduling Strategy & Failure Semantics


VI. Resource Profiling & Quotas


VII. Autoscaling & Elasticity


VIII. Cost Metrology & Budgeting


IX. Metrology & Units (SI)


X. Machine-Readable Fragment (Drop-in)

orchestration:

orchestrator: "argo"

dag: {max_concurrency: 256, backfill:{enabled:true, window:"P3D"}}

dependencies:

- {from:"validate.schema", to:"transform.normalize"}

- {from:"transform.normalize", to:"feature.map"}

triggers:

cron: "5 * * * *"

scheduling:

queue: "high"

priority: 8

preempt: true

retries: {max:3, backoff:"expo", jitter_ms:200}

timeout_s: 5400

sla:

latency_ms: {p50:3000, p95:10000, p99:20000}

availability: 0.999

error_rate: 0.005

alert_rules:

- {name:"p99_breach", rule:"latency_ms.p99>20000 for 10m", severity:"high"}

resources:

requests: {cpu: 8, mem_gb: 32, gpu: 0}

limits: {cpu: 16, mem_gb: 64, gpu: 0}

disk_gb: 500

net_mbps: 1200

qos: "guaranteed"

autoscale:

enabled: true

policy: {metric:"qps", target:0.7, min_replicas:4, max_replicas:64, cooldown_s:120}

cost:

budget: {currency:"USD", monthly_cap: 5000}

pricing_refs: {compute:"pricing/compute@v1.0", storage:"pricing/storage@v1.0", egress:"pricing/egress@v1.0"}

metrology: {units:"SI", check_dim:true}


XI. Lint Rules (Excerpt, Normative)

lint_rules:

- id: ORCH.ORCHESTRATOR_ALLOWED

when: "$.orchestration.orchestrator"

assert: "value in ['airflow','argo','ray','custom']"

level: error

- id: SCHED.TIMEOUT_DEFINED

when: "$.scheduling.timeout_s"

assert: "is_number(value) and value > 0"

level: error

- id: SCHED.RETRIES_VALID

when: "$.scheduling.retries"

assert: "value.max >= 0 and value.backoff in ['expo','linear']"

level: error

- id: SLA.METRICS_DEFINED

when: "$.scheduling.sla"

assert: "has_keys(latency_ms, availability, error_rate)"

level: error

- id: RES.REQUESTS_LIMITS

when: "$.resources"

assert: "has_keys(requests, limits) and requests.cpu <= limits.cpu and requests.mem_gb <= limits.mem_gb"

level: error

- id: AUTOSCALE.BOUNDS

when: "$.autoscale"

assert: "value.enabled == false or (value.policy.min_replicas >= 1 and value.policy.max_replicas >= value.policy.min_replicas)"

level: error

- id: METROLOGY.SI_AND_CHECKDIM

when: "$.metrology"

assert: "units == 'SI' and check_dim == true"

level: error


XII. Export Manifest & Audit

export_manifest:

version: "v1.0"

artifacts:

- {path:"orchestration/dag.yaml", sha256:"..."}

- {path:"scheduling/policies.yaml", sha256:"..."}

- {path:"resources/usage.report.csv", sha256:"..."}

- {path:"autoscale/history.csv", sha256:"..."}

- {path:"cost/monthly_report.csv", sha256:"..."}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"


XIII. Chapter Compliance Checklist