Chapter 15 Fault Tolerance, Recovery & DR


I. Chapter Purpose & Scope

: failure semantics and compensations, retry and timeouts, idempotency and deduplication, checkpoints and snapshots, backups and replay, RTO/RPO and drills, cross–AZ/region failover and fallback, exports and audit; ensure alignment with contracts/scheduling/monitoring and the Metrology chapter.disaster recovery (DR), and recovery, fault toleranceFix pipeline specifications for

II. Terminology & Dependencies


III. Fields & Structure (Normative)

fault_tolerance:

semantics:

on_fail: "retry|skip|quarantine|block"

error_classes: ["retryable","non_retryable","escalate"]

retry:

policy: {max: 3, backoff: "expo", jitter_ms: 200}

timeout_s: 1800

idempotency:

enabled: true

dedupe_key: ["<pk>","<offset|ts>"]

sink_mode: "idempotent-insert|upsert"

compensation:

enabled: true

handlers:

- {stage:"transform.normalize", action:"reverse_op", spec:"comp/normalize.reverse.yaml"}

- {stage:"feature.map", action:"delete_artifact", spec:"comp/delete.manifest.yaml"}

recovery:

checkpoint:

mode: "exactly-once|at-least-once"

store: "s3://.../chk/<stage>"

cadence: "PT5M"

contents: ["offset","cursor","watermark","sink_commit"]

snapshot:

enabled: true

store: "s3://.../snap/<dataset>"

cadence: "P1D"

retention: "P30D"

replay:

enabled: true

inputs_lock: "locks/inputs.manifest.json"

policy: "strict|lenient"

rollbacks:

guardrail: {max_depth: 2, require_approval: true}

dr:

strategy: "active-active|active-passive"

topology:

primary: {region:"eu-west-1", azs:["a","b"], quorum:3}

standby: {region:"eu-central-1", azs:["a","b"], quorum:3}

rto: "PT30M"

rpo: "PT5M"

failover:

trigger: "manual|auto"

health_checks: ["latency_ms.p99","error_rate","heartbeat"]

dns_ttl_s: 60

fallback:

criteria: ["primary_healthy_24h","replication_lag<PT1M"]

testing:

chaos:

enabled: true

experiments:

- {name:"kill-worker", scope:"stage", percent:10}

- {name:"net-partition", scope:"cluster", duration_s:300}

- {name:"disk-throttle", scope:"node", mbps:50}

drills:

schedule: "quarterly"

playbooks: ["dr/runbook.md","rollback/runbook.md"]

success_criteria: ["rto_met","rpo_met","no_data_loss","alerting_ok"]

backups:

datasets: ["feat_rows","train_pkg"]

cadence: "P1D"

store: "s3://.../backup"

encryption: "SSE-KMS"

integrity: {hash:"sha256", manifest:"backup/manifest.json"}


IV. Failure Semantics, Retry & Idempotency


V. Compensation, Rollback & Replay


VI. Checkpoints, Snapshots & Backups


VII. DR Strategy & Switchovers


VIII. Chaos Testing, Drills & Success Criteria


IX. Metrology & Units (SI)

  1. Perf & objectives: RTO, RPO, T_inf (ms), QPS (1/s), ρ (—); net_mbps, size_bytes.
  2. Mandatory: metrology:{units:"SI", check_dim:true}; normalize units first before composition/conversion.
  3. Path quantities: if FT/Recovery processes handle T_arr, register delta_form, path="gamma(ell)", measure="d ell", and use:
    • T_arr = ( 1 / c_ref ) * ( ∫ n_eff d ell ), or
    • T_arr = ( ∫ ( n_eff / c_ref ) d ell ),
      with check_dim validation.

X. Machine-Readable Fragment (Drop-in)

fault_tolerance:

semantics: {on_fail:"retry", error_classes:["retryable","non_retryable","escalate"]}

retry: {policy:{max:3, backoff:"expo", jitter_ms:200}, timeout_s:1800}

idempotency: {enabled:true, dedupe_key:["id","updated_at"], sink_mode:"upsert"}

compensation:

enabled: true

handlers:

- {stage:"feature.map", action:"delete_artifact", spec:"comp/delete.manifest.yaml"}

recovery:

checkpoint: {mode:"exactly-once", store:"s3://meta/chk/feat.map", cadence:"PT5M",

contents:["offset","cursor","watermark","sink_commit"]}

snapshot: {enabled:true, store:"s3://snap/feat_rows", cadence:"P1D", retention:"P30D"}

replay: {enabled:true, inputs_lock:"locks/inputs.manifest.json", policy:"strict"}

rollbacks: {guardrail:{max_depth:2, require_approval:true}}

dr:

strategy: "active-passive"

topology:

primary: {region:"eu-west-1", azs:["a","b"], quorum:3}

standby: {region:"eu-central-1", azs:["a","b"], quorum:3}

rto: "PT30M"

rpo: "PT5M"

failover: {trigger:"auto", health_checks:["latency_ms.p99","error_rate","heartbeat"], dns_ttl_s:60}

fallback: {criteria:["primary_healthy_24h","replication_lag<PT1M"]}

testing:

chaos: {enabled:true, experiments:[{name:"kill-worker",scope:"stage",percent:10}]}

drills: {schedule:"quarterly", playbooks:["dr/runbook.md"], success_criteria:["rto_met","rpo_met","no_data_loss"]}

backups:

datasets: ["feat_rows","train_pkg"]

cadence: "P1D"

store: "s3://backup"

encryption: "SSE-KMS"

integrity: {hash:"sha256", manifest:"backup/manifest.json"}

metrology: {units:"SI", check_dim:true}


XI. Lint Rules (Excerpt, Normative)

lint_rules:

- id: FT.IDEMPOTENCY_REQUIRED

when: "$.fault_tolerance.idempotency.enabled"

assert: "value == true"

level: error

- id: RC.CHECKPOINT_DEFINED

when: "$.recovery.checkpoint"

assert: "has_keys(mode, store, cadence)"

level: error

- id: DR.RTO_RPO_DEFINED

when: "$.dr"

assert: "has_keys(rto, rpo) and duration_valid(rto) and duration_valid(rpo)"

level: error

- id: DR.STRATEGY_ALLOWED

when: "$.dr.strategy"

assert: "value in ['active-active','active-passive']"

level: error

- id: TEST.DRILLS_SCHEDULED

when: "$.testing.drills.schedule"

assert: "matches('^(monthly|quarterly|biannual|annual)$') or duration_valid(value)"

level: error

- id: BKP.INTEGRITY_MANIFEST

when: "$.backups"

assert: "has_keys(store, cadence, integrity)"

level: error

- id: METROLOGY.SI_AND_CHECKDIM

when: "$.metrology"

assert: "units == 'SI' and check_dim == true"

level: error


XII. Export Manifest & Audit

export_manifest:

version: "v1.0"

artifacts:

- {path:"chk/catalog.json", sha256:"..."}

- {path:"snap/retention.policy", sha256:"..."}

- {path:"dr/runbook.md", sha256:"..."}

- {path:"dr/drill_reports/2025Q3.md", sha256:"..."}

- {path:"backup/manifest.json", sha256:"..."}

- {path:"comp/normalize.reverse.yaml", sha256:"..."}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Data.Pipeline v1.0:Ch.11"

- "EFT.WP.Data.Pipeline v1.0:Ch.12"


XIII. Chapter Compliance Checklist