diff --git a/.agent_state/README.md b/.agent_state/README.md
new file mode 100644
index 0000000..1143d2b
--- /dev/null
+++ b/.agent_state/README.md
@@ -0,0 +1,12 @@
+This directory stores persisted runtime state for agent_patterns workbench.
+
+Recommended to commit:
+- registry.json
+- budgets.json
+- triggers.json
+- failure_fingerprints.json
+- repo_index.json (optional)
+
+Recommended to NOT commit:
+- runs/ (large logs/artifacts)
+- workspaces/ (patch sandboxes)
diff --git a/.agent_state/budgets.json b/.agent_state/budgets.json
new file mode 100644
index 0000000..e69de29
diff --git a/.agent_state/modules_history.json b/.agent_state/modules_history.json
new file mode 100644
index 0000000..e69de29
diff --git a/.agent_state/registry.json b/.agent_state/registry.json
new file mode 100644
index 0000000..e69de29
diff --git a/.agent_state/repo_index.json b/.agent_state/repo_index.json
new file mode 100644
index 0000000..92064b4
--- /dev/null
+++ b/.agent_state/repo_index.json
@@ -0,0 +1,1104 @@
+{
+  "version": "0.1.0",
+  "files": {
+    "config.py": {
+      "sha256": "c29548d6e01f86030b9546ee420e3a068ab62d3a90d83884ccb99fa0f94fb7b1",
+      "size": 619,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "docker-compose.yml": {
+      "sha256": "7c3cb6c491301146ff31ed7cbab11abd7aac3d3de72f1d8d56adb630b480e6bf",
+      "size": 777,
+      "mtime": 1772164906.9648027,
+      "lang": "yaml",
+      "skipped": false
+    },
+    "pyproject.toml": {
+      "sha256": "7d0fffedc73bc8afe467188359453eacac9233d010b2613ce55f2dbf5f543f7c",
+      "size": 1118,
+      "mtime": 1772164906.9648027,
+      "lang": "toml",
+      "skipped": false
+    },
+    "README.md": {
+      "sha256": "e1a39d5b7bfe4b068da647da81811e701d3c43a70f91eace7fe5484ac537dca8",
+      "size": 30758,
+      "mtime": 1772164906.9608028,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "run_context.py": {
+      "sha256": "04552581b504d5814ec35957bdec48c8fe4ede192d395b775c54692f7fa41631",
+      "size": 2070,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "WORKBENCH.md": {
+      "sha256": "f86a2b5bdde466460f42a25c25aec02d77e3b7e81aabd5922c040c6a00cd42b5",
+      "size": 2523,
+      "mtime": 1772164906.9608028,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "__init__.py": {
+      "sha256": "3bb38a6cbdbba08882ce14fc7c2a29a2c43b64bffd1f221cd1175b7bc2334d20",
+      "size": 368,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/run_context.py": {
+      "sha256": "7c7606d95f7b4d0e63126bd9d06d46138b8b4f3b67d6f1a9ecb88f32ea485b0b",
+      "size": 1072,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/__init__.py": {
+      "sha256": "35e1de4f0ba8a4bb2274be8fcc5ff9ababed9a2e7c228ad851b0c6285a7d20a9",
+      "size": 4353,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "evals/search_evals.py": {
+      "sha256": "7178f0fbf5c05eaa09b8c3bda1eb6244ebe319c03ef77938b1fe1565967f4673",
+      "size": 338,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "evals/workbench_evals.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "evals/workflow_evals.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "evals/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/agent/base.py": {
+      "sha256": "d56528d333f0d0b4313230a387739ce54fc8da9e5c648483e702d870496e9648",
+      "size": 4120,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/agent/memory_adapter.py": {
+      "sha256": "2439fba99afc33d54cadc341d9a03d87e5770c6bfb4ce6c7bdd4d975386af4eb",
+      "size": 11355,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/agent/__init__.py": {
+      "sha256": "7a9d391faeefdf65426ef721194eea60d0407097913e976542deb7a95f097822",
+      "size": 446,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/backends/base.py": {
+      "sha256": "9aafcff8587df9cde7c357381a00b26b3663c16cde7bb940eef0a380880d7a3b",
+      "size": 544,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/backends/local_fs.py": {
+      "sha256": "c0d897b7f4954c3cb6648df49cdd9100de6f60f89faf695a0622a0172fc5fcd7",
+      "size": 1371,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/backends/sandbox_exec.py": {
+      "sha256": "4d3e6818caf695e750196033269ff6b81758c5194deb1defc428973f9cacf8a4",
+      "size": 884,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/backends/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/docs/architecture.md": {
+      "sha256": "b13f13ddf3068ecb373d547ba137989589aa9cfef39a58c36033150a3f3388e8",
+      "size": 644,
+      "mtime": 1772164906.9648027,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "agent_ext/docs/evidence.md": {
+      "sha256": "77ff6b4999a59afd45506eef0be2c789a1796df528f1be3f42469e1a8a1e9019",
+      "size": 350,
+      "mtime": 1772164906.9648027,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "agent_ext/docs/integration.md": {
+      "sha256": "e87682074b29ffdc4cf196ed46d4442cd4071e775d652ac33dc63010a9656b9c",
+      "size": 1012,
+      "mtime": 1772164906.9648027,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "agent_ext/docs/skill_athoring.md": {
+      "sha256": "1d10dc5d30fcdefaddb70e1dd83282888b4868768d05d205de249c4b931ca103",
+      "size": 323,
+      "mtime": 1772164906.9648027,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "agent_ext/evidence/citations.py": {
+      "sha256": "7bfc39504cfd909e614b560157f3934df557984e2b1669c34d2dbe2e1623689e",
+      "size": 288,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/evidence/models.py": {
+      "sha256": "1812827c9c2180c262b30c8cc210aa6250d5755c4ba69c52c2f124cd1185da29",
+      "size": 984,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/evidence/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/examples/ocr_with_agent_demo.py": {
+      "sha256": "dd39f5dcf4bc7f63ba7e48bc1db6cd806e8fc674b868bdb4e2a11c96f476f8c1",
+      "size": 5203,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/examples/starter_subagent.py": {
+      "sha256": "6688e233d3c41b25e49a04364f1d52e426479e7312c480c17bd312d7c6519e15",
+      "size": 782,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/base.py": {
+      "sha256": "e53f0402889a400d5a0ed01945a0ad938d4444b03b8713b75946721d0d15ae02",
+      "size": 880,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/docx_writer.py": {
+      "sha256": "5ab2bd36447086a28f22ea03d9516c6c5543b98622c92fe99c5ff2e1f31f0fa7",
+      "size": 1496,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/html_writer.py": {
+      "sha256": "33fd66c0ede96fef1c9b520e79d5ccee3cf4f2b92eba919e970808be1c5d8ab7",
+      "size": 1199,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/models.py": {
+      "sha256": "2b4d33a6e9a4a5b210fca47facec3797f9ad46c2278cc7e47b5fcbfb833adfe1",
+      "size": 562,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/pdf_writer.py": {
+      "sha256": "1ba18c4103eb729eba958f47372993017b5dfc9c78dc165352ae151d2dc609bb",
+      "size": 1626,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/pptx_writer.py": {
+      "sha256": "7872b58e198239e01d828ad876b19193e9438c5313c0691f947c321462498450",
+      "size": 1330,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/export/__init__.py": {
+      "sha256": "e084859639173fe108bb5feebf2273e8c801318e9b908a66799cd8071c9d2590",
+      "size": 253,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/hooks/base.py": {
+      "sha256": "873d9b817b455ff8dbaec94fd805514d93b41c267be0b1496e112025c32ede5f",
+      "size": 1079,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/hooks/builtins.py": {
+      "sha256": "9aad8b21f6f2004474d5228c763f298346e02bc5bc0a697b91bbe892839b8bf2",
+      "size": 6849,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/hooks/chain.py": {
+      "sha256": "a3377170420b68a67bcd4518fa1740ba305b96cc3dd9f8114eb3b8062295123c",
+      "size": 1539,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/hooks/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/citations.py": {
+      "sha256": "4c02243d9ccf025b41f5b2323a52e1c5f1bca1d989a625e08909976dfa05dcbe",
+      "size": 473,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/docx_parser.py": {
+      "sha256": "48fe542061d129b0e0787a7e9e6e775716e129ac5d68d30f897a87480fb23090",
+      "size": 1987,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/extractors.py": {
+      "sha256": "c34dc37a060d05d65d42426ee18d8b6709b97a391d44645bd2a53cf00498bba4",
+      "size": 2393,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/llm_ocr_engine.py": {
+      "sha256": "272601879eb5385647ac8c83de0b284afb50478206eb50666ce41f8aa873f4f5",
+      "size": 3217,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/models.py": {
+      "sha256": "3b53b02eed0e19a9a26ed0517d7d4b616ac5c977304192fd5cf8784aea9e8869",
+      "size": 1946,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/multi_extractor.py": {
+      "sha256": "7f247baa1d3afbc80ab2c1cf0f42f7afc5ffdefa39286f2d8aeaf38c2d148245",
+      "size": 654,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/ocr_engines.py": {
+      "sha256": "ba47880fd2377b2944c01d187bc17fec9f737a5b5d4dbfcefbf6f87652aa3974",
+      "size": 628,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/pdf2image_renderer.py": {
+      "sha256": "c5ea85769684a18463de64ef93aae76022956427fd7c6fbb4df0b33441682461",
+      "size": 1426,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/pdf_to_images.py": {
+      "sha256": "874eefb30ad8686f372b6434df9fe03cf610eb5a07607d5ff04042197deee76b",
+      "size": 1367,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/pipeline.py": {
+      "sha256": "8b0da8849f88b692649b7507efbd4b82acabd8a5b595ddf6081e604e4e14bf64",
+      "size": 2271,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/retry_planner.py": {
+      "sha256": "f12ff25a8bdcd2926ab94fe4fb3029025de464a3ffff1dca3a633184cb3f80bd",
+      "size": 9916,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/validation.py": {
+      "sha256": "3ace74b6bfc761587909f7042e108fe022dafb773f691f64431aeffcdc795385",
+      "size": 7881,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/validation_evidence.py": {
+      "sha256": "7319cff75ffd5225e495eda46b584c7b5f424fbcc28346b228cc91bc4732984a",
+      "size": 4527,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/ingest/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/mcp/client.py": {
+      "sha256": "efd92d73195f89cc59763c959e342cd19f2a994a3d4974d63d1326b7225bd80a",
+      "size": 684,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/mcp/registry.py": {
+      "sha256": "7af137c6cc0378f6ee3f4817718c1039937b104fac97771bed32e4e089739ca8",
+      "size": 974,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/mcp/server.py": {
+      "sha256": "3d11498510442e0c8795648bc787daee1e0548a4322b7aa1dc28d3e3c15b4a89",
+      "size": 752,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/mcp/transport.py": {
+      "sha256": "b1b44eb876c86a696a57d58bc5ea17ca7dcecac427946fc74e887b891cffe233",
+      "size": 381,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/mcp/types.py": {
+      "sha256": "2f1582a914e132721e8fc1e4188202e03306ffef66838691d4524026245e4d2b",
+      "size": 540,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/mcp/__init__.py": {
+      "sha256": "297b1b33df21dcea4e76d69e810ec2d0cf24d6b5083cb45adabdc5640e323873",
+      "size": 186,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/memory/base.py": {
+      "sha256": "1f91487915d626cedb710c2a29b9007b3c3ac2f4d2e3d171e416b95c554adcc4",
+      "size": 251,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/memory/summarize.py": {
+      "sha256": "eb77f0c4e7474965ae03d31a216ef1305a8fde61736ad3106c204786c253edbc",
+      "size": 5624,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/memory/window.py": {
+      "sha256": "8e6c3ab9cf02ba23e867246e2efd453395b56b22439fb126646e9ebe37516278",
+      "size": 585,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/memory/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/loader.py": {
+      "sha256": "6f02e29cae3d9212a98dbe3d5781258d194728ed3733ec19484ce0e36651e5ad",
+      "size": 268,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/registry.py": {
+      "sha256": "d158c1a7ae5f7471893950b2179a6d613ba92b387ab16292745731e3aaedcf47",
+      "size": 3535,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/spec.py": {
+      "sha256": "1dea50cce99a12f079b40cd9fd074961dd2d60a9d35957c425aaf2cda97ac5c5",
+      "size": 1160,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/__init__.py": {
+      "sha256": "8b30064b73eba111f7fd219f7dbbb83f8159666b683898bd3a1d11d5c8dc7c79",
+      "size": 95,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/controller.py": {
+      "sha256": "8f16df7754673af29e350cb296a9d1f35177faaccf6a5f16c4356cd1f851de00",
+      "size": 4048,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/evidence_graph.py": {
+      "sha256": "8b08ba0ef5bc9456b6b3ce007ff84bc03e8d14789147c71a47f30ac70716fe80",
+      "size": 1454,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/executor.py": {
+      "sha256": "72712b68994d6712fcddd9421a853ae3b2f9dd82bafb20c3bc84c58b0d2e720f",
+      "size": 2056,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/executor_parallel.py": {
+      "sha256": "5f4a42bc530f91888e0097495ed3506bac1df6732f3d9ecb688fe6dbbcbc61b6",
+      "size": 464,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/gap_analysis.py": {
+      "sha256": "fac570150f16d63fcb04537e61d859e10137eb2dcf815aaa50c7f4741f937eaf",
+      "size": 2101,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/handlers_default.py": {
+      "sha256": "4349cc3515b89c3b0728e4cdf2814cb0bbfbcfc1a65b204ec8e5e7cdd0dda3b4",
+      "size": 2965,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/ledger.py": {
+      "sha256": "517e90f35dd012376662420a3c6d3fd664fb86228e46feb10cf0c3d7bd85ca11",
+      "size": 3181,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/models.py": {
+      "sha256": "26aa43dfc672344ccfc2a77abd9f517d4ef8d8673496ab8f5c531085d8423623",
+      "size": 2445,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/planner.py": {
+      "sha256": "148995decacfbfc026ff5c75c343fe373fbc42eea738c0ceb8aee1529831753e",
+      "size": 1837,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/synth.py": {
+      "sha256": "14adbf43906096c4ba65384a040e13811428834fff7c3e295d0460a753db9b0b",
+      "size": 2770,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/research/__init__.py": {
+      "sha256": "a714c5714d91984a14b8bdd76a597b6d0f3b609c03f3c5bb6b00b62fa61e6cb5",
+      "size": 210,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/rlm/policies.py": {
+      "sha256": "88b583e28bb7f5d133f14c666ea9c28cc906fc6a38d7ea90d32ef6702eaf40dd",
+      "size": 227,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/rlm/python_runner.py": {
+      "sha256": "ceb2ae9cfa301e21e4fb985def6894e121124a43328f5801d92e188616b2a280",
+      "size": 1686,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/rlm/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/search/bm25.py": {
+      "sha256": "46b2409a96cf856864eaecc8423d138e4cf54ad433e2f70ab903973a28fd4335",
+      "size": 5428,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/search/index.py": {
+      "sha256": "9036aea30a3a72f97b477849d114cd4cb42a3b49c2210cc857817b12fbfa367d",
+      "size": 4129,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/search/store.py": {
+      "sha256": "4734b5ee96dfd168d989a79dc2d53c50d787f9442a14cac354d2c199638a25ff",
+      "size": 1118,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/search/tokenize.py": {
+      "sha256": "029365fa6e756ded1a4b90371bd75d79d1fd2b05d3c953c9d2ebc50ea362beb7",
+      "size": 1790,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/search/__init__.py": {
+      "sha256": "5a9abc624db868a517d952d5c08da05404994ad56b73c05a8e2f60a5b873f5db",
+      "size": 115,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/self_improve/controller.py": {
+      "sha256": "fd45c9cb3934c6c77a867d7a13397cda737454f0b511f8c98f1bb8123904193d",
+      "size": 2311,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/self_improve/gates.py": {
+      "sha256": "ff3be7c795c389a1dc1ff4038535bffd8e6647fdbd56fb82fc17842b384b6e9f",
+      "size": 2144,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/self_improve/models.py": {
+      "sha256": "9d3f75504546fc7e950714f8fad312458b39506933523237c77c4d9a9165cd25",
+      "size": 994,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/self_improve/patching.py": {
+      "sha256": "3139471aa6fb98b45851e71d6e7b7c356ed2111c0c3805a8f7f51bb0d8ec1222",
+      "size": 8333,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/self_improve/triggers.py": {
+      "sha256": "7029b25ca1522c46a736c01df2e2706b4fa886a26683b8ceb8c23bfe6f6c4a56",
+      "size": 1430,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/self_improve/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/skills/loader.py": {
+      "sha256": "a2dbafe898b55ced7c5cd180665ed7a50f991dcb2529e3a3a47ba8a2db10ad13",
+      "size": 697,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/skills/models.py": {
+      "sha256": "98ca3bf2163a48103059cdb2692c6dd214a7fcf78080cb0f194384fdaab8b0d4",
+      "size": 641,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/skills/registry.py": {
+      "sha256": "2ca28801ddedc810b26552a1669d3d3271cfa759b0e1667e4f659559fd4063fb",
+      "size": 1709,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/skills/selector.py": {
+      "sha256": "dcd1daa1f2697fceec1428be7bd52602533fd60bd920bf1051898b167b51cd07",
+      "size": 987,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/skills/toolset.py": {
+      "sha256": "92ad9fa4b89e3f0880789bac629db542aff53dfb39fe078278751a05a88b5919",
+      "size": 1032,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/skills/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/subagents/base.py": {
+      "sha256": "92d3fde3e9a6249632bd7fc338b2a3e203e8c5ce4ad4aa0b76bfd989bc3d33a8",
+      "size": 389,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/subagents/orchestrator.py": {
+      "sha256": "8ce674aea22a971ec60b9f86f8dd4583246ceacde8ca14040662bc97a72de23c",
+      "size": 1530,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/subagents/registry.py": {
+      "sha256": "676e076f95f2d89f8fb76718ca9630c6c31f7bb913ae4461a846a01aaa7e4bdd",
+      "size": 429,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/subagents/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/events.py": {
+      "sha256": "daa00b74bf18ecd453088051ab07fd335657ca27ebb4ef34e31ea833e0b30a5e",
+      "size": 1891,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/models.py": {
+      "sha256": "d14cfd2935fb0ce1a59ce1997aba5eee6c8a6dfc57a4d68c4882766e22e5ee69",
+      "size": 2565,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/store_base.py": {
+      "sha256": "5d20647c8da484d0495e8e0f3d28aac0972e63d5ed7683ee21585d5752e79143",
+      "size": 947,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/store_memory.py": {
+      "sha256": "2ea621de97da000bc2ceebe577feb58efa49d7d81298a44fe24bd45e17dc97ee",
+      "size": 8253,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/store_postgres.py": {
+      "sha256": "56b6d11bdbffb539ae1ce80faf836480ed28f7fd0735edd4e84828297c21e119",
+      "size": 15323,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/toolset.py": {
+      "sha256": "4f361a6d9623e59039f606b76fa9fb952db2c0c6cb3fdda9e75b884cc7b3c6da",
+      "size": 2579,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/todo/__init__.py": {
+      "sha256": "ed809582f2d620b833d786d9151d5142f05ef8e97c974e718b56fceef297bc3f",
+      "size": 308,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/events.py": {
+      "sha256": "3b42cc1069f6a063a96887138f5969bdc6ae140f92edb0a182ddafde05bfa9f7",
+      "size": 620,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/gitops.py": {
+      "sha256": "6961d670b1153650f17b790e31bd3edfc936bff38bd6998783054c2895219efb",
+      "size": 887,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/jupyter.py": {
+      "sha256": "749c2fd8cbc4cccb4cc8189f54e01d532b5960f3a534e131b6015ba60a9001dd",
+      "size": 1762,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/limits.py": {
+      "sha256": "8cef0904e3729c2771cab89fd49adcfcf8762851c70889ddc8ea0d6a8d0beeab",
+      "size": 399,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/locks.py": {
+      "sha256": "e4e387048edd07070837499debeea3b08f341361a17c645034ccb2c3448ca839",
+      "size": 1694,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/loop.py": {
+      "sha256": "69a5d3235376d6c46d104b655a60e8ea74f8b01dba9e92313dc6ba9759e9599b",
+      "size": 16014,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/models.py": {
+      "sha256": "93d13c4deeec829a013c34b6d64f6bb5e66934c3ed545b3a7e6d48cbf4be9f52",
+      "size": 1269,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/parallel.py": {
+      "sha256": "a42c1ee796106ac0d584a373b6559a4837f93d933fd8739a3e237a5387d7a2c6",
+      "size": 517,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/planner.py": {
+      "sha256": "bf1eb5085bc7ea9842684760966a65be30939931a1bb82edf5b4aa6b329d4d3d",
+      "size": 2627,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/runtime.py": {
+      "sha256": "f77c82dc0e9fa2003a0ad25685ec6bae028b2725b9314351a4069e026665ed0f",
+      "size": 5213,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/subagents.py": {
+      "sha256": "c81b3d8e289c21542cf348a61395b36492a95525acc224554e4080d2fc568aff",
+      "size": 6075,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/tui.py": {
+      "sha256": "ebd0d10bc963f165ddcd5eca8bd02d5a88b0db8062762e73e523c0c0e34c4b68",
+      "size": 1132,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/tui_async.py": {
+      "sha256": "11bef3d4723279a13108407ef24b9c2d0f9643b26f6dd03192605bb9a366159f",
+      "size": 31105,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/worktrees.py": {
+      "sha256": "888a10fab42abde1b25d3a398a6a71995d5fbc61eb83d1c445fc93a2bd5bc7c3",
+      "size": 3067,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/writer_runner.py": {
+      "sha256": "d2f008b03ddd2ebcb8f047f3d80b48284a8d5d39ee4b93b06f93024dff781954",
+      "size": 1771,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/__main__.py": {
+      "sha256": "4eece5efb631d042b87467846a3ff77a47719e94624e69199aa80d1c4242be69",
+      "size": 1504,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/bandit.py": {
+      "sha256": "c04e9159977cf2b55a72bdbb4ac03ea0efd7fceb73a1e26d1e95feee76beed8f",
+      "size": 1196,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/builtins.py": {
+      "sha256": "ebe62d6a016acc6ae4550e790da4b1572d0b49558641b84735e3a0c8f043c513",
+      "size": 3603,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/executor.py": {
+      "sha256": "4b95fc2d94f5d1e567ce58754990887f6e80f0dc1658385ae3275d6772ab8245",
+      "size": 1502,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/experience.py": {
+      "sha256": "984ff9198880f9f3039d9ebb4e5a0ed27624b604dd9002f134c4de84cbe91c6f",
+      "size": 1655,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/planner.py": {
+      "sha256": "e30793e1b9fb951ae15e3b3276434ec21532ecf4ab3797fbcd07847b77eebb6a",
+      "size": 2022,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/registry.py": {
+      "sha256": "de309c9612b61b0f85df1d38a5a668c27738d9668aedf38bdf3232f5123360cf",
+      "size": 1392,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/types.py": {
+      "sha256": "4fbb19064b253f1c2a1b1007a2b7ac8438eccae6bf09e83ac56ec314872a7e3c",
+      "size": 1644,
+      "mtime": 1772164906.9648027,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workflow/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/builtins/core/module.py": {
+      "sha256": "331dcc6f2ebf895dea97ea797e95bb03f2fd5d8c28de7a74e610f11dd14ffcd9",
+      "size": 648,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/builtins/core/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/builtins/self_improve/module.py": {
+      "sha256": "16f1c37fc9e809a0cabe6b129ed6015983def14900b39ac522b387f82d481975",
+      "size": 654,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/builtins/self_improve/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/builtins/workflow/module.py": {
+      "sha256": "2cc8f53f9ee682df28ceb6518fae4963bd8440c0d3fae1e4d51bacf608230700",
+      "size": 717,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/modules/builtins/workflow/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/subagents_bm25.py": {
+      "sha256": "a4d7025b830009c485a679f700f8454fd8fb99205d825230f8998d81d3216aed",
+      "size": 685,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/subagents_patch.py": {
+      "sha256": "b289904bcefe2163eb49438008aaae4278d7768cd56c001a53fe564260a08292",
+      "size": 4139,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/daemon.py": {
+      "sha256": "6e81657e94475df3f2495097eb0d6ef0e41d82d87838c622848ca3961b0ee1d4",
+      "size": 2687,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/loop_v2.py": {
+      "sha256": "f3b9499059c5806023e0e33f0d90d152d1cbf4df251a4effb3f0c743563761b4",
+      "size": 6992,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/modes.py": {
+      "sha256": "45a387610946cfc5e84973b2fbcff2de186a743b903d639f2f76abaea9283e32",
+      "size": 892,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/scoring.py": {
+      "sha256": "7facb76213a4e9d9a9e5806626822efae538b7143dc7811f2af3382d359ee095",
+      "size": 1228,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/state.py": {
+      "sha256": "8208eea1d31d2b6fc2278495aff3d3e7a7e3eaf8779d3e8d724ed39237d1428d",
+      "size": 2757,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/strategy_bank.py": {
+      "sha256": "ec677fd625c04b1d42bd2923d6f4178a37a44a7eaff060f18333afc2ce89db7b",
+      "size": 539,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/triggers.py": {
+      "sha256": "22c1d0ad40cd358199c7513d45992512a421e116f816e5749eb92abb5de2e88f",
+      "size": 1040,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/cog/__init__.py": {
+      "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "size": 0,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/adopt.py": {
+      "sha256": "beb25cbc7fbf8fb4275038c34ce076638e5a2c30055673675b07198d91c94795",
+      "size": 4486,
+      "mtime": 1772164906.9568028,
+      "lang": "python",
+      "skipped": false
+    },
+    "docs/AUTO_AGENT.md": {
+      "sha256": "7c8c346f073d2e7884326d7a398927acff13e9fc4061a36c6c5dbb728788ba43",
+      "size": 12861,
+      "mtime": 1772164906.9648027,
+      "lang": "markdown",
+      "skipped": false
+    },
+    "agent_ext/cog/__main__.py": {
+      "sha256": "693690cba4969995911e233e6dbd4c5d0ff71feee54fac81372573cb81e54e3c",
+      "size": 2438,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/streaming.py": {
+      "sha256": "4ce9156d048a136df766d612ae5635a697162f2420d0f69e4a649104dfa073ca",
+      "size": 3360,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/patch_models.py": {
+      "sha256": "3de0b7af51e9027288c691adbb550806a658acc6528e7173ab20e528b9bc16d1",
+      "size": 2555,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    },
+    "agent_ext/workbench/plan_models.py": {
+      "sha256": "1f2bab64f47b53360e12e85122e49a94237d12908c584e83d06948c4d0cb95e5",
+      "size": 1402,
+      "mtime": 1772164906.9608028,
+      "lang": "python",
+      "skipped": false
+    }
+  }
+}
\ No newline at end of file
diff --git a/.agent_state/strategy_scores.json b/.agent_state/strategy_scores.json
new file mode 100644
index 0000000..e69de29
diff --git a/.agent_state/triggers.json b/.agent_state/triggers.json
new file mode 100644
index 0000000..e69de29
diff --git a/.agent_state/workflow_experience.json b/.agent_state/workflow_experience.json
new file mode 100644
index 0000000..e69de29
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..eb7abc9
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,29 @@
+# Workbench / LLM (agent_ext.workbench with --use-openai-chat-model)
+# Copy to .env and set values. load_dotenv() runs at startup.
+LLM_BASE_URL=https://api.openai.com/v1
+LLM_API_KEY=your-api-key
+LLM_MODEL=gpt-4o
+
+# Optional: parallel limits (defaults shown)
+# MAX_PARALLEL_SUBAGENTS=4
+# MAX_PARALLEL_MODEL_CALLS=2
+
+# Daemon (python -m agent_ext.cog): headless self-improving loop
+# USE_OPENAI_CHAT_MODEL=1
+# AGENT_DAEMON_GOAL=keep improving the repo safely
+# AGENT_LOOP_SLEEP=30
+# AGENT_MAX_IDLE=600
+# COG_MAX_STEPS=10
+# COG_MAX_MODEL_CALLS=6
+# COG_MAX_PARALLEL_WRITERS=3
+# MAX_DIFF_CHARS=60000
+# AUTO_COMMIT_THRESHOLD=80
+
+# Keep implement worktree after run (default 0). If 1, worktree stays at .agent_state/worktrees/<run_id>/writer_llm_patch/
+# KEEP_WORKTREE=1
+
+# Auto-adopt: apply patch to main repo and push after gates pass
+# AUTO_ADOPT=1
+# AUTO_PUSH_BRANCH=dev
+# ADOPT_PULL_STRATEGY=merge
+# ADOPT_PUSH_RETRIES=2
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..b8f42aa
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+# When merging a PR into main/dev, do not take .agent_state from the PR branch.
+# Shared learning state lives on agent-state/main; code branches should not merge state.
+# Requires: git config merge.keep-ours.driver "true" (see docs/AUTO_AGENT.md)
+.agent_state/** merge=keep-ours
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..885376e
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,72 @@
+name: CI
+
+on:
+  push:
+    branches: [main, dev, "auto/**"]
+  pull_request:
+    branches: [main, dev]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: uv sync --dev
+
+      - name: Run tests
+        run: uv run python -m pytest tests/ -v --tb=short
+
+  lint:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref || github.ref_name }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
+      - name: Set up Python
+        run: uv python install 3.12
+
+      - name: Install dependencies
+        run: uv sync --dev
+
+      - name: Run ruff check (lint)
+        run: uv run ruff check agent_ext/ tests/ --fix
+        continue-on-error: true
+
+      - name: Run ruff format
+        run: uv run ruff format agent_ext/ tests/
+
+      - name: Commit lint fixes (if any)
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add -A
+          if git diff --cached --quiet; then
+            echo "No lint changes to commit"
+          else
+            git commit -m "style: auto-fix lint (ruff)"
+            git push
+          fi
diff --git a/.gitignore b/.gitignore
index b7faf40..9615155 100644
--- a/.gitignore
+++ b/.gitignore
@@ -205,3 +205,65 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+
+# -------------------------
+# Python
+# -------------------------
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.egg-info/
+
+# -------------------------
+# OS / editor
+# -------------------------
+.DS_Store
+.vscode/
+.idea/
+
+# -------------------------
+# Agent state: default ignore everything
+# -------------------------
+.agent_state/**
+
+# Keep curated "brain" files
+!.agent_state/README.md
+!.agent_state/registry.json
+!.agent_state/budgets.json
+!.agent_state/triggers.json
+!.agent_state/failure_fingerprints.json
+!.agent_state/repo_index.json
+!.agent_state/workflow_experience.json
+!.agent_state/strategy_scores.json
+!.agent_state/modules_history.json
+
+# Keep directory placeholders
+!.agent_state/runs/.gitkeep
+!.agent_state/locks/.gitkeep
+
+# Ignore heavy/volatile stuff
+.agent_state/runs/**
+.agent_state/worktrees/**
+.agent_state/tmp/**
+.agent_state/workspaces/**
+.agent_state/*.log
+.agent_state/**/*.patch
+.agent_state/**/*.diff
+
+# If you later add embeddings/vector caches locally:
+.agent_state/embeddings/**
+.agent_state/bm25_cache/**
+
+# -------------------------
+# Docker
+# -------------------------
+*.pid
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..3db6aa4
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.0
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..385c19d
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,220 @@
+# AGENTS.md — Developer & AI Agent Guide
+
+## Overview
+
+**agent_patterns** is a self-improving, self-assembling agentic system. It provides modular, pluggable subsystems for building AI agents: middleware, subagents, memory, backends, skills, RLM, database, and a workbench TUI for interactive development. All subsystems are at feature parity with their upstream reference implementations.
+
+---
+
+## Repository Structure
+
+```
+/                              # Root package: agent_patterns
+├── AGENTS.md                  # This file
+├── README.md                  # Full API documentation
+├── WORKBENCH.md               # Workbench usage guide
+├── pyproject.toml             # Project config
+├── .env.example               # Environment variables reference
+│
+├── agent_ext/                 # Main extension package
+│   ├── hooks/                 # Middleware system (async, context, cost, parallel, permissions)
+│   │   ├── README.md
+│   │   ├── base.py            # AgentMiddleware ABC + legacy Hook Protocol
+│   │   ├── chain.py           # MiddlewareChain (async) + HookChain (legacy sync)
+│   │   ├── context.py         # ScopedContext with hook-type access control
+│   │   ├── cost_tracking.py   # Token + USD cost monitoring with budgets
+│   │   ├── parallel.py        # Run middleware concurrently (ALL_MUST_PASS, FIRST_WINS, MERGE)
+│   │   ├── permissions.py     # ALLOW/DENY/ASK tool decisions
+│   │   ├── builtins.py        # AuditHook, PolicyHook, ContentFilterHook, ConditionalMiddleware
+│   │   └── exceptions.py      # InputBlocked, ToolBlocked, BudgetExceededError, etc.
+│   │
+│   ├── subagents/             # Multi-agent orchestration
+│   │   ├── README.md
+│   │   ├── base.py            # Subagent/SubagentResult protocol
+│   │   ├── registry.py        # SubagentRegistry (static) + DynamicAgentRegistry
+│   │   ├── orchestrator.py    # SubagentOrchestrator (bounded parallel)
+│   │   ├── types.py           # Messages, TaskHandle, SubAgentConfig, auto-mode selection
+│   │   └── message_bus.py     # InMemoryMessageBus (ask/answer), TaskManager (soft/hard cancel)
+│   │
+│   ├── rlm/                   # Recursive Language Model — large context analysis
+│   │   ├── README.md
+│   │   ├── models.py          # RLMConfig, REPLResult, GroundedResponse, RLMDependencies
+│   │   ├── repl.py            # REPLEnvironment (persistent state, llm_query, sandboxed)
+│   │   ├── policies.py        # RLMPolicy (legacy)
+│   │   └── python_runner.py   # run_restricted_python (legacy)
+│   │
+│   ├── backends/              # File storage, execution, permissions
+│   │   ├── README.md
+│   │   ├── base.py            # FilesystemBackend + ExecBackend protocols
+│   │   ├── local_fs.py        # LocalFilesystemBackend (sandboxed to root)
+│   │   ├── sandbox_exec.py    # LocalSubprocessExecBackend
+│   │   ├── state.py           # StateBackend (in-memory, for testing)
+│   │   ├── permissions.py     # PermissionChecker + presets (READONLY, PERMISSIVE, etc.)
+│   │   └── hashline.py        # Content-hash line editing for precise AI edits
+│   │
+│   ├── memory/                # Context management
+│   │   ├── README.md
+│   │   ├── base.py            # MemoryManager protocol
+│   │   ├── window.py          # SlidingWindowMemory (message-count + token-aware)
+│   │   ├── summarize.py       # SummarizingMemory (LLM dossier compression)
+│   │   └── cutoff.py          # Safe cutoff preserving tool call/response pairs
+│   │
+│   ├── skills/                # Progressive-disclosure instruction packs
+│   │   ├── README.md
+│   │   ├── models.py          # SkillSpec, LoadedSkill, create_skill()
+│   │   ├── registry.py        # SkillRegistry (directory discovery)
+│   │   ├── loader.py          # SkillLoader
+│   │   ├── exceptions.py      # SkillNotFoundError, SkillValidationError
+│   │   └── registries/        # CombinedRegistry, FilteredRegistry, PrefixedRegistry
+│   │
+│   ├── database/              # SQL capabilities for AI agents
+│   │   ├── README.md
+│   │   ├── types.py           # QueryResult, SchemaInfo, TableInfo, DatabaseConfig
+│   │   ├── protocol.py        # DatabaseBackend protocol
+│   │   └── sqlite.py          # SQLiteDatabase with security controls
+│   │
+│   ├── evidence/              # Evidence + citations
+│   ├── todo/                  # Task management (CRUD, deps, events, stores)
+│   ├── workbench/             # TUI workbench (plan → run → adopt)
+│   ├── cog/                   # Cognitive daemon (headless self-improvement)
+│   ├── self_improve/          # Patching, gates, triggers
+│   ├── search/                # BM25 search index
+│   ├── modules/               # Plugin module system
+│   ├── mcp/                   # MCP tool registry
+│   ├── ingest/                # Document ingestion
+│   ├── export/                # Document export
+│   ├── research/              # Deep research controller
+│   └── workflow/              # Workflow synthesis + execution
+│
+├── tests/                     # 158 tests
+│   ├── test_hooks.py          # Middleware: chain, context, cost, parallel, permissions
+│   ├── test_subagents.py      # Registries, message bus, execution modes
+│   ├── test_rlm.py            # REPL, persistent state, grounded response
+│   ├── test_backends_new.py   # State backend, permissions, hashline
+│   ├── test_memory_new.py     # Window, safe cutoff, token-based trim
+│   ├── test_database.py       # SQLite queries, security, schemas
+│   ├── test_skills_new.py     # Programmatic skills, registry composition
+│   ├── test_patching.py       # Diff sanitization, hunk repair, git apply
+│   ├── test_planner.py        # TaskQueue operations
+│   ├── test_scoring.py        # Score properties, score_patch
+│   └── test_worktrees.py      # Git worktree operations
+│
+└── docs/                      # Additional documentation
+```
+
+---
+
+## Setup
+
+```bash
+uv sync
+cp .env.example .env  # configure LLM endpoint
+uv run python -m pytest tests/ -v  # verify 158 tests pass
+```
+
+---
+
+## Running
+
+```bash
+# TUI workbench
+uv run python -m agent_ext.workbench --use-openai-chat-model
+
+# Cog daemon (headless)
+AUTO_ADOPT=1 uv run python -m agent_ext.cog --use-openai-chat-model
+```
+
+---
+
+## Running Tests
+
+```bash
+uv run python -m pytest tests/ -v
+```
+
+---
+
+## Subsystem Quick Reference
+
+### Middleware (`hooks/`)
+Async lifecycle hooks with scoped context, cost tracking, parallel execution, and permissions.
+```python
+from agent_ext.hooks import MiddlewareChain, AuditHook, PolicyHook, CostTrackingMiddleware
+chain = MiddlewareChain([AuditHook(), PolicyHook(), CostTrackingMiddleware(budget_limit_usd=5.0)])
+```
+
+### Subagents (`subagents/`)
+Multi-agent orchestration with message bus and task management.
+```python
+from agent_ext.subagents import DynamicAgentRegistry, InMemoryMessageBus, TaskManager
+```
+
+### RLM (`rlm/`)
+Sandboxed REPL for large-context analysis with sub-model delegation.
+```python
+from agent_ext.rlm import REPLEnvironment, RLMConfig, GroundedResponse
+```
+
+### Backends (`backends/`)
+File storage with permissions, in-memory testing backend, hashline editing.
+```python
+from agent_ext.backends import StateBackend, PermissionChecker, READONLY_RULESET, format_hashline_output
+```
+
+### Memory (`memory/`)
+Token-aware sliding window with safe cutoff preserving tool call pairs.
+```python
+from agent_ext.memory import SlidingWindowMemory
+memory = SlidingWindowMemory(max_tokens=100_000, trigger_tokens=80_000)
+```
+
+### Skills (`skills/`)
+Progressive-disclosure skills with programmatic creation and registry composition.
+```python
+from agent_ext.skills import create_skill, CombinedRegistry, FilteredRegistry
+```
+
+### Database (`database/`)
+SQL capabilities with security controls.
+```python
+from agent_ext.database import SQLiteDatabase, DatabaseConfig
+```
+
+---
+
+## Code Patterns
+
+### Adding a New Subagent
+```python
+from agent_ext.workbench.subagents import SubagentResult
+class MyAgent:
+    name = "my_agent"
+    async def run(self, ctx, *, input, meta):
+        return SubagentResult(ok=True, name=self.name, output="result", meta={})
+```
+
+### Adding a New Middleware
+```python
+from agent_ext.hooks import AgentMiddleware, InputBlocked
+class MyFilter(AgentMiddleware):
+    async def before_run(self, ctx, prompt):
+        if "blocked" in str(prompt):
+            raise InputBlocked("Blocked content")
+        return prompt
+```
+
+### Adding a New Module
+Create `agent_ext/modules/builtins/<name>/module.py` with `module_spec`.
+
+---
+
+## Key Design Decisions
+
+- **Async-first middleware** with backward-compat sync hooks
+- **Scoped context** with hook-type access control (earlier hooks only)
+- **Structured patches** — LLM returns structured edits, we convert to valid unified diff
+- **Worktree isolation** — each implement task in its own git worktree
+- **Safe cutoff** — never split tool call/response pairs when trimming history
+- **Lazy imports** — heavy deps (pydantic-ai, exporters) loaded on first use
+- **Permission presets** — READONLY, DEFAULT, PERMISSIVE, STRICT
+- **Hashline editing** — content-hash-tagged lines for precise AI edits
diff --git a/README.md b/README.md
index 296d0ae..94f6384 100644
--- a/README.md
+++ b/README.md
@@ -1,742 +1,500 @@
 # agent_patterns
 
-Shared patterns and extensions for building AI agents: **RunContext**, **hooks**, **evidence**, **skills**, **backends**, **memory**, **subagents**, **RLM**, **todo (task management)**, **document ingest**, **deep research**, and a [pydantic-ai](https://ai.pydantic.dev) base agent. Use one pattern or combine several; everything is designed to plug into a single `RunContext`.
+A self-improving, self-assembling agentic system built on [pydantic-ai](https://ai.pydantic.dev). Modular subsystems that plug together: **middleware**, **subagents**, **RLM code execution**, **backends**, **memory**, **skills**, **database**, **todo**, **evidence**, **document ingest**, **deep research**, and an interactive **workbench TUI** for goal → plan → run → adopt workflows.
 
----
-
-## Setup
-
-**Optional dependency (pydantic-ai):** The core package does **not** depend on `pydantic-ai`, so you can use agent_patterns (hooks, todo, ingest, evidence, etc.) without pulling in pydantic-ai or its transitive deps (e.g. Starlette). If your app **already has** pydantic-ai installed, **PydanticAIAgentBase** and **LLMVisionOCREngine** will use your version when you import them. If you need the agent/vision OCR and don’t have pydantic-ai yet, install the extra:
+Use one subsystem or all of them. Everything composes through pydantic-ai's `FunctionToolset` API and our `AgentPatterns` batteries-included agent.
 
 ```bash
-pip install agent-patterns[agent]
-# or
-uv add "agent-patterns[agent]"
+uv sync && uv run python -m agent_ext.workbench --use-openai-chat-model
 ```
 
-Put the **parent** of this repo on `PYTHONPATH`:
+---
 
-```bash
-# From the directory that contains agent_patterns (e.g. monorepo root):
-export PYTHONPATH="$(pwd):$PYTHONPATH"
-uv run python -c "from agent_ext import PydanticAIAgentBase, RunContext; print('OK')"
-```
+## Table of Contents
+
+- [Quick Start](#quick-start)
+- [AgentPatterns — Batteries-Included Agent](#agentpatterns--batteries-included-agent)
+- [Workbench TUI](#workbench-tui)
+- [Cog Daemon (Headless)](#cog-daemon-headless)
+- [Subsystems](#subsystems)
+  - [Middleware](#middleware)
+  - [Subagents](#subagents)
+  - [RLM (Code Execution)](#rlm-code-execution)
+  - [Backends](#backends)
+  - [Memory](#memory)
+  - [Skills](#skills)
+  - [Database](#database)
+  - [Todo](#todo)
+  - [Evidence](#evidence)
+  - [Document Ingest](#document-ingest)
+  - [Deep Research](#deep-research)
+- [Toolset Factories](#toolset-factories)
+- [Setup](#setup)
+- [Environment Variables](#environment-variables)
+- [Running Tests](#running-tests)
 
-From inside `agent_patterns`:
+---
+
+## Quick Start
 
 ```bash
-export PYTHONPATH="$(cd .. && pwd):$PYTHONPATH"
-uv run python your_script.py
+# Install
+uv sync
+
+# Configure
+cp .env.example .env
+# Edit .env with your LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
+
+# Run the interactive workbench
+uv run python -m agent_ext.workbench --use-openai-chat-model
+
+# Or use programmatically
+uv run python -c "
+from agent_ext.agent import AgentPatterns
+agent = AgentPatterns('openai:gpt-4o', toolsets=['console', 'todo'])
+print(type(agent))
+"
 ```
 
 ---
 
-## 1. RunContext (the core)
+## AgentPatterns — Batteries-Included Agent
 
-Every pattern expects a **RunContext**: one object that carries identity, policy, logging, storage, and optional subsystems for the run.
+`AgentPatterns` inherits from pydantic-ai's `Agent` and auto-wires all subsystems. Pass toolset names as strings and get a fully-equipped agent:
 
 ```python
-from agent_patterns.run_context import RunContext, Policy
-
-# You implement these protocols (or use your own adapters):
-# - Cache (get/set)
-# - Logger (info/warning/error)
-# - ArtifactStore (put_bytes, get_bytes, put_json, get_json)
-
-ctx = RunContext(
-    case_id="case-1",
-    session_id="sess-1",
-    user_id="user-1",
-    policy=Policy(allow_tools=True, allow_exec=False, allow_fs_write=False),
-    cache=my_cache,
-    logger=my_logger,
-    artifacts=my_artifact_store,
-    trace_id="optional-trace-id",
+from agent_ext.agent import AgentPatterns
+from agent_ext.memory import SlidingWindowMemory
+
+# Coding assistant with filesystem + task management + memory
+agent = AgentPatterns(
+    "openai:gpt-4o",
+    instructions="You are a helpful coding assistant.",
+    toolsets=["console", "todo"],
+    memory=SlidingWindowMemory(max_tokens=100_000),
 )
 
-# Optional: inject subsystems (set by your composition root)
-ctx.skills = skill_registry_or_loader
-ctx.backends = {"fs": fs_backend, "exec": exec_backend}
-ctx.subagents = subagent_registry
-ctx.memory = memory_manager
-ctx.rlm = rlm_policy
-ctx.todo = todo_toolset  # TodoToolset(store, events=...) for task CRUD
+result = await agent.run("List all Python files and create a review task for each")
 ```
 
-- **Policy**: `allow_tools`, `allow_exec`, `allow_fs_write`, `max_tool_calls`, `max_runtime_s`, `redaction_level`. Use with **PolicyHook** to enforce at runtime.
-- **ArtifactStore**: store blobs and JSON for auditability; ingest and research write artifacts keyed by `case_id` / `session_id`.
+### Factory Methods
 
----
+```python
+# Console agent — ls, read, write, edit, grep, execute
+agent = AgentPatterns.with_console("openai:gpt-4o")
 
-## 2. Hooks (audit, policy, custom)
+# Data analysis — sandboxed Python REPL with sub-model delegation
+agent = AgentPatterns.with_rlm("openai:gpt-4o", sub_model="openai:gpt-4o-mini")
 
-**Hooks** run at defined points around a run: before/after run, before/after model request/response, before/after tool call/result, and on error. Implement the `Hook` protocol and chain them.
+# Database — SQL queries with read-only protection
+agent = AgentPatterns.with_database("openai:gpt-4o")
 
-```python
-from agent_ext import HookChain, AuditHook, PolicyHook, ContentFilterHook, BlockedToolCall, BlockedPrompt, make_blocklist_filter
-
-# Built-in: logging and timing
-audit = AuditHook()
-
-# Built-in: enforce ctx.policy (e.g. block tools if allow_tools=False)
-policy = PolicyHook()
-
-# Block dangerous prompts before they reach the LLM (raises BlockedPrompt on match)
-DANGEROUS = ["ignore previous", "jailbreak", "disregard instructions"]  # your blocklist
-content_filter = ContentFilterHook(filter_fn=make_blocklist_filter(DANGEROUS, reason="Prompt blocked by policy"))
-# Or custom filter that can raise BlockedPrompt or redact:
-# def my_filter(ctx, payload, phase): ...
-# content_filter = ContentFilterHook(filter_fn=my_filter)
-
-chain = HookChain([audit, content_filter, policy])
-
-# In your agent loop (catch BlockedPrompt so dangerous prompts never reach the LLM):
-chain.before_run(ctx)
-try:
-    request = chain.before_model_request(ctx, request)
-    response = get_model_response(request)
-    response = chain.after_model_response(ctx, response)
-    # ... tool calls: chain.before_tool_call(ctx, call), chain.after_tool_result(ctx, result)
-    outcome = ...
-except BlockedPrompt as e:
-    outcome = "I can't process that request."  # or your safe fallback; no LLM call
-except Exception as e:
-    outcome = chain.on_error(ctx, e)
-chain.after_run(ctx, outcome)
+# Everything at once
+agent = AgentPatterns.with_all("openai:gpt-4o")
 ```
 
-- **BlockedToolCall**: raised by PolicyHook when a tool is disallowed; handle it in your runner.
-- **BlockedPrompt**: raise from a content filter to **block the request before it reaches the LLM**. The runner should catch it and not call the model (e.g. return a safe message). Use **make_blocklist_filter(patterns, reason=...)** to block requests whose text matches any pattern (substring or regex).
-- **ContentFilterHook**: runs your `filter_fn` on every **before_model_request** (so blocking works always) and on **after_model_response** when `ctx.policy.redaction_level` is not `"none"`. Filter can return modified payload or raise **BlockedPrompt** to block the request before it reaches the LLM. Use for blocklists, PII redaction, or moderation APIs.
-- **Custom hooks**: implement `Hook` (e.g. rate limiting, metrics) and prepend/append to the chain.
-
----
-
-## 3. Evidence and citations
+### Composing Toolsets
 
-**Evidence** is the common shape for “something produced by a step”: text, entities, findings, doc extracts, etc. It carries **citations** (source_id, locator, quote) and **provenance** (produced_by, artifact_ids).
+Pass names (auto-created) or `FunctionToolset` instances:
 
 ```python
-from agent_ext import Evidence, Citation, Provenance
-
-ev = Evidence(
-    kind="finding",
-    content="The report states X.",
-    citations=[Citation(source_id="doc-1", locator="page:3", quote="...", confidence=0.9)],
-    provenance=Provenance(produced_by="ingest_pipeline", artifact_ids=["doc-1"]),
-    confidence=0.8,
-    tags=["pii:redacted"],
+from agent_ext.rlm import create_rlm_toolset
+from agent_ext.backends.console import create_console_toolset
+
+agent = AgentPatterns(
+    "openai:gpt-4o",
+    toolsets=[
+        "todo",                                    # by name
+        create_console_toolset(),                   # pre-configured instance
+        create_rlm_toolset(code_timeout=120),       # custom settings
+    ],
 )
 ```
 
-Used by: **ingest** (extractors produce Evidence), **research** (tasks produce Evidence, synth consumes it), and any agent that needs to pass structured findings with sources.
+### Available Toolsets
 
----
-
-## 4. Skills (discovery and loading)
+| Name | Tools | Use Case |
+|------|-------|----------|
+| `"console"` | ls, read_file, write_file, edit_file, grep, glob_files, execute | File operations + shell |
+| `"rlm"` | execute_code | Sandboxed Python for data analysis |
+| `"database"` | list_tables, describe_table, sample_table, query | SQL database access |
+| `"subagents"` | task, check_task, list_active_tasks, cancel_task | Multi-agent delegation |
+| `"todo"` | create_task, list_tasks, update_task, complete_task | Task management |
 
-**Skills** are discovered from directories (`skills/<id>/SKILL.md`) and loaded as markdown. The registry builds a **SkillSpec** per folder; the loader reads the body and returns a **LoadedSkill**.
+---
 
-```python
-from agent_ext import SkillRegistry
-from agent_ext.skills.loader import SkillLoader
+## Workbench TUI
 
-registry = SkillRegistry(roots=["skills", "vendor/skills"])
-registry.discover()
-for spec in registry.list():
-    print(spec.id, spec.name)
+Interactive terminal UI for the self-improving agent loop. Think OpenCode / Claude Code style — non-blocking, parallel, streaming.
 
-loader = SkillLoader(max_bytes=256_000)
-loaded = loader.load(registry.get("my_skill"))
-# loaded.body_markdown, loaded.spec, loaded.body_hash
+```bash
+uv run python -m agent_ext.workbench --use-openai-chat-model
 ```
 
-- Attach `registry` or a skill selector to **ctx.skills** so agents/tools can resolve skills by id.
-- Use **SkillSpec** / **LoadedSkill** to inject skill text into prompts or tool descriptions.
+### Workflow
+
+1. **Type a goal** (or `/plan <goal>`) — planning runs in background, prompt returns immediately
+2. **`/run`** (or `/run N` for N parallel workers) — tasks execute, completions stream live
+3. **`/watch`** — live-updating view of progress + LLM trace
+4. **`/adopt`** — apply the generated patch to your repo
+5. **`/diff`** — view the last generated patch with syntax highlighting
+
+### Commands
+
+| Command | Description |
+|---------|-------------|
+| `/plan <goal>` | Queue a plan (background) |
+| `/run` or `/run N` | Execute tasks (N parallel workers) |
+| `/run N fg` | Execute with live spinner (foreground) |
+| `/watch` | Live view of run + LLM trace |
+| `/tasks` | Task queue with timing + icons |
+| `/diff` | Show last generated patch |
+| `/adopt` | Apply last patch to repo |
+| `/retry [id]` | Retry failed tasks |
+| `/cancel <id>` | Cancel pending task |
+| `/ask <question>` | One-off LLM question (background) |
+| `/traces [N]` | Last N LLM traces |
+| `/trace` | Last trace in full |
+| `/status` | Run info + queue counts |
+| `/stop` or `/stop all` | Cancel background runs |
+| `/parallel <n>` | Set max concurrent subagents |
+| `/model` | Model info |
+| `/clear` | Clear screen |
+| `/help` | Full command reference |
+
+### How It Works
+
+The workbench runs a **plan → search → design → implement → gates** pipeline:
+
+- **Plan**: LLM dynamically chooses task sequence (or fixed fallback without model)
+- **Search**: BM25 index + repo grep find relevant code
+- **Design**: LLM proposes approach + file list
+- **Implement**: LLM generates structured patch → applied in isolated git worktree → gates run
+- **Gates**: Import check + compile check + optional pytest
+- **Adopt**: Diff saved to `.agent_state/`; `/adopt` applies to main repo
+
+Each implement step runs in an **isolated git worktree** — concurrent patches don't interfere. The structured patch system (LLM returns `PatchOutput` JSON, we convert to valid unified diff) avoids raw diff parsing failures.
 
 ---
 
-## 5. Backends (filesystem and exec)
+## Cog Daemon (Headless)
 
-**Backends** give the run sandboxed filesystem and optional subprocess execution. Attach them to **ctx.backends** so tools (or ingest) can use them under policy.
+Fully automated self-improving loop — no TUI, runs forever.
 
-```python
-from agent_ext import LocalFilesystemBackend, LocalSubprocessExecBackend
-
-fs = LocalFilesystemBackend(root="/tmp/sandbox", allow_write=ctx.policy.allow_fs_write)
-exec_backend = LocalSubprocessExecBackend()  # runs in subprocess, respects timeout
-
-ctx.backends = {"fs": fs, "exec": exec_backend}
-
-# In a tool or pipeline:
-backend = ctx.backends["fs"]
-content = backend.read_text("path/relative/to/root")
-# exec_backend.run(["python", "-c", "..."], timeout_s=30)
+```bash
+export AUTO_ADOPT=1 AUTO_PUSH_BRANCH=dev
+uv run python -m agent_ext.cog --use-openai-chat-model
 ```
 
-- **FilesystemBackend**: read_text, write_text, list, glob (all scoped to root).
-- **ExecBackend**: run(cmd, cwd=..., env=..., timeout_s=...). Use only when **Policy.allow_exec** is True.
+The daemon runs cognitive cycles: detect triggers → choose mode (FAST/DEEP/REPAIR/EXPLORE) → parallel writers in worktrees → score patches → auto-adopt if gates pass + score threshold met → commit and push.
+
+Anti-thrash protection via `RegressionMemory` prevents oscillating edits. Per-runner branches support multiple agents working concurrently.
 
 ---
 
-## 6. Memory (conversation shape and checkpoint)
+## Subsystems
+
+### Middleware
 
-**Memory** shapes the list of messages before each model request and checkpoints after each run. Two implementations: **SlidingWindowMemory** (last N) and **SummarizingMemory** (dossier + last N).
+Async lifecycle hooks with 7 hook points, scoped context, cost tracking, parallel execution, and permissions.
 
 ```python
-from agent_ext import SlidingWindowMemory, SummarizingMemory, SummarizeConfig
-from agent_ext.memory.summarize import Dossier
+from agent_ext.hooks import (
+    MiddlewareChain, AuditHook, PolicyHook,
+    CostTrackingMiddleware, ParallelMiddleware,
+    AsyncGuardrailMiddleware, GuardrailTiming,
+    ConditionalMiddleware, middleware_from_functions,
+    make_blocklist_filter, ContentFilterHook,
+    ToolDecision, ToolPermissionResult,
+)
 
-# Option A: sliding window
-memory = SlidingWindowMemory(max_messages=20)
+# Cost tracking with budget enforcement
+cost_mw = CostTrackingMiddleware(budget_limit_usd=5.0, cost_per_1k_input=0.01)
 
-# Option B: summarizing (needs a summarize_fn that returns/updates a Dossier)
-def my_summarize(ctx, text: str, base: Dossier) -> Dossier:
-    base.summary = f"Summary of: {text[:1000]}..."
-    return base
+# Parallel validators — all must pass
+parallel = ParallelMiddleware([PIIDetector(), InjectionGuard()])
 
-memory = SummarizingMemory(
-    cfg=SummarizeConfig(max_messages=80, keep_last_n=30),
-    summarize_fn=my_summarize,
+# Async guardrail — runs alongside LLM, cancels on failure
+guardrail = AsyncGuardrailMiddleware(PolicyCheck(), timing=GuardrailTiming.CONCURRENT)
+
+# Conditional — only run when condition met
+redactor = ConditionalMiddleware(
+    condition=lambda ctx: ctx.policy.redaction_level != "none",
+    when_true=RedactionMiddleware(),
 )
-if hasattr(memory, "bind_ctx"):
-    memory.bind_ctx(ctx)
-```
 
-- **MemoryManager** protocol: `shape_messages(messages) -> messages`, `checkpoint(messages, outcome=...)`.
-- Attach to **ctx.memory** and/or pass into **PydanticAIAgentBase(memory=...)** so the pydantic-ai history_processor and post-run checkpoint use it.
+chain = MiddlewareChain([AuditHook(), cost_mw, parallel, guardrail, redactor])
+```
 
----
+**Features**: scoped context with access control (earlier hooks only), `ToolDecision.ALLOW/DENY/ASK`, per-hook timeouts, tool-name filtering, decorator-based creation via `middleware_from_functions()`.
 
-## 7. Subagents (delegate to specialists)
+### Subagents
 
-**Subagents** are async callables that take `input` and `metadata` and return **SubagentResult**. Register them and run via **SubagentOrchestrator**.
+Multi-agent orchestration with message bus, dynamic registry, and task management.
 
 ```python
-from agent_ext import Subagent, SubagentResult, SubagentRegistry, SubagentOrchestrator
-
-class MySpecialist(Subagent):
-    name = "kg_proposer"
-    async def run(self, *, input: Any, metadata: dict) -> SubagentResult:
-        # e.g. call another agent or service
-        return SubagentResult(ok=True, output={"schema": "..."}, metadata={})
-
-registry = SubagentRegistry()
-registry.register(MySpecialist())
-orchestrator = SubagentOrchestrator(registry)
-
-ctx.subagents = registry
-
-# Run several in parallel
-results = await orchestrator.run_many(
-    ctx,
-    [
-        ("kg_proposer", evidence_list, {"source": "ingest"}),
-        ("other_agent", query, {}),
-    ],
-    timeout_s=60,
+from agent_ext.subagents import (
+    SubagentRegistry, DynamicAgentRegistry,
+    InMemoryMessageBus, TaskManager,
+    SubAgentConfig, decide_execution_mode,
 )
-```
 
-- Use from a **main agent tool**: resolve `ctx.subagents.get(name)` and `await agent.run(...)`, or call **orchestrator.run_many** with a list of (name, input, metadata).
+# Dynamic creation at runtime with limits
+registry = DynamicAgentRegistry(max_agents=10)
+config = SubAgentConfig(name="researcher", description="...", instructions="...")
+registry.register(config, agent_instance)
 
----
+# Message bus with ask/answer protocol
+bus = InMemoryMessageBus()
+queue = bus.register_agent("worker-1")
+response = await bus.ask("parent", "worker-1", "Analyze this", task_id="t1")
+
+# Auto sync/async mode selection
+mode = decide_execution_mode(TaskCharacteristics(estimated_complexity="complex"), config)
+```
 
-## 8. RLM (restricted execution)
+### RLM (Code Execution)
 
-**RLM** provides a restricted Python runner and policy for executing user or model-generated code safely (e.g. for reasoning or tool-like execution).
+Sandboxed REPL for large-context analysis. The LLM writes Python code to explore data, with optional `llm_query()` for sub-model delegation.
 
 ```python
-from agent_ext import RLMPolicy, run_restricted_python
+from agent_ext.rlm import REPLEnvironment, RLMConfig, GroundedResponse
 
-policy = RLMPolicy()  # configure allowed builtins, timeouts, etc.
-ctx.rlm = policy
+repl = REPLEnvironment(
+    context=massive_document,  # str, dict, or list
+    config=RLMConfig(sub_model="openai:gpt-4o-mini"),
+)
 
-# Run untrusted code in a restricted environment
-result = run_restricted_python("1 + 1", policy=policy)
+# State persists between executions
+repl.execute("print(f'Context: {len(context)} chars')")
+repl.execute("""
+relevant = [l for l in context.split('\\n') if 'revenue' in l.lower()]
+analysis = llm_query(f"Summarize: {relevant[:5]}")
+print(analysis)
+""")
+
+# Grounded response with citations
+response = GroundedResponse(
+    info="Revenue grew [1] driven by expansion [2]",
+    grounding={"1": "increased by 45%", "2": "new markets in Asia"},
+)
 ```
 
-- Use when **Policy.allow_exec** (or a dedicated RLM flag) is True; wrap in hooks for audit.
-
----
-
-## 9. Todo (task management)
+### Backends
 
-**Todo** provides planning primitives: tasks with subtasks, dependencies, and multi-tenant scoping (case_id, session_id, user_id). Use **TaskStore** (in-memory or Postgres), optional **TaskEventBus** for task_created / task_updated / task_completed, and **TodoToolset** to expose CRUD to agents or services.
+File storage with permission presets, in-memory testing, hashline editing, and composite routing.
 
 ```python
-from agent_ext import (
-    Task,
-    TaskCreate,
-    TaskPatch,
-    TaskQuery,
-    TaskStatus,
-    TaskStore,
-    InMemoryTaskStore,
-    PostgresTaskStore,
-    TaskEvent,
-    TaskEventBus,
-    InProcessEventBus,
-    WebhookEventBus,
-    TodoToolset,
+from agent_ext.backends import (
+    StateBackend, LocalFilesystemBackend, CompositeBackend,
+    PermissionChecker, READONLY_RULESET, PERMISSIVE_RULESET,
+    format_hashline_output, apply_hashline_edit,
 )
 
-# In-memory store (no deps)
-store: TaskStore = InMemoryTaskStore()
-
-# Or Postgres (requires asyncpg)
-# store = await PostgresTaskStore.connect("postgresql://...")
-
-# Optional: events (in-process handlers or webhooks)
-bus = InProcessEventBus()
-bus.on("task_completed", lambda e: print("Done:", e.task.id))
-
-# Or send events to webhooks
-# bus = WebhookEventBus(urls=["https://my.app/webhook"], timeout_s=10.0)
-
-toolset = TodoToolset(store, events=bus)
+# In-memory for tests
+backend = StateBackend()
+backend.write_text("src/app.py", "print('hello')")
 
-# Create task (e.g. from agent or planner)
-t = await toolset.create_task(
-    TaskCreate(
-        title="Review document",
-        description="Check section 3",
-        priority=50,
-        tags=["review"],
-        case_id=ctx.case_id,
-        session_id=ctx.session_id,
-        user_id=ctx.user_id,
-    )
+# Composite: route by path prefix
+composite = CompositeBackend(
+    default=StateBackend(),
+    routes={"/project/": LocalFilesystemBackend(root="/my/project", allow_write=True)},
 )
 
-# List and filter
-tasks = await toolset.list_tasks(TaskQuery(case_id=ctx.case_id, status="pending", limit=20))
-
-# Update (e.g. mark done)
-await toolset.update_task(t.id, TaskPatch(status="done"))
-
-# Dependencies and subtasks
-await toolset.add_dependency(task_id="B", depends_on_task_id="A")
-child = await toolset.add_subtask(parent_id="parent-id", data=TaskCreate(title="Substep 1"))
+# Hashline: precise edits by line number + hash (no text matching needed)
+tagged = format_hashline_output("def hello():\n    return 42\n")
+# 1:96|def hello():
+# 2:2a|    return 42
+new_content, error = apply_hashline_edit(content, start_line=2, start_hash="2a", new_content="    return 99")
 ```
 
-- **Task**: id, title, description, status, priority, parent_id, depends_on, tags, case_id/session_id/user_id, artifact_ids, evidence_ids, meta, created_at, updated_at.
-- **TaskStore** protocol: create_task, get_task, list_tasks, update_task, delete_task, add_dependency, add_subtask.
-- **PostgresTaskStore** creates table `agent_tasks` and indexes on case_id, session_id, user_id, parent_id, status (requires **asyncpg**).
-- Attach **TodoToolset** to **ctx** (e.g. `ctx.todo = toolset`) so agent tools can create/list/update tasks scoped to the run.
+**Permission presets**: `READONLY_RULESET`, `DEFAULT_RULESET`, `PERMISSIVE_RULESET`, `STRICT_RULESET`. All deny `.env`, `.pem`, `.key`, credentials.
 
-### Using Todo in an agent flow
+### Memory
 
-Use **TodoToolset** with **PydanticAIAgentBase**: set `ctx.todo = toolset` before runs, then in agent tools call `ctx.deps.todo` so the agent can create, list, update, and manage tasks scoped to the current run (case_id, session_id, user_id).
-
-**1. Setup: store, optional events, toolset, attach to context**
+Token-aware sliding window and auto-triggering LLM summarization. Never splits tool call/response pairs.
 
 ```python
-from agent_ext import (
-    RunContext,
-    InMemoryTaskStore,
-    InProcessEventBus,
-    TodoToolset,
-    TaskCreate,
-    TaskPatch,
-    TaskQuery,
+from agent_ext.memory import (
+    SlidingWindowMemory,
+    SummarizationProcessor, create_summarization_processor,
 )
 
-store = InMemoryTaskStore()
-bus = InProcessEventBus()
-toolset = TodoToolset(store, events=bus)
+# Sliding window (message or token mode)
+memory = SlidingWindowMemory(max_tokens=100_000, trigger_tokens=80_000)
 
-ctx = RunContext(case_id="case-1", session_id="sess-1", user_id="user-1", policy=..., cache=..., logger=..., artifacts=...)
-ctx.todo = toolset
+# Auto-triggering LLM summarizer
+processor = create_summarization_processor(
+    model="openai:gpt-4o-mini",
+    trigger=("tokens", 100_000),
+    keep=("messages", 20),
+)
+# Use as pydantic-ai history_processor:
+# agent = Agent("openai:gpt-4o", history_processors=[processor])
 ```
 
-**2. Define an agent with todo tools (use `ctx.deps` = RunContext, `ctx.deps.todo` = TodoToolset)**
+### Skills
 
-```python
-from pydantic import BaseModel, Field
-from agent_ext import PydanticAIAgentBase, RunContext
-from agent_ext.todo.models import TaskCreate, TaskPatch, TaskQuery
-from pydantic_ai import RunContext as PAIRunContext
-
-class PlanOutput(BaseModel):
-    summary: str = Field(description="Brief summary of the plan")
-
-class PlannerAgent(PydanticAIAgentBase[PlanOutput]):
-    def __init__(self):
-        super().__init__(
-            "openai:gpt-4o-mini",
-            output_type=PlanOutput,
-            instructions="You help plan work by creating and updating tasks. Use the task tools to create, list, and update tasks.",
-        )
-
-# Tools receive pydantic-ai RunContext; ctx.deps is our RunContext, ctx.deps.todo is the TodoToolset
-agent = PlannerAgent()
-
-@agent.tool
-async def create_task(
-    ctx: PAIRunContext[RunContext],
-    title: str,
-    description: str = "",
-    priority: int = 50,
-    tags: str = "",
-) -> str:
-    """Create a task scoped to the current case/session/user."""
-    if not ctx.deps.todo:
-        return "Task system not available."
-    data = TaskCreate(
-        title=title,
-        description=description or None,
-        priority=priority,
-        tags=[t.strip() for t in tags.split(",") if t.strip()],
-        case_id=ctx.deps.case_id,
-        session_id=ctx.deps.session_id,
-        user_id=ctx.deps.user_id,
-    )
-    task = await ctx.deps.todo.create_task(data)
-    return f"Created task {task.id}: {task.title}"
-
-@agent.tool
-async def list_tasks(
-    ctx: PAIRunContext[RunContext],
-    status: str = "pending",
-    limit: int = 20,
-) -> str:
-    """List tasks for the current case/session."""
-    if not ctx.deps.todo:
-        return "Task system not available."
-    q = TaskQuery(
-        case_id=ctx.deps.case_id,
-        session_id=ctx.deps.session_id,
-        user_id=ctx.deps.user_id,
-        status=status if status in ("pending", "in_progress", "done", "blocked", "canceled", "failed") else None,
-        limit=limit,
-    )
-    tasks = await ctx.deps.todo.list_tasks(q)
-    if not tasks:
-        return "No tasks found."
-    lines = [f"- [{t.id}] {t.title} (status={t.status}, priority={t.priority})" for t in tasks]
-    return "\n".join(lines)
-
-@agent.tool
-async def update_task_status(
-    ctx: PAIRunContext[RunContext],
-    task_id: str,
-    status: str,
-) -> str:
-    """Update a task's status (e.g. in_progress, done)."""
-    if not ctx.deps.todo:
-        return "Task system not available."
-    patch = TaskPatch(status=status)
-    task = await ctx.deps.todo.update_task(task_id, patch)
-    if not task:
-        return f"Task {task_id} not found."
-    return f"Updated {task.id} to status={task.status}"
-```
-
-**3. Run the agent; it can create and manage tasks via the tools**
+Progressive-disclosure instruction packs with directory discovery, programmatic creation, registry composition, and git-backed remote loading.
 
 ```python
-# User asks for a plan; agent uses create_task / list_tasks / update_task_status
-result = agent.run_sync(
-    ctx,
-    "Create three tasks: (1) Research competitors, (2) Draft outline, (3) Review. Then list pending tasks.",
+from agent_ext.skills import (
+    SkillRegistry, create_skill,
+    CombinedRegistry, FilteredRegistry, PrefixedRegistry, RenamedRegistry,
 )
-# result.output.summary, and tasks were created/listed via tool calls
+from agent_ext.skills.registries.git import GitSkillsRegistry
+
+# Local discovery
+local = SkillRegistry(roots=["skills"])
+local.discover()
 
-# Later: "Mark the first task as in progress"
-result2 = agent.run_sync(
-    ctx,
-    "Set 'Research competitors' to in progress.",
-    message_history=result.new_messages(),
+# Git-backed (clone from any repo)
+remote = GitSkillsRegistry(
+    repo_url="https://github.com/anthropics/skills",
+    path="skills",
+    target_dir="./cached-skills",
 )
-```
 
-- Always scope **TaskCreate** and **TaskQuery** with `case_id=ctx.deps.case_id`, `session_id=ctx.deps.session_id`, `user_id=ctx.deps.user_id` so tasks belong to the current run.
-- Check `ctx.deps.todo` in tools if todo is optional (e.g. return a friendly message when not set).
-- For more operations (subtasks, dependencies), add tools that call `ctx.deps.todo.add_subtask(parent_id, TaskCreate(...))` and `ctx.deps.todo.add_dependency(task_id, depends_on_task_id)`.
+# Compose registries
+combined = CombinedRegistry([local, remote])
+python_only = FilteredRegistry(combined, predicate=lambda s: "python" in s.tags)
+namespaced = PrefixedRegistry(remote, prefix="remote_")
 
----
+# Programmatic creation (no filesystem)
+skill = create_skill(id="review", name="Code Review", description="...", body="# Review\n...")
+```
 
-## 10. Document ingest (PDF → OCR → Evidence)
+### Database
 
-**Ingest** turns documents into **Evidence** with citations: PDF → page images → OCR → validation → extraction.
+SQL capabilities with SQLite and PostgreSQL backends, security controls, and a FunctionToolset.
 
 ```python
-from agent_ext import (
-    RunContext,
-    IngestPipeline,
-    DocumentInput,
-    IngestResult,
-    PDFToImages,
-    OCREngine,
-    PageExtractor,
-    OCRValidator,
-    OCRValidationPolicy,
-    ValidationEvidenceEmitter,
-)
+from agent_ext.database import SQLiteDatabase, PostgresDatabase, DatabaseConfig
 
-# Implement or use provided PDFToImages, OCREngine, PageExtractor
-pipeline = IngestPipeline(
-    pdf_to_images=pdf_to_images_impl,
-    ocr_engine=ocr_engine_impl,
-    extractor=extractor_impl,
-    validator=OCRValidator(OCRValidationPolicy()),
-    validation_evidence_emitter=ValidationEvidenceEmitter(),
-    fail_fast_on_validation=True,
-)
+# SQLite (read-only by default)
+async with SQLiteDatabase("data.db") as db:
+    tables = await db.list_tables()
+    result = await db.execute_query("SELECT * FROM users WHERE age > 25")
+
+# PostgreSQL
+async with PostgresDatabase("postgresql://user:pass@localhost/mydb") as db:
+    schema = await db.get_schema()
+    result = await db.execute_query("SELECT COUNT(*) FROM orders")
 
-doc = DocumentInput(artifact_id="doc-123", path="/path/to/file.pdf")
-result: IngestResult = pipeline.run(ctx, doc)
-# result.ocr_pages, result.evidence_chunks (List[Evidence])
+# Security: read-only, row limits, query length limits
+config = DatabaseConfig(read_only=True, max_rows=1000, timeout_s=30)
 ```
 
-- **IngestResult**: doc_artifact_id, page_images, ocr_pages, evidence_chunks. Feed evidence into research or an agent context.
-- **MultiExtractor**: combine multiple **PageExtractor**s; **OCRRetryAction** / retry planner for validation failures.
+### Todo
 
-### OCR with the wrapped agent (vision / LLM)
+Task management with subtasks, dependencies, events, and multi-tenant scoping.
 
-You can run **vision OCR** using our **PydanticAIAgentBase** and ingest pipeline: PDF → page images → one LLM call per page (image + prompt) → structured or plain text per page. This follows the same pattern as the [pydantic-ai OCR examples](https://github.com/vstorm-co/pydantic-ai-examples/tree/main/ocr_parsing) (PDF→images, send image to the model, structured output with validation), but wired to our **RunContext**, **IngestPipeline**, and **LLMVisionOCREngine**.
+```python
+from agent_ext.todo import InMemoryTaskStore, TodoToolset, TaskCreate, TaskQuery
 
-1. **Structured output model** (optional): use **PageOCROutput** (and **PageOCRElement**) so the agent returns schema-validated OCR per page (`file_type`, `file_content_md`, `file_elements`). Pydantic validates the LLM response; see [pydantic-ai structured OCR](https://github.com/vstorm-co/pydantic-ai-examples/blob/main/ocr_parsing/2_ocr_with_structured_output.py) and [validation](https://github.com/vstorm-co/pydantic-ai-examples/blob/main/ocr_parsing/3_ocr_validation.py) for the idea.
+store = InMemoryTaskStore()
+toolset = TodoToolset(store)
 
-2. **OCR agent**: subclass **PydanticAIAgentBase[PageOCROutput]** (or `str` for plain markdown). Use a vision-capable model (e.g. `openai:gpt-4o`) and instructions that describe the OCR task and output shape.
+task = await toolset.create_task(TaskCreate(title="Review PR", tags=["review"], case_id="case-1"))
+tasks = await toolset.list_tasks(TaskQuery(case_id="case-1", status="pending"))
+await toolset.update_task(task.id, TaskPatch(status="done"))
+```
 
-3. **LLMVisionOCREngine**: wraps your agent; for each page image it sends `[prompt, BinaryContent(image)]` to the agent and maps the result to **OCRPage** (e.g. `full_text` from `file_content_md`). Wire it as the pipeline’s `ocr_engine`.
+### Evidence
 
-4. **Pipeline**: **PDFToImages** (e.g. with **Pdf2ImageRenderer** from `agent_ext.ingest.pdf2image_renderer`) → **LLMVisionOCREngine** → validator (optional) → **PageExtractor** (e.g. **MarkdownDumpExtractor**). Run with **RunContext** and **DocumentInput**; get **IngestResult.ocr_pages** and **evidence_chunks**.
+Universal output format for structured findings with citations and provenance.
 
 ```python
-from agent_ext import (
-    RunContext,
-    PydanticAIAgentBase,
-    IngestPipeline,
-    DocumentInput,
-    IngestResult,
-    PDFToImages,
-    LLMVisionOCREngine,
-    PageOCROutput,
-)
-from agent_ext.ingest.extractors import MarkdownDumpExtractor
-from agent_ext.ingest.pdf2image_renderer import Pdf2ImageRenderer
-
-# 1) Structured output model (optional; use str for plain text)
-# PageOCROutput has file_type, file_content_md, file_elements (list of PageOCRElement)
-
-# 2) OCR agent: vision model + instructions + output_type
-class OCRAgent(PydanticAIAgentBase[PageOCROutput]):
-    def __init__(self):
-        super().__init__(
-            "openai:gpt-4o",
-            output_type=PageOCROutput,
-            instructions="You are an OCR expert. Extract text and structure from the document image. Return file_type, file_content_md (Markdown), and file_elements (element_type, element_content).",
-        )
-
-ocr_agent = OCRAgent()
-prompt = "Perform OCR on this document page. Return structured output: file_type, file_content_md, file_elements."
-
-# 3) Vision OCR engine: one agent run per page image
-ocr_engine = LLMVisionOCREngine(ocr_agent, prompt, media_type="image/png")
-
-# 4) Pipeline: PDF → images (Pdf2ImageRenderer) → LLM OCR → evidence
-pdf_to_images = PDFToImages(Pdf2ImageRenderer(), dpi=200)
-pipeline = IngestPipeline(
-    pdf_to_images=pdf_to_images,
-    ocr_engine=ocr_engine,
-    extractor=MarkdownDumpExtractor(),
-    validator=None,
-)
+from agent_ext.evidence import Evidence, Citation, Provenance
 
-ctx = RunContext(...)  # case_id, policy, cache, logger, artifacts (required for PDFToImages + engine)
-doc = DocumentInput(artifact_id="doc-1", path="/path/to/doc.pdf")
-result: IngestResult = pipeline.run(ctx, doc)
-# result.ocr_pages[i].full_text, result.ocr_pages[i].metadata.get("structured"), result.evidence_chunks
+evidence = Evidence(
+    kind="finding",
+    content="Revenue grew 45%",
+    citations=[Citation(source_id="doc-1", locator="page:3", quote="...", confidence=0.9)],
+    provenance=Provenance(produced_by="ingest_pipeline", artifact_ids=["doc-1"]),
+)
 ```
 
-- **Artifacts**: **PDFToImages** and **LLMVisionOCREngine** use **ctx.artifacts** (get_bytes/put_bytes for page images). Ensure the document is stored as an artifact or that you load it and put it before calling **pipeline.run**.
-- **Concurrency**: the pipeline runs pages sequentially; for parallel page calls you could extend **LLMVisionOCREngine** or run multiple pipelines in parallel with a shared context.
-- A minimal runnable demo is in **examples/ocr_with_agent_demo.py** (requires configured RunContext and artifact store).
+### Document Ingest
 
----
+PDF → page images → OCR → validation → Evidence with citations.
 
-## 11. Deep research (plan → execute → gaps → synthesize)
+### Deep Research
 
-**Research** runs a loop: plan tasks → execute (with kind-specific handlers) → collect evidence → gap analysis → add tasks → synthesize outcome.
-
-```python
-from agent_ext.research import DeepResearchController
-from agent_ext.research.planner import default_plan
-from agent_ext.research.executor import ResearchExecutor
-from agent_ext.research.handlers_default import handle_analyze, handle_synthesize  # and others
-from agent_ext.research.models import ResearchBudget
-
-planner = ResearchPlanner(plan_fn=default_plan)  # or your LLM planner
-handlers = {"analyze": handle_analyze, "synthesize": handle_synthesize}  # add search, ingest_document, etc.
-executor = ResearchExecutor(handlers=handlers)
-
-controller = DeepResearchController(
-    planner=planner,
-    executor=executor,
-    budget=ResearchBudget(max_steps=40, max_runtime_s=180),
-    enable_gap_analysis=True,
-    max_gap_iterations=3,
-    persist_snapshots=True,
-)
-
-outcome = await controller.run(ctx, question="What is the impact of X?")
-# outcome.answer, outcome.claims, outcome.plan, outcome.steps_taken
-```
-
-- **ResearchLedger** tracks plan, tasks, evidence, events; **EvidenceGraph** for structure; **propose_gaps** to add tasks from gaps; **build_outcome** to synthesize claims and answer.
-- Handlers receive **RunContext**, **ResearchTask**, **ResearchLedger** and return **Sequence[Evidence]**. Wire ingest output or subagents into handlers (e.g. `ingest_document` handler runs **IngestPipeline**).
+Plan → execute → gap analysis → synthesize. Pluggable handlers for search, ingest, analyze, synthesize.
 
 ---
 
-## 12. Pydantic-AI agent (base + memory + tools)
+## Toolset Factories
 
-**PydanticAIAgentBase** is a pydantic-ai **Agent** that uses **RunContext** as deps, optional **memory** (history_processor + checkpoint), and safe tool-call truncation.
+Every subsystem provides a `create_*_toolset()` factory that returns a pydantic-ai `FunctionToolset`:
 
 ```python
-from pydantic import BaseModel, Field
-from agent_ext import PydanticAIAgentBase, RunContext, SlidingWindowMemory
-from pydantic_ai import RunContext as PAIRunContext
-
-class MyOutput(BaseModel):
-    answer: str = Field(description="The agent's answer")
-
-memory = SlidingWindowMemory(max_messages=20)
-
-class MyAgent(PydanticAIAgentBase[MyOutput]):
-    def __init__(self):
-        super().__init__(
-            "openai:gpt-4o",
-            output_type=MyOutput,
-            instructions="You are a helpful assistant.",
-            memory=memory,
-        )
-
-@my_agent.tool
-async def lookup(ctx: PAIRunContext[RunContext], query: str) -> str:
-    ctx.deps.logger.info("tool.lookup", query=query)
-    return "..."
-
-agent = MyAgent()
-result1 = agent.run_sync(ctx, "What is 2+2?")
-result2 = agent.run_sync(ctx, "And in hex?", message_history=result1.new_messages())
+from agent_ext.rlm import create_rlm_toolset
+from agent_ext.database import create_database_toolset
+from agent_ext.backends.console import create_console_toolset
+from agent_ext.subagents import create_subagent_toolset
+from agent_ext.todo import create_todo_toolset
+from agent_ext.skills.pai_toolset import create_skills_toolset
 ```
 
-- **Tool calls and safe truncation**: history is truncated so **tool call pairs** are never split; use **message_kind**, **has_tool_calls**, **has_tool_returns**, **safe_truncate_messages** from **agent_ext.agent** when inspecting or trimming messages.
-- **Memory**: with `memory=` set, a history_processor runs **shape_messages** and **checkpoint** runs after each **run_sync** / **run**.
-- **Todo in the agent**: set `ctx.todo = TodoToolset(store, events=...)` and in tools use `ctx.deps.todo` to create/list/update tasks; see **§9 (Todo) → Using Todo in an agent flow** for a full example.
-
 ---
 
-## 13. Combining patterns
-
-Build one **RunContext** and attach the subsystems you need; then use hooks, ingest, research, and agent together.
-
-### Example: Agent + hooks + policy
+## Setup
 
-```python
-ctx = RunContext(case_id=..., session_id=..., user_id=..., policy=Policy(allow_tools=True), ...)
-chain = HookChain([AuditHook(), PolicyHook()])
-# Wrap your agent run: chain.before_run(ctx); ... run agent ...; chain.after_run(ctx, result)
-```
+```bash
+# Install all dependencies
+uv sync
 
-### Example: Ingest → Evidence → Research
+# Configure environment
+cp .env.example .env
+# Edit .env: LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
 
-```python
-ctx.artifacts = my_artifact_store
-ingest_result = ingest_pipeline.run(ctx, DocumentInput(artifact_id="doc-1", path="report.pdf"))
-evidence_from_doc = ingest_result.evidence_chunks
+# Verify
+uv run python -c "from agent_ext import AgentPatterns; print('OK')"
 
-# Use evidence in research (e.g. in an ingest_document or analyze handler)
-# or pass to a subagent / main agent as context
+# Run tests
+uv run python -m pytest tests/ -v
 ```
 
-### Example: Agent + memory + subagents
+---
 
-```python
-ctx.memory = SlidingWindowMemory(max_messages=30)
-ctx.subagents = SubagentRegistry()  # register specialists
-agent = MyAgent()  # PydanticAIAgentBase with memory=ctx.memory
-# In a tool: result = await ctx.subagents.get("kg_proposer").run(input=ev, metadata={})
-```
+## Environment Variables
 
-### Example: Agent + todo (task toolset)
+| Variable | Purpose | Default |
+|----------|---------|---------|
+| `LLM_BASE_URL` | LLM API endpoint | `http://127.0.0.1:8000/v1` |
+| `LLM_API_KEY` | API key | `local` |
+| `LLM_MODEL` | Model name | `gpt-oss-120b` |
+| `MAX_PARALLEL_SUBAGENTS` | Concurrent subagent calls | `4` |
+| `MAX_PARALLEL_MODEL_CALLS` | Concurrent LLM calls | `2` |
+| `AUTO_ADOPT` | Auto-commit after gates pass | `0` |
+| `AUTO_PUSH_BRANCH` | Branch to push to | `dev` |
+| `AUTO_COMMIT_THRESHOLD` | Min score to auto-adopt | `80` |
+| `KEEP_WORKTREE` | Keep worktree after implement | `0` |
+| `BM25_TOP_K` | Default search results | `20` |
+| `GITHUB_TOKEN` | For git skill registry auth | (none) |
 
-```python
-store = InMemoryTaskStore()
-bus = InProcessEventBus()
-toolset = TodoToolset(store, events=bus)
-ctx.todo = toolset
+See `.env.example` for the full list.
 
-# In an agent tool: create/list/update tasks scoped to ctx.case_id / session_id
-# t = await ctx.deps.todo.create_task(TaskCreate(title="...", case_id=ctx.deps.case_id, ...))
-# tasks = await ctx.deps.todo.list_tasks(TaskQuery(session_id=ctx.deps.session_id))
-```
+---
 
-### Example: Research + ingest + orchestrator
+## Running Tests
 
-```python
-async def handle_ingest_document(ctx: RunContext, task: ResearchTask, ledger: ResearchLedger):
-    path = task.inputs.get("path")
-    doc = DocumentInput(artifact_id=task.id, path=path)
-    result = ingest_pipeline.run(ctx, doc)
-    return result.evidence_chunks
-
-executor = ResearchExecutor(handlers={..., "ingest_document": handle_ingest_document})
-controller = DeepResearchController(planner=planner, executor=executor, ...)
-outcome = await controller.run(ctx, question="...")
-```
+```bash
+# All tests (186 passing)
+uv run python -m pytest tests/ -v
 
-### Example: Full stack (context, hooks, backends, skills, memory, ingest, research, agent)
+# Specific subsystem
+uv run python -m pytest tests/test_hooks.py -v
+uv run python -m pytest tests/test_database.py -v
 
-```python
-# 1. Context with all subsystems
-ctx = RunContext(..., policy=policy, cache=cache, logger=logger, artifacts=artifacts)
-ctx.backends = {"fs": LocalFilesystemBackend(...), "exec": LocalSubprocessExecBackend()}
-ctx.skills = SkillRegistry(roots=["skills"])
-ctx.skills.discover()
-ctx.memory = SummarizingMemory(cfg=..., summarize_fn=...)
-ctx.subagents = registry  # SubagentOrchestrator(registry) for run_many
-ctx.todo = TodoToolset(InMemoryTaskStore(), events=InProcessEventBus())  # optional
-
-# 2. Hooks around runs
-chain = HookChain([AuditHook(), PolicyHook()])
-
-# 3. Ingest for documents
-ingest_result = ingest_pipeline.run(ctx, doc)
-
-# 4. Research with ingest + custom handlers
-executor = ResearchExecutor(handlers={"ingest_document": handle_ingest, "analyze": handle_analyze, ...})
-outcome = await controller.run(ctx, question=question)
-
-# 5. Agent with memory and tools that use ctx.deps (logger, backends, subagents)
-agent = MyAgent()
-chain.before_run(ctx)
-result = agent.run_sync(ctx, user_message, message_history=...)
-chain.after_run(ctx, result)
+# With coverage
+uv run python -m pytest tests/ --cov=agent_ext
 ```
 
 ---
 
-## Imports reference
-
-| Area | Import from | Key types |
-|------|-------------|-----------|
-| Context | `agent_ext` or `agent_patterns.run_context` | RunContext, ToolCall, ToolResult, Policy |
-| Hooks | `agent_ext` | Hook, BlockedToolCall, BlockedPrompt, AuditHook, PolicyHook, ContentFilterHook, ContentFilterFn, make_blocklist_filter, HookChain |
-| Evidence | `agent_ext` | Citation, Provenance, Evidence |
-| Skills | `agent_ext` | SkillSpec, LoadedSkill, SkillRegistry |
-| Backends | `agent_ext` | LocalFilesystemBackend, LocalSubprocessExecBackend |
-| Memory | `agent_ext` | SlidingWindowMemory, SummarizingMemory |
-| Memory config | `agent_ext.memory.summarize` | SummarizeConfig, Dossier |
-| Subagents | `agent_ext` | Subagent, SubagentResult, SubagentRegistry, SubagentOrchestrator |
-| RLM | `agent_ext` | RLMPolicy, run_restricted_python |
-| Todo | `agent_ext` | Task, TaskCreate, TaskPatch, TaskQuery, TaskStatus, TaskStore, InMemoryTaskStore, PostgresTaskStore, TaskEvent, TaskEventBus, InProcessEventBus, WebhookEventBus, TodoToolset |
-| Ingest | `agent_ext` | DocumentInput, IngestResult, PageImage, OCRPage, OCRSpan, PageOCROutput, PageOCRElement, IngestPipeline, PDFToImages, OCREngine, LLMVisionOCREngine, PageExtractor, OCRValidator, OCRValidationPolicy, ValidationEvidenceEmitter, MultiExtractor, OCRRetryAction |
-| Research | `agent_ext.research` | DeepResearchController; planner, executor, ledger, models in agent_ext.research.* |
-| Pydantic-AI agent | `agent_ext` | PydanticAIAgentBase |
-| Agent memory / tools | `agent_ext.agent` | build_history_processor, checkpoint_after_run, message_kind, has_tool_calls, has_tool_returns, safe_truncate_messages |
-
-Root types live in **agent_patterns.run_context** (re-exported from **agent_ext**) so the stdlib **types** module is not shadowed.
+## License
+
+MIT
diff --git a/WORKBENCH.md b/WORKBENCH.md
new file mode 100644
index 0000000..4339492
--- /dev/null
+++ b/WORKBENCH.md
@@ -0,0 +1,34 @@
+set these:
+export LLM_BASE_URL="http://127.0.0.1:8000/v1"
+export LLM_API_KEY="local"
+export LLM_MODEL="gpt-oss-120b"
+
+
+uv sync --extra docs --extra agent
+python -m agent_ext.workbench --use-openai-chat-model --max-parallel-subagents 6 --max-parallel-model-calls 2
+
+In the TUI (agentic, non-blocking — like OpenCode / Claude Code)
+
+- Type a goal or `/plan <goal>`: planning runs in background; prompt returns immediately. **Plans are dynamic** when you use `--use-openai-chat-model`: the LLM chooses the task sequence (e.g. skip analyze for small edits, add multiple search steps). Without a model, a fixed plan (analyze → search → design → implement → gates) is used.
+- try: 'build me a self improving code agent'
+- **Live like Cursor/Claude Code:** When you `/run`, task completions stream in real time. Use **`/watch`** to pop up a live-updating view (recent task output + LLM trace); it refreshes until you press Enter to leave. **`/watch t0003`** opens the same view and highlights that you’re watching for that task. When a run finishes, the panel shows the patch path and `/adopt` if implement ran.
+- **OpenCode-style parallel execution:** `/run N` starts N **concurrent workers** that drain the task queue. You can start many runs; task completions stream below. Use `/status`, `/traces`, `/stop` while runs are in progress.
+- `/ask <question>`: one-off LLM question in background; answer prints when ready.
+- To watch one run block with live trace: `/run N fg` (runs N tasks sequentially with spinner). `/status` shows how many runs are in progress and queue counts.
+- `/stop`: cancel the most recent background run. `/stop all`: cancel all background runs. Interrupt at any time.
+
+- **Concurrency:** `/parallel 8` sets max concurrent subagent calls per task. `/run N` sets N parallel workers per run (queue is drained by N workers; same idea as OpenCode’s “run multiple units of work in parallel”).
+
+- LLM trace streams during implement; for DAG/streaming hooks use `agent_ext.workbench.streaming` (`run_agent_streaming`, `iter_agent_dag`).
+
+This will already feel like: goal → plan → concurrent repo scans → iterative execution.
+
+---
+
+**Fully automated (no TUI)**  
+See `docs/AUTO_AGENT.md` for design. Run the daemon:
+
+  export USE_OPENAI_CHAT_MODEL=1
+  python -m agent_ext.cog [--use-openai-chat-model]
+
+Set `AUTO_ADOPT=1` and `AUTO_PUSH_BRANCH` (e.g. `dev` or `auto/$(hostname)`) to auto-commit and push after gates. Adopt pulls before push and retries on conflict (see `.env.example`).
\ No newline at end of file
diff --git a/agent_ext/__init__.py b/agent_ext/__init__.py
index 438e1d5..57a2eca 100644
--- a/agent_ext/__init__.py
+++ b/agent_ext/__init__.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+
 # Ensure root package is importable when run from repo root (e.g. uv run python main.py)
 def _ensure_root_importable() -> None:
     import sys
     from pathlib import Path
+
     _root = Path(__file__).resolve().parent.parent
     _parent = _root.parent
     if _parent not in (Path(p).resolve() for p in sys.path):
@@ -12,83 +14,247 @@ def _ensure_root_importable() -> None:
 
 _ensure_root_importable()
 
-
-from .run_context import RunContext, ToolCall, ToolResult
-from .hooks.base import Hook, BlockedToolCall, BlockedPrompt
-from .hooks.builtins import AuditHook, PolicyHook, ContentFilterHook, ContentFilterFn, make_blocklist_filter
-from .hooks.chain import HookChain
-from .evidence.models import Citation, Provenance, Evidence
-from .skills.models import SkillSpec, LoadedSkill
-from .skills.registry import SkillRegistry
+# ---------------------------------------------------------------------------
+# Lightweight, always-needed imports (fast: ~10ms total)
+# ---------------------------------------------------------------------------
 from .backends.local_fs import LocalFilesystemBackend
 from .backends.sandbox_exec import LocalSubprocessExecBackend
+from .evidence.models import Citation, Evidence, Provenance
+from .export.base import Exporter
+from .export.models import ExportRequest, ExportResult
+from .hooks.base import AgentMiddleware, BlockedPrompt, BlockedToolCall, Hook
+from .hooks.builtins import (
+    AuditHook,
+    ConditionalMiddleware,
+    ContentFilterFn,
+    ContentFilterHook,
+    PolicyHook,
+    make_blocklist_filter,
+)
+from .hooks.chain import HookChain, MiddlewareChain
+from .hooks.context import ContextAccessError, HookType, MiddlewareContext, ScopedContext
+from .hooks.exceptions import (
+    BudgetExceededError,
+    InputBlocked,
+    MiddlewareError,
+    MiddlewareTimeout,
+    OutputBlocked,
+    ToolBlocked,
+)
+from .hooks.permissions import PermissionHandler, ToolDecision, ToolPermissionResult
+from .hooks.strategies import AggregationStrategy, GuardrailTiming
 from .memory.summarize import SummarizingMemory
 from .memory.window import SlidingWindowMemory
-from .subagents.base import Subagent, SubagentResult
-from .subagents.orchestrator import SubagentOrchestrator
-from .subagents.registry import SubagentRegistry
-from .rlm.policies import RLMPolicy
-from .rlm.python_runner import run_restricted_python
-from .ingest.models import DocumentInput, IngestResult, PageImage, OCRPage, OCRSpan, PageOCROutput, PageOCRElement
-from .ingest.pdf_to_images import PDFToImages
-from .ingest.ocr_engines import OCREngine
-from .ingest.extractors import PageExtractor
-from .ingest.validation import OCRValidator, OCRValidationPolicy
-from .ingest.validation_evidence import ValidationEvidenceEmitter
-from .ingest.pipeline import IngestPipeline
-from .ingest.retry_planner import OCRRetryAction
-from .ingest.multi_extractor import MultiExtractor
+from .run_context import RunContext, ToolCall, ToolResult
+from .skills.models import LoadedSkill, SkillSpec
+from .skills.registry import SkillRegistry
+from .todo.events import InProcessEventBus, TaskEvent, TaskEventBus, WebhookEventBus
 from .todo.models import Task, TaskCreate, TaskPatch, TaskQuery, TaskStatus
-from .export.models import ExportResult, ExportRequest
-from .export.base import Exporter
-from .export.html_writer import HtmlExporter
-from .export.docx_writer import DocxExporter
-from .export.pdf_writer import PdfExporter
-from .export.pptx_writer import PptxExporter
-
-
-# Optional: pydantic-ai (agent + vision OCR). Omit from core deps to avoid version/Starlette conflicts.
-# If your app already has pydantic-ai, these will use it; else install with: pip install agent-patterns[agent]
-try:
-    from .ingest.llm_ocr_engine import LLMVisionOCREngine
-except ImportError:
-    LLMVisionOCREngine = None  # type: ignore[misc, assignment]
-try:
-    from .agent.base import PydanticAIAgentBase
-except ImportError:
-    PydanticAIAgentBase = None  # type: ignore[misc, assignment]
 from .todo.store_base import TaskStore
 from .todo.store_memory import InMemoryTaskStore
-from .todo.store_postgres import PostgresTaskStore
-from .todo.events import TaskEvent, TaskEventBus, InProcessEventBus, WebhookEventBus
 from .todo.toolset import TodoToolset
 
+# ---------------------------------------------------------------------------
+# Heavy imports deferred via __getattr__ (pydantic-ai ~0.5s, exporters, etc.)
+# These are only loaded when explicitly accessed by name.
+# ---------------------------------------------------------------------------
 
 __all__ = [
-    "RunContext", "ToolCall", "ToolResult",
-    "Hook", "BlockedToolCall", "BlockedPrompt",
-    "AuditHook", "PolicyHook", "ContentFilterHook", "ContentFilterFn", "make_blocklist_filter",
+    "RunContext",
+    "ToolCall",
+    "ToolResult",
+    # Hooks / middleware
+    "AgentMiddleware",
+    "Hook",
+    "BlockedToolCall",
+    "BlockedPrompt",
+    "AuditHook",
+    "PolicyHook",
+    "ContentFilterHook",
+    "ContentFilterFn",
+    "make_blocklist_filter",
+    "ConditionalMiddleware",
     "HookChain",
-    "Citation", "Provenance", "Evidence",
-    "SkillSpec", "LoadedSkill",
+    "MiddlewareChain",
+    "MiddlewareContext",
+    "ScopedContext",
+    "HookType",
+    "ContextAccessError",
+    "InputBlocked",
+    "ToolBlocked",
+    "OutputBlocked",
+    "BudgetExceededError",
+    "MiddlewareTimeout",
+    "MiddlewareError",
+    "ToolDecision",
+    "ToolPermissionResult",
+    "PermissionHandler",
+    "AggregationStrategy",
+    "GuardrailTiming",
+    "CostTrackingMiddleware",
+    "CostInfo",
+    "ParallelMiddleware",
+    "AsyncGuardrailMiddleware",
+    "middleware_from_functions",
+    "Citation",
+    "Provenance",
+    "Evidence",
+    "SkillSpec",
+    "LoadedSkill",
     "SkillRegistry",
-    "LocalFilesystemBackend", "LocalSubprocessExecBackend",
-    "SummarizingMemory", "SlidingWindowMemory",
-    "Subagent", "SubagentResult",
-    "SubagentOrchestrator", "SubagentRegistry",
-    "RLMPolicy", "run_restricted_python",
-    "DocumentInput", "IngestResult", "PageImage", "OCRPage", "OCRSpan", "PageOCROutput", "PageOCRElement",
-    "PDFToImages", "OCREngine", "PageExtractor",
-    "OCRValidator", "OCRValidationPolicy", "ValidationEvidenceEmitter",
-    "IngestPipeline", "OCRRetryAction", "MultiExtractor",
-    "Task", "TaskCreate", "TaskPatch", "TaskQuery", "TaskStatus",
-    "TaskStore", "InMemoryTaskStore", "PostgresTaskStore",
-    "TaskEvent", "TaskEventBus", "InProcessEventBus", "WebhookEventBus",
+    "LocalFilesystemBackend",
+    "LocalSubprocessExecBackend",
+    "SummarizingMemory",
+    "SlidingWindowMemory",
+    "Subagent",
+    "SubagentResult",
+    "SubagentOrchestrator",
+    "SubagentRegistry",
+    "RLMPolicy",
+    "run_restricted_python",
+    "DocumentInput",
+    "IngestResult",
+    "PageImage",
+    "OCRPage",
+    "OCRSpan",
+    "PageOCROutput",
+    "PageOCRElement",
+    "PDFToImages",
+    "OCREngine",
+    "PageExtractor",
+    "OCRValidator",
+    "OCRValidationPolicy",
+    "ValidationEvidenceEmitter",
+    "IngestPipeline",
+    "OCRRetryAction",
+    "MultiExtractor",
+    "Task",
+    "TaskCreate",
+    "TaskPatch",
+    "TaskQuery",
+    "TaskStatus",
+    "TaskStore",
+    "InMemoryTaskStore",
+    "PostgresTaskStore",
+    "TaskEvent",
+    "TaskEventBus",
+    "InProcessEventBus",
+    "WebhookEventBus",
     "TodoToolset",
-    "ExportResult", "ExportRequest",
-    "Exporter", "HtmlExporter", "DocxExporter", "PdfExporter", "PptxExporter",
+    "ExportResult",
+    "ExportRequest",
+    "Exporter",
+    "HtmlExporter",
+    "DocxExporter",
+    "PdfExporter",
+    "PptxExporter",
+    "LLMVisionOCREngine",
+    "PydanticAIAgentBase",
 ]
-if LLMVisionOCREngine is not None:
-    __all__.append("LLMVisionOCREngine")
-if PydanticAIAgentBase is not None:
-    __all__.append("PydanticAIAgentBase")
\ No newline at end of file
+
+# Lazy-loaded module cache
+_lazy_cache: dict[str, object] = {}
+
+# Map of name → (module_path, attr_name) for heavy imports
+_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
+    # Hooks (deferred heavy)
+    "CostTrackingMiddleware": ("agent_ext.hooks.cost_tracking", "CostTrackingMiddleware"),
+    "CostInfo": ("agent_ext.hooks.cost_tracking", "CostInfo"),
+    "ParallelMiddleware": ("agent_ext.hooks.parallel", "ParallelMiddleware"),
+    "AsyncGuardrailMiddleware": ("agent_ext.hooks.async_guardrail", "AsyncGuardrailMiddleware"),
+    "middleware_from_functions": ("agent_ext.hooks.decorators", "middleware_from_functions"),
+    # Subagents
+    "Subagent": ("agent_ext.subagents.base", "Subagent"),
+    "SubagentResult": ("agent_ext.subagents.base", "SubagentResult"),
+    "SubagentOrchestrator": ("agent_ext.subagents.orchestrator", "SubagentOrchestrator"),
+    "SubagentRegistry": ("agent_ext.subagents.registry", "SubagentRegistry"),
+    "DynamicAgentRegistry": ("agent_ext.subagents.registry", "DynamicAgentRegistry"),
+    "InMemoryMessageBus": ("agent_ext.subagents.message_bus", "InMemoryMessageBus"),
+    "TaskManager": ("agent_ext.subagents.message_bus", "TaskManager"),
+    "SubAgentConfig": ("agent_ext.subagents.types", "SubAgentConfig"),
+    # RLM
+    "RLMPolicy": ("agent_ext.rlm.policies", "RLMPolicy"),
+    "run_restricted_python": ("agent_ext.rlm.python_runner", "run_restricted_python"),
+    "REPLEnvironment": ("agent_ext.rlm.repl", "REPLEnvironment"),
+    "GroundedResponse": ("agent_ext.rlm.models", "GroundedResponse"),
+    "RLMConfig": ("agent_ext.rlm.models", "RLMConfig"),
+    # Backends (new)
+    "StateBackend": ("agent_ext.backends.state", "StateBackend"),
+    "PermissionChecker": ("agent_ext.backends.permissions", "PermissionChecker"),
+    "READONLY_RULESET": ("agent_ext.backends.permissions", "READONLY_RULESET"),
+    "PERMISSIVE_RULESET": ("agent_ext.backends.permissions", "PERMISSIVE_RULESET"),
+    "format_hashline_output": ("agent_ext.backends.hashline", "format_hashline_output"),
+    "apply_hashline_edit": ("agent_ext.backends.hashline", "apply_hashline_edit"),
+    # Database
+    "SQLiteDatabase": ("agent_ext.database.sqlite", "SQLiteDatabase"),
+    "DatabaseConfig": ("agent_ext.database.types", "DatabaseConfig"),
+    "create_database_toolset": ("agent_ext.database.toolset", "create_database_toolset"),
+    "SQLDatabaseDeps": ("agent_ext.database.toolset", "SQLDatabaseDeps"),
+    # Console toolset
+    "create_console_toolset": ("agent_ext.backends.console", "create_console_toolset"),
+    "ConsoleDeps": ("agent_ext.backends.console", "ConsoleDeps"),
+    # Subagent toolset
+    "create_subagent_toolset": ("agent_ext.subagents.toolset", "create_subagent_toolset"),
+    "SubAgentDeps": ("agent_ext.subagents.toolset", "SubAgentDeps"),
+    # RLM toolset
+    "create_rlm_toolset": ("agent_ext.rlm.toolset", "create_rlm_toolset"),
+    # Todo toolset (pydantic-ai)
+    "create_todo_toolset": ("agent_ext.todo.pai_toolset", "create_todo_toolset"),
+    "TodoDeps": ("agent_ext.todo.pai_toolset", "TodoDeps"),
+    # Skills (new registries)
+    "create_skill": ("agent_ext.skills.models", "create_skill"),
+    "CombinedRegistry": ("agent_ext.skills.registries.combined", "CombinedRegistry"),
+    "FilteredRegistry": ("agent_ext.skills.registries.filtered", "FilteredRegistry"),
+    "PrefixedRegistry": ("agent_ext.skills.registries.prefixed", "PrefixedRegistry"),
+    # Ingest (pulls in pdf libs)
+    "DocumentInput": ("agent_ext.ingest.models", "DocumentInput"),
+    "IngestResult": ("agent_ext.ingest.models", "IngestResult"),
+    "PageImage": ("agent_ext.ingest.models", "PageImage"),
+    "OCRPage": ("agent_ext.ingest.models", "OCRPage"),
+    "OCRSpan": ("agent_ext.ingest.models", "OCRSpan"),
+    "PageOCROutput": ("agent_ext.ingest.models", "PageOCROutput"),
+    "PageOCRElement": ("agent_ext.ingest.models", "PageOCRElement"),
+    "PDFToImages": ("agent_ext.ingest.pdf_to_images", "PDFToImages"),
+    "OCREngine": ("agent_ext.ingest.ocr_engines", "OCREngine"),
+    "PageExtractor": ("agent_ext.ingest.extractors", "PageExtractor"),
+    "OCRValidator": ("agent_ext.ingest.validation", "OCRValidator"),
+    "OCRValidationPolicy": ("agent_ext.ingest.validation", "OCRValidationPolicy"),
+    "ValidationEvidenceEmitter": ("agent_ext.ingest.validation_evidence", "ValidationEvidenceEmitter"),
+    "IngestPipeline": ("agent_ext.ingest.pipeline", "IngestPipeline"),
+    "OCRRetryAction": ("agent_ext.ingest.retry_planner", "OCRRetryAction"),
+    "MultiExtractor": ("agent_ext.ingest.multi_extractor", "MultiExtractor"),
+    # Exporters (pull in reportlab, docx, pptx)
+    "HtmlExporter": ("agent_ext.export.html_writer", "HtmlExporter"),
+    "DocxExporter": ("agent_ext.export.docx_writer", "DocxExporter"),
+    "PdfExporter": ("agent_ext.export.pdf_writer", "PdfExporter"),
+    "PptxExporter": ("agent_ext.export.pptx_writer", "PptxExporter"),
+    # Postgres task store (pulls asyncpg)
+    "PostgresTaskStore": ("agent_ext.todo.store_postgres", "PostgresTaskStore"),
+    # Pydantic-AI agent (pulls pydantic-ai ~0.5s)
+    "PydanticAIAgentBase": ("agent_ext.agent.base", "PydanticAIAgentBase"),
+    "AgentPatterns": ("agent_ext.agent.agent", "AgentPatterns"),
+    # LLM Vision OCR (pulls pydantic-ai)
+    "LLMVisionOCREngine": ("agent_ext.ingest.llm_ocr_engine", "LLMVisionOCREngine"),
+}
+
+
+def __getattr__(name: str) -> object:
+    if name in _lazy_cache:
+        return _lazy_cache[name]
+    if name in _LAZY_IMPORTS:
+        mod_path, attr = _LAZY_IMPORTS[name]
+        import importlib
+
+        try:
+            mod = importlib.import_module(mod_path)
+            obj = getattr(mod, attr)
+            _lazy_cache[name] = obj
+            return obj
+        except (ImportError, AttributeError):
+            # Optional dependency not installed — return None for back-compat
+            _lazy_cache[name] = None  # type: ignore[assignment]
+            return None
+    raise AttributeError(f"module 'agent_ext' has no attribute {name!r}")
+
+
+__version__ = "0.1.0"
diff --git a/agent_ext/agent/README.md b/agent_ext/agent/README.md
new file mode 100644
index 0000000..36faf49
--- /dev/null
+++ b/agent_ext/agent/README.md
@@ -0,0 +1,54 @@
+# AgentPatterns — Batteries-Included Pydantic-AI Agent
+
+`AgentPatterns` inherits from pydantic-ai's `Agent` and auto-wires all agent_patterns subsystems: middleware, memory, console tools, RLM code execution, database queries, subagent delegation, and task management.
+
+## Quick Start
+
+```python
+from agent_ext.agent import AgentPatterns
+
+# Minimal: just a model
+agent = AgentPatterns("openai:gpt-4o")
+
+# With toolsets (pass names or FunctionToolset instances)
+agent = AgentPatterns(
+    "openai:gpt-4o",
+    instructions="You are a coding assistant.",
+    toolsets=["console", "todo"],
+)
+
+# With memory
+from agent_ext.memory import SlidingWindowMemory
+
+agent = AgentPatterns(
+    "openai:gpt-4o",
+    toolsets=["console"],
+    memory=SlidingWindowMemory(max_messages=50),
+)
+```
+
+## Factory Methods
+
+```python
+# Console agent (ls, read, write, edit, grep, execute)
+agent = AgentPatterns.with_console()
+
+# RLM agent (sandboxed Python execution)
+agent = AgentPatterns.with_rlm(sub_model="openai:gpt-4o-mini")
+
+# Database agent (SQL queries)
+agent = AgentPatterns.with_database()
+
+# Kitchen sink (everything)
+agent = AgentPatterns.with_all()
+```
+
+## Available Toolsets
+
+| Name | Tools | Use Case |
+|------|-------|----------|
+| `"console"` | ls, read_file, write_file, edit_file, grep, glob_files, execute | File operations + shell |
+| `"rlm"` | execute_code | Sandboxed Python for data analysis |
+| `"database"` | list_tables, describe_table, sample_table, query | SQL database access |
+| `"subagents"` | task, check_task, list_active_tasks, cancel_task | Multi-agent delegation |
+| `"todo"` | create_task, list_tasks, update_task, complete_task | Task management |
diff --git a/agent_ext/agent/__init__.py b/agent_ext/agent/__init__.py
index 416b81d..0ca16eb 100644
--- a/agent_ext/agent/__init__.py
+++ b/agent_ext/agent/__init__.py
@@ -1,5 +1,8 @@
+"""Agent classes — PydanticAIAgentBase (low-level) and AgentPatterns (batteries-included)."""
+
 from __future__ import annotations
 
+from .agent import AgentPatterns
 from .base import PydanticAIAgentBase
 from .memory_adapter import (
     build_history_processor,
@@ -12,6 +15,7 @@
 
 __all__ = [
     "PydanticAIAgentBase",
+    "AgentPatterns",
     "build_history_processor",
     "checkpoint_after_run",
     "has_tool_calls",
diff --git a/agent_ext/agent/agent.py b/agent_ext/agent/agent.py
new file mode 100644
index 0000000..a715f9d
--- /dev/null
+++ b/agent_ext/agent/agent.py
@@ -0,0 +1,190 @@
+"""AgentPatterns — fully-wired pydantic-ai Agent with all subsystems.
+
+Inherits from pydantic-ai ``Agent`` and auto-wires middleware, memory,
+and any combination of toolsets (console, RLM, database, subagents, todo).
+
+Example::
+
+    from agent_ext.agent import AgentPatterns
+    from agent_ext.backends import LocalFilesystemBackend
+    from agent_ext.backends.console import ConsoleDeps
+
+    agent = AgentPatterns(
+        "openai:gpt-4o",
+        instructions="You are a helpful coding assistant.",
+        toolsets=["console", "todo"],
+    )
+
+    # Run with deps
+    result = await agent.run(
+        "List files in the current directory",
+        deps=ConsoleDeps(backend=LocalFilesystemBackend(root=".", allow_write=True)),
+    )
+"""
+
+from __future__ import annotations
+
+from typing import Any, TypeVar
+
+from pydantic import BaseModel
+from pydantic_ai import Agent
+
+from ..memory.base import MemoryManager
+from .memory_adapter import build_history_processor
+
+OutputT = TypeVar("OutputT", str, BaseModel)
+
+
+# ---------------------------------------------------------------------------
+# Toolset factory registry
+# ---------------------------------------------------------------------------
+
+
+def _get_toolset_factory(name: str) -> Any:
+    """Lazy-load a toolset factory by name."""
+    factories = {
+        "console": ("agent_ext.backends.console", "create_console_toolset"),
+        "rlm": ("agent_ext.rlm.toolset", "create_rlm_toolset"),
+        "database": ("agent_ext.database.toolset", "create_database_toolset"),
+        "subagents": ("agent_ext.subagents.toolset", "create_subagent_toolset"),
+        "todo": ("agent_ext.todo.pai_toolset", "create_todo_toolset"),
+    }
+    if name not in factories:
+        raise ValueError(f"Unknown toolset: {name!r}. Available: {sorted(factories.keys())}")
+    mod_path, attr = factories[name]
+    import importlib
+
+    mod = importlib.import_module(mod_path)
+    return getattr(mod, attr)
+
+
+class AgentPatterns(Agent):
+    """Pydantic-AI Agent with agent_patterns subsystems wired in.
+
+    Pass toolset names as strings (auto-created) or FunctionToolset instances.
+    Middleware and memory are auto-integrated.
+
+    Args:
+        model: Model name (e.g. ``"openai:gpt-4o"``).
+        instructions: System prompt.
+        toolsets: List of toolset names (``"console"``, ``"rlm"``, ``"database"``,
+            ``"subagents"``, ``"todo"``) or FunctionToolset instances.
+        middleware: List of ``AgentMiddleware`` instances for the hook chain.
+        memory: ``MemoryManager`` instance (SlidingWindowMemory, SummarizingMemory).
+        output_type: Pydantic model for structured output (default: ``str``).
+        **kwargs: Additional args passed to ``pydantic_ai.Agent``.
+
+    Example::
+
+        # Kitchen-sink agent with everything
+        agent = AgentPatterns(
+            "openai:gpt-4o",
+            instructions="You are a full-stack AI assistant.",
+            toolsets=["console", "rlm", "database", "todo"],
+            memory=SlidingWindowMemory(max_messages=50),
+        )
+
+        # Minimal agent with just console tools
+        agent = AgentPatterns(
+            "openai:gpt-4o",
+            toolsets=["console"],
+        )
+    """
+
+    def __init__(
+        self,
+        model: str,
+        *,
+        instructions: str | None = None,
+        toolsets: list[str | Any] | None = None,
+        middleware: list[Any] | None = None,
+        memory: MemoryManager | None = None,
+        output_type: type = str,
+        **kwargs: Any,
+    ) -> None:
+        # Resolve toolsets: strings become factory calls, objects pass through
+        resolved_toolsets: list[Any] = []
+        for ts in toolsets or []:
+            if isinstance(ts, str):
+                factory = _get_toolset_factory(ts)
+                resolved_toolsets.append(factory())
+            else:
+                resolved_toolsets.append(ts)
+
+        # Wire memory as history processor
+        history_processors = list(kwargs.pop("history_processors", []))
+        if memory is not None:
+            history_processors.insert(0, build_history_processor(memory))
+
+        # Store for post-run checkpoint
+        self._ap_memory = memory
+        self._ap_middleware = middleware or []
+
+        super().__init__(
+            model,
+            output_type=output_type,
+            instructions=instructions or "",
+            toolsets=resolved_toolsets or None,
+            history_processors=history_processors or None,
+            **kwargs,
+        )
+
+    # -- Convenience factory methods ----------------------------------------
+
+    @classmethod
+    def with_console(
+        cls,
+        model: str = "openai:gpt-4o",
+        *,
+        instructions: str = "You are a helpful coding assistant with filesystem access.",
+        memory: MemoryManager | None = None,
+        **kwargs: Any,
+    ) -> AgentPatterns:
+        """Create an agent with console tools (ls, read, write, edit, grep, execute)."""
+        return cls(model, instructions=instructions, toolsets=["console"], memory=memory, **kwargs)
+
+    @classmethod
+    def with_rlm(
+        cls,
+        model: str = "openai:gpt-4o",
+        *,
+        instructions: str = "You analyze data by writing Python code. Use execute_code to explore the context variable.",
+        sub_model: str | None = None,
+        memory: MemoryManager | None = None,
+        **kwargs: Any,
+    ) -> AgentPatterns:
+        """Create an agent with RLM code execution tools."""
+        from ..rlm.toolset import create_rlm_toolset
+
+        rlm_ts = create_rlm_toolset(sub_model=sub_model)
+        return cls(model, instructions=instructions, toolsets=[rlm_ts], memory=memory, **kwargs)
+
+    @classmethod
+    def with_database(
+        cls,
+        model: str = "openai:gpt-4o",
+        *,
+        instructions: str = "You help users query and understand databases. Use the database tools to explore schemas and run queries.",
+        memory: MemoryManager | None = None,
+        **kwargs: Any,
+    ) -> AgentPatterns:
+        """Create an agent with database query tools."""
+        return cls(model, instructions=instructions, toolsets=["database"], memory=memory, **kwargs)
+
+    @classmethod
+    def with_all(
+        cls,
+        model: str = "openai:gpt-4o",
+        *,
+        instructions: str = "You are a powerful AI assistant with access to filesystem, code execution, database queries, and task management.",
+        memory: MemoryManager | None = None,
+        **kwargs: Any,
+    ) -> AgentPatterns:
+        """Create an agent with ALL available toolsets."""
+        return cls(
+            model,
+            instructions=instructions,
+            toolsets=["console", "rlm", "database", "todo"],
+            memory=memory,
+            **kwargs,
+        )
diff --git a/agent_ext/agent/base.py b/agent_ext/agent/base.py
index 3df4fd9..a78eb0b 100644
--- a/agent_ext/agent/base.py
+++ b/agent_ext/agent/base.py
@@ -30,9 +30,10 @@ def __init__(self):
     # Next turn: pass message_history so the agent sees the conversation
     result2 = agent.run_sync(ctx, "And that in hex?", message_history=result.new_messages())
 """
+
 from __future__ import annotations
 
-from typing import Any, Optional, TypeVar
+from typing import Any, TypeVar
 
 from pydantic import BaseModel
 from pydantic_ai import Agent
@@ -68,7 +69,7 @@ def __init__(
         *,
         output_type: type[OutputT] = str,  # type: ignore[assignment]
         instructions: str | None = None,
-        memory: Optional[Any] = None,
+        memory: Any | None = None,
         **kwargs: Any,
     ) -> None:
         if memory is not None:
diff --git a/agent_ext/agent/memory_adapter.py b/agent_ext/agent/memory_adapter.py
index ae2196a..2568e4d 100644
--- a/agent_ext/agent/memory_adapter.py
+++ b/agent_ext/agent/memory_adapter.py
@@ -9,9 +9,11 @@
   with ToolCallPart and the following ModelRequest with ToolReturnPart).
 - Full round-trip: generic dicts keep _original so tool calls are preserved.
 """
+
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Callable, List, Optional
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from agent_ext.memory.base import MemoryManager
@@ -20,6 +22,7 @@
 
 # --- Tool call inspection and safe truncation ---
 
+
 def message_kind(msg: Any) -> str:
     """
     Classify a pydantic-ai message for inspection.
@@ -62,11 +65,11 @@ def has_tool_returns(msg: Any) -> bool:
 
 
 def safe_truncate_messages(
-    messages: List[Any],
+    messages: list[Any],
     max_messages: int,
     *,
     only_before_request: bool = True,
-) -> List[Any]:
+) -> list[Any]:
     """
     Truncate from the front so at most max_messages remain, without breaking tool call pairs.
 
@@ -105,7 +108,7 @@ def _model_message_to_dict(msg: Any) -> dict[str, Any]:
         d = msg.model_dump()
         # ModelRequest has 'parts', ModelResponse has 'parts'
         parts = d.get("parts", [])
-        content_parts: List[str] = []
+        content_parts: list[str] = []
         role = "message"
         for p in parts:
             if isinstance(p, dict):
@@ -165,6 +168,7 @@ def _dict_to_model_message(d: dict[str, Any]) -> Any:
     if role == "system":
         try:
             from pydantic_ai.messages import SystemPromptPart
+
             return ModelRequest(parts=[SystemPromptPart(content=content)])
         except ImportError:
             return ModelRequest(parts=[UserPromptPart(content=f"[System]\n{content}")])
@@ -198,10 +202,10 @@ def _dict_to_model_message_safe(d: dict[str, Any]) -> Any:
 
 
 def model_messages_to_generic(
-    messages: List[Any],
+    messages: list[Any],
     *,
     preserve_originals: bool = True,
-) -> List[dict[str, Any]]:
+) -> list[dict[str, Any]]:
     """
     Convert pydantic-ai ModelMessage list to list of dicts for our memory.
     When preserve_originals=True (default), each dict has _original so tool calls
@@ -212,12 +216,12 @@ def model_messages_to_generic(
     return [_model_message_to_dict_v2(m) for m in messages]
 
 
-def generic_to_model_messages(generic: List[Any]) -> List[Any]:
+def generic_to_model_messages(generic: list[Any]) -> list[Any]:
     """
     Convert list of dicts (from our memory) back to pydantic-ai ModelMessage list.
     If a dict has _original, that message is used as-is (preserves tool calls).
     """
-    out: List[Any] = []
+    out: list[Any] = []
     for m in generic:
         if isinstance(m, dict) and "_original" in m:
             out.append(m["_original"])
@@ -228,20 +232,20 @@ def generic_to_model_messages(generic: List[Any]) -> List[Any]:
     return out
 
 
-def _get_memory_max_messages(memory: Any) -> Optional[int]:
+def _get_memory_max_messages(memory: Any) -> int | None:
     """Get max_messages from SlidingWindowMemory or SummarizingMemory.cfg."""
     if hasattr(memory, "max_messages"):
-        return getattr(memory, "max_messages")
+        return memory.max_messages
     if hasattr(memory, "cfg") and memory.cfg is not None:
         return getattr(memory.cfg, "max_messages", None)
     return None
 
 
 def build_history_processor(
-    memory: "MemoryManager",
+    memory: MemoryManager,
     *,
-    max_messages_for_safe_truncate: Optional[int] = None,
-) -> Callable[..., List[Any]]:
+    max_messages_for_safe_truncate: int | None = None,
+) -> Callable[..., list[Any]]:
     """
     Build a pydantic-ai history_processor that uses our MemoryManager.shape_messages.
 
@@ -264,8 +268,8 @@ def build_history_processor(
 
     def processor(
         ctx_or_messages: Any,
-        messages: Optional[List[Any]] = None,
-    ) -> List[Any]:
+        messages: list[Any] | None = None,
+    ) -> list[Any]:
         if messages is None:
             messages = ctx_or_messages
             ctx = None
@@ -285,9 +289,9 @@ def processor(
 
 
 def checkpoint_after_run(
-    memory: "MemoryManager",
-    ctx: "RunContext",
-    all_messages: List[Any],
+    memory: MemoryManager,
+    ctx: RunContext,
+    all_messages: list[Any],
     outcome: Any,
 ) -> None:
     """
diff --git a/agent_ext/backends/README.md b/agent_ext/backends/README.md
new file mode 100644
index 0000000..628aa74
--- /dev/null
+++ b/agent_ext/backends/README.md
@@ -0,0 +1,62 @@
+# Backends — File Storage, Execution & Permissions
+
+Everything an AI agent needs to work with files and execute code safely.
+
+## Features
+
+- **Local Filesystem**: Sandboxed read/write/list/glob within a root directory
+- **State Backend**: In-memory filesystem for testing (no disk needed)
+- **Subprocess Execution**: Run commands with timeout and capture
+- **Permission System**: Fine-grained access control with presets
+- **Hashline**: Content-hash line editing for precise, low-token edits
+
+## Backends
+
+| Backend | Use Case |
+|---------|----------|
+| `LocalFilesystemBackend` | Real filesystem, sandboxed to root dir |
+| `LocalSubprocessExecBackend` | Run shell commands |
+| `StateBackend` | In-memory, ephemeral — perfect for tests |
+
+## Permission Presets
+
+| Preset | Read | Write | Execute |
+|--------|------|-------|---------|
+| `READONLY_RULESET` | ✅ | ❌ | ❌ |
+| `DEFAULT_RULESET` | ✅ | Ask | Ask |
+| `PERMISSIVE_RULESET` | ✅ | ✅ | ✅ |
+| `STRICT_RULESET` | Ask | Ask | Ask |
+
+All presets deny access to `.env`, `.pem`, `.key`, credentials, etc.
+
+## Hashline Editing
+
+```python
+from agent_ext.backends import format_hashline_output, apply_hashline_edit
+
+# Format file with hashline tags
+tagged = format_hashline_output("def hello():\n    return 42\n")
+# 1:a3|def hello():
+# 2:f1|    return 42
+
+# Edit by line number + hash (no text matching needed)
+new_content, error = apply_hashline_edit(
+    "def hello():\n    return 42\n",
+    start_line=2, start_hash="f1",
+    new_content="    return 99",
+)
+```
+
+## State Backend (Testing)
+
+```python
+from agent_ext.backends import StateBackend
+
+backend = StateBackend()
+backend.write_text("src/app.py", "print('hello')")
+content = backend.read_text("src/app.py")
+
+# Rich operations
+matches = backend.grep_raw("print")
+result = backend.edit("src/app.py", "hello", "world")
+```
diff --git a/agent_ext/backends/__init__.py b/agent_ext/backends/__init__.py
index e69de29..188c1d5 100644
--- a/agent_ext/backends/__init__.py
+++ b/agent_ext/backends/__init__.py
@@ -0,0 +1,55 @@
+"""File storage, execution, and permission backends for AI agents."""
+
+from .base import ExecBackend, ExecResult, FilesystemBackend
+from .composite import CompositeBackend
+from .console import CONSOLE_SYSTEM_PROMPT, ConsoleDeps, create_console_toolset
+from .hashline import apply_hashline_edit, format_hashline_output, line_hash
+from .local_fs import LocalFilesystemBackend
+from .permissions import (
+    DEFAULT_RULESET,
+    PERMISSIVE_RULESET,
+    READONLY_RULESET,
+    STRICT_RULESET,
+    OperationPermissions,
+    PermissionAction,
+    PermissionChecker,
+    PermissionOperation,
+    PermissionRule,
+    PermissionRuleset,
+    create_ruleset,
+)
+from .sandbox_exec import LocalSubprocessExecBackend
+from .state import EditResult, FileData, FileInfo, GrepMatch, StateBackend, WriteResult
+
+__all__ = [
+    # Base protocols
+    "ExecBackend",
+    "ExecResult",
+    "FilesystemBackend",
+    # Backends
+    "LocalFilesystemBackend",
+    "LocalSubprocessExecBackend",
+    "StateBackend",
+    # State backend types
+    "FileData",
+    "FileInfo",
+    "GrepMatch",
+    "EditResult",
+    "WriteResult",
+    # Permissions
+    "DEFAULT_RULESET",
+    "PERMISSIVE_RULESET",
+    "READONLY_RULESET",
+    "STRICT_RULESET",
+    "OperationPermissions",
+    "PermissionAction",
+    "PermissionChecker",
+    "PermissionOperation",
+    "PermissionRule",
+    "PermissionRuleset",
+    "create_ruleset",
+    # Hashline
+    "apply_hashline_edit",
+    "format_hashline_output",
+    "line_hash",
+]
diff --git a/agent_ext/backends/base.py b/agent_ext/backends/base.py
index cbac3d6..e2820e2 100644
--- a/agent_ext/backends/base.py
+++ b/agent_ext/backends/base.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Optional, Protocol
+
+import builtins
+from typing import Protocol
 
 
 class FilesystemBackend(Protocol):
     def read_text(self, path: str) -> str: ...
     def write_text(self, path: str, content: str) -> None: ...
-    def list(self, path: str) -> List[str]: ...
-    def glob(self, pattern: str) -> List[str]: ...
+    def list(self, path: str) -> builtins.list[str]: ...
+    def glob(self, pattern: str) -> builtins.list[str]: ...
 
 
 class ExecResult(dict):
@@ -14,4 +16,6 @@ class ExecResult(dict):
 
 
 class ExecBackend(Protocol):
-    def run(self, cmd: List[str], *, cwd: Optional[str] = None, env: Optional[Dict[str, str]] = None, timeout_s: int = 30) -> ExecResult: ...
+    def run(
+        self, cmd: list[str], *, cwd: str | None = None, env: dict[str, str] | None = None, timeout_s: int = 30
+    ) -> ExecResult: ...
diff --git a/agent_ext/backends/composite.py b/agent_ext/backends/composite.py
new file mode 100644
index 0000000..f0511ca
--- /dev/null
+++ b/agent_ext/backends/composite.py
@@ -0,0 +1,61 @@
+"""Composite backend — routes operations to different backends by path prefix.
+
+Example::
+
+    from agent_ext.backends import CompositeBackend, StateBackend, LocalFilesystemBackend
+
+    backend = CompositeBackend(
+        default=StateBackend(),
+        routes={
+            "/project/": LocalFilesystemBackend(root="/my/project", allow_write=True),
+            "/temp/": StateBackend(),
+        },
+    )
+
+    backend.write_text("/project/app.py", "...")  # → LocalFilesystemBackend
+    backend.write_text("/scratch.txt", "...")       # → StateBackend (default)
+"""
+
+from __future__ import annotations
+
+from .base import FilesystemBackend
+
+
+class CompositeBackend(FilesystemBackend):
+    """Backend that routes operations to different backends by path prefix.
+
+    Longest-prefix match is used, falling back to ``default``.
+    """
+
+    def __init__(
+        self,
+        default: FilesystemBackend,
+        routes: dict[str, FilesystemBackend] | None = None,
+    ) -> None:
+        self._default = default
+        self._routes = routes or {}
+        # Sort by length (longest first) for correct matching
+        self._sorted_prefixes = sorted(self._routes.keys(), key=len, reverse=True)
+
+    def _get_backend(self, path: str) -> FilesystemBackend:
+        for prefix in self._sorted_prefixes:
+            if path.startswith(prefix):
+                return self._routes[prefix]
+        return self._default
+
+    def read_text(self, path: str) -> str:
+        return self._get_backend(path).read_text(path)
+
+    def write_text(self, path: str, content: str) -> None:
+        self._get_backend(path).write_text(path, content)
+
+    def list(self, path: str) -> list[str]:
+        return self._get_backend(path).list(path)
+
+    def glob(self, pattern: str) -> list[str]:
+        # Aggregate from all backends
+        results: set[str] = set()
+        results.update(self._default.glob(pattern))
+        for backend in self._routes.values():
+            results.update(backend.glob(pattern))
+        return sorted(results)
diff --git a/agent_ext/backends/console.py b/agent_ext/backends/console.py
new file mode 100644
index 0000000..132054c
--- /dev/null
+++ b/agent_ext/backends/console.py
@@ -0,0 +1,195 @@
+"""Console toolset — gives any pydantic-ai agent file and shell capabilities.
+
+Tools: ls, read_file, write_file, edit_file, grep, glob_files, execute.
+
+Example::
+
+    from pydantic_ai import Agent
+    from agent_ext.backends import create_console_toolset, ConsoleDeps, LocalFilesystemBackend
+
+    backend = LocalFilesystemBackend(root="/workspace", allow_write=True)
+    toolset = create_console_toolset()
+    agent = Agent("openai:gpt-4o", toolsets=[toolset])
+
+    deps = ConsoleDeps(backend=backend)
+    result = await agent.run("List files in the src directory", deps=deps)
+"""
+
+from __future__ import annotations
+
+import subprocess
+from typing import Annotated, Any
+
+from pydantic import BaseModel, ConfigDict, SkipValidation
+from pydantic_ai import RunContext
+from pydantic_ai.toolsets import FunctionToolset
+
+from .permissions import PERMISSIVE_RULESET, PermissionChecker, PermissionRuleset
+
+CONSOLE_SYSTEM_PROMPT = """\
+## Console Tools
+
+You have access to filesystem tools (ls, read_file, write_file, edit_file, \
+glob, grep) and shell execution (execute). Read each tool's description for \
+detailed usage guidance.
+"""
+
+
+class ConsoleDeps(BaseModel):
+    """Dependencies for the console toolset."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    backend: Annotated[Any, SkipValidation]  # FilesystemBackend
+    permissions: PermissionRuleset = PERMISSIVE_RULESET
+    exec_enabled: bool = False
+    exec_timeout: int = 30
+
+
+def create_console_toolset(*, toolset_id: str | None = None) -> FunctionToolset[ConsoleDeps]:
+    """Create a console toolset with file operations and optional shell execution.
+
+    Returns:
+        FunctionToolset with ls, read_file, write_file, edit_file, grep, glob_files, execute.
+    """
+    toolset: FunctionToolset[ConsoleDeps] = FunctionToolset(id=toolset_id)
+
+    def _check(ctx: RunContext[ConsoleDeps], op: str, path: str) -> str | None:
+        """Check permission; return error string or None."""
+        checker = PermissionChecker(ctx.deps.permissions)
+        action = checker.check(op, path)  # type: ignore[arg-type]
+        if action == "deny":
+            return f"Permission denied: {op} on {path}"
+        if action == "ask":
+            return f"Permission required: {op} on {path} (ask mode not implemented)"
+        return None
+
+    @toolset.tool(description="List files and directories at the given path.")
+    async def ls(ctx: RunContext[ConsoleDeps], path: str = ".") -> str:
+        err = _check(ctx, "ls", path)
+        if err:
+            return err
+        try:
+            entries = ctx.deps.backend.list(path)
+            return "\n".join(entries) if entries else "(empty directory)"
+        except Exception as e:
+            return f"Error: {e}"
+
+    @toolset.tool(description="Read file content. ALWAYS read a file before editing it.")
+    async def read_file(ctx: RunContext[ConsoleDeps], path: str, offset: int = 0, limit: int = 2000) -> str:
+        err = _check(ctx, "read", path)
+        if err:
+            return err
+        try:
+            content = ctx.deps.backend.read_text(path)
+            lines = content.split("\n")
+            end = min(offset + limit, len(lines))
+            numbered = [f"{i + 1:>6}\t{lines[i]}" for i in range(offset, end)]
+            result = "\n".join(numbered)
+            if end < len(lines):
+                result += f"\n\n... ({len(lines) - end} more lines)"
+            return result
+        except Exception as e:
+            return f"Error: {e}"
+
+    @toolset.tool(
+        description="Write content to a file. Creates the file if it doesn't exist. Prefer edit_file for existing files."
+    )
+    async def write_file(ctx: RunContext[ConsoleDeps], path: str, content: str) -> str:
+        err = _check(ctx, "write", path)
+        if err:
+            return err
+        try:
+            ctx.deps.backend.write_text(path, content)
+            return f"Wrote {len(content)} bytes to {path}"
+        except Exception as e:
+            return f"Error: {e}"
+
+    @toolset.tool(
+        description="Edit a file by replacing an exact string. ALWAYS read_file first. Use replace_all=True for renaming."
+    )
+    async def edit_file(
+        ctx: RunContext[ConsoleDeps],
+        path: str,
+        old_string: str,
+        new_string: str,
+        replace_all: bool = False,
+    ) -> str:
+        err = _check(ctx, "edit", path)
+        if err:
+            return err
+        try:
+            content = ctx.deps.backend.read_text(path)
+            count = content.count(old_string)
+            if count == 0:
+                return f"Error: String not found in {path}"
+            if count > 1 and not replace_all:
+                return f"Error: String found {count} times. Use replace_all=True or provide more context."
+            if replace_all:
+                new_content = content.replace(old_string, new_string)
+            else:
+                new_content = content.replace(old_string, new_string, 1)
+            ctx.deps.backend.write_text(path, new_content)
+            return f"Replaced {'all ' + str(count) if replace_all else '1'} occurrence(s) in {path}"
+        except Exception as e:
+            return f"Error: {e}"
+
+    @toolset.tool(description="Search for a regex pattern in files.")
+    async def grep(ctx: RunContext[ConsoleDeps], pattern: str, path: str = ".") -> str:
+        err = _check(ctx, "grep", path)
+        if err:
+            return err
+        try:
+            import re
+
+            results: list[str] = []
+            for fp in ctx.deps.backend.glob("**/*.py"):
+                try:
+                    content = ctx.deps.backend.read_text(fp)
+                    for i, line in enumerate(content.split("\n")):
+                        if re.search(pattern, line):
+                            results.append(f"{fp}:{i + 1}: {line.rstrip()}")
+                except Exception:
+                    continue
+                if len(results) >= 50:
+                    break
+            return "\n".join(results) if results else "No matches found."
+        except Exception as e:
+            return f"Error: {e}"
+
+    @toolset.tool(description="Find files matching a glob pattern.")
+    async def glob_files(ctx: RunContext[ConsoleDeps], pattern: str) -> str:
+        err = _check(ctx, "glob", pattern)
+        if err:
+            return err
+        try:
+            matches = ctx.deps.backend.glob(pattern)
+            return "\n".join(matches) if matches else "No files found."
+        except Exception as e:
+            return f"Error: {e}"
+
+    @toolset.tool(description="Execute a shell command. Only available when exec is enabled.")
+    async def execute(ctx: RunContext[ConsoleDeps], command: str) -> str:
+        if not ctx.deps.exec_enabled:
+            return "Error: Shell execution is disabled."
+        err = _check(ctx, "execute", command)
+        if err:
+            return err
+        try:
+            p = subprocess.run(
+                command,
+                shell=True,
+                capture_output=True,
+                text=True,
+                timeout=ctx.deps.exec_timeout,
+            )
+            output = (p.stdout or "") + (p.stderr or "")
+            if len(output) > 10_000:
+                output = output[:10_000] + "\n... (truncated)"
+            return f"Exit code: {p.returncode}\n{output}".strip()
+        except subprocess.TimeoutExpired:
+            return f"Error: Command timed out after {ctx.deps.exec_timeout}s"
+        except Exception as e:
+            return f"Error: {e}"
+
+    return toolset
diff --git a/agent_ext/backends/hashline.py b/agent_ext/backends/hashline.py
new file mode 100644
index 0000000..a379ac1
--- /dev/null
+++ b/agent_ext/backends/hashline.py
@@ -0,0 +1,102 @@
+"""Hashline: content-hash-tagged line editing for AI agents.
+
+Each line is tagged with a 2-character content hash.  Models reference
+lines by ``number:hash`` pairs instead of reproducing exact text,
+eliminating whitespace-matching errors and reducing output tokens.
+
+Format::
+
+    1:a3|function hello() {
+    2:f1|  return "world";
+    3:0e|}
+"""
+
+from __future__ import annotations
+
+import hashlib
+
+
+def line_hash(content: str) -> str:
+    """Generate a 2-char hex content hash for a line."""
+    return hashlib.md5(content.encode("utf-8")).hexdigest()[:2]
+
+
+def _split_lines(content: str) -> tuple[list[str], bool]:
+    has_trailing_nl = content.endswith("\n")
+    lines = content.split("\n")
+    if has_trailing_nl and lines and lines[-1] == "":
+        lines = lines[:-1]
+    return lines, has_trailing_nl
+
+
+def format_hashline_output(content: str, offset: int = 0, limit: int = 2000) -> str:
+    """Format file content with hashline tags.
+
+    Each line becomes ``{line_num}:{hash}|{content}``.
+    """
+    lines, _ = _split_lines(content)
+    total = len(lines)
+    if total == 0:
+        return "(empty file)"
+    if offset >= total:
+        return f"Error: Offset {offset} exceeds file length ({total} lines)"
+    end = min(offset + limit, total)
+    parts = [f"{i + 1}:{line_hash(lines[i])}|{lines[i]}" for i in range(offset, end)]
+    result = "\n".join(parts)
+    if end < total:
+        result += f"\n\n... ({total - end} more lines)"
+    return result
+
+
+def apply_hashline_edit(
+    content: str,
+    start_line: int,
+    start_hash: str,
+    new_content: str,
+    end_line: int | None = None,
+    end_hash: str | None = None,
+    insert_after: bool = False,
+) -> tuple[str, str | None]:
+    """Apply a hashline edit.  Validates hashes match before applying.
+
+    Returns ``(new_file_content, error)``.  *error* is ``None`` on success.
+    """
+    lines, has_trailing_nl = _split_lines(content)
+    total = len(lines)
+
+    if start_line < 1 or start_line > total:
+        return content, f"Line {start_line} out of range (file has {total} lines)"
+
+    actual_sh = line_hash(lines[start_line - 1])
+    if actual_sh != start_hash:
+        return content, (
+            f"Hash mismatch at line {start_line}: expected '{start_hash}', "
+            f"got '{actual_sh}'. File may have changed — re-read it first."
+        )
+
+    effective_end = start_line
+    if end_line is not None:
+        if end_line < start_line:
+            return content, f"end_line ({end_line}) must be >= start_line ({start_line})"
+        if end_line > total:
+            return content, f"End line {end_line} out of range (file has {total} lines)"
+        if end_hash is not None:
+            actual_eh = line_hash(lines[end_line - 1])
+            if actual_eh != end_hash:
+                return content, (
+                    f"Hash mismatch at line {end_line}: expected '{end_hash}', "
+                    f"got '{actual_eh}'. File may have changed — re-read it first."
+                )
+        effective_end = end_line
+
+    new_lines = new_content.split("\n") if new_content else []
+
+    if insert_after:
+        result_lines = lines[:start_line] + new_lines + lines[start_line:]
+    else:
+        result_lines = lines[: start_line - 1] + new_lines + lines[effective_end:]
+
+    result = "\n".join(result_lines)
+    if has_trailing_nl:
+        result += "\n"
+    return result, None
diff --git a/agent_ext/backends/local_fs.py b/agent_ext/backends/local_fs.py
index 4251989..aeb5d58 100644
--- a/agent_ext/backends/local_fs.py
+++ b/agent_ext/backends/local_fs.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 import glob
 import os
 
@@ -18,7 +19,7 @@ def _resolve(self, path: str) -> str:
 
     def read_text(self, path: str) -> str:
         ap = self._resolve(path)
-        with open(ap, "r", encoding="utf-8") as f:
+        with open(ap, encoding="utf-8") as f:
             return f.read()
 
     def write_text(self, path: str, content: str) -> None:
diff --git a/agent_ext/backends/permissions.py b/agent_ext/backends/permissions.py
new file mode 100644
index 0000000..ddd0b96
--- /dev/null
+++ b/agent_ext/backends/permissions.py
@@ -0,0 +1,183 @@
+"""Permission system for backend operations.
+
+Fine-grained access control with presets (read-only, full-access, etc.)
+and pattern-based rules.
+"""
+
+from __future__ import annotations
+
+import fnmatch
+from dataclasses import dataclass, field
+from typing import Literal
+
+PermissionAction = Literal["allow", "deny", "ask"]
+PermissionOperation = Literal["read", "write", "edit", "execute", "glob", "grep", "ls"]
+
+
+@dataclass(frozen=True)
+class PermissionRule:
+    """A rule matching paths/commands to an action.  First match wins."""
+
+    pattern: str
+    action: PermissionAction
+    description: str = ""
+
+
+@dataclass
+class OperationPermissions:
+    """Permissions for a single operation type."""
+
+    default: PermissionAction = "allow"
+    rules: list[PermissionRule] = field(default_factory=list)
+
+    def check(self, path: str) -> PermissionAction:
+        for rule in self.rules:
+            if fnmatch.fnmatch(path, rule.pattern):
+                return rule.action
+        return self.default
+
+
+@dataclass
+class PermissionRuleset:
+    """Complete permissions configuration for all operations."""
+
+    default: PermissionAction = "ask"
+    read: OperationPermissions | None = None
+    write: OperationPermissions | None = None
+    edit: OperationPermissions | None = None
+    execute: OperationPermissions | None = None
+    glob: OperationPermissions | None = None
+    grep: OperationPermissions | None = None
+    ls: OperationPermissions | None = None
+
+    def get_operation_permissions(self, operation: PermissionOperation) -> OperationPermissions:
+        op_perms = getattr(self, operation, None)
+        if op_perms is not None:
+            return op_perms
+        return OperationPermissions(default=self.default)
+
+    def check(self, operation: PermissionOperation, path: str) -> PermissionAction:
+        return self.get_operation_permissions(operation).check(path)
+
+
+class PermissionChecker:
+    """Checks operations against a ``PermissionRuleset``."""
+
+    def __init__(self, ruleset: PermissionRuleset) -> None:
+        self.ruleset = ruleset
+
+    def check(self, operation: PermissionOperation, path: str) -> PermissionAction:
+        return self.ruleset.check(operation, path)
+
+    def is_allowed(self, operation: PermissionOperation, path: str) -> bool:
+        return self.check(operation, path) == "allow"
+
+    def require(self, operation: PermissionOperation, path: str) -> None:
+        """Raise ``PermissionError`` if not allowed."""
+        action = self.check(operation, path)
+        if action == "deny":
+            raise PermissionError(f"Operation '{operation}' denied for path: {path}")
+        if action == "ask":
+            raise PermissionError(f"Operation '{operation}' requires approval for path: {path}")
+
+
+# ---------------------------------------------------------------------------
+# Common sensitive file patterns
+# ---------------------------------------------------------------------------
+
+SECRETS_PATTERNS = [
+    "**/.env",
+    "**/.env.*",
+    "**/*.pem",
+    "**/*.key",
+    "**/*.crt",
+    "**/credentials*",
+    "**/secrets*",
+    "**/*secret*",
+    "**/*password*",
+    "**/.aws/**",
+    "**/.ssh/**",
+]
+
+
+def _deny_rules(patterns: list[str], desc: str) -> list[PermissionRule]:
+    return [PermissionRule(pattern=p, action="deny", description=desc) for p in patterns]
+
+
+# ---------------------------------------------------------------------------
+# Preset rulesets
+# ---------------------------------------------------------------------------
+
+READONLY_RULESET = PermissionRuleset(
+    default="deny",
+    read=OperationPermissions(default="allow", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    write=OperationPermissions(default="deny"),
+    edit=OperationPermissions(default="deny"),
+    execute=OperationPermissions(default="deny"),
+    glob=OperationPermissions(default="allow"),
+    grep=OperationPermissions(default="allow"),
+    ls=OperationPermissions(default="allow"),
+)
+
+PERMISSIVE_RULESET = PermissionRuleset(
+    default="allow",
+    read=OperationPermissions(default="allow", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    write=OperationPermissions(default="allow", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    edit=OperationPermissions(default="allow", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    execute=OperationPermissions(default="allow"),
+    glob=OperationPermissions(default="allow"),
+    grep=OperationPermissions(default="allow"),
+    ls=OperationPermissions(default="allow"),
+)
+
+DEFAULT_RULESET = PermissionRuleset(
+    default="ask",
+    read=OperationPermissions(default="allow", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    write=OperationPermissions(default="ask", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    edit=OperationPermissions(default="ask", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    execute=OperationPermissions(default="ask"),
+    glob=OperationPermissions(default="allow"),
+    grep=OperationPermissions(default="allow"),
+    ls=OperationPermissions(default="allow"),
+)
+
+STRICT_RULESET = PermissionRuleset(
+    default="ask",
+    read=OperationPermissions(default="ask", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    write=OperationPermissions(default="ask", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    edit=OperationPermissions(default="ask", rules=_deny_rules(SECRETS_PATTERNS, "Protect secrets")),
+    execute=OperationPermissions(default="ask"),
+    glob=OperationPermissions(default="ask"),
+    grep=OperationPermissions(default="ask"),
+    ls=OperationPermissions(default="ask"),
+)
+
+
+def create_ruleset(
+    *,
+    default: PermissionAction = "ask",
+    allow_read: bool = True,
+    allow_write: bool = False,
+    allow_edit: bool = False,
+    allow_execute: bool = False,
+    allow_glob: bool = True,
+    allow_grep: bool = True,
+    allow_ls: bool = True,
+    deny_secrets: bool = True,
+) -> PermissionRuleset:
+    """Convenience factory for custom rulesets."""
+
+    def _act(allowed: bool) -> PermissionAction:
+        return "allow" if allowed else "ask"
+
+    secret_rules = _deny_rules(SECRETS_PATTERNS, "Protect secrets") if deny_secrets else []
+    return PermissionRuleset(
+        default=default,
+        read=OperationPermissions(default=_act(allow_read), rules=secret_rules),
+        write=OperationPermissions(default=_act(allow_write), rules=secret_rules),
+        edit=OperationPermissions(default=_act(allow_edit), rules=secret_rules),
+        execute=OperationPermissions(default=_act(allow_execute)),
+        glob=OperationPermissions(default=_act(allow_glob)),
+        grep=OperationPermissions(default=_act(allow_grep)),
+        ls=OperationPermissions(default=_act(allow_ls)),
+    )
diff --git a/agent_ext/backends/sandbox_exec.py b/agent_ext/backends/sandbox_exec.py
index 26d95dc..d2b6e75 100644
--- a/agent_ext/backends/sandbox_exec.py
+++ b/agent_ext/backends/sandbox_exec.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
+
 import subprocess
-from typing import Dict, List, Optional
 
 from .base import ExecBackend, ExecResult
 
@@ -9,7 +9,9 @@ class LocalSubprocessExecBackend(ExecBackend):
     def __init__(self, *, enabled: bool):
         self.enabled = enabled
 
-    def run(self, cmd: List[str], *, cwd: Optional[str] = None, env: Optional[Dict[str, str]] = None, timeout_s: int = 30) -> ExecResult:
+    def run(
+        self, cmd: list[str], *, cwd: str | None = None, env: dict[str, str] | None = None, timeout_s: int = 30
+    ) -> ExecResult:
         if not self.enabled:
             raise PermissionError("Exec disabled by policy")
         p = subprocess.run(
diff --git a/agent_ext/backends/state.py b/agent_ext/backends/state.py
new file mode 100644
index 0000000..0451876
--- /dev/null
+++ b/agent_ext/backends/state.py
@@ -0,0 +1,203 @@
+"""In-memory file storage backend for testing and sandboxed execution.
+
+Files are stored in a dictionary and are ephemeral.  Useful for tests,
+preview environments, and stateless sandboxes.
+"""
+
+from __future__ import annotations
+
+import fnmatch
+import re
+from dataclasses import dataclass
+from datetime import UTC, datetime
+
+from .base import FilesystemBackend
+
+
+@dataclass
+class FileData:
+    content: list[str]
+    created_at: str = ""
+    modified_at: str = ""
+
+
+@dataclass
+class FileInfo:
+    name: str
+    path: str
+    is_dir: bool
+    size: int | None = None
+
+
+@dataclass
+class GrepMatch:
+    path: str
+    line_number: int
+    line: str
+
+
+@dataclass
+class EditResult:
+    path: str | None = None
+    error: str | None = None
+    occurrences: int = 0
+
+
+@dataclass
+class WriteResult:
+    path: str | None = None
+    error: str | None = None
+
+
+def _normalize_path(path: str) -> str:
+    if not path.startswith("/"):
+        path = "/" + path
+    if len(path) > 1 and path.endswith("/"):
+        path = path.rstrip("/")
+    return path
+
+
+def _validate_path(path: str) -> str | None:
+    if ".." in path:
+        return "Path cannot contain '..'"
+    if path.startswith("~"):
+        return "Path cannot start with '~'"
+    return None
+
+
+class StateBackend(FilesystemBackend):
+    """In-memory file storage backend.
+
+    Compatible with ``FilesystemBackend`` protocol and also provides
+    rich operations: ``ls_info``, ``edit``, ``grep_raw``, ``glob_info``.
+
+    Example::
+
+        backend = StateBackend()
+        backend.write_text("src/app.py", "print('hello')")
+        content = backend.read_text("src/app.py")
+    """
+
+    def __init__(self, files: dict[str, FileData] | None = None) -> None:
+        self._files: dict[str, FileData] = files or {}
+
+    @property
+    def files(self) -> dict[str, FileData]:
+        return self._files
+
+    def _ts(self) -> str:
+        return datetime.now(UTC).isoformat()
+
+    # -- FilesystemBackend protocol -----------------------------------------
+
+    def read_text(self, path: str) -> str:
+        p = _normalize_path(path)
+        fd = self._files.get(p)
+        if fd is None:
+            raise FileNotFoundError(f"File not found: {p}")
+        return "\n".join(fd.content)
+
+    def write_text(self, path: str, content: str) -> None:
+        p = _normalize_path(path)
+        err = _validate_path(p)
+        if err:
+            raise PermissionError(err)
+        now = self._ts()
+        lines = content.split("\n")
+        existing = self._files.get(p)
+        self._files[p] = FileData(
+            content=lines,
+            created_at=existing.created_at if existing else now,
+            modified_at=now,
+        )
+
+    def list(self, path: str) -> list[str]:
+        p = _normalize_path(path)
+        prefix = p if p == "/" else p + "/"
+        entries: set[str] = set()
+        for fp in self._files:
+            if fp.startswith(prefix):
+                rel = fp[len(prefix) :]
+                top = rel.split("/")[0]
+                entries.add(top)
+        return sorted(entries)
+
+    def glob(self, pattern: str) -> list[str]:
+        results: list[str] = []
+        for fp in self._files:
+            if fnmatch.fnmatch(fp, pattern) or fnmatch.fnmatch(fp, "/" + pattern):
+                results.append(fp.lstrip("/"))
+        return sorted(results)
+
+    # -- rich operations ----------------------------------------------------
+
+    def read_numbered(self, path: str, offset: int = 0, limit: int = 2000) -> str:
+        """Read file with line numbers (like the upstream ``read``)."""
+        p = _normalize_path(path)
+        fd = self._files.get(p)
+        if fd is None:
+            return f"Error: File '{p}' not found"
+        lines = fd.content
+        total = len(lines)
+        if offset >= total:
+            return f"Error: Offset {offset} exceeds file length ({total} lines)"
+        end = min(offset + limit, total)
+        result_lines = [f"{i + 1:>6}\t{lines[i]}" for i in range(offset, end)]
+        result = "\n".join(result_lines)
+        if end < total:
+            result += f"\n\n... ({total - end} more lines)"
+        return result
+
+    def edit(self, path: str, old_string: str, new_string: str, replace_all: bool = False) -> EditResult:
+        """Edit a file by replacing strings."""
+        p = _normalize_path(path)
+        fd = self._files.get(p)
+        if fd is None:
+            return EditResult(error=f"File '{p}' not found")
+        content = "\n".join(fd.content)
+        count = content.count(old_string)
+        if count == 0:
+            return EditResult(error="String not found in file")
+        if count > 1 and not replace_all:
+            return EditResult(error=f"String found {count} times. Use replace_all=True.")
+        new_content = (
+            content.replace(old_string, new_string) if replace_all else content.replace(old_string, new_string, 1)
+        )
+        fd.content = new_content.split("\n")
+        fd.modified_at = self._ts()
+        return EditResult(path=p, occurrences=count if replace_all else 1)
+
+    def grep_raw(self, pattern: str, path: str | None = None) -> list[GrepMatch] | str:
+        """Search for regex pattern in files."""
+        try:
+            rx = re.compile(pattern)
+        except re.error as e:
+            return f"Error: Invalid regex: {e}"
+        results: list[GrepMatch] = []
+        files_to_search = list(self._files.keys())
+        if path:
+            p = _normalize_path(path)
+            files_to_search = [f for f in files_to_search if f.startswith(p)]
+        for fp in files_to_search:
+            for i, line in enumerate(self._files[fp].content):
+                if rx.search(line):
+                    results.append(GrepMatch(path=fp, line_number=i + 1, line=line))
+        return results
+
+    def ls_info(self, path: str) -> list[FileInfo]:
+        """List files/dirs at path with metadata."""
+        p = _normalize_path(path)
+        prefix = p if p == "/" else p + "/"
+        entries: dict[str, FileInfo] = {}
+        for fp, fd in self._files.items():
+            if not fp.startswith(prefix):
+                continue
+            rel = fp[len(prefix) :]
+            parts = rel.split("/")
+            name = parts[0]
+            if name not in entries:
+                if len(parts) == 1:
+                    entries[name] = FileInfo(name=name, path=fp, is_dir=False, size=sum(len(l) for l in fd.content))
+                else:
+                    entries[name] = FileInfo(name=name, path=prefix + name, is_dir=True)
+        return sorted(entries.values(), key=lambda x: (not x.is_dir, x.name))
diff --git a/agent_ext/cog/__init__.py b/agent_ext/cog/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/cog/__main__.py b/agent_ext/cog/__main__.py
new file mode 100644
index 0000000..fb765cc
--- /dev/null
+++ b/agent_ext/cog/__main__.py
@@ -0,0 +1,75 @@
+"""
+Headless daemon entry point for fully automated self-improving agent.
+
+Usage:
+  export LLM_BASE_URL=... LLM_API_KEY=... LLM_MODEL=...
+  export USE_OPENAI_CHAT_MODEL=1   # or pass --use-openai-chat-model
+  python -m agent_ext.cog [--use-openai-chat-model]
+
+Runs the cognitive loop forever: plan → patch → gates → (optional) adopt & push.
+See docs/AUTO_AGENT.md and .env.example for env vars.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+
+from dotenv import find_dotenv, load_dotenv
+
+# Load .env so LLM_*, AUTO_*, COG_*, AGENT_* are set
+load_dotenv(find_dotenv())
+
+from agent_ext.cog.daemon import run_forever
+from agent_ext.workbench.models import build_openai_chat_model, model_from_env
+from agent_ext.workbench.runtime import build_ctx
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Run self-improving agent daemon (no TUI)")
+    ap.add_argument("--use-openai-chat-model", action="store_true", help="Use OpenAI-compatible chat model from env")
+    ap.add_argument(
+        "--max-parallel-subagents", type=int, default=None, help="Max concurrent subagents (default from env or 4)"
+    )
+    ap.add_argument(
+        "--max-parallel-model-calls", type=int, default=None, help="Max concurrent model calls (default from env or 2)"
+    )
+    ap.add_argument("--case-id", default=os.getenv("AGENT_CASE_ID", "daemon-1"))
+    ap.add_argument("--session-id", default=os.getenv("AGENT_SESSION_ID", "sess-1"))
+    ap.add_argument("--user-id", default=os.getenv("AGENT_USER_ID", "user-1"))
+    args = ap.parse_args()
+
+    use_model = args.use_openai_chat_model or bool(
+        os.getenv("USE_OPENAI_CHAT_MODEL", "").strip().lower() in ("1", "true", "yes")
+    )
+    model = None
+    if use_model:
+        cfg = model_from_env()
+        model = build_openai_chat_model(cfg)
+        print(f"[daemon] model={cfg.model} base_url={cfg.base_url}")
+
+    max_sub = (
+        args.max_parallel_subagents
+        if args.max_parallel_subagents is not None
+        else int(os.getenv("MAX_PARALLEL_SUBAGENTS", "4"))
+    )
+    max_llm = (
+        args.max_parallel_model_calls
+        if args.max_parallel_model_calls is not None
+        else int(os.getenv("MAX_PARALLEL_MODEL_CALLS", "2"))
+    )
+    ctx = build_ctx(
+        case_id=args.case_id,
+        session_id=args.session_id,
+        user_id=args.user_id,
+        model=model,
+        max_parallel_subagents=max_sub,
+        max_parallel_model_calls=max_llm,
+    )
+
+    asyncio.run(run_forever(ctx))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/agent_ext/cog/daemon.py b/agent_ext/cog/daemon.py
new file mode 100644
index 0000000..cb8310e
--- /dev/null
+++ b/agent_ext/cog/daemon.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import asyncio
+import os
+import random
+from dataclasses import dataclass
+
+from agent_ext.cog.loop_v2 import run_cognitive_cycle
+from agent_ext.cog.state import Budget
+
+
+def _log(ctx, level: str, msg: str) -> None:
+    if getattr(ctx, "logger", None):
+        getattr(ctx.logger, level)(msg, **{})
+    else:
+        print(f"[daemon][{level}] {msg}")
+
+
+@dataclass
+class DaemonConfig:
+    sleep_s: int = int(os.getenv("AGENT_LOOP_SLEEP", "30"))
+    max_idle_s: int = int(os.getenv("AGENT_MAX_IDLE", "600"))
+    goal: str = os.getenv("AGENT_DAEMON_GOAL", "keep improving the repo safely")
+
+
+async def run_forever(ctx, *, cfg: DaemonConfig | None = None) -> None:
+    cfg = cfg or DaemonConfig()
+
+    budget = Budget(
+        max_steps=int(os.getenv("COG_MAX_STEPS", "10")),
+        max_model_calls=int(os.getenv("COG_MAX_MODEL_CALLS", "6")),
+        max_parallel_writers=int(os.getenv("COG_MAX_PARALLEL_WRITERS", "3")),
+        max_diff_chars=int(os.getenv("MAX_DIFF_CHARS", "60000")),
+        auto_commit_threshold=float(os.getenv("AUTO_COMMIT_THRESHOLD", "80")),
+    )
+
+    cycle = 0
+    while True:
+        cycle += 1
+        try:
+            _log(ctx, "info", f"cycle {cycle}: goal={cfg.goal[:50]}...")
+            out = await run_cognitive_cycle(ctx, cfg.goal, budget)
+
+            adopted = bool(out.get("adopted", False))
+            ok = bool(out.get("ok", False))
+            mode = out.get("mode", "—")
+            score = out.get("score")
+            reason = out.get("reason", "")
+
+            # Backoff logic: if nothing adopted, don’t thrash
+            if adopted:
+                _log(ctx, "info", f"cycle {cycle}: adopted patch (mode={mode})")
+            elif not ok:
+                _log(ctx, "warning", f"cycle {cycle}: not ok reason={reason}")
+            else:
+                _log(ctx, "info", f"cycle {cycle}: skipped (score={score}, no adopt)")
+
+            if not ok or not adopted:
+                delay = min(cfg.max_idle_s, cfg.sleep_s * 2)
+            else:
+                delay = cfg.sleep_s
+            delay = delay + random.randint(0, 5)
+            _log(ctx, "info", f"cycle {cycle}: sleeping {delay}s")
+            await asyncio.sleep(delay)
+
+        except Exception as e:
+            if getattr(ctx, "logger", None):
+                ctx.logger.error("daemon_error", error=repr(e))
+            _log(ctx, "error", f"daemon_error: {e!r}")
+            await asyncio.sleep(min(cfg.max_idle_s, cfg.sleep_s * 3) + random.randint(0, 10))
+
+
+if __name__ == "__main__":
+    # So "python -m agent_ext.cog.daemon" works the same as "python -m agent_ext.cog"
+    from agent_ext.cog.__main__ import main
+
+    main()
diff --git a/agent_ext/cog/loop_v2.py b/agent_ext/cog/loop_v2.py
new file mode 100644
index 0000000..1856b14
--- /dev/null
+++ b/agent_ext/cog/loop_v2.py
@@ -0,0 +1,197 @@
+from __future__ import annotations
+
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+from agent_ext.self_improve.gates import run_gates
+from agent_ext.self_improve.models import GatePlan
+from agent_ext.self_improve.patching import apply_unified_diff
+
+# You already have these pieces:
+# - create_worktree / cleanup_worktree
+# - worktree_diff
+# - apply_unified_diff
+# - run_gates / GatePlan
+# - your llm_patch subagent which accepts strategy hints
+from agent_ext.workbench.worktrees import cleanup_worktree, create_worktree, worktree_diff
+
+from .modes import choose_mode
+from .scoring import score_patch
+from .state import Budget, CogState, RegressionMemory
+from .strategy_bank import pick_strategies
+from .triggers import detect_triggers, repo_fingerprint
+
+
+def _diff_touched_files(diff_text: str) -> list[str]:
+    files = []
+    for line in diff_text.splitlines():
+        if line.startswith("diff --git "):
+            parts = line.split()
+            if len(parts) >= 4:
+                b = parts[3].replace("b/", "")
+                files.append(b)
+    return sorted(set(files))
+
+
+async def run_cognitive_cycle(ctx, goal: str, budget: Budget) -> dict[str, Any]:
+    state: CogState = getattr(ctx, "cog_state", None)
+    reg: RegressionMemory = getattr(ctx, "regression_memory", None)
+    if state is None or reg is None:
+        raise RuntimeError(
+            "RunContext must have cog_state and regression_memory. "
+            "Use agent_ext.workbench.runtime.build_ctx() to build ctx for the daemon."
+        )
+
+    triggers = detect_triggers(state.last_repo_fingerprint)
+
+    # BM25 confidence: how sharp is the distribution?
+    hits = ctx.search.search(goal, top_k=20)
+    bm25_conf = 0.0
+    if hits:
+        top = hits[0][1]
+        tenth = hits[min(9, len(hits) - 1)][1]
+        bm25_conf = float(top / (top + tenth + 1e-9))
+
+    mode = choose_mode(fail_streak=state.fail_streak, triggers=triggers, bm25_confidence=bm25_conf)
+
+    # Pick candidates (paths) and pass snippets to patcher
+    candidates = [{"path": p, "score": float(s)} for p, s in hits[: mode.max_files]]
+
+    # Parallel writers (each in its own worktree)
+    strategies = pick_strategies(mode.parallel_writers)
+
+    results = []
+    for strat in strategies:
+        wt = create_worktree(run_id=ctx.session_id, agent_name=f"writer_{strat.name}")
+        try:
+            patcher = ctx.subagents.get("llm_patch")
+            res = await patcher.run(
+                ctx,
+                input=goal,
+                meta={
+                    "workdir": str(wt.path),
+                    "candidates": candidates,
+                    "max_files": mode.max_files,
+                    "strategy": strat.prompt_style,  # your patcher should incorporate this into prompt
+                },
+            )
+            if not res.ok:
+                results.append({"strategy": strat.name, "ok": False, "err": res.meta})
+                continue
+
+            ok_apply, out_apply = apply_unified_diff(res.output, repo_root=wt.path)
+            if not ok_apply:
+                results.append({"strategy": strat.name, "ok": False, "err": f"apply_failed: {out_apply[:400]}"})
+                continue
+
+            plan = GatePlan(import_check=True, compile_check=True, pytest_paths=(["tests"] if mode.pytest else []))
+            gates = run_gates(plan)
+
+            diff = worktree_diff(wt)
+            touched = _diff_touched_files(diff)
+
+            # hard cap
+            if len(diff) > budget.max_diff_chars:
+                results.append({"strategy": strat.name, "ok": False, "err": f"diff_too_large: {len(diff)}"})
+                continue
+
+            results.append(
+                {
+                    "strategy": strat.name,
+                    "ok": True,
+                    "gates_ok": gates.ok,
+                    "diff": diff,
+                    "diff_chars": len(diff),
+                    "files": touched,
+                }
+            )
+
+        finally:
+            cleanup_worktree(wt, prune_branch=False)
+
+    # Select winner
+    scored = []
+    for r in results:
+        if not r.get("ok") or "diff" not in r:
+            continue
+        sc = score_patch(
+            gates_ok=bool(r.get("gates_ok")),
+            diff_chars=int(r.get("diff_chars", 0)),
+            files_touched=len(r.get("files", [])),
+            eval_delta=0.0,  # wire evals later
+        )
+        scored.append((sc.score, sc, r))
+
+    if not scored:
+        state.fail_streak += 1
+        state.save()
+        return {"ok": False, "mode": mode.name, "reason": "no_valid_candidates", "raw": results}
+
+    scored.sort(key=lambda x: x[0], reverse=True)
+    best_score, best_sc, best_r = scored[0]
+
+    # Anti-thrash: block if files flip too often
+    if reg.is_thrash_risk(best_r["files"]):
+        state.fail_streak += 1
+        state.save()
+        return {"ok": False, "mode": mode.name, "reason": "thrash_risk", "files": best_r["files"], "score": best_score}
+
+    # Decide auto-commit
+    auto = bool(int(os.getenv("AUTO_ADOPT", "1")))
+    threshold = float(os.getenv("AUTO_COMMIT_THRESHOLD", str(budget.auto_commit_threshold)))
+
+    # Persist patch artifact
+    patches_dir = Path(".agent_state/patches") / ctx.session_id
+    patches_dir.mkdir(parents=True, exist_ok=True)
+    patch_path = patches_dir / f"{best_r['strategy']}.diff"
+    patch_path.write_text(best_r["diff"], encoding="utf-8")
+    (Path(".agent_state/last_patch_path.txt")).write_text(str(patch_path), encoding="utf-8")
+
+    if not best_sc.ok or best_score < threshold or not auto:
+        state.fail_streak += 0 if best_sc.ok else 1
+        state.save()
+        return {
+            "ok": True,
+            "adopted": False,
+            "mode": mode.name,
+            "score": best_score,
+            "patch": str(patch_path),
+            "files": best_r["files"],
+            "reasons": best_sc.reasons,
+        }
+
+    # Auto-adopt into current working tree (dev) + commit/push
+    from agent_ext.workbench.adopt import apply_diff_to_repo, commit_and_push
+
+    apply_diff_to_repo(best_r["diff"], repo_root=Path("."))
+
+    main_plan = GatePlan(import_check=True, compile_check=True, pytest_paths=(["tests"] if mode.pytest else []))
+    main_gates = run_gates(main_plan)
+    if not main_gates.ok:
+        state.fail_streak += 1
+        state.save()
+        return {"ok": False, "mode": mode.name, "reason": "main_gates_failed_after_adopt", "patch": str(patch_path)}
+
+    msg = f"auto[{mode.name}/{best_r['strategy']}]: {goal[:72]}"
+    commit_and_push(message=msg, branch=os.getenv("AUTO_PUSH_BRANCH", "dev"), repo_root=Path("."))
+
+    reg.note_commit(best_r["files"], msg)
+    reg.save()
+
+    state.fail_streak = 0
+    state.last_success_ts = time.time()
+    state.last_repo_fingerprint = repo_fingerprint()
+    state.save()
+
+    return {
+        "ok": True,
+        "adopted": True,
+        "mode": mode.name,
+        "score": best_score,
+        "patch": str(patch_path),
+        "files": best_r["files"],
+        "reasons": best_sc.reasons,
+        "commit_msg": msg,
+    }
diff --git a/agent_ext/cog/modes.py b/agent_ext/cog/modes.py
new file mode 100644
index 0000000..036c755
--- /dev/null
+++ b/agent_ext/cog/modes.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Mode:
+    name: str
+    parallel_writers: int
+    max_files: int
+    deep_context: bool
+    pytest: bool
+
+
+FAST = Mode("fast", parallel_writers=1, max_files=4, deep_context=False, pytest=False)
+DEEP = Mode("deep", parallel_writers=2, max_files=8, deep_context=True, pytest=False)
+REPAIR = Mode("repair", parallel_writers=2, max_files=8, deep_context=True, pytest=True)
+EXPLORE = Mode("explore", parallel_writers=3, max_files=6, deep_context=False, pytest=False)
+
+
+def choose_mode(*, fail_streak: int, triggers: list, bm25_confidence: float) -> Mode:
+    if fail_streak >= 2:
+        return REPAIR
+    if any(t.kind == "repo_changed" for t in triggers) and bm25_confidence < 0.25:
+        return DEEP
+    if bm25_confidence > 0.6:
+        return FAST
+    return EXPLORE
diff --git a/agent_ext/cog/scoring.py b/agent_ext/cog/scoring.py
new file mode 100644
index 0000000..88139f8
--- /dev/null
+++ b/agent_ext/cog/scoring.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+def touched_files_from_diff(diff_text: str) -> list[str]:
+    files = []
+    for line in diff_text.splitlines():
+        if line.startswith("diff --git "):
+            parts = line.split()
+            if len(parts) >= 4:
+                files.append(parts[3].replace("b/", ""))
+    return sorted(set(files))
+
+
+@dataclass(frozen=True)
+class Score:
+    total: float
+    reasons: dict[str, float]
+
+    @property
+    def score(self) -> float:
+        """Alias for ``total`` — used by workbench loop and cog loop."""
+        return self.total
+
+    @property
+    def ok(self) -> bool:
+        """True when gates passed (positive gates score)."""
+        return self.reasons.get("gates", 0.0) > 0
+
+
+def score_patch(*, gates_ok: bool, diff_chars: int, files_touched: int, eval_delta: float = 0.0) -> Score:
+    """
+    Starter scoring:
+      +100 if gates pass
+      +50*eval_delta (later)
+      - diff_chars/2000 (cap 30)
+      - 2*files_touched (cap 20)
+    """
+    reasons: dict[str, float] = {}
+    total = 0.0
+
+    reasons["gates"] = 100.0 if gates_ok else -50.0
+    total += reasons["gates"]
+
+    reasons["eval_delta"] = eval_delta * 50.0
+    total += reasons["eval_delta"]
+
+    reasons["diff_penalty"] = -min(30.0, diff_chars / 2000.0)
+    total += reasons["diff_penalty"]
+
+    reasons["files_penalty"] = -min(20.0, files_touched * 2.0)
+    total += reasons["files_penalty"]
+
+    return Score(total=total, reasons=reasons)
diff --git a/agent_ext/cog/state.py b/agent_ext/cog/state.py
new file mode 100644
index 0000000..bf410af
--- /dev/null
+++ b/agent_ext/cog/state.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import json
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+STATE_PATH = Path(".agent_state/cog_state.json")
+REGRESS_PATH = Path(".agent_state/regression_memory.json")
+
+
+def _read(path: Path, default: Any):
+    if not path.exists():
+        return default
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _write(path: Path, obj: Any):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(obj, indent=2), encoding="utf-8")
+
+
+@dataclass
+class Budget:
+    max_steps: int = 10
+    max_model_calls: int = 6
+    max_parallel_writers: int = 3
+    max_diff_chars: int = 60000
+    auto_commit_threshold: float = 80.0
+
+
+@dataclass
+class CogState:
+    version: str = "0.2.0"
+    last_repo_fingerprint: str = ""
+    last_success_ts: float = 0.0
+    fail_streak: int = 0
+    recent_actions: list[dict[str, Any]] = field(default_factory=list)
+
+    def load(self):
+        data = _read(STATE_PATH, None)
+        if not data:
+            return
+        self.version = data.get("version", self.version)
+        self.last_repo_fingerprint = data.get("last_repo_fingerprint", "")
+        self.last_success_ts = float(data.get("last_success_ts", 0.0))
+        self.fail_streak = int(data.get("fail_streak", 0))
+        self.recent_actions = data.get("recent_actions", [])
+
+    def save(self):
+        _write(
+            STATE_PATH,
+            {
+                "version": self.version,
+                "last_repo_fingerprint": self.last_repo_fingerprint,
+                "last_success_ts": self.last_success_ts,
+                "fail_streak": self.fail_streak,
+                "recent_actions": self.recent_actions[-200:],  # cap
+            },
+        )
+
+
+@dataclass
+class RegressionMemory:
+    """
+    Prevents oscillation: detects same files flipping back/forth or repeated revert cycles.
+    """
+
+    flips: dict[str, int] = field(default_factory=dict)  # file -> flip count
+    last_commits: list[dict[str, Any]] = field(default_factory=list)
+
+    def load(self):
+        data = _read(REGRESS_PATH, None)
+        if not data:
+            return
+        self.flips = data.get("flips", {})
+        self.last_commits = data.get("last_commits", [])
+
+    def save(self):
+        _write(
+            REGRESS_PATH,
+            {
+                "flips": self.flips,
+                "last_commits": self.last_commits[-200:],
+            },
+        )
+
+    def note_commit(self, files_touched: list[str], commit_msg: str):
+        for f in files_touched:
+            self.flips[f] = int(self.flips.get(f, 0)) + 1
+        self.last_commits.append({"ts": time.time(), "files": files_touched, "msg": commit_msg})
+
+    def is_thrash_risk(self, files_touched: list[str], max_flips: int = 8) -> bool:
+        return any(int(self.flips.get(f, 0)) >= max_flips for f in files_touched)
diff --git a/agent_ext/cog/strategy_bank.py b/agent_ext/cog/strategy_bank.py
new file mode 100644
index 0000000..6504b30
--- /dev/null
+++ b/agent_ext/cog/strategy_bank.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Strategy:
+    name: str
+    prompt_style: str
+
+
+STRATEGIES = [
+    Strategy("minimal_fix", "Make the smallest change that satisfies the goal."),
+    Strategy("test_first", "Add/adjust tests first, then implement."),
+    Strategy("refactor_safe", "Prefer safer refactor patterns and clearer structure."),
+]
+
+
+def pick_strategies(n: int) -> list[Strategy]:
+    return STRATEGIES[: max(1, min(n, len(STRATEGIES)))]
diff --git a/agent_ext/cog/triggers.py b/agent_ext/cog/triggers.py
new file mode 100644
index 0000000..39abd4d
--- /dev/null
+++ b/agent_ext/cog/triggers.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import hashlib
+import os
+import subprocess
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Trigger:
+    kind: str
+    detail: str
+
+
+def _run(cmd: list[str]) -> tuple[bool, str]:
+    p = subprocess.run(cmd, env=os.environ.copy(), capture_output=True, text=True)
+    out = (p.stdout or "") + ("\n" if p.stdout and p.stderr else "") + (p.stderr or "")
+    return p.returncode == 0, out.strip()
+
+
+def repo_fingerprint() -> str:
+    # cheap-ish: hash of git status + HEAD
+    ok, head = _run(["git", "rev-parse", "HEAD"])
+    if not ok:
+        return ""
+    ok, st = _run(["git", "status", "--porcelain"])
+    s = head + "\n" + st
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
+
+def detect_triggers(prev_fp: str) -> list[Trigger]:
+    tr: list[Trigger] = []
+    fp = repo_fingerprint()
+    if fp and fp != prev_fp:
+        tr.append(Trigger("repo_changed", "git status/head changed"))
+    # add more later: failing CI marker, new issues, eval drift, etc.
+    return tr
diff --git a/agent_ext/database/README.md b/agent_ext/database/README.md
new file mode 100644
index 0000000..89e9bc4
--- /dev/null
+++ b/agent_ext/database/README.md
@@ -0,0 +1,58 @@
+# Database — SQL Capabilities for AI Agents
+
+Empower AI agents to explore schemas, query data, and understand database structures with built-in security controls.
+
+## Features
+
+- **SQLite Backend**: Full schema exploration and query execution
+- **Security**: Read-only mode, row limits, query length limits, timeouts
+- **Schema Discovery**: List tables, describe columns, sample data
+- **Write Protection**: Block INSERT/UPDATE/DELETE in read-only mode
+
+## Quick Start
+
+```python
+from agent_ext.database import SQLiteDatabase, DatabaseConfig
+
+# Read-only access (default)
+db = SQLiteDatabase("my_data.db")
+await db.connect()
+
+# Explore schema
+tables = await db.list_tables()
+for t in tables:
+    print(f"{t.name}: {t.row_count} rows")
+
+# Describe a table
+info = await db.describe_table("users")
+for col in info.columns:
+    print(f"  {col['name']} ({col['type']})")
+
+# Query with security controls
+result = await db.execute_query("SELECT * FROM users WHERE age > 25")
+print(f"Got {result.row_count} rows")
+
+# Sample data
+sample = await db.sample_table("users", limit=5)
+
+await db.disconnect()
+```
+
+## Context Manager
+
+```python
+async with SQLiteDatabase("data.db") as db:
+    result = await db.execute_query("SELECT COUNT(*) FROM orders")
+```
+
+## Security Configuration
+
+```python
+config = DatabaseConfig(
+    read_only=True,        # Block all writes
+    max_rows=1000,         # Limit result size
+    timeout_s=30.0,        # Query timeout
+    max_query_length=10000 # Prevent huge queries
+)
+db = SQLiteDatabase("data.db", config=config)
+```
diff --git a/agent_ext/database/__init__.py b/agent_ext/database/__init__.py
new file mode 100644
index 0000000..961e481
--- /dev/null
+++ b/agent_ext/database/__init__.py
@@ -0,0 +1,16 @@
+"""Database toolset — SQL capabilities for AI agents."""
+
+from .postgres import PostgresDatabase
+from .protocol import DatabaseBackend
+from .sqlite import SQLiteDatabase
+from .toolset import DATABASE_SYSTEM_PROMPT, SQLDatabaseDeps, create_database_toolset
+from .types import DatabaseConfig, QueryResult, SchemaInfo, TableInfo
+
+__all__ = [
+    "QueryResult",
+    "SchemaInfo",
+    "TableInfo",
+    "DatabaseConfig",
+    "DatabaseBackend",
+    "SQLiteDatabase",
+]
diff --git a/agent_ext/database/postgres.py b/agent_ext/database/postgres.py
new file mode 100644
index 0000000..d72500d
--- /dev/null
+++ b/agent_ext/database/postgres.py
@@ -0,0 +1,149 @@
+"""PostgreSQL database backend with security controls.
+
+Requires ``asyncpg`` (already in project dependencies).
+
+Example::
+
+    from agent_ext.database import PostgresDatabase, DatabaseConfig
+
+    async with PostgresDatabase("postgresql://user:pass@localhost/mydb") as db:
+        tables = await db.list_tables()
+        result = await db.execute_query("SELECT * FROM users LIMIT 10")
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Any
+
+from .types import DatabaseConfig, QueryResult, SchemaInfo, TableInfo
+
+
+class PostgresDatabase:
+    """PostgreSQL database backend.
+
+    Provides schema exploration, query execution, and security controls.
+    Uses asyncpg for async Postgres access.
+    """
+
+    def __init__(
+        self,
+        dsn: str,
+        config: DatabaseConfig | None = None,
+    ) -> None:
+        self.dsn = dsn
+        self.config = config or DatabaseConfig()
+        self._pool: Any = None
+
+    async def connect(self) -> None:
+        import asyncpg
+
+        self._pool = await asyncpg.create_pool(self.dsn, min_size=1, max_size=5)
+
+    async def disconnect(self) -> None:
+        if self._pool:
+            await self._pool.close()
+            self._pool = None
+
+    def _require_pool(self):
+        if self._pool is None:
+            raise RuntimeError("Database not connected. Call connect() first.")
+        return self._pool
+
+    async def list_tables(self) -> list[TableInfo]:
+        pool = self._require_pool()
+        async with pool.acquire() as conn:
+            rows = await conn.fetch("SELECT tablename FROM pg_tables WHERE schemaname = 'public' ORDER BY tablename")
+            tables: list[TableInfo] = []
+            for row in rows:
+                name = row["tablename"]
+                count_row = await conn.fetchrow(f'SELECT COUNT(*) as cnt FROM "{name}"')
+                tables.append(TableInfo(name=name, row_count=count_row["cnt"] if count_row else None))
+            return tables
+
+    async def describe_table(self, table_name: str) -> TableInfo:
+        pool = self._require_pool()
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT column_name, data_type, is_nullable, column_default,
+                       (SELECT COUNT(*) > 0 FROM information_schema.key_column_usage k
+                        WHERE k.table_name = c.table_name AND k.column_name = c.column_name
+                        AND k.constraint_name LIKE '%_pkey') as is_pk
+                FROM information_schema.columns c
+                WHERE table_schema = 'public' AND table_name = $1
+                ORDER BY ordinal_position
+                """,
+                table_name,
+            )
+            columns = [
+                {
+                    "name": r["column_name"],
+                    "type": r["data_type"],
+                    "notnull": r["is_nullable"] == "NO",
+                    "default": r["column_default"],
+                    "pk": bool(r["is_pk"]),
+                }
+                for r in rows
+            ]
+            count_row = await conn.fetchrow(f'SELECT COUNT(*) as cnt FROM "{table_name}"')
+            return TableInfo(
+                name=table_name,
+                columns=columns,
+                row_count=count_row["cnt"] if count_row else None,
+            )
+
+    async def get_schema(self) -> SchemaInfo:
+        tables = await self.list_tables()
+        detailed = [await self.describe_table(t.name) for t in tables]
+        return SchemaInfo(
+            tables=detailed,
+            database_type="postgresql",
+            database_path=self.dsn.split("@")[-1] if "@" in self.dsn else self.dsn,
+        )
+
+    async def execute_query(self, sql: str) -> QueryResult:
+        pool = self._require_pool()
+
+        if len(sql) > self.config.max_query_length:
+            return QueryResult(error=f"Query too long ({len(sql)} chars, max {self.config.max_query_length})")
+
+        if self.config.read_only:
+            sql_upper = sql.strip().upper()
+            write_ops = ("INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "CREATE", "REPLACE", "TRUNCATE")
+            if any(sql_upper.startswith(op) for op in write_ops):
+                return QueryResult(error="Write operations not allowed in read-only mode")
+
+        t0 = time.time()
+        try:
+            async with pool.acquire() as conn:
+                rows = await conn.fetch(sql)
+                dt = (time.time() - t0) * 1000
+
+                if not rows:
+                    return QueryResult(execution_time_ms=dt)
+
+                columns = list(rows[0].keys())
+                result_rows = [list(r.values()) for r in rows[: self.config.max_rows + 1]]
+                truncated = len(result_rows) > self.config.max_rows
+                result_rows = result_rows[: self.config.max_rows]
+
+                return QueryResult(
+                    columns=columns,
+                    rows=result_rows,
+                    row_count=len(result_rows),
+                    truncated=truncated,
+                    execution_time_ms=dt,
+                )
+        except Exception as e:
+            return QueryResult(error=str(e), execution_time_ms=(time.time() - t0) * 1000)
+
+    async def sample_table(self, table_name: str, limit: int = 5) -> QueryResult:
+        return await self.execute_query(f'SELECT * FROM "{table_name}" LIMIT {min(limit, self.config.max_rows)}')
+
+    async def __aenter__(self):
+        await self.connect()
+        return self
+
+    async def __aexit__(self, *args):
+        await self.disconnect()
diff --git a/agent_ext/database/protocol.py b/agent_ext/database/protocol.py
new file mode 100644
index 0000000..22f9d0b
--- /dev/null
+++ b/agent_ext/database/protocol.py
@@ -0,0 +1,21 @@
+"""Database backend protocol."""
+
+from __future__ import annotations
+
+from typing import Protocol
+
+from .types import DatabaseConfig, QueryResult, SchemaInfo, TableInfo
+
+
+class DatabaseBackend(Protocol):
+    """Protocol for database backends (SQLite, PostgreSQL, etc.)."""
+
+    config: DatabaseConfig
+
+    async def connect(self) -> None: ...
+    async def disconnect(self) -> None: ...
+    async def list_tables(self) -> list[TableInfo]: ...
+    async def describe_table(self, table_name: str) -> TableInfo: ...
+    async def get_schema(self) -> SchemaInfo: ...
+    async def execute_query(self, sql: str) -> QueryResult: ...
+    async def sample_table(self, table_name: str, limit: int = 5) -> QueryResult: ...
diff --git a/agent_ext/database/sqlite.py b/agent_ext/database/sqlite.py
new file mode 100644
index 0000000..0f4bac3
--- /dev/null
+++ b/agent_ext/database/sqlite.py
@@ -0,0 +1,143 @@
+"""SQLite database backend with security controls."""
+
+from __future__ import annotations
+
+import sqlite3
+import time
+from pathlib import Path
+from typing import Any
+
+from .types import DatabaseConfig, QueryResult, SchemaInfo, TableInfo
+
+
+class SQLiteDatabase:
+    """SQLite database backend.
+
+    Provides schema exploration, query execution, and security controls
+    (read-only mode, row limits, query timeouts).
+
+    Example::
+
+        db = SQLiteDatabase("my_data.db")
+        await db.connect()
+        tables = await db.list_tables()
+        result = await db.execute_query("SELECT * FROM users LIMIT 10")
+        await db.disconnect()
+    """
+
+    def __init__(
+        self,
+        path: str | Path,
+        config: DatabaseConfig | None = None,
+    ) -> None:
+        self.path = str(path)
+        self.config = config or DatabaseConfig()
+        self._conn: sqlite3.Connection | None = None
+
+    async def connect(self) -> None:
+        uri = f"file:{self.path}"
+        if self.config.read_only:
+            uri += "?mode=ro"
+        self._conn = sqlite3.connect(uri, uri=True, timeout=self.config.timeout_s)
+        self._conn.row_factory = sqlite3.Row
+
+    async def disconnect(self) -> None:
+        if self._conn:
+            self._conn.close()
+            self._conn = None
+
+    def _require_conn(self) -> sqlite3.Connection:
+        if self._conn is None:
+            raise RuntimeError("Database not connected. Call connect() first.")
+        return self._conn
+
+    async def list_tables(self) -> list[TableInfo]:
+        conn = self._require_conn()
+        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
+        tables: list[TableInfo] = []
+        for row in cursor.fetchall():
+            name = row[0]
+            count_cursor = conn.execute(f"SELECT COUNT(*) FROM [{name}]")
+            count = count_cursor.fetchone()[0]
+            tables.append(TableInfo(name=name, row_count=count))
+        return tables
+
+    async def describe_table(self, table_name: str) -> TableInfo:
+        conn = self._require_conn()
+        cursor = conn.execute(f"PRAGMA table_info([{table_name}])")
+        columns: list[dict[str, Any]] = []
+        for row in cursor.fetchall():
+            columns.append(
+                {
+                    "name": row[1],
+                    "type": row[2],
+                    "notnull": bool(row[3]),
+                    "default": row[4],
+                    "pk": bool(row[5]),
+                }
+            )
+        count_cursor = conn.execute(f"SELECT COUNT(*) FROM [{table_name}]")
+        count = count_cursor.fetchone()[0]
+        return TableInfo(name=table_name, columns=columns, row_count=count)
+
+    async def get_schema(self) -> SchemaInfo:
+        tables = await self.list_tables()
+        detailed: list[TableInfo] = []
+        for t in tables:
+            detailed.append(await self.describe_table(t.name))
+        return SchemaInfo(
+            tables=detailed,
+            database_type="sqlite",
+            database_path=self.path,
+        )
+
+    async def execute_query(self, sql: str) -> QueryResult:
+        """Execute a SQL query with security controls."""
+        conn = self._require_conn()
+
+        if len(sql) > self.config.max_query_length:
+            return QueryResult(error=f"Query too long ({len(sql)} chars, max {self.config.max_query_length})")
+
+        # Block write operations in read-only mode
+        if self.config.read_only:
+            sql_upper = sql.strip().upper()
+            write_ops = ("INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "CREATE", "REPLACE", "TRUNCATE")
+            if any(sql_upper.startswith(op) for op in write_ops):
+                return QueryResult(error="Write operations not allowed in read-only mode")
+
+        t0 = time.time()
+        try:
+            cursor = conn.execute(sql)
+            if cursor.description is None:
+                return QueryResult(execution_time_ms=(time.time() - t0) * 1000)
+
+            columns = [desc[0] for desc in cursor.description]
+            rows_raw = cursor.fetchmany(self.config.max_rows + 1)
+            truncated = len(rows_raw) > self.config.max_rows
+            rows = [list(r) for r in rows_raw[: self.config.max_rows]]
+
+            return QueryResult(
+                columns=columns,
+                rows=rows,
+                row_count=len(rows),
+                truncated=truncated,
+                execution_time_ms=(time.time() - t0) * 1000,
+            )
+        except Exception as e:
+            return QueryResult(
+                error=str(e),
+                execution_time_ms=(time.time() - t0) * 1000,
+            )
+
+    async def sample_table(self, table_name: str, limit: int = 5) -> QueryResult:
+        """Get sample rows from a table."""
+        return await self.execute_query(f"SELECT * FROM [{table_name}] LIMIT {min(limit, self.config.max_rows)}")
+
+    # -- context manager support --
+
+    async def __aenter__(self):
+        await self.connect()
+        return self
+
+    async def __aexit__(self, *args):
+        await self.disconnect()
diff --git a/agent_ext/database/toolset.py b/agent_ext/database/toolset.py
new file mode 100644
index 0000000..e48118c
--- /dev/null
+++ b/agent_ext/database/toolset.py
@@ -0,0 +1,144 @@
+"""Database toolset — gives any pydantic-ai agent SQL query capabilities.
+
+Example::
+
+    from pydantic_ai import Agent
+    from agent_ext.database import create_database_toolset, SQLDatabaseDeps, SQLiteDatabase
+
+    db = SQLiteDatabase("my_data.db")
+    await db.connect()
+
+    toolset = create_database_toolset()
+    agent = Agent("openai:gpt-4o", toolsets=[toolset])
+
+    deps = SQLDatabaseDeps(database=db)
+    result = await agent.run("What tables are in the database?", deps=deps)
+"""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Annotated, Any
+
+from pydantic import BaseModel, ConfigDict, SkipValidation
+from pydantic_ai import RunContext
+from pydantic_ai.toolsets import FunctionToolset
+
+DATABASE_SYSTEM_PROMPT = """
+## Database Toolset
+
+### IMPORTANT
+* Database may be running in READ-ONLY mode
+* When in read-only mode, only SELECT queries are allowed
+
+You have access to database tools:
+* `list_tables` - list all tables in the database
+* `describe_table` - get column info for a table
+* `sample_table` - preview rows from a table
+* `query` - execute a SQL query
+
+### Best Practices
+* Always sample a table before writing complex queries
+* Use LIMIT when querying large tables
+* Validate queries against the schema before running
+"""
+
+
+class SQLDatabaseDeps(BaseModel):
+    """Dependencies for the database toolset."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    database: Annotated[Any, SkipValidation]  # DatabaseBackend
+    read_only: bool = True
+    max_rows: int = 100
+    query_timeout: float = 30.0
+
+
+def create_database_toolset(*, toolset_id: str | None = None) -> FunctionToolset[SQLDatabaseDeps]:
+    """Create a database toolset for AI agents.
+
+    Returns:
+        FunctionToolset with list_tables, describe_table, sample_table, query tools.
+    """
+    toolset: FunctionToolset[SQLDatabaseDeps] = FunctionToolset(id=toolset_id)
+
+    @toolset.tool
+    async def list_tables(ctx: RunContext[SQLDatabaseDeps]) -> list[str]:
+        """List all tables in the database."""
+        tables = await ctx.deps.database.list_tables()
+        return [t.name for t in tables]
+
+    @toolset.tool
+    async def describe_table(ctx: RunContext[SQLDatabaseDeps], table_name: str) -> str:
+        """Get column info for a specific table.
+
+        Args:
+            table_name: Name of the table to describe.
+        """
+        info = await ctx.deps.database.describe_table(table_name)
+        lines = [f"Table: {info.name} ({info.row_count} rows)"]
+        for col in info.columns:
+            pk = " [PK]" if col.get("pk") else ""
+            nullable = "" if col.get("notnull") else " NULL"
+            lines.append(f"  {col['name']} {col.get('type', '?')}{pk}{nullable}")
+        return "\n".join(lines)
+
+    @toolset.tool
+    async def sample_table(ctx: RunContext[SQLDatabaseDeps], table_name: str, limit: int = 5) -> str:
+        """Preview rows from a table.
+
+        Args:
+            table_name: Table to sample.
+            limit: Number of rows (default 5).
+        """
+        result = await ctx.deps.database.sample_table(table_name, limit=min(limit, ctx.deps.max_rows))
+        if result.error:
+            return f"Error: {result.error}"
+        if not result.rows:
+            return "No rows found."
+        header = " | ".join(result.columns)
+        rows = [" | ".join(str(v) for v in row) for row in result.rows]
+        return f"{header}\n{'─' * len(header)}\n" + "\n".join(rows)
+
+    @toolset.tool
+    async def query(ctx: RunContext[SQLDatabaseDeps], sql_query: str) -> str:
+        """Execute a SQL query and return results.
+
+        Args:
+            sql_query: SQL query to execute.
+        """
+        try:
+            result = await asyncio.wait_for(
+                ctx.deps.database.execute_query(sql_query),
+                timeout=ctx.deps.query_timeout,
+            )
+        except TimeoutError:
+            return f"Error: Query timed out after {ctx.deps.query_timeout}s"
+
+        if result.error:
+            return f"Error: {result.error}"
+
+        if not result.columns:
+            return f"Query executed successfully ({result.execution_time_ms:.0f}ms)"
+
+        # Format as table
+        if len(result.rows) > ctx.deps.max_rows:
+            rows = result.rows[: ctx.deps.max_rows]
+            truncated = True
+        else:
+            rows = result.rows
+            truncated = result.truncated
+
+        header = " | ".join(result.columns)
+        lines = [header, "─" * len(header)]
+        for row in rows:
+            lines.append(" | ".join(str(v) for v in row))
+
+        footer = f"\n({result.row_count} rows, {result.execution_time_ms:.0f}ms)"
+        if truncated:
+            footer += f" [truncated to {len(rows)} rows]"
+        lines.append(footer)
+        return "\n".join(lines)
+
+    return toolset
diff --git a/agent_ext/database/types.py b/agent_ext/database/types.py
new file mode 100644
index 0000000..78d1d33
--- /dev/null
+++ b/agent_ext/database/types.py
@@ -0,0 +1,46 @@
+"""Type definitions for the database system."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class DatabaseConfig:
+    """Configuration for database access."""
+
+    read_only: bool = True
+    max_rows: int = 1000
+    timeout_s: float = 30.0
+    max_query_length: int = 10_000
+
+
+@dataclass
+class TableInfo:
+    """Metadata about a database table."""
+
+    name: str
+    columns: list[dict[str, Any]] = field(default_factory=list)
+    row_count: int | None = None
+
+
+@dataclass
+class SchemaInfo:
+    """Full database schema."""
+
+    tables: list[TableInfo] = field(default_factory=list)
+    database_type: str = ""
+    database_path: str = ""
+
+
+@dataclass
+class QueryResult:
+    """Result of a SQL query."""
+
+    columns: list[str] = field(default_factory=list)
+    rows: list[list[Any]] = field(default_factory=list)
+    row_count: int = 0
+    truncated: bool = False
+    error: str | None = None
+    execution_time_ms: float = 0.0
diff --git a/agent_ext/evidence/citations.py b/agent_ext/evidence/citations.py
index 4bc9d4f..4f0d487 100644
--- a/agent_ext/evidence/citations.py
+++ b/agent_ext/evidence/citations.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 from .models import Citation
 
 
diff --git a/agent_ext/evidence/models.py b/agent_ext/evidence/models.py
index 3effc6d..032f2fc 100644
--- a/agent_ext/evidence/models.py
+++ b/agent_ext/evidence/models.py
@@ -1,26 +1,28 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Optional
+
+from typing import Any
+
 from pydantic import BaseModel, Field
 
 
 class Citation(BaseModel):
-    source_id: str                 # artifact id / uri / file id
-    locator: str                   # page:3, line:20-40, bbox:x1,y1,x2,y2, offset:...
-    quote: Optional[str] = None    # optional small excerpt
+    source_id: str  # artifact id / uri / file id
+    locator: str  # page:3, line:20-40, bbox:x1,y1,x2,y2, offset:...
+    quote: str | None = None  # optional small excerpt
     confidence: float = 0.7
 
 
 class Provenance(BaseModel):
-    produced_by: str               # tool/subagent name
-    artifact_ids: List[str] = Field(default_factory=list)
-    timestamps: Dict[str, str] = Field(default_factory=dict)
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    produced_by: str  # tool/subagent name
+    artifact_ids: list[str] = Field(default_factory=list)
+    timestamps: dict[str, str] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 class Evidence(BaseModel):
-    kind: str                      # "text" | "entity" | "relation" | "finding" | ...
+    kind: str  # "text" | "entity" | "relation" | "finding" | ...
     content: Any
-    citations: List[Citation] = Field(default_factory=list)
+    citations: list[Citation] = Field(default_factory=list)
     provenance: Provenance
     confidence: float = 0.7
-    tags: List[str] = Field(default_factory=list)  # pii|sensitive|domain:finance|...
+    tags: list[str] = Field(default_factory=list)  # pii|sensitive|domain:finance|...
diff --git a/agent_ext/examples/ocr_with_agent_demo.py b/agent_ext/examples/ocr_with_agent_demo.py
index 7df38c0..0fd4549 100644
--- a/agent_ext/examples/ocr_with_agent_demo.py
+++ b/agent_ext/examples/ocr_with_agent_demo.py
@@ -9,31 +9,33 @@
   Set OCR_DEMO_PDF to a PDF path and OPENAI_API_KEY; then from repo root:
   uv run python -m agent_ext.examples.ocr_with_agent_demo
 """
+
 from __future__ import annotations
 
 import os
-import uuid
-from typing import Any, Dict
 
 # Add parent so agent_ext is importable when run as script
 import sys
+import uuid
 from pathlib import Path
+from typing import Any
+
 _root = Path(__file__).resolve().parent.parent.parent
 if _root not in (Path(p).resolve() for p in sys.path):
     sys.path.insert(0, str(_root))
 
 from agent_ext import (
-    PydanticAIAgentBase,
-    IngestPipeline,
     DocumentInput,
+    IngestPipeline,
     IngestResult,
-    PDFToImages,
     LLMVisionOCREngine,
     PageOCROutput,
+    PDFToImages,
+    PydanticAIAgentBase,
 )
-from agent_ext.run_context import RunContext, Policy
 from agent_ext.ingest.extractors import MarkdownDumpExtractor
 from agent_ext.ingest.pdf2image_renderer import Pdf2ImageRenderer
+from agent_ext.run_context import Policy, RunContext
 
 
 # -----------------------------------------------------------------------------
@@ -41,10 +43,10 @@
 # -----------------------------------------------------------------------------
 class InMemoryArtifactStore:
     def __init__(self) -> None:
-        self._bytes_store: Dict[str, bytes] = {}
-        self._metadata: Dict[str, Dict[str, Any]] = {}
+        self._bytes_store: dict[str, bytes] = {}
+        self._metadata: dict[str, dict[str, Any]] = {}
 
-    def put_bytes(self, content: bytes, *, metadata: Dict[str, Any]) -> str:
+    def put_bytes(self, content: bytes, *, metadata: dict[str, Any]) -> str:
         aid = str(uuid.uuid4())
         self._bytes_store[aid] = content
         self._metadata[aid] = metadata
@@ -53,15 +55,17 @@ def put_bytes(self, content: bytes, *, metadata: Dict[str, Any]) -> str:
     def get_bytes(self, artifact_id: str) -> bytes:
         return self._bytes_store[artifact_id]
 
-    def put_json(self, obj: Dict[str, Any], *, metadata: Dict[str, Any]) -> str:
+    def put_json(self, obj: dict[str, Any], *, metadata: dict[str, Any]) -> str:
         import json
+
         aid = str(uuid.uuid4())
         self._bytes_store[aid] = json.dumps(obj).encode("utf-8")
         self._metadata[aid] = metadata
         return aid
 
-    def get_json(self, artifact_id: str) -> Dict[str, Any]:
+    def get_json(self, artifact_id: str) -> dict[str, Any]:
         import json
+
         return json.loads(self._bytes_store[artifact_id].decode("utf-8"))
 
 
@@ -103,7 +107,9 @@ def __init__(self) -> None:
 def main() -> None:
     pdf_path = os.environ.get("OCR_DEMO_PDF")
     if not pdf_path or not Path(pdf_path).exists():
-        print("Set OCR_DEMO_PDF to a PDF path. Example: OCR_DEMO_PDF=./sample.pdf uv run python -m agent_ext.examples.ocr_with_agent_demo")
+        print(
+            "Set OCR_DEMO_PDF to a PDF path. Example: OCR_DEMO_PDF=./sample.pdf uv run python -m agent_ext.examples.ocr_with_agent_demo"
+        )
         return
 
     artifacts = InMemoryArtifactStore()
diff --git a/agent_ext/examples/starter_subagent.py b/agent_ext/examples/starter_subagent.py
index df69f98..bda3846 100644
--- a/agent_ext/examples/starter_subagent.py
+++ b/agent_ext/examples/starter_subagent.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
-from typing import Any, Dict
+
+from typing import Any
 
 from agent_ext.subagents.base import SubagentResult
 
@@ -7,7 +8,7 @@
 class LocalKGShapeProposer:
     name = "local_kg_shape_proposer"
 
-    async def run(self, *, input: Any, metadata: Dict[str, Any]) -> SubagentResult:
+    async def run(self, *, input: Any, metadata: dict[str, Any]) -> SubagentResult:
         """
         Replace with your local agent bridge.
         Input could be a list of Evidence chunks or extracted entities.
diff --git a/agent_ext/export/__init__.py b/agent_ext/export/__init__.py
index e8e05c2..c133425 100644
--- a/agent_ext/export/__init__.py
+++ b/agent_ext/export/__init__.py
@@ -1,5 +1,5 @@
-from .html_writer import HtmlExporter
 from .docx_writer import DocxExporter
+from .html_writer import HtmlExporter
 from .pdf_writer import PdfExporter
 
 EXPORTERS = {
diff --git a/agent_ext/export/base.py b/agent_ext/export/base.py
index 3ace8a2..b1620a4 100644
--- a/agent_ext/export/base.py
+++ b/agent_ext/export/base.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
+
 from typing import Protocol
+
 from agent_ext.export.models import ExportRequest, ExportResult
 
+
 class Exporter(Protocol):
     def render_bytes(self, *, req: ExportRequest, outcome: dict) -> bytes: ...
     def filename(self, *, req: ExportRequest) -> str: ...
diff --git a/agent_ext/export/docx_writer.py b/agent_ext/export/docx_writer.py
index cf5a609..b5e59a6 100644
--- a/agent_ext/export/docx_writer.py
+++ b/agent_ext/export/docx_writer.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
+
 import io
+
 from agent_ext.export.models import ExportRequest
 
+
 class DocxExporter:
     def mime_type(self) -> str:
         return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
diff --git a/agent_ext/export/html_writer.py b/agent_ext/export/html_writer.py
index 2ba7fdd..7cb6131 100644
--- a/agent_ext/export/html_writer.py
+++ b/agent_ext/export/html_writer.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
+
 from html import escape
-from agent_ext.export.base import Exporter
+
 from agent_ext.export.models import ExportRequest
 
+
 class HtmlExporter:
     def mime_type(self) -> str:
         return "text/html; charset=utf-8"
@@ -18,7 +20,10 @@ def render_bytes(self, *, req: ExportRequest, outcome: dict) -> bytes:
         limitations = outcome.get("limitations") or []
 
         def li(items):
-            return "\n".join(f"<li>{escape(str(x.get('text', x)))}</li>" if isinstance(x, dict) else f"<li>{escape(str(x))}</li>" for x in items)
+            return "\n".join(
+                f"<li>{escape(str(x.get('text', x)))}</li>" if isinstance(x, dict) else f"<li>{escape(str(x))}</li>"
+                for x in items
+            )
 
         html = f"""<!doctype html>
 <html>
diff --git a/agent_ext/export/models.py b/agent_ext/export/models.py
index 35c6c16..63ef1d1 100644
--- a/agent_ext/export/models.py
+++ b/agent_ext/export/models.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Optional
+
+from typing import Any
+
 from pydantic import BaseModel, Field
 
+
 class ExportRequest(BaseModel):
     title: str = "Investigation Report"
     format: str  # "html" | "pdf" | "docx" | "pptx"
@@ -9,10 +12,11 @@ class ExportRequest(BaseModel):
     include_limitations: bool = True
     include_evidence_appendix: bool = False  # later
 
+
 class ExportResult(BaseModel):
     format: str
     filename: str
     mime_type: str
     bytes_len: int
-    artifact_id: Optional[str] = None
-    meta: Dict[str, Any] = Field(default_factory=dict)
+    artifact_id: str | None = None
+    meta: dict[str, Any] = Field(default_factory=dict)
diff --git a/agent_ext/export/pdf_writer.py b/agent_ext/export/pdf_writer.py
index 52311a4..684dea1 100644
--- a/agent_ext/export/pdf_writer.py
+++ b/agent_ext/export/pdf_writer.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
+
 import io
+
 from agent_ext.export.models import ExportRequest
 
+
 class PdfExporter:
     def mime_type(self) -> str:
         return "application/pdf"
diff --git a/agent_ext/export/pptx_writer.py b/agent_ext/export/pptx_writer.py
index d7e36e7..d14083e 100644
--- a/agent_ext/export/pptx_writer.py
+++ b/agent_ext/export/pptx_writer.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
 import io
+
 from pptx import Presentation
 
 
 class PptxExporter:
-
     def mime_type(self) -> str:
         return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 
diff --git a/agent_ext/hooks/README.md b/agent_ext/hooks/README.md
new file mode 100644
index 0000000..fcc6748
--- /dev/null
+++ b/agent_ext/hooks/README.md
@@ -0,0 +1,78 @@
+# Middleware / Hooks System
+
+Full-featured async middleware for intercepting, transforming, and guarding every step of an AI agent's lifecycle.
+
+## Features
+
+- **7 Lifecycle Hooks**: `before_run`, `after_run`, `before_model_request`, `before_tool_call`, `after_tool_call`, `on_tool_error`, `on_error`
+- **Scoped Context**: Data sharing between hooks with strict access controls (each hook can only read from earlier hooks)
+- **Cost Tracking**: Automatic token counting and USD cost monitoring with budget enforcement
+- **Parallel Execution**: Run multiple validators concurrently with aggregation strategies
+- **Permissions**: Structured ALLOW/DENY/ASK decisions for tool calls
+- **Timeouts**: Per-hook timeout enforcement
+- **Tool Filtering**: Apply middleware to specific tools only
+- **Conditional Middleware**: Run only when a condition is met
+
+## Quick Start
+
+```python
+from agent_ext.hooks.base import AgentMiddleware
+from agent_ext.hooks.chain import MiddlewareChain
+from agent_ext.hooks.exceptions import InputBlocked
+
+class ContentFilter(AgentMiddleware):
+    async def before_run(self, ctx, prompt):
+        if "ignore instructions" in str(prompt).lower():
+            raise InputBlocked("Prompt injection blocked")
+        return prompt
+
+chain = MiddlewareChain([ContentFilter(), AuditHook()])
+```
+
+## Built-in Middleware
+
+| Middleware | Purpose |
+|-----------|---------|
+| `AuditHook` | Logs lifecycle events with timing |
+| `PolicyHook` | Enforces `ctx.policy` (blocks tools, etc.) |
+| `ContentFilterHook` | Content filtering with blocklists |
+| `CostTrackingMiddleware` | Token + USD cost tracking |
+| `ParallelMiddleware` | Run validators concurrently |
+| `ConditionalMiddleware` | Run middleware only when condition met |
+
+## Aggregation Strategies (Parallel)
+
+| Strategy | Behavior |
+|----------|----------|
+| `ALL_MUST_PASS` | All must succeed, any failure fails |
+| `FIRST_WINS` | First non-exception result used |
+| `MERGE` | Combine all dict results |
+
+## Permissions
+
+```python
+from agent_ext.hooks.permissions import ToolDecision, ToolPermissionResult
+
+class ApprovalMiddleware(AgentMiddleware):
+    async def before_tool_call(self, ctx, tool_name, tool_args):
+        if tool_name == "delete_file":
+            return ToolPermissionResult(
+                decision=ToolDecision.ASK,
+                reason="Destructive operation requires approval",
+            )
+        return tool_args
+```
+
+## Context System
+
+```python
+from agent_ext.hooks.context import MiddlewareContext, HookType
+
+ctx = MiddlewareContext(config={"rate_limit": 100})
+scoped = ctx.for_hook(HookType.BEFORE_RUN)
+scoped.set("user_intent", "question")
+
+# Later hook can read earlier data
+later = ctx.for_hook(HookType.AFTER_RUN)
+intent = later.get_from(HookType.BEFORE_RUN, "user_intent")
+```
diff --git a/agent_ext/hooks/__init__.py b/agent_ext/hooks/__init__.py
index e69de29..806916c 100644
--- a/agent_ext/hooks/__init__.py
+++ b/agent_ext/hooks/__init__.py
@@ -0,0 +1,81 @@
+"""Middleware / hooks system — async lifecycle hooks for AI agents."""
+
+from .async_guardrail import AsyncGuardrailMiddleware
+from .base import AgentMiddleware, Hook
+from .builtins import (
+    AuditHook,
+    ConditionalMiddleware,
+    ContentFilterFn,
+    ContentFilterHook,
+    PolicyHook,
+    make_blocklist_filter,
+)
+from .chain import HookChain, MiddlewareChain
+from .context import ContextAccessError, HookType, MiddlewareContext, ScopedContext
+from .cost_tracking import CostInfo, CostTrackingMiddleware, create_cost_tracking_middleware
+from .decorators import middleware_from_functions
+from .exceptions import (
+    BlockedPrompt,
+    BlockedToolCall,
+    BudgetExceededError,
+    GuardrailTimeout,
+    InputBlocked,
+    MiddlewareConfigError,
+    MiddlewareError,
+    MiddlewareTimeout,
+    OutputBlocked,
+    ParallelExecutionFailed,
+    ToolBlocked,
+)
+from .parallel import ParallelMiddleware
+from .permissions import PermissionHandler, ToolDecision, ToolPermissionResult
+from .strategies import AggregationStrategy, GuardrailTiming
+
+__all__ = [
+    # Base
+    "AgentMiddleware",
+    "Hook",
+    # Chain
+    "HookChain",
+    "MiddlewareChain",
+    # Context
+    "ContextAccessError",
+    "HookType",
+    "MiddlewareContext",
+    "ScopedContext",
+    # Cost tracking
+    "CostInfo",
+    "CostTrackingMiddleware",
+    "create_cost_tracking_middleware",
+    # Exceptions
+    "BlockedPrompt",
+    "BlockedToolCall",
+    "BudgetExceededError",
+    "GuardrailTimeout",
+    "InputBlocked",
+    "MiddlewareConfigError",
+    "MiddlewareError",
+    "MiddlewareTimeout",
+    "OutputBlocked",
+    "ParallelExecutionFailed",
+    "ToolBlocked",
+    # Parallel + Strategies
+    "AggregationStrategy",
+    "GuardrailTiming",
+    "ParallelMiddleware",
+    # Async guardrail
+    "AsyncGuardrailMiddleware",
+    # Decorators
+    "middleware_from_functions",
+    # Permissions
+    "PermissionHandler",
+    "ToolDecision",
+    "ToolPermissionResult",
+    # Builtins
+    "AuditHook",
+    "ConditionalMiddleware",
+    "ContentFilterFn",
+    "ContentFilterHook",
+    "PolicyHook",
+    "make_blocklist_filter",
+]
diff --git a/agent_ext/hooks/async_guardrail.py b/agent_ext/hooks/async_guardrail.py
new file mode 100644
index 0000000..a42ee31
--- /dev/null
+++ b/agent_ext/hooks/async_guardrail.py
@@ -0,0 +1,125 @@
+"""Async guardrail middleware — run guardrails concurrently with LLM calls.
+
+When the guardrail detects a violation while the LLM is still generating,
+the request is short-circuited to save time and API costs.
+
+Timing modes:
+- BLOCKING: traditional — guardrail completes before LLM starts
+- CONCURRENT: guardrail and LLM run in parallel, fail-fast on violation
+- ASYNC_POST: guardrail runs after LLM (monitoring only)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from collections.abc import Sequence
+from typing import Any
+
+from agent_ext.run_context import RunContext
+
+from .base import AgentMiddleware
+from .exceptions import GuardrailTimeout, InputBlocked
+from .strategies import GuardrailTiming
+
+logger = logging.getLogger(__name__)
+
+
+class AsyncGuardrailMiddleware(AgentMiddleware):
+    """Run guardrails concurrently with LLM calls for improved latency.
+
+    Example::
+
+        guardrail = AsyncGuardrailMiddleware(
+            guardrail=PolicyViolationDetector(),
+            timing=GuardrailTiming.CONCURRENT,
+            cancel_on_failure=True,
+        )
+    """
+
+    def __init__(
+        self,
+        guardrail: AgentMiddleware,
+        timing: GuardrailTiming = GuardrailTiming.CONCURRENT,
+        cancel_on_failure: bool = True,
+        timeout: float | None = None,
+        name: str | None = None,
+    ) -> None:
+        self.guardrail = guardrail
+        self.timing = timing
+        self.cancel_on_failure = cancel_on_failure
+        self._timeout = timeout
+        self._name = name or f"AsyncGuardrail({type(guardrail).__name__})"
+        # State for concurrent execution
+        self._guardrail_task: asyncio.Task[Any] | None = None
+        self._guardrail_error: Exception | None = None
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    async def before_run(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
+        if self.timing == GuardrailTiming.BLOCKING:
+            # Traditional: guardrail must pass before LLM starts
+            return await self._run_guardrail_check(ctx, prompt)
+        elif self.timing == GuardrailTiming.CONCURRENT:
+            # Launch guardrail in background; it will raise if it fails
+            self._guardrail_error = None
+            self._guardrail_task = asyncio.create_task(self._run_guardrail_background(ctx, prompt))
+            return prompt
+        else:
+            # ASYNC_POST: do nothing before run
+            return prompt
+
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
+        if self.timing == GuardrailTiming.CONCURRENT:
+            # Wait for background guardrail to complete
+            if self._guardrail_task is not None:
+                try:
+                    if self._timeout:
+                        await asyncio.wait_for(self._guardrail_task, timeout=self._timeout)
+                    else:
+                        await self._guardrail_task
+                except TimeoutError as exc:
+                    raise GuardrailTimeout(self._name, self._timeout or 0.0) from exc
+                except InputBlocked as e:
+                    raise e from None  # Re-raise — guardrail blocked the input
+                finally:
+                    self._guardrail_task = None
+
+            if self._guardrail_error is not None:
+                raise self._guardrail_error
+
+        elif self.timing == GuardrailTiming.ASYNC_POST:
+            # Run guardrail after LLM, non-blocking (monitoring)
+            try:
+                await self._run_guardrail_check(ctx, prompt)
+            except InputBlocked as e:
+                logger.warning(f"Post-run guardrail violation: {e}")
+                # Don't raise — ASYNC_POST is non-blocking
+
+        return output
+
+    async def _run_guardrail_check(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
+        """Run guardrail synchronously (blocking)."""
+        if self._timeout:
+            try:
+                return await asyncio.wait_for(
+                    self.guardrail.before_run(ctx, prompt),
+                    timeout=self._timeout,
+                )
+            except TimeoutError as exc:
+                raise GuardrailTimeout(self._name, self._timeout) from exc
+        return await self.guardrail.before_run(ctx, prompt)
+
+    async def _run_guardrail_background(self, ctx: RunContext, prompt: str | Sequence[Any]) -> None:
+        """Run guardrail in background for concurrent mode."""
+        try:
+            await self.guardrail.before_run(ctx, prompt)
+        except InputBlocked as e:
+            self._guardrail_error = e
+            if self.cancel_on_failure:
+                raise e from None  # Propagate to cancel concurrent operations
+        except Exception as e:
+            self._guardrail_error = e
+            logger.error(f"Guardrail error: {e}")
diff --git a/agent_ext/hooks/base.py b/agent_ext/hooks/base.py
index 10caf2c..a38a745 100644
--- a/agent_ext/hooks/base.py
+++ b/agent_ext/hooks/base.py
@@ -1,26 +1,123 @@
+"""Async middleware base class and legacy sync Protocol.
+
+The new ``AgentMiddleware`` ABC is the primary base:
+- All hooks are async
+- ``tool_names`` filter (None = all tools)
+- ``timeout`` per hook
+- Full lifecycle: before_run, after_run, before_model_request,
+  before_tool_call, after_tool_call, on_tool_error, on_error
+
+The old ``Hook`` sync Protocol is kept for backward-compat but users
+should migrate to ``AgentMiddleware``.
+"""
+
 from __future__ import annotations
-from typing import Any, Optional, Protocol
+
+from abc import ABC
+from collections.abc import Sequence
+from typing import Any, Protocol
 
 from agent_ext.run_context import RunContext, ToolCall, ToolResult
 
+# Re-export exceptions so ``from agent_ext.hooks.base import BlockedToolCall``
+# still works.
+from .exceptions import (  # noqa: F401
+    BlockedPrompt,
+    BlockedToolCall,
+    BudgetExceededError,
+    GuardrailTimeout,
+    InputBlocked,
+    MiddlewareError,
+    MiddlewareTimeout,
+    OutputBlocked,
+    ToolBlocked,
+)
+
+# ---------------------------------------------------------------------------
+# New async ABC (parity with pydantic-ai-middleware)
+# ---------------------------------------------------------------------------
+
+
+class AgentMiddleware(ABC):  # noqa: B024
+    """Async middleware base class.
+
+    Override only the hooks you need.  ``before_*`` hooks run in order,
+    ``after_*`` hooks run in reverse order (onion model).
+
+    Attributes:
+        tool_names: Set of tool names this middleware applies to.
+            ``None`` (default) means *all* tools.
+        timeout: Max seconds for any single hook call (``None`` = unlimited).
+    """
+
+    tool_names: set[str] | None = None
+    timeout: float | None = None
+
+    def _should_handle_tool(self, tool_name: str) -> bool:
+        if self.tool_names is None:
+            return True
+        return tool_name in self.tool_names
+
+    # -- lifecycle hooks (all async) ----------------------------------------
+
+    async def before_run(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
+        """Called before the agent runs.  May modify or block the prompt."""
+        return prompt
+
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
+        """Called after the agent finishes.  May modify or block the output."""
+        return output
+
+    async def before_model_request(self, ctx: RunContext, messages: list[Any]) -> list[Any]:
+        """Called before each model request."""
+        return messages
+
+    async def before_tool_call(
+        self,
+        ctx: RunContext,
+        tool_name: str,
+        tool_args: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Called before a tool is called.  Return modified args or raise ``ToolBlocked``."""
+        return tool_args
+
+    async def after_tool_call(
+        self,
+        ctx: RunContext,
+        tool_name: str,
+        tool_args: dict[str, Any],
+        result: Any,
+    ) -> Any:
+        """Called after a tool returns.  May modify the result."""
+        return result
+
+    async def on_tool_error(
+        self,
+        ctx: RunContext,
+        tool_name: str,
+        tool_args: dict[str, Any],
+        error: Exception,
+    ) -> Exception | None:
+        """Called when a tool raises.  Return a replacement exception or ``None`` to re-raise."""
+        return None
+
+    async def on_error(self, ctx: RunContext, error: Exception) -> Exception | None:
+        """Called on any error.  Return replacement or ``None`` to re-raise."""
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Legacy sync Protocol (backward-compat)
+# ---------------------------------------------------------------------------
+
 
 class Hook(Protocol):
+    """Sync hook protocol (legacy).  Prefer ``AgentMiddleware`` for new code."""
+
     def before_run(self, ctx: RunContext) -> None: ...
     def after_run(self, ctx: RunContext, outcome: Any) -> Any: ...
     def before_model_request(self, ctx: RunContext, request: Any) -> Any: ...
     def after_model_response(self, ctx: RunContext, response: Any) -> Any: ...
     def before_tool_call(self, ctx: RunContext, call: ToolCall) -> ToolCall: ...
     def after_tool_result(self, ctx: RunContext, result: ToolResult) -> ToolResult: ...
-    def on_error(self, ctx: RunContext, err: Exception) -> Optional[Any]: ...
-
-
-class BlockedToolCall(RuntimeError):
-    pass
-
-
-class BlockedPrompt(RuntimeError):
-    """Raise from a content filter to block a request before it reaches the LLM."""
-    def __init__(self, message: str, *, matched_rule: Optional[str] = None, details: Optional[Any] = None):
-        super().__init__(message)
-        self.matched_rule = matched_rule
-        self.details = details
+    def on_error(self, ctx: RunContext, err: Exception) -> Any | None: ...
diff --git a/agent_ext/hooks/builtins.py b/agent_ext/hooks/builtins.py
index a98d22c..cab4abf 100644
--- a/agent_ext/hooks/builtins.py
+++ b/agent_ext/hooks/builtins.py
@@ -1,84 +1,89 @@
+"""Built-in middleware implementations.
+
+All middleware are now async ``AgentMiddleware`` subclasses.
+Legacy sync imports (``AuditHook``, ``PolicyHook``, ``ContentFilterHook``,
+``make_blocklist_filter``) still work — they subclass both ``AgentMiddleware``
+and implement the old sync ``Hook`` interface for backward-compat.
+"""
+
 from __future__ import annotations
+
 import re
 import time
-from typing import Any, Callable, List, Literal, Optional, Sequence, Union
+from collections.abc import Callable, Sequence
+from typing import Any, Literal
+
+from agent_ext.run_context import RunContext
 
-from .base import BlockedPrompt, BlockedToolCall, Hook
-from agent_ext.run_context import RunContext, ToolCall, ToolResult
+from .base import AgentMiddleware
+from .exceptions import InputBlocked, ToolBlocked
 
-# Content filter: (ctx, payload, phase) -> filtered payload. phase is "request" or "response".
-# May raise BlockedPrompt to block the request before it reaches the LLM.
+# Type alias for content filter functions
 ContentFilterFn = Callable[[RunContext, Any, Literal["request", "response"]], Any]
 
 
-class AuditHook(Hook):
-    def before_run(self, ctx: RunContext) -> None:
+# ---------------------------------------------------------------------------
+# AuditHook (async middleware + legacy sync interface)
+# ---------------------------------------------------------------------------
+
+
+class AuditHook(AgentMiddleware):
+    """Logs lifecycle events: run start/end, model requests, tool calls."""
+
+    async def before_run(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
         ctx.tags["t0"] = time.time()
         ctx.logger.info("agent.run.start", case_id=ctx.case_id, session_id=ctx.session_id, trace_id=ctx.trace_id)
+        return prompt
 
-    def after_run(self, ctx: RunContext, outcome: Any) -> Any:
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
         dt = time.time() - float(ctx.tags.get("t0", time.time()))
         ctx.logger.info("agent.run.end", seconds=dt, trace_id=ctx.trace_id)
-        return outcome
+        return output
 
-    def before_model_request(self, ctx: RunContext, request: Any) -> Any:
+    async def before_model_request(self, ctx: RunContext, messages: list[Any]) -> list[Any]:
         ctx.logger.info("model.request", trace_id=ctx.trace_id)
-        return request
-
-    def after_model_response(self, ctx: RunContext, response: Any) -> Any:
-        ctx.logger.info("model.response", trace_id=ctx.trace_id)
-        return response
+        return messages
 
-    def before_tool_call(self, ctx: RunContext, call: ToolCall) -> ToolCall:
-        ctx.logger.info("tool.call", name=call.name, trace_id=ctx.trace_id)
-        return call
+    async def before_tool_call(self, ctx: RunContext, tool_name: str, tool_args: dict[str, Any]) -> dict[str, Any]:
+        ctx.logger.info("tool.call", name=tool_name, trace_id=ctx.trace_id)
+        return tool_args
 
-    def after_tool_result(self, ctx: RunContext, result: ToolResult) -> ToolResult:
-        ctx.logger.info("tool.result", name=result.name, ok=result.ok, trace_id=ctx.trace_id)
+    async def after_tool_call(self, ctx: RunContext, tool_name: str, tool_args: dict[str, Any], result: Any) -> Any:
+        ctx.logger.info("tool.result", name=tool_name, trace_id=ctx.trace_id)
         return result
 
-    def on_error(self, ctx: RunContext, err: Exception) -> Optional[Any]:
-        ctx.logger.error("agent.error", error=str(err), trace_id=ctx.trace_id)
+    async def on_error(self, ctx: RunContext, error: Exception) -> Exception | None:
+        ctx.logger.error("agent.error", error=str(error), trace_id=ctx.trace_id)
         return None
 
 
-class PolicyHook(Hook):
-    def before_run(self, ctx: RunContext) -> None:
-        return None
-
-    def after_run(self, ctx: RunContext, outcome: Any) -> Any:
-        return outcome
+# ---------------------------------------------------------------------------
+# PolicyHook
+# ---------------------------------------------------------------------------
 
-    def before_model_request(self, ctx: RunContext, request: Any) -> Any:
-        return request
 
-    def after_model_response(self, ctx: RunContext, response: Any) -> Any:
-        return response
+class PolicyHook(AgentMiddleware):
+    """Enforces ``ctx.policy`` — blocks tools when ``allow_tools=False``."""
 
-    def before_tool_call(self, ctx: RunContext, call: ToolCall) -> ToolCall:
+    async def before_tool_call(self, ctx: RunContext, tool_name: str, tool_args: dict[str, Any]) -> dict[str, Any]:
         if not ctx.policy.allow_tools:
-            raise BlockedToolCall(f"Tools are disabled by policy: {call.name}")
-        return call
-
-    def after_tool_result(self, ctx: RunContext, result: ToolResult) -> ToolResult:
-        return result
-
-    def on_error(self, ctx: RunContext, err: Exception) -> Optional[Any]:
-        return None
+            raise ToolBlocked(tool_name, "Tools are disabled by policy")
+        return tool_args
 
 
-def _identity_filter(ctx: RunContext, payload: Any, phase: Literal["request", "response"]) -> Any:
-    return payload
+# ---------------------------------------------------------------------------
+# Content filtering
+# ---------------------------------------------------------------------------
 
 
 def _default_extract_text(payload: Any, phase: Literal["request", "response"]) -> str:
-    """Best-effort extract of text from a request/response payload for blocklist checks."""
+    """Best-effort text extraction from a request/response payload."""
     if phase != "request":
         return ""
     if isinstance(payload, str):
         return payload
     if isinstance(payload, list):
-        parts: List[str] = []
+        parts: list[str] = []
         for msg in payload:
             if isinstance(msg, str):
                 parts.append(msg)
@@ -104,17 +109,17 @@ def _default_extract_text(payload: Any, phase: Literal["request", "response"]) -
 
 
 def make_blocklist_filter(
-    patterns: Sequence[Union[str, re.Pattern]],
+    patterns: Sequence[str | re.Pattern[str]],
     *,
-    extract_text: Optional[Callable[[Any, Literal["request", "response"]], str]] = None,
+    extract_text: Callable[[Any, Literal["request", "response"]], str] | None = None,
     reason: str = "Request blocked by policy",
 ) -> ContentFilterFn:
-    """
-    Build a content filter that blocks requests whose text matches any pattern.
-    Raises BlockedPrompt so the request never reaches the LLM. Use in ContentFilterHook.
+    """Build a content filter that blocks requests matching any pattern.
+
+    Raises ``InputBlocked`` so the request never reaches the LLM.
     """
     extract = extract_text or _default_extract_text
-    compiled: List[re.Pattern] = []
+    compiled: list[re.Pattern[str]] = []
     for p in patterns:
         if isinstance(p, re.Pattern):
             compiled.append(p)
@@ -127,7 +132,7 @@ def filter_fn(ctx: RunContext, payload: Any, phase: Literal["request", "response
         text = extract(payload, phase)
         for pat in compiled:
             if pat.search(text):
-                raise BlockedPrompt(
+                raise InputBlocked(
                     reason,
                     matched_rule=pat.pattern if hasattr(pat, "pattern") else str(pat),
                     details={"phase": phase},
@@ -137,39 +142,112 @@ def filter_fn(ctx: RunContext, payload: Any, phase: Literal["request", "response
     return filter_fn
 
 
-class ContentFilterHook(Hook):
-    """
-    Middleware hook for content filtering / redaction on model request and response.
-    Uses ctx.policy.redaction_level: when "none", payloads pass through; otherwise
-    the filter_fn is applied. Supply your own filter (e.g. PII redaction, topic blocklist,
-    or moderation API) via the filter_fn constructor argument.
-    Your filter_fn may raise BlockedPrompt to block the request before it reaches the LLM;
-    the runner should catch BlockedPrompt and not call the model (e.g. return a safe message).
-    """
-    def __init__(self, filter_fn: Optional[ContentFilterFn] = None) -> None:
-        self.filter_fn = filter_fn or _identity_filter
+class ContentFilterHook(AgentMiddleware):
+    """Content filtering / redaction middleware.
 
-    def before_run(self, ctx: RunContext) -> None:
-        return None
+    Runs ``filter_fn`` on every ``before_model_request`` (always) and on
+    ``after_run`` when ``ctx.policy.redaction_level`` is not ``"none"``.
+
+    ``filter_fn`` may raise ``InputBlocked`` to block the request.
+    """
 
-    def after_run(self, ctx: RunContext, outcome: Any) -> Any:
-        return outcome
+    def __init__(self, filter_fn: ContentFilterFn | None = None) -> None:
+        self.filter_fn = filter_fn or (lambda ctx, payload, phase: payload)
 
-    def before_model_request(self, ctx: RunContext, request: Any) -> Any:
-        # Always run request filter so blocking (BlockedPrompt) works even when redaction_level is "none"
-        return self.filter_fn(ctx, request, "request")
+    async def before_model_request(self, ctx: RunContext, messages: list[Any]) -> list[Any]:
+        return self.filter_fn(ctx, messages, "request")
 
-    def after_model_response(self, ctx: RunContext, response: Any) -> Any:
-        # Only redact response when policy requests it
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
         if ctx.policy.redaction_level == "none":
-            return response
-        return self.filter_fn(ctx, response, "response")
+            return output
+        return self.filter_fn(ctx, output, "response")
+
+
+# ---------------------------------------------------------------------------
+# Conditional middleware
+# ---------------------------------------------------------------------------
+
 
-    def before_tool_call(self, ctx: RunContext, call: ToolCall) -> ToolCall:
-        return call
+class ConditionalMiddleware(AgentMiddleware):
+    """Route to different middleware based on a runtime condition.
 
-    def after_tool_result(self, ctx: RunContext, result: ToolResult) -> ToolResult:
+    Supports single middleware or when_true/when_false branching.
+
+    Example (simple)::
+
+        cond = ConditionalMiddleware(
+            PII_Filter(),
+            condition=lambda ctx: ctx.policy.redaction_level != "none",
+        )
+
+    Example (branching)::
+
+        cond = ConditionalMiddleware(
+            condition=lambda ctx: ctx.policy.allow_exec,
+            when_true=FullAccessMiddleware(),
+            when_false=ReadOnlyMiddleware(),
+        )
+    """
+
+    def __init__(
+        self,
+        inner: AgentMiddleware | None = None,
+        condition: Callable[[RunContext], bool] | None = None,
+        *,
+        when_true: AgentMiddleware | list[AgentMiddleware] | None = None,
+        when_false: AgentMiddleware | list[AgentMiddleware] | None = None,
+    ) -> None:
+        if condition is None:
+            raise ValueError("condition is required")
+        self.condition = condition
+        # Normalize: inner → when_true (backward compat)
+        if inner is not None and when_true is None:
+            when_true = inner
+        self.when_true: list[AgentMiddleware] = (
+            [when_true] if isinstance(when_true, AgentMiddleware) else list(when_true or [])
+        )
+        self.when_false: list[AgentMiddleware] = (
+            [when_false] if isinstance(when_false, AgentMiddleware) else list(when_false or [])
+        )
+
+    def _select(self, ctx: RunContext) -> list[AgentMiddleware]:
+        return self.when_true if self.condition(ctx) else self.when_false
+
+    async def before_run(self, ctx, prompt):
+        for mw in self._select(ctx):
+            prompt = await mw.before_run(ctx, prompt)
+        return prompt
+
+    async def after_run(self, ctx, prompt, output):
+        for mw in reversed(self._select(ctx)):
+            output = await mw.after_run(ctx, prompt, output)
+        return output
+
+    async def before_model_request(self, ctx, messages):
+        for mw in self._select(ctx):
+            messages = await mw.before_model_request(ctx, messages)
+        return messages
+
+    async def before_tool_call(self, ctx, tool_name, tool_args):
+        for mw in self._select(ctx):
+            tool_args = await mw.before_tool_call(ctx, tool_name, tool_args)
+        return tool_args
+
+    async def after_tool_call(self, ctx, tool_name, tool_args, result):
+        for mw in reversed(self._select(ctx)):
+            result = await mw.after_tool_call(ctx, tool_name, tool_args, result)
         return result
 
-    def on_error(self, ctx: RunContext, err: Exception) -> Optional[Any]:
+    async def on_tool_error(self, ctx, tool_name, tool_args, error):
+        for mw in self._select(ctx):
+            handled = await mw.on_tool_error(ctx, tool_name, tool_args, error)
+            if handled is not None:
+                return handled
+        return None
+
+    async def on_error(self, ctx, error):
+        for mw in self._select(ctx):
+            handled = await mw.on_error(ctx, error)
+            if handled is not None:
+                return handled
         return None
diff --git a/agent_ext/hooks/chain.py b/agent_ext/hooks/chain.py
index fdeb82a..f9b2e5c 100644
--- a/agent_ext/hooks/chain.py
+++ b/agent_ext/hooks/chain.py
@@ -1,12 +1,284 @@
+"""Composable async middleware chains.
+
+``MiddlewareChain`` groups multiple ``AgentMiddleware`` instances into a
+reusable unit.  ``before_*`` hooks run in order, ``after_*`` / ``on_*``
+hooks run in reverse order (onion model).  Chains can be nested —
+adding a chain flattens it.
+
+The legacy sync ``HookChain`` is preserved at the bottom of this file.
+"""
+
 from __future__ import annotations
-from typing import Any, List, Optional
 
-from .base import Hook
+import asyncio
+from collections.abc import Iterator, Sequence
+from typing import Any, overload
+
 from agent_ext.run_context import RunContext, ToolCall, ToolResult
 
+from .base import AgentMiddleware, Hook
+from .exceptions import MiddlewareTimeout
+from .permissions import ToolPermissionResult
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _flatten(
+    items: Sequence[AgentMiddleware | MiddlewareChain],
+) -> list[AgentMiddleware]:
+    flat: list[AgentMiddleware] = []
+    for item in items:
+        if isinstance(item, MiddlewareChain):
+            flat.extend(item._middleware)
+        elif isinstance(item, AgentMiddleware):
+            flat.append(item)
+        else:
+            raise TypeError(f"Expected AgentMiddleware or MiddlewareChain, got {type(item).__name__}")
+    return flat
+
+
+async def _with_timeout(coro, timeout: float | None, mw_name: str, hook_name: str):
+    """Run *coro* with optional timeout, raising ``MiddlewareTimeout`` on expiry."""
+    if timeout is None:
+        return await coro
+    try:
+        return await asyncio.wait_for(coro, timeout=timeout)
+    except TimeoutError as e:
+        raise MiddlewareTimeout(mw_name, timeout, hook_name) from e
+
+
+# ---------------------------------------------------------------------------
+# Async MiddlewareChain (parity with pydantic-ai-middleware)
+# ---------------------------------------------------------------------------
+
+
+class MiddlewareChain(AgentMiddleware):
+    """A composable, ordered chain of async middleware.
+
+    Supports ``add``, ``insert``, ``remove``, ``replace``, ``pop``,
+    ``clear``, ``copy``, ``+``, ``+=``, indexing, iteration, and ``len``.
+    """
+
+    def __init__(
+        self,
+        middleware: Sequence[AgentMiddleware | MiddlewareChain] | None = None,
+        *,
+        name: str | None = None,
+    ) -> None:
+        self._middleware: list[AgentMiddleware] = _flatten(middleware or [])
+        self._name = name or f"MiddlewareChain({len(self._middleware)})"
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def middleware(self) -> list[AgentMiddleware]:
+        return list(self._middleware)
+
+    # -- mutators -----------------------------------------------------------
+
+    def add(self, mw: AgentMiddleware | MiddlewareChain) -> MiddlewareChain:
+        if isinstance(mw, MiddlewareChain):
+            self._middleware.extend(mw._middleware)
+        elif isinstance(mw, AgentMiddleware):
+            self._middleware.append(mw)
+        else:
+            raise TypeError(f"Expected AgentMiddleware or MiddlewareChain, got {type(mw).__name__}")
+        return self
+
+    def insert(self, index: int, mw: AgentMiddleware | MiddlewareChain) -> MiddlewareChain:
+        if isinstance(mw, MiddlewareChain):
+            self._middleware[index:index] = mw._middleware
+        elif isinstance(mw, AgentMiddleware):
+            self._middleware.insert(index, mw)
+        else:
+            raise TypeError(f"Expected AgentMiddleware or MiddlewareChain, got {type(mw).__name__}")
+        return self
+
+    def remove(self, mw: AgentMiddleware) -> MiddlewareChain:
+        self._middleware.remove(mw)
+        return self
+
+    def pop(self, index: int = -1) -> AgentMiddleware:
+        return self._middleware.pop(index)
+
+    def replace(self, old: AgentMiddleware, new: AgentMiddleware | MiddlewareChain) -> MiddlewareChain:
+        idx = self._middleware.index(old)
+        if isinstance(new, MiddlewareChain):
+            self._middleware[idx : idx + 1] = new._middleware
+        elif isinstance(new, AgentMiddleware):
+            self._middleware[idx] = new
+        else:
+            raise TypeError(f"Expected AgentMiddleware or MiddlewareChain, got {type(new).__name__}")
+        return self
+
+    def clear(self) -> MiddlewareChain:
+        self._middleware.clear()
+        return self
+
+    def copy(self) -> MiddlewareChain:
+        return MiddlewareChain(list(self._middleware), name=self._name)
+
+    # -- dunder -------------------------------------------------------------
+
+    def __add__(self, other: AgentMiddleware | MiddlewareChain) -> MiddlewareChain:
+        if isinstance(other, MiddlewareChain):
+            return MiddlewareChain([*self._middleware, *other._middleware])
+        if isinstance(other, AgentMiddleware):
+            return MiddlewareChain([*self._middleware, other])
+        return NotImplemented
+
+    def __iadd__(self, other: AgentMiddleware | MiddlewareChain) -> MiddlewareChain:
+        return self.add(other)
+
+    def __len__(self) -> int:
+        return len(self._middleware)
+
+    def __bool__(self) -> bool:
+        return bool(self._middleware)
+
+    @overload
+    def __getitem__(self, index: int) -> AgentMiddleware: ...
+    @overload
+    def __getitem__(self, index: slice) -> list[AgentMiddleware]: ...
+    def __getitem__(self, index):
+        return self._middleware[index]
+
+    def __iter__(self) -> Iterator[AgentMiddleware]:
+        return iter(self._middleware)
+
+    def __contains__(self, item: object) -> bool:
+        return item in self._middleware
+
+    def __repr__(self) -> str:
+        return f"MiddlewareChain({self._middleware!r})"
+
+    def __str__(self) -> str:
+        if not self._middleware:
+            return f"{self.name} (empty)"
+        flow = " → ".join(type(mw).__name__ for mw in self._middleware)
+        return f"{self.name}: {flow}"
+
+    # -- hook dispatch (async) ----------------------------------------------
+
+    async def before_run(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
+        current = prompt
+        for mw in self._middleware:
+            current = await _with_timeout(
+                mw.before_run(ctx, current),
+                mw.timeout,
+                type(mw).__name__,
+                "before_run",
+            )
+        return current
+
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
+        current = output
+        for mw in reversed(self._middleware):
+            current = await _with_timeout(
+                mw.after_run(ctx, prompt, current),
+                mw.timeout,
+                type(mw).__name__,
+                "after_run",
+            )
+        return current
+
+    async def before_model_request(self, ctx: RunContext, messages: list[Any]) -> list[Any]:
+        current = messages
+        for mw in self._middleware:
+            current = await _with_timeout(
+                mw.before_model_request(ctx, current),
+                mw.timeout,
+                type(mw).__name__,
+                "before_model_request",
+            )
+        return current
+
+    async def before_tool_call(
+        self,
+        ctx: RunContext,
+        tool_name: str,
+        tool_args: dict[str, Any],
+    ) -> dict[str, Any] | ToolPermissionResult:
+        current_args = tool_args
+        for mw in self._middleware:
+            if not mw._should_handle_tool(tool_name):
+                continue
+            result = await _with_timeout(
+                mw.before_tool_call(ctx, tool_name, current_args),
+                mw.timeout,
+                type(mw).__name__,
+                "before_tool_call",
+            )
+            if isinstance(result, ToolPermissionResult):
+                return result  # short-circuit
+            current_args = result
+        return current_args
+
+    async def after_tool_call(
+        self,
+        ctx: RunContext,
+        tool_name: str,
+        tool_args: dict[str, Any],
+        result: Any,
+    ) -> Any:
+        current = result
+        for mw in reversed(self._middleware):
+            if not mw._should_handle_tool(tool_name):
+                continue
+            current = await _with_timeout(
+                mw.after_tool_call(ctx, tool_name, tool_args, current),
+                mw.timeout,
+                type(mw).__name__,
+                "after_tool_call",
+            )
+        return current
+
+    async def on_tool_error(
+        self,
+        ctx: RunContext,
+        tool_name: str,
+        tool_args: dict[str, Any],
+        error: Exception,
+    ) -> Exception | None:
+        for mw in self._middleware:
+            if not mw._should_handle_tool(tool_name):
+                continue
+            handled = await _with_timeout(
+                mw.on_tool_error(ctx, tool_name, tool_args, error),
+                mw.timeout,
+                type(mw).__name__,
+                "on_tool_error",
+            )
+            if handled is not None:
+                return handled
+        return None
+
+    async def on_error(self, ctx: RunContext, error: Exception) -> Exception | None:
+        for mw in self._middleware:
+            handled = await _with_timeout(
+                mw.on_error(ctx, error),
+                mw.timeout,
+                type(mw).__name__,
+                "on_error",
+            )
+            if handled is not None:
+                return handled
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Legacy sync HookChain (backward-compat with the old Hook Protocol)
+# ---------------------------------------------------------------------------
+
 
 class HookChain:
-    def __init__(self, hooks: List[Hook]):
+    """Sync hook chain (legacy).  Prefer ``MiddlewareChain`` for new code."""
+
+    def __init__(self, hooks: list[Hook]):
         self.hooks = hooks
 
     def before_run(self, ctx: RunContext) -> None:
@@ -38,7 +310,7 @@ def after_tool_result(self, ctx: RunContext, result: ToolResult) -> ToolResult:
             result = h.after_tool_result(ctx, result)
         return result
 
-    def on_error(self, ctx: RunContext, err: Exception) -> Optional[Any]:
+    def on_error(self, ctx: RunContext, err: Exception) -> Any | None:
         for h in reversed(self.hooks):
             maybe = h.on_error(ctx, err)
             if maybe is not None:
diff --git a/agent_ext/hooks/context.py b/agent_ext/hooks/context.py
new file mode 100644
index 0000000..8ce725e
--- /dev/null
+++ b/agent_ext/hooks/context.py
@@ -0,0 +1,188 @@
+"""Middleware context for sharing data across the middleware execution chain.
+
+Provides a context system with strict access controls:
+- Each hook can only *write* to its own namespace
+- Each hook can only *read* from earlier hooks in the execution chain
+- ``on_error`` / ``on_tool_error`` can read everything
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from enum import IntEnum
+from typing import Any
+
+
+class HookType(IntEnum):
+    """Execution order of middleware hooks.
+
+    The integer value represents execution order.  A hook can only read
+    data from hooks with *lower* values (earlier in the chain).
+    """
+
+    BEFORE_RUN = 1
+    BEFORE_MODEL_REQUEST = 2
+    BEFORE_TOOL_CALL = 3
+    ON_TOOL_ERROR = 4
+    AFTER_TOOL_CALL = 5
+    AFTER_RUN = 6
+    ON_ERROR = 7  # special: can read all
+
+
+class ContextAccessError(Exception):
+    """Raised when middleware attempts unauthorized context access."""
+
+
+class ScopedContext:
+    """A scoped view of ``MiddlewareContext`` for a specific hook.
+
+    Enforces access control:
+    - Can only *write* to the current hook's namespace
+    - Can only *read* from the current and earlier hooks' namespaces
+    """
+
+    def __init__(self, parent: MiddlewareContext, current_hook: HookType) -> None:
+        self._parent = parent
+        self._current_hook = current_hook
+
+    # -- read-only global state ---------------------------------------------
+
+    @property
+    def config(self) -> Mapping[str, Any]:
+        """Read-only access to global configuration."""
+        return self._parent.config
+
+    @property
+    def metadata(self) -> Mapping[str, Any]:
+        """Read-only access to execution metadata."""
+        return self._parent.metadata
+
+    @property
+    def current_hook(self) -> HookType:
+        return self._current_hook
+
+    # -- access control -----------------------------------------------------
+
+    def _can_read(self, hook: HookType) -> bool:
+        if self._current_hook in (HookType.ON_ERROR, HookType.ON_TOOL_ERROR):
+            return True
+        return hook <= self._current_hook
+
+    # -- write (only to own namespace) --------------------------------------
+
+    def set(self, key: str, value: Any) -> None:
+        """Store *key* → *value* in the current hook's namespace."""
+        self._parent._set_hook_data(self._current_hook, key, value)
+
+    # -- read ---------------------------------------------------------------
+
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get a value from the current hook's namespace."""
+        return self.get_from(self._current_hook, key, default)
+
+    def get_from(self, hook: HookType, key: str, default: Any = None) -> Any:
+        """Get a value from another hook's namespace (respecting access control)."""
+        if not self._can_read(hook):
+            raise ContextAccessError(
+                f"Hook '{self._current_hook.name}' cannot read from '{hook.name}' (later in execution chain)"
+            )
+        return self._parent._get_hook_data(hook, key, default)
+
+    def get_all_from(self, hook: HookType) -> Mapping[str, Any]:
+        """Get all data from a hook's namespace."""
+        if not self._can_read(hook):
+            raise ContextAccessError(
+                f"Hook '{self._current_hook.name}' cannot read from '{hook.name}' (later in execution chain)"
+            )
+        return self._parent._get_all_hook_data(hook)
+
+    def has_key(self, key: str) -> bool:
+        return self.has_key_in(self._current_hook, key)
+
+    def has_key_in(self, hook: HookType, key: str) -> bool:
+        if not self._can_read(hook):
+            raise ContextAccessError(
+                f"Hook '{self._current_hook.name}' cannot read from '{hook.name}' (later in execution chain)"
+            )
+        return self._parent._has_hook_key(hook, key)
+
+
+class MiddlewareContext:
+    """Context object for sharing data across the middleware chain.
+
+    Provides:
+    - Immutable global ``config``
+    - Mutable ``metadata`` (timestamps, usage, etc.)
+    - Per-hook namespaced storage with access control via ``ScopedContext``
+
+    Example::
+
+        ctx = MiddlewareContext(config={"rate_limit": 100})
+        scoped = ctx.for_hook(HookType.BEFORE_RUN)
+        scoped.set("user_intent", "question")
+
+        later = ctx.for_hook(HookType.AFTER_RUN)
+        intent = later.get_from(HookType.BEFORE_RUN, "user_intent")  # OK
+    """
+
+    def __init__(
+        self,
+        config: dict[str, Any] | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        self._config: dict[str, Any] = dict(config) if config else {}
+        self._metadata: dict[str, Any] = dict(metadata) if metadata else {}
+        self._hook_data: dict[HookType, dict[str, Any]] = {h: {} for h in HookType}
+
+    # -- public read-only ---------------------------------------------------
+
+    @property
+    def config(self) -> Mapping[str, Any]:
+        return self._config
+
+    @property
+    def metadata(self) -> Mapping[str, Any]:
+        return self._metadata
+
+    def set_metadata(self, key: str, value: Any) -> None:
+        """Set metadata (internal use, e.g. by MiddlewareAgent)."""
+        self._metadata[key] = value
+
+    # -- scoped access ------------------------------------------------------
+
+    def for_hook(self, hook: HookType) -> ScopedContext:
+        """Get a ``ScopedContext`` for the given hook."""
+        return ScopedContext(self, hook)
+
+    # -- internals ----------------------------------------------------------
+
+    def _set_hook_data(self, hook: HookType, key: str, value: Any) -> None:
+        self._hook_data[hook][key] = value
+
+    def _get_hook_data(self, hook: HookType, key: str, default: Any = None) -> Any:
+        return self._hook_data[hook].get(key, default)
+
+    def _get_all_hook_data(self, hook: HookType) -> Mapping[str, Any]:
+        return self._hook_data[hook]
+
+    def _has_hook_key(self, hook: HookType, key: str) -> bool:
+        return key in self._hook_data[hook]
+
+    # -- cloning for parallel execution -------------------------------------
+
+    def clone(self) -> MiddlewareContext:
+        """Shallow clone for parallel middleware (prevents race conditions)."""
+        new = MiddlewareContext(config=dict(self._config), metadata=dict(self._metadata))
+        for hook, data in self._hook_data.items():
+            new._hook_data[hook] = dict(data)
+        return new
+
+    def merge_from(self, other: MiddlewareContext, hook: HookType) -> None:
+        """Merge data from *other*'s hook namespace into ours."""
+        self._hook_data[hook].update(other._hook_data[hook])
+
+    def reset(self) -> None:
+        """Clear per-run state (metadata + hook data), keep config."""
+        self._metadata.clear()
+        for hook in HookType:
+            self._hook_data[hook].clear()
diff --git a/agent_ext/hooks/cost_tracking.py b/agent_ext/hooks/cost_tracking.py
new file mode 100644
index 0000000..13277e5
--- /dev/null
+++ b/agent_ext/hooks/cost_tracking.py
@@ -0,0 +1,180 @@
+"""Cost tracking middleware — automatic token usage and USD cost monitoring.
+
+Tracks token usage across agent runs, calculates costs, supports callbacks
+for real-time UI updates, and enforces budget limits.
+"""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from typing import Any, Union
+
+from agent_ext.run_context import RunContext
+
+from .base import AgentMiddleware
+from .exceptions import BudgetExceededError
+
+CostCallback = Union[Callable[["CostInfo"], Any], None]
+
+
+@dataclass
+class CostInfo:
+    """Per-run and cumulative cost information.
+
+    Attributes:
+        run_cost_usd: USD cost of this run (``None`` if model unknown).
+        total_cost_usd: Cumulative USD cost.
+        run_request_tokens: Input tokens this run.
+        run_response_tokens: Output tokens this run.
+        total_request_tokens: Cumulative input tokens.
+        total_response_tokens: Cumulative output tokens.
+        run_count: Number of completed runs.
+    """
+
+    run_cost_usd: float | None
+    total_cost_usd: float | None
+    run_request_tokens: int
+    run_response_tokens: int
+    total_request_tokens: int
+    total_response_tokens: int
+    run_count: int
+
+
+class CostTrackingMiddleware(AgentMiddleware):
+    """Middleware that accumulates token counts and USD cost.
+
+    Args:
+        model_name: Model id for cost calculation (e.g. ``"openai:gpt-4o"``).
+            ``None`` disables USD costing (tokens still tracked).
+        budget_limit_usd: Max cumulative USD (``None`` = unlimited).
+        on_cost_update: Callback after each run with a ``CostInfo``.
+            Accepts sync or async callables.
+        cost_per_1k_input: Manual $/1k input tokens (used when ``model_name`` is None).
+        cost_per_1k_output: Manual $/1k output tokens.
+    """
+
+    def __init__(
+        self,
+        model_name: str | None = None,
+        budget_limit_usd: float | None = None,
+        on_cost_update: CostCallback = None,
+        cost_per_1k_input: float = 0.0,
+        cost_per_1k_output: float = 0.0,
+    ) -> None:
+        self.model_name = model_name
+        self.budget_limit_usd = budget_limit_usd
+        self.on_cost_update = on_cost_update
+        self.cost_per_1k_input = cost_per_1k_input
+        self.cost_per_1k_output = cost_per_1k_output
+
+        self._total_request_tokens: int = 0
+        self._total_response_tokens: int = 0
+        self._total_cost_usd: float = 0.0
+        self._run_count: int = 0
+
+    @property
+    def total_cost(self) -> float:
+        return self._total_cost_usd
+
+    @property
+    def total_request_tokens(self) -> int:
+        return self._total_request_tokens
+
+    @property
+    def total_response_tokens(self) -> int:
+        return self._total_response_tokens
+
+    @property
+    def run_count(self) -> int:
+        return self._run_count
+
+    def reset(self) -> None:
+        self._total_request_tokens = 0
+        self._total_response_tokens = 0
+        self._total_cost_usd = 0.0
+        self._run_count = 0
+
+    async def before_run(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
+        if self.budget_limit_usd is not None and self._total_cost_usd >= self.budget_limit_usd:
+            raise BudgetExceededError(self._total_cost_usd, self.budget_limit_usd)
+        return prompt
+
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
+        # Extract usage from ctx.tags (set by the agent runner)
+        run_req = int(ctx.tags.get("run_request_tokens", 0))
+        run_resp = int(ctx.tags.get("run_response_tokens", 0))
+
+        self._total_request_tokens += run_req
+        self._total_response_tokens += run_resp
+        self._run_count += 1
+
+        run_cost = self._calc_cost(run_req, run_resp)
+        if run_cost is not None:
+            self._total_cost_usd += run_cost
+
+        total_cost = self._total_cost_usd if (self.model_name or self.cost_per_1k_input) else None
+
+        info = CostInfo(
+            run_cost_usd=run_cost,
+            total_cost_usd=total_cost,
+            run_request_tokens=run_req,
+            run_response_tokens=run_resp,
+            total_request_tokens=self._total_request_tokens,
+            total_response_tokens=self._total_response_tokens,
+            run_count=self._run_count,
+        )
+        await self._notify(info)
+        return output
+
+    def _calc_cost(self, input_tokens: int, output_tokens: int) -> float | None:
+        """Calculate USD cost.  Uses genai-prices when available, else manual rates."""
+        if self.model_name:
+            try:
+                from genai_prices import calc_price  # type: ignore[import-untyped]
+
+                provider_id: str | None = None
+                model_ref = self.model_name
+                if ":" in self.model_name:
+                    parts = self.model_name.split(":", 1)
+                    provider_id, model_ref = parts[0], parts[1]
+
+                # Build a minimal usage-like object
+                class _Usage:
+                    def __init__(self, inp, out):
+                        self.input_tokens = inp
+                        self.output_tokens = out
+
+                result = calc_price(_Usage(input_tokens, output_tokens), model_ref, provider_id=provider_id)
+                return float(result.total_price)
+            except Exception:
+                pass  # fall through to manual
+
+        if self.cost_per_1k_input or self.cost_per_1k_output:
+            return (input_tokens / 1000.0) * self.cost_per_1k_input + (output_tokens / 1000.0) * self.cost_per_1k_output
+        return None
+
+    async def _notify(self, info: CostInfo) -> None:
+        if self.on_cost_update is None:
+            return
+        result = self.on_cost_update(info)
+        if inspect.isawaitable(result):
+            await result
+
+
+def create_cost_tracking_middleware(
+    model_name: str | None = None,
+    budget_limit_usd: float | None = None,
+    on_cost_update: CostCallback = None,
+    cost_per_1k_input: float = 0.0,
+    cost_per_1k_output: float = 0.0,
+) -> CostTrackingMiddleware:
+    """Convenience factory."""
+    return CostTrackingMiddleware(
+        model_name=model_name,
+        budget_limit_usd=budget_limit_usd,
+        on_cost_update=on_cost_update,
+        cost_per_1k_input=cost_per_1k_input,
+        cost_per_1k_output=cost_per_1k_output,
+    )
diff --git a/agent_ext/hooks/decorators.py b/agent_ext/hooks/decorators.py
new file mode 100644
index 0000000..59a062b
--- /dev/null
+++ b/agent_ext/hooks/decorators.py
@@ -0,0 +1,112 @@
+"""Decorator-based middleware for simple use cases.
+
+Create middleware from individual async functions instead of subclassing.
+
+Example::
+
+    from agent_ext.hooks.decorators import middleware_from_functions
+
+    async def log_prompt(ctx, prompt):
+        print(f"Prompt: {prompt}")
+        return prompt
+
+    async def sanitize_output(ctx, prompt, output):
+        return output.replace("SSN:", "[REDACTED]")
+
+    mw = middleware_from_functions(before_run=log_prompt, after_run=sanitize_output)
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+from .base import AgentMiddleware
+
+
+class _FunctionMiddleware(AgentMiddleware):
+    """Middleware that delegates to individual functions."""
+
+    def __init__(
+        self,
+        *,
+        before_run_fn: Callable | None = None,
+        after_run_fn: Callable | None = None,
+        before_model_request_fn: Callable | None = None,
+        before_tool_call_fn: Callable | None = None,
+        after_tool_call_fn: Callable | None = None,
+        on_tool_error_fn: Callable | None = None,
+        on_error_fn: Callable | None = None,
+        tool_names: set[str] | None = None,
+    ):
+        self._before_run_fn = before_run_fn
+        self._after_run_fn = after_run_fn
+        self._before_model_request_fn = before_model_request_fn
+        self._before_tool_call_fn = before_tool_call_fn
+        self._after_tool_call_fn = after_tool_call_fn
+        self._on_tool_error_fn = on_tool_error_fn
+        self._on_error_fn = on_error_fn
+        if tool_names is not None:
+            self.tool_names = tool_names
+
+    async def before_run(self, ctx, prompt):
+        if self._before_run_fn:
+            return await self._before_run_fn(ctx, prompt)
+        return prompt
+
+    async def after_run(self, ctx, prompt, output):
+        if self._after_run_fn:
+            return await self._after_run_fn(ctx, prompt, output)
+        return output
+
+    async def before_model_request(self, ctx, messages):
+        if self._before_model_request_fn:
+            return await self._before_model_request_fn(ctx, messages)
+        return messages
+
+    async def before_tool_call(self, ctx, tool_name, tool_args):
+        if self._before_tool_call_fn:
+            return await self._before_tool_call_fn(ctx, tool_name, tool_args)
+        return tool_args
+
+    async def after_tool_call(self, ctx, tool_name, tool_args, result):
+        if self._after_tool_call_fn:
+            return await self._after_tool_call_fn(ctx, tool_name, tool_args, result)
+        return result
+
+    async def on_tool_error(self, ctx, tool_name, tool_args, error):
+        if self._on_tool_error_fn:
+            return await self._on_tool_error_fn(ctx, tool_name, tool_args, error)
+        return None
+
+    async def on_error(self, ctx, error):
+        if self._on_error_fn:
+            return await self._on_error_fn(ctx, error)
+        return None
+
+
+def middleware_from_functions(
+    *,
+    before_run: Callable | None = None,
+    after_run: Callable | None = None,
+    before_model_request: Callable | None = None,
+    before_tool_call: Callable | None = None,
+    after_tool_call: Callable | None = None,
+    on_tool_error: Callable | None = None,
+    on_error: Callable | None = None,
+    tool_names: set[str] | None = None,
+) -> AgentMiddleware:
+    """Create middleware from individual async functions.
+
+    Each function receives the same args as the corresponding
+    ``AgentMiddleware`` hook method.
+    """
+    return _FunctionMiddleware(
+        before_run_fn=before_run,
+        after_run_fn=after_run,
+        before_model_request_fn=before_model_request,
+        before_tool_call_fn=before_tool_call,
+        after_tool_call_fn=after_tool_call,
+        on_tool_error_fn=on_tool_error,
+        on_error_fn=on_error,
+        tool_names=tool_names,
+    )
diff --git a/agent_ext/hooks/exceptions.py b/agent_ext/hooks/exceptions.py
new file mode 100644
index 0000000..ac4b270
--- /dev/null
+++ b/agent_ext/hooks/exceptions.py
@@ -0,0 +1,118 @@
+"""Rich exception hierarchy for the middleware system.
+
+Covers input/output blocking, tool blocking, permissions, budgets,
+timeouts, parallel execution failures, and aggregation errors.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class MiddlewareError(Exception):
+    """Base exception for all middleware errors."""
+
+
+class MiddlewareConfigError(MiddlewareError):
+    """Raised when middleware configuration is invalid."""
+
+
+# ---------------------------------------------------------------------------
+# Blocking
+# ---------------------------------------------------------------------------
+
+
+class InputBlocked(MiddlewareError):
+    """Raised by *before_run* or *before_model_request* to block a prompt."""
+
+    def __init__(self, reason: str = "Input blocked", *, matched_rule: str | None = None, details: Any = None):
+        self.reason = reason
+        self.matched_rule = matched_rule
+        self.details = details
+        super().__init__(reason)
+
+
+class ToolBlocked(MiddlewareError):
+    """Raised when a tool call is blocked by middleware."""
+
+    def __init__(self, tool_name: str, reason: str = "Tool blocked"):
+        self.tool_name = tool_name
+        self.reason = reason
+        super().__init__(f"Tool '{tool_name}' blocked: {reason}")
+
+
+class OutputBlocked(MiddlewareError):
+    """Raised by *after_run* to block an agent output."""
+
+    def __init__(self, reason: str = "Output blocked"):
+        self.reason = reason
+        super().__init__(reason)
+
+
+# ---------------------------------------------------------------------------
+# Budget / cost
+# ---------------------------------------------------------------------------
+
+
+class BudgetExceededError(MiddlewareError):
+    """Raised when accumulated cost exceeds the configured budget limit."""
+
+    def __init__(self, cost: float, budget: float):
+        self.cost = cost
+        self.budget = budget
+        super().__init__(f"Budget exceeded: ${cost:.4f} >= ${budget:.4f} limit")
+
+
+# ---------------------------------------------------------------------------
+# Timeouts
+# ---------------------------------------------------------------------------
+
+
+class MiddlewareTimeout(MiddlewareError):
+    """Raised when a middleware hook exceeds its configured timeout."""
+
+    def __init__(self, middleware_name: str, timeout: float, hook_name: str = ""):
+        self.middleware_name = middleware_name
+        self.timeout = timeout
+        self.hook_name = hook_name
+        detail = f" in {hook_name}" if hook_name else ""
+        super().__init__(f"Middleware '{middleware_name}' timed out{detail} after {timeout:.2f}s")
+
+
+class GuardrailTimeout(MiddlewareError):
+    """Raised when an async guardrail times out."""
+
+    def __init__(self, guardrail_name: str, timeout: float):
+        self.guardrail_name = guardrail_name
+        self.timeout = timeout
+        super().__init__(f"Guardrail '{guardrail_name}' timed out after {timeout:.2f}s")
+
+
+# ---------------------------------------------------------------------------
+# Parallel execution
+# ---------------------------------------------------------------------------
+
+
+class ParallelExecutionFailed(MiddlewareError):
+    """Raised when parallel middleware execution fails."""
+
+    def __init__(
+        self,
+        errors: list[Exception],
+        results: list[Any] | None = None,
+        message: str = "Parallel middleware execution failed",
+    ):
+        self.errors = errors
+        self.results = results or []
+        self.failed_count = len(errors)
+        self.success_count = len(self.results)
+        super().__init__(f"{message}: {self.failed_count} failed, {self.success_count} succeeded")
+
+
+# ---------------------------------------------------------------------------
+# Backward-compat aliases (old names used in codebase)
+# ---------------------------------------------------------------------------
+
+# These map to the old names so existing code doesn't break.
+BlockedToolCall = ToolBlocked
+BlockedPrompt = InputBlocked
diff --git a/agent_ext/hooks/parallel.py b/agent_ext/hooks/parallel.py
new file mode 100644
index 0000000..c63dde4
--- /dev/null
+++ b/agent_ext/hooks/parallel.py
@@ -0,0 +1,129 @@
+"""Parallel middleware execution — run multiple middleware concurrently.
+
+Useful when you have several independent checks (e.g. PII detection,
+profanity filter, injection guard) that can all run at the same time.
+
+Aggregation strategies control how results are combined:
+- ALL_MUST_PASS: all must succeed (any failure fails the whole check)
+- FIRST_WINS: first non-exception result is used
+- MERGE: combine all results (for dict-like outputs)
+"""
+
+from __future__ import annotations
+
+import asyncio
+from collections.abc import Sequence
+from typing import Any
+
+from agent_ext.run_context import RunContext
+
+from .base import AgentMiddleware
+from .exceptions import ParallelExecutionFailed
+from .strategies import AggregationStrategy
+
+
+class ParallelMiddleware(AgentMiddleware):
+    """Execute multiple middleware concurrently.
+
+    Example::
+
+        parallel = ParallelMiddleware(
+            middleware=[PIIDetector(), ProfanityFilter(), InjectionGuard()],
+            strategy=AggregationStrategy.ALL_MUST_PASS,
+        )
+        chain = MiddlewareChain([parallel, LoggingMiddleware()])
+    """
+
+    def __init__(
+        self,
+        middleware: Sequence[AgentMiddleware],
+        strategy: AggregationStrategy = AggregationStrategy.ALL_MUST_PASS,
+        *,
+        name: str | None = None,
+    ) -> None:
+        self._middleware = list(middleware)
+        self.strategy = strategy
+        self._name = name or f"Parallel({len(self._middleware)})"
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    # -- generic parallel runner --------------------------------------------
+
+    async def _run_parallel(
+        self,
+        hook_name: str,
+        coros: list,
+        passthrough: Any,
+    ) -> Any:
+        """Run coroutines in parallel and aggregate per strategy."""
+        results = await asyncio.gather(*coros, return_exceptions=True)
+
+        errors = [r for r in results if isinstance(r, Exception)]
+        successes = [r for r in results if not isinstance(r, Exception)]
+
+        if self.strategy == AggregationStrategy.ALL_MUST_PASS:
+            if errors:
+                raise ParallelExecutionFailed(errors, successes)
+            return passthrough  # all passed → use original
+
+        if self.strategy == AggregationStrategy.FIRST_WINS:
+            if successes:
+                return successes[0]
+            if errors:
+                raise ParallelExecutionFailed(errors)
+            return passthrough
+
+        if self.strategy == AggregationStrategy.MERGE:
+            # For dict-like outputs, merge all successes
+            if not successes:
+                if errors:
+                    raise ParallelExecutionFailed(errors)
+                return passthrough
+            merged = passthrough
+            for s in successes:
+                if isinstance(s, dict) and isinstance(merged, dict):
+                    merged = {**merged, **s}
+                else:
+                    merged = s  # last wins for non-dict
+            return merged
+
+        return passthrough
+
+    # -- hooks (parallel dispatch) ------------------------------------------
+
+    async def before_run(self, ctx: RunContext, prompt: str | Sequence[Any]) -> str | Sequence[Any]:
+        coros = [mw.before_run(ctx, prompt) for mw in self._middleware]
+        return await self._run_parallel("before_run", coros, prompt)
+
+    async def after_run(self, ctx: RunContext, prompt: str | Sequence[Any], output: Any) -> Any:
+        coros = [mw.after_run(ctx, prompt, output) for mw in self._middleware]
+        return await self._run_parallel("after_run", coros, output)
+
+    async def before_model_request(self, ctx: RunContext, messages: list[Any]) -> list[Any]:
+        coros = [mw.before_model_request(ctx, messages) for mw in self._middleware]
+        return await self._run_parallel("before_model_request", coros, messages)
+
+    async def before_tool_call(self, ctx: RunContext, tool_name: str, tool_args: dict[str, Any]) -> dict[str, Any]:
+        applicable = [mw for mw in self._middleware if mw._should_handle_tool(tool_name)]
+        if not applicable:
+            return tool_args
+        coros = [mw.before_tool_call(ctx, tool_name, tool_args) for mw in applicable]
+        return await self._run_parallel("before_tool_call", coros, tool_args)
+
+    async def after_tool_call(self, ctx: RunContext, tool_name: str, tool_args: dict[str, Any], result: Any) -> Any:
+        applicable = [mw for mw in self._middleware if mw._should_handle_tool(tool_name)]
+        if not applicable:
+            return result
+        coros = [mw.after_tool_call(ctx, tool_name, tool_args, result) for mw in applicable]
+        return await self._run_parallel("after_tool_call", coros, result)
+
+    async def on_error(self, ctx: RunContext, error: Exception) -> Exception | None:
+        coros = [mw.on_error(ctx, error) for mw in self._middleware]
+        results = await asyncio.gather(*coros, return_exceptions=True)
+        # Return first non-None, non-exception result
+        for r in results:
+            if r is not None and not isinstance(r, Exception):
+                return r
+        return None
diff --git a/agent_ext/hooks/permissions.py b/agent_ext/hooks/permissions.py
new file mode 100644
index 0000000..dea80c4
--- /dev/null
+++ b/agent_ext/hooks/permissions.py
@@ -0,0 +1,63 @@
+"""Structured permission decisions for tool calls.
+
+Instead of raising ``ToolBlocked`` or returning modified args,
+middleware can return a ``ToolPermissionResult`` with a structured decision:
+ALLOW, DENY, or ASK (defers to a ``PermissionHandler`` callback).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Awaitable
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Protocol, runtime_checkable
+
+
+class ToolDecision(Enum):
+    """Decision for a tool-call permission check."""
+
+    ALLOW = "allow"
+    DENY = "deny"
+    ASK = "ask"  # defer to a PermissionHandler callback
+
+
+@dataclass
+class ToolPermissionResult:
+    """Structured result from ``before_tool_call``.
+
+    Examples::
+
+        # Allow with modified args
+        ToolPermissionResult(decision=ToolDecision.ALLOW,
+                             modified_args={**tool_args, "sanitized": True})
+
+        # Deny
+        ToolPermissionResult(decision=ToolDecision.DENY,
+                             reason="Not authorized")
+
+        # Ask a human / system
+        ToolPermissionResult(decision=ToolDecision.ASK,
+                             reason="Requires explicit approval")
+    """
+
+    decision: ToolDecision
+    reason: str = ""
+    modified_args: dict[str, Any] | None = field(default=None)
+
+
+@runtime_checkable
+class PermissionHandler(Protocol):
+    """Callback protocol for handling ASK decisions.
+
+    Implement to decide whether to allow or deny a tool call when
+    middleware returns ``ToolDecision.ASK``.
+    """
+
+    def __call__(
+        self,
+        tool_name: str,
+        tool_args: dict[str, Any],
+        reason: str,
+    ) -> Awaitable[bool]:
+        """Return ``True`` to allow, ``False`` to deny."""
+        ...
diff --git a/agent_ext/hooks/strategies.py b/agent_ext/hooks/strategies.py
new file mode 100644
index 0000000..d4d04bb
--- /dev/null
+++ b/agent_ext/hooks/strategies.py
@@ -0,0 +1,33 @@
+"""Execution strategies for parallel middleware and guardrails."""
+
+from __future__ import annotations
+
+from enum import Enum
+
+
+class AggregationStrategy(Enum):
+    """How to aggregate results from parallel middleware.
+
+    - ALL_MUST_PASS: all must succeed
+    - FIRST_SUCCESS: first non-error result
+    - RACE: first to complete (even if error)
+    - COLLECT_ALL: return all results as a list
+    """
+
+    ALL_MUST_PASS = "all_must_pass"
+    FIRST_SUCCESS = "first_success"
+    RACE = "race"
+    COLLECT_ALL = "collect_all"
+
+
+class GuardrailTiming(Enum):
+    """When guardrails execute relative to the agent/LLM call.
+
+    - BLOCKING: guardrail completes before agent starts (traditional)
+    - CONCURRENT: guardrail runs alongside LLM, fail-fast on violation
+    - ASYNC_POST: guardrail runs after LLM (monitoring only, non-blocking)
+    """
+
+    BLOCKING = "blocking"
+    CONCURRENT = "concurrent"
+    ASYNC_POST = "async_post"
diff --git a/agent_ext/ingest/citations.py b/agent_ext/ingest/citations.py
index df3fb4b..7048635 100644
--- a/agent_ext/ingest/citations.py
+++ b/agent_ext/ingest/citations.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
+
 from agent_ext.evidence.models import Citation
+
 from .models import OCRSpan
 
 
diff --git a/agent_ext/ingest/docx_parser.py b/agent_ext/ingest/docx_parser.py
index 996b4d3..af90658 100644
--- a/agent_ext/ingest/docx_parser.py
+++ b/agent_ext/ingest/docx_parser.py
@@ -9,7 +9,6 @@
 """
 
 from dataclasses import dataclass
-from typing import List
 
 from .models import OCRPage
 
@@ -18,20 +17,18 @@
 class DocxParser:
     paragraphs_per_chunk: int = 40
 
-    def parse_bytes(self, doc_bytes: bytes) -> List[OCRPage]:
+    def parse_bytes(self, doc_bytes: bytes) -> list[OCRPage]:
         try:
             from docx import Document
         except Exception as e:
-            raise ImportError(
-                "python-docx is required. Install with: pip install python-docx"
-            ) from e
+            raise ImportError("python-docx is required. Install with: pip install python-docx") from e
 
         import io
 
         doc = Document(io.BytesIO(doc_bytes))
 
-        pages: List[OCRPage] = []
-        buf: List[str] = []
+        pages: list[OCRPage] = []
+        buf: list[str] = []
         page_index = 0
 
         def flush():
diff --git a/agent_ext/ingest/extractors.py b/agent_ext/ingest/extractors.py
index f7cac77..87042e7 100644
--- a/agent_ext/ingest/extractors.py
+++ b/agent_ext/ingest/extractors.py
@@ -1,26 +1,30 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Protocol, Type
+
+from typing import Protocol
 
 from pydantic import BaseModel
 
+from agent_ext.evidence.models import Citation, Evidence, Provenance
 from agent_ext.run_context import RunContext
+
 from .models import OCRPage
-from agent_ext.evidence.models import Evidence, Provenance, Citation
 
 
 class PageExtractor(Protocol):
     name: str
-    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: List[OCRPage]) -> List[Evidence]: ...
+
+    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: list[OCRPage]) -> list[Evidence]: ...
 
 
 class MarkdownDumpExtractor:
     """
     Produces a straightforward per-page markdown Evidence chunk.
     """
+
     name = "markdown_dump"
 
-    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: List[OCRPage]) -> List[Evidence]:
-        out: List[Evidence] = []
+    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: list[OCRPage]) -> list[Evidence]:
+        out: list[Evidence] = []
         for p in pages:
             cit = Citation(source_id=doc_artifact_id, locator=f"page:{p.page_index}", confidence=0.7)
             out.append(
@@ -40,12 +44,13 @@ class StructuredModelExtractor:
     """
     Adapter: your team wires this to an LLM call (PydanticAI agent) that returns a specific BaseModel.
     """
-    def __init__(self, *, model_type: Type[BaseModel], llm_fn):
+
+    def __init__(self, *, model_type: type[BaseModel], llm_fn):
         self.model_type = model_type
         self.llm_fn = llm_fn
         self.name = f"structured:{model_type.__name__}"
 
-    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: List[OCRPage]) -> List[Evidence]:
+    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: list[OCRPage]) -> list[Evidence]:
         # join text for now; you can do page-aware prompting later
         text = "\n\n".join([f"[page {p.page_index}]\n{p.full_text}" for p in pages if p.full_text.strip()])
         obj = self.llm_fn(ctx, text, self.model_type)  # must return an instance of model_type
diff --git a/agent_ext/ingest/llm_ocr_engine.py b/agent_ext/ingest/llm_ocr_engine.py
index 49280fa..66314e9 100644
--- a/agent_ext/ingest/llm_ocr_engine.py
+++ b/agent_ext/ingest/llm_ocr_engine.py
@@ -4,12 +4,13 @@
 Pattern aligned with vision OCR (e.g. PDF → images → LLM per page → structured output);
 see README §10 and pydantic-ai OCR examples for the idea.
 """
+
 from __future__ import annotations
 
-from typing import Any, List, Optional, Protocol
+from typing import Any, Protocol
 
+from agent_ext.ingest.models import OCRPage, PageImage
 from agent_ext.run_context import RunContext
-from agent_ext.ingest.models import PageImage, OCRPage
 
 try:
     from pydantic_ai import BinaryContent
@@ -19,6 +20,7 @@
 
 class _AgentLike(Protocol):
     """Agent that accepts run_sync(ctx, message) with message as list (e.g. prompt + BinaryContent)."""
+
     def run_sync(self, ctx: RunContext, message: Any, **kwargs: Any) -> Any: ...
 
 
@@ -43,6 +45,7 @@ class LLMVisionOCREngine:
     Use with our wrapped agent and a structured output type (e.g. PageOCROutput) for
     schema-validated OCR; see README §10 and pydantic-ai OCR examples for the pattern.
     """
+
     name = "llm_vision"
 
     def __init__(
@@ -58,8 +61,8 @@ def __init__(
         self.prompt = prompt
         self.media_type = media_type
 
-    def ocr_pages(self, ctx: RunContext, pages: List[PageImage]) -> List[OCRPage]:
-        out: List[OCRPage] = []
+    def ocr_pages(self, ctx: RunContext, pages: list[PageImage]) -> list[OCRPage]:
+        out: list[OCRPage] = []
         for page in pages:
             image_bytes = ctx.artifacts.get_bytes(page.image_artifact_id)
             message = [
diff --git a/agent_ext/ingest/models.py b/agent_ext/ingest/models.py
index f2642c1..1120808 100644
--- a/agent_ext/ingest/models.py
+++ b/agent_ext/ingest/models.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Optional, Tuple
+
+from typing import Any
+
 from pydantic import BaseModel, Field
 
-from agent_ext.evidence.models import Citation, Evidence, Provenance
+from agent_ext.evidence.models import Evidence
 
 
 class DocumentInput(BaseModel):
@@ -10,36 +12,38 @@ class DocumentInput(BaseModel):
     A single item to ingest. Backed by an artifact id or an accessible path.
     Prefer artifact ids for auditability.
     """
-    artifact_id: Optional[str] = None
-    path: Optional[str] = None
-    filename: Optional[str] = None
-    mime_type: Optional[str] = None
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+    artifact_id: str | None = None
+    path: str | None = None
+    filename: str | None = None
+    mime_type: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 class PageImage(BaseModel):
     page_index: int
     image_artifact_id: str
-    width: Optional[int] = None
-    height: Optional[int] = None
+    width: int | None = None
+    height: int | None = None
 
 
 class OCRSpan(BaseModel):
     text: str
-    bbox: Optional[Tuple[int, int, int, int]] = None  # x1,y1,x2,y2
+    bbox: tuple[int, int, int, int] | None = None  # x1,y1,x2,y2
     confidence: float = 0.7
 
 
 class OCRPage(BaseModel):
     page_index: int
-    spans: List[OCRSpan] = Field(default_factory=list)
+    spans: list[OCRSpan] = Field(default_factory=list)
     full_text: str = ""
     engine: str = "unknown"
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 class PageOCRElement(BaseModel):
     """Single element on a page (table, paragraph, image description, etc.) for structured vision OCR."""
+
     element_type: str = ""
     element_content: str = ""
 
@@ -49,9 +53,10 @@ class PageOCROutput(BaseModel):
     Structured output from a vision/LLM OCR agent per page.
     Use with PydanticAIAgentBase[PageOCROutput] for schema-validated OCR (see README §10).
     """
+
     file_type: str = ""
     file_content_md: str = ""
-    file_elements: List[PageOCRElement] = Field(default_factory=list)
+    file_elements: list[PageOCRElement] = Field(default_factory=list)
 
 
 class IngestResult(BaseModel):
@@ -60,7 +65,8 @@ class IngestResult(BaseModel):
     - per-page OCR
     - Evidence chunks (normalized)
     """
+
     doc_artifact_id: str
-    page_images: List[PageImage] = Field(default_factory=list)
-    ocr_pages: List[OCRPage] = Field(default_factory=list)
-    evidence_chunks: List[Evidence] = Field(default_factory=list)
+    page_images: list[PageImage] = Field(default_factory=list)
+    ocr_pages: list[OCRPage] = Field(default_factory=list)
+    evidence_chunks: list[Evidence] = Field(default_factory=list)
diff --git a/agent_ext/ingest/multi_extractor.py b/agent_ext/ingest/multi_extractor.py
index d9c9c03..a709cda 100644
--- a/agent_ext/ingest/multi_extractor.py
+++ b/agent_ext/ingest/multi_extractor.py
@@ -1,20 +1,19 @@
 from __future__ import annotations
 
-from typing import List
+from agent_ext.evidence.models import Evidence
+from agent_ext.run_context import RunContext
 
 from .extractors import PageExtractor
 from .models import OCRPage
-from agent_ext.evidence.models import Evidence
-from agent_ext.run_context import RunContext
 
 
 class MultiExtractor:
-    def __init__(self, extractors: List[PageExtractor]):
+    def __init__(self, extractors: list[PageExtractor]):
         self.extractors = extractors
         self.name = "multi_extractor"
 
-    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: List[OCRPage]) -> List[Evidence]:
-        out: List[Evidence] = []
+    def extract(self, ctx: RunContext, *, doc_artifact_id: str, pages: list[OCRPage]) -> list[Evidence]:
+        out: list[Evidence] = []
         for ex in self.extractors:
             out.extend(ex.extract(ctx, doc_artifact_id=doc_artifact_id, pages=pages))
         return out
diff --git a/agent_ext/ingest/ocr_engines.py b/agent_ext/ingest/ocr_engines.py
index 7f717dc..2de2c04 100644
--- a/agent_ext/ingest/ocr_engines.py
+++ b/agent_ext/ingest/ocr_engines.py
@@ -1,21 +1,24 @@
 from __future__ import annotations
-from typing import List, Protocol
+
+from typing import Protocol
 
 from agent_ext.run_context import RunContext
+
 from .models import OCRPage, PageImage
 
 
 class OCREngine(Protocol):
     name: str
-    def ocr_pages(self, ctx: RunContext, pages: List[PageImage]) -> List[OCRPage]: ...
+
+    def ocr_pages(self, ctx: RunContext, pages: list[PageImage]) -> list[OCRPage]: ...
 
 
 class NullOCREngine:
     name = "null"
 
-    def ocr_pages(self, ctx: RunContext, pages: List[PageImage]) -> List[OCRPage]:
+    def ocr_pages(self, ctx: RunContext, pages: list[PageImage]) -> list[OCRPage]:
         # Useful for testing pipeline wiring
-        out: List[OCRPage] = []
+        out: list[OCRPage] = []
         for p in pages:
             out.append(OCRPage(page_index=p.page_index, spans=[], full_text="", engine=self.name))
         return out
diff --git a/agent_ext/ingest/pdf2image_renderer.py b/agent_ext/ingest/pdf2image_renderer.py
index 92cc45c..e0d6c3d 100644
--- a/agent_ext/ingest/pdf2image_renderer.py
+++ b/agent_ext/ingest/pdf2image_renderer.py
@@ -2,7 +2,6 @@
 
 import io
 from dataclasses import dataclass
-from typing import Optional
 
 from pdf2image import convert_from_bytes, pdfinfo_from_bytes
 
@@ -17,8 +16,9 @@ class Pdf2ImageRenderer:
       which we use to avoid loading all pages into memory at once.
     - pdfinfo_from_bytes gives page count quickly.
     """
+
     fmt: str = "png"
-    poppler_path: Optional[str] = None  # set if poppler isn't on PATH
+    poppler_path: str | None = None  # set if poppler isn't on PATH
 
     def page_count(self, *, pdf_bytes: bytes) -> int:
         info = pdfinfo_from_bytes(pdf_bytes, poppler_path=self.poppler_path)
diff --git a/agent_ext/ingest/pdf_to_images.py b/agent_ext/ingest/pdf_to_images.py
index 475d2e4..f5e5ad5 100644
--- a/agent_ext/ingest/pdf_to_images.py
+++ b/agent_ext/ingest/pdf_to_images.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
-from typing import List, Protocol
+
+from typing import Protocol
 
 from agent_ext.run_context import RunContext
+
 from .models import DocumentInput, PageImage
 
 
@@ -15,14 +17,14 @@ def __init__(self, renderer: PDFRenderer, *, dpi: int = 200):
         self.renderer = renderer
         self.dpi = dpi
 
-    def run(self, ctx: RunContext, doc: DocumentInput) -> List[PageImage]:
+    def run(self, ctx: RunContext, doc: DocumentInput) -> list[PageImage]:
         if not doc.artifact_id:
             raise ValueError("PDFToImages currently expects doc.artifact_id for auditability")
 
         pdf_bytes = ctx.artifacts.get_bytes(doc.artifact_id)
         n = self.renderer.page_count(pdf_bytes=pdf_bytes)
 
-        pages: List[PageImage] = []
+        pages: list[PageImage] = []
         for i in range(n):
             png = self.renderer.render_to_png_bytes(pdf_bytes=pdf_bytes, page_index=i, dpi=self.dpi)
             img_id = ctx.artifacts.put_bytes(
diff --git a/agent_ext/ingest/pipeline.py b/agent_ext/ingest/pipeline.py
index d3a5599..03445c4 100644
--- a/agent_ext/ingest/pipeline.py
+++ b/agent_ext/ingest/pipeline.py
@@ -1,19 +1,20 @@
 from __future__ import annotations
-from typing import List, Optional
 
 from agent_ext.run_context import RunContext
+
+from .extractors import PageExtractor
 from .models import DocumentInput, IngestResult
-from .pdf_to_images import PDFToImages
 from .ocr_engines import OCREngine
-from .extractors import PageExtractor
-from .validation import OCRValidator, OCRValidationPolicy
+from .pdf_to_images import PDFToImages
+from .validation import OCRValidator
 from .validation_evidence import ValidationEvidenceEmitter
 
+
 class IngestPipeline:
     def __init__(
         self,
         *,
-        pdf_to_images: Optional[PDFToImages],
+        pdf_to_images: PDFToImages | None,
         ocr_engine: OCREngine,
         extractor: PageExtractor,
         validator: OCRValidator | None = None,
@@ -45,11 +46,7 @@ def run(self, ctx: RunContext, doc: DocumentInput) -> IngestResult:
             if self.fail_fast_on_validation:
                 report.raise_if_failed()
         # 3) Extract → Evidence
-        evidence = self.extractor.extract(
-            ctx, 
-            doc_artifact_id=doc_id, 
-            pages=ocr_pages
-            )
+        evidence = self.extractor.extract(ctx, doc_artifact_id=doc_id, pages=ocr_pages)
 
         return IngestResult(
             doc_artifact_id=doc_id,
diff --git a/agent_ext/ingest/retry_planner.py b/agent_ext/ingest/retry_planner.py
index d5f5e59..bcfa7e7 100644
--- a/agent_ext/ingest/retry_planner.py
+++ b/agent_ext/ingest/retry_planner.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
+from typing import Any
 
 from agent_ext.evidence.models import Evidence
 
@@ -11,8 +12,9 @@ class OCRRetryAction:
     """
     One recommended action. Your router can pick one or sequence them.
     """
+
     kind: str  # "rerender_pages" | "rerun_ocr" | "rerun_ocr_pages" | "switch_engine" | "increase_dpi"
-    params: Dict[str, Any]
+    params: dict[str, Any]
     reason: str
     priority: int = 50  # smaller = earlier
 
@@ -20,19 +22,19 @@ class OCRRetryAction:
 @dataclass
 class OCRRetryPlan:
     ok: bool
-    actions: List[OCRRetryAction]
+    actions: list[OCRRetryAction]
     summary: str
-    failed_pages: List[int]
-    warn_pages: List[int]
-    metrics: Dict[str, Any]
+    failed_pages: list[int]
+    warn_pages: list[int]
+    metrics: dict[str, Any]
 
 
-def _extract_validation_evidence(evidence_chunks: Sequence[Evidence]) -> Tuple[List[Evidence], List[Evidence]]:
+def _extract_validation_evidence(evidence_chunks: Sequence[Evidence]) -> tuple[list[Evidence], list[Evidence]]:
     """
     Returns (doc_level, page_level) validation evidences.
     """
-    doc_level: List[Evidence] = []
-    page_level: List[Evidence] = []
+    doc_level: list[Evidence] = []
+    page_level: list[Evidence] = []
     for e in evidence_chunks:
         if e.kind != "validation":
             continue
@@ -48,9 +50,9 @@ def _extract_validation_evidence(evidence_chunks: Sequence[Evidence]) -> Tuple[L
     return doc_level, page_level
 
 
-def _pages_from_page_evidence(page_level: Sequence[Evidence]) -> Tuple[Set[int], Set[int]]:
-    failed: Set[int] = set()
-    warned: Set[int] = set()
+def _pages_from_page_evidence(page_level: Sequence[Evidence]) -> tuple[set[int], set[int]]:
+    failed: set[int] = set()
+    warned: set[int] = set()
     for e in page_level:
         c = e.content if isinstance(e.content, dict) else {}
         page = c.get("page_index")
@@ -64,7 +66,7 @@ def _pages_from_page_evidence(page_level: Sequence[Evidence]) -> Tuple[Set[int],
     return failed, warned
 
 
-def _doc_failure(doc_level: Sequence[Evidence]) -> Tuple[bool, Dict[str, Any], List[Dict[str, Any]]]:
+def _doc_failure(doc_level: Sequence[Evidence]) -> tuple[bool, dict[str, Any], list[dict[str, Any]]]:
     """
     Returns (ok, metrics, issues)
     """
@@ -88,7 +90,7 @@ def build_ocr_retry_plan(
     max_dpi: int = 350,
     dpi_step: int = 100,
     current_engine: str = "primary",
-    alternate_engines: Optional[List[str]] = None,
+    alternate_engines: list[str] | None = None,
     allow_llm_vision_fallback: bool = True,
     prefer_page_subset_retry: bool = True,
 ) -> OCRRetryPlan:
@@ -105,7 +107,7 @@ def build_ocr_retry_plan(
     ok, metrics, issues = _doc_failure(doc_level)
     failed_pages, warn_pages = _pages_from_page_evidence(page_level)
 
-    actions: List[OCRRetryAction] = []
+    actions: list[OCRRetryAction] = []
 
     # If no failures, nothing to do
     if ok and not failed_pages:
@@ -230,7 +232,12 @@ def build_ocr_retry_plan(
         actions.append(
             OCRRetryAction(
                 kind="switch_engine",
-                params={"from": current_engine, "to": "llm_vision" if allow_llm_vision_fallback else (alternate_engines[0] if alternate_engines else "secondary")},
+                params={
+                    "from": current_engine,
+                    "to": "llm_vision"
+                    if allow_llm_vision_fallback
+                    else (alternate_engines[0] if alternate_engines else "secondary"),
+                },
                 reason="Many pages are empty/near-empty; this often indicates OCR engine mismatch with scan/layout. Consider LLM vision fallback.",
                 priority=25,
             )
diff --git a/agent_ext/ingest/validation.py b/agent_ext/ingest/validation.py
index f1ef7e2..1af07cf 100644
--- a/agent_ext/ingest/validation.py
+++ b/agent_ext/ingest/validation.py
@@ -1,29 +1,29 @@
 from __future__ import annotations
 
 import re
-from typing import Any, Dict, List, Optional, Protocol, Tuple
+from typing import Any
 
 from pydantic import BaseModel, Field, ValidationError
 
-from .models import OCRPage, OCRSpan, PageImage
-
+from .models import OCRPage, PageImage
 
 # ----------------------------
 # Validation outputs
 # ----------------------------
 
+
 class ValidationIssue(BaseModel):
     code: str
     severity: str = "warn"  # info|warn|error
     message: str
-    page_index: Optional[int] = None
-    evidence: Dict[str, Any] = Field(default_factory=dict)
+    page_index: int | None = None
+    evidence: dict[str, Any] = Field(default_factory=dict)
 
 
 class OCRValidationReport(BaseModel):
     ok: bool
-    issues: List[ValidationIssue] = Field(default_factory=list)
-    metrics: Dict[str, Any] = Field(default_factory=dict)
+    issues: list[ValidationIssue] = Field(default_factory=list)
+    metrics: dict[str, Any] = Field(default_factory=dict)
 
     def raise_if_failed(self) -> None:
         if not self.ok:
@@ -35,16 +35,17 @@ def raise_if_failed(self) -> None:
 # Policies
 # ----------------------------
 
+
 class OCRValidationPolicy(BaseModel):
     min_pages: int = 1
     min_chars_per_page: int = 40
     min_total_chars: int = 200
-    min_alpha_ratio: float = 0.20          # alpha chars / total chars
-    max_garbage_ratio: float = 0.35        # weird chars ratio
+    min_alpha_ratio: float = 0.20  # alpha chars / total chars
+    max_garbage_ratio: float = 0.35  # weird chars ratio
     max_empty_page_fraction: float = 0.40  # allow some blank pages
-    require_monotonic_pages: bool = True   # page indexes unique/increasing
-    min_span_confidence: float = 0.40      # if spans exist
-    allow_no_spans: bool = True            # engines may not produce spans
+    require_monotonic_pages: bool = True  # page indexes unique/increasing
+    min_span_confidence: float = 0.40  # if spans exist
+    allow_no_spans: bool = True  # engines may not produce spans
 
 
 # ----------------------------
@@ -53,18 +54,21 @@ class OCRValidationPolicy(BaseModel):
 
 _WEIRD = re.compile(r"[^\w\s\.,;:\-–—\(\)\[\]{}'\"/\\@#%&\+\*=<>!?$]")
 
+
 def _alpha_ratio(s: str) -> float:
     if not s:
         return 0.0
     alpha = sum(c.isalpha() for c in s)
     return alpha / max(1, len(s))
 
+
 def _garbage_ratio(s: str) -> float:
     if not s:
         return 0.0
     weird = len(_WEIRD.findall(s))
     return weird / max(1, len(s))
 
+
 def _count_chars(p: OCRPage) -> int:
     return len(p.full_text or "")
 
@@ -73,6 +77,7 @@ def _count_chars(p: OCRPage) -> int:
 # Validator
 # ----------------------------
 
+
 class OCRValidator:
     def __init__(self, policy: OCRValidationPolicy = OCRValidationPolicy()):
         self.policy = policy
@@ -80,18 +85,20 @@ def __init__(self, policy: OCRValidationPolicy = OCRValidationPolicy()):
     def validate_pages(
         self,
         *,
-        page_images: List[PageImage],
-        ocr_pages: List[OCRPage],
+        page_images: list[PageImage],
+        ocr_pages: list[OCRPage],
     ) -> OCRValidationReport:
-        issues: List[ValidationIssue] = []
-        metrics: Dict[str, Any] = {}
+        issues: list[ValidationIssue] = []
+        metrics: dict[str, Any] = {}
 
         if len(page_images) < self.policy.min_pages:
-            issues.append(ValidationIssue(
-                code="too_few_pages",
-                severity="error",
-                message=f"Expected >= {self.policy.min_pages} pages, got {len(page_images)}",
-            ))
+            issues.append(
+                ValidationIssue(
+                    code="too_few_pages",
+                    severity="error",
+                    message=f"Expected >= {self.policy.min_pages} pages, got {len(page_images)}",
+                )
+            )
 
         # page index monotonic + coverage checks
         img_idx = [p.page_index for p in page_images]
@@ -99,29 +106,35 @@ def validate_pages(
 
         if self.policy.require_monotonic_pages:
             if img_idx != sorted(img_idx) or len(set(img_idx)) != len(img_idx):
-                issues.append(ValidationIssue(
-                    code="non_monotonic_page_images",
-                    severity="error",
-                    message="PageImage.page_index must be unique and increasing",
-                    evidence={"indexes": img_idx},
-                ))
+                issues.append(
+                    ValidationIssue(
+                        code="non_monotonic_page_images",
+                        severity="error",
+                        message="PageImage.page_index must be unique and increasing",
+                        evidence={"indexes": img_idx},
+                    )
+                )
             if ocr_idx != sorted(ocr_idx) or len(set(ocr_idx)) != len(ocr_idx):
-                issues.append(ValidationIssue(
-                    code="non_monotonic_ocr_pages",
-                    severity="error",
-                    message="OCRPage.page_index must be unique and increasing",
-                    evidence={"indexes": ocr_idx},
-                ))
+                issues.append(
+                    ValidationIssue(
+                        code="non_monotonic_ocr_pages",
+                        severity="error",
+                        message="OCRPage.page_index must be unique and increasing",
+                        evidence={"indexes": ocr_idx},
+                    )
+                )
 
         # coverage
         missing = sorted(set(img_idx) - set(ocr_idx))
         if missing:
-            issues.append(ValidationIssue(
-                code="missing_ocr_pages",
-                severity="error",
-                message=f"OCR missing pages: {missing[:20]}{'...' if len(missing) > 20 else ''}",
-                evidence={"missing": missing},
-            ))
+            issues.append(
+                ValidationIssue(
+                    code="missing_ocr_pages",
+                    severity="error",
+                    message=f"OCR missing pages: {missing[:20]}{'...' if len(missing) > 20 else ''}",
+                    evidence={"missing": missing},
+                )
+            )
 
         # text quality metrics
         total_chars = sum(_count_chars(p) for p in ocr_pages)
@@ -129,28 +142,32 @@ def validate_pages(
         metrics["pages"] = len(ocr_pages)
 
         if total_chars < self.policy.min_total_chars:
-            issues.append(ValidationIssue(
-                code="too_little_text",
-                severity="error",
-                message=f"Total OCR chars {total_chars} < {self.policy.min_total_chars}",
-                evidence={"total_chars": total_chars},
-            ))
+            issues.append(
+                ValidationIssue(
+                    code="too_little_text",
+                    severity="error",
+                    message=f"Total OCR chars {total_chars} < {self.policy.min_total_chars}",
+                    evidence={"total_chars": total_chars},
+                )
+            )
 
         empty_pages = [p.page_index for p in ocr_pages if _count_chars(p) < self.policy.min_chars_per_page]
-        empty_frac = (len(empty_pages) / max(1, len(ocr_pages)))
+        empty_frac = len(empty_pages) / max(1, len(ocr_pages))
         metrics["empty_page_fraction"] = empty_frac
 
         if empty_frac > self.policy.max_empty_page_fraction:
-            issues.append(ValidationIssue(
-                code="too_many_empty_pages",
-                severity="error",
-                message=f"Empty page fraction {empty_frac:.2f} > {self.policy.max_empty_page_fraction:.2f}",
-                evidence={"empty_pages": empty_pages[:50]},
-            ))
+            issues.append(
+                ValidationIssue(
+                    code="too_many_empty_pages",
+                    severity="error",
+                    message=f"Empty page fraction {empty_frac:.2f} > {self.policy.max_empty_page_fraction:.2f}",
+                    evidence={"empty_pages": empty_pages[:50]},
+                )
+            )
 
         # per-page ratios
-        bad_alpha: List[int] = []
-        bad_garbage: List[int] = []
+        bad_alpha: list[int] = []
+        bad_garbage: list[int] = []
         for p in ocr_pages:
             txt = p.full_text or ""
             if _alpha_ratio(txt) < self.policy.min_alpha_ratio:
@@ -162,39 +179,47 @@ def validate_pages(
             if p.spans:
                 low = [s.confidence for s in p.spans if s.confidence < self.policy.min_span_confidence]
                 if low and len(low) / max(1, len(p.spans)) > 0.5:
-                    issues.append(ValidationIssue(
-                        code="low_span_confidence",
-                        severity="warn",
-                        page_index=p.page_index,
-                        message="Many spans have low confidence",
-                        evidence={"min_span_confidence": self.policy.min_span_confidence},
-                    ))
+                    issues.append(
+                        ValidationIssue(
+                            code="low_span_confidence",
+                            severity="warn",
+                            page_index=p.page_index,
+                            message="Many spans have low confidence",
+                            evidence={"min_span_confidence": self.policy.min_span_confidence},
+                        )
+                    )
             else:
                 if not self.policy.allow_no_spans:
-                    issues.append(ValidationIssue(
-                        code="missing_spans",
-                        severity="warn",
-                        page_index=p.page_index,
-                        message="OCR engine produced no spans",
-                    ))
+                    issues.append(
+                        ValidationIssue(
+                            code="missing_spans",
+                            severity="warn",
+                            page_index=p.page_index,
+                            message="OCR engine produced no spans",
+                        )
+                    )
 
         metrics["bad_alpha_pages"] = len(bad_alpha)
         metrics["bad_garbage_pages"] = len(bad_garbage)
 
         if bad_alpha:
-            issues.append(ValidationIssue(
-                code="low_alpha_ratio_pages",
-                severity="warn",
-                message="Some pages have unusually low alphabetic content",
-                evidence={"pages": bad_alpha[:50]},
-            ))
+            issues.append(
+                ValidationIssue(
+                    code="low_alpha_ratio_pages",
+                    severity="warn",
+                    message="Some pages have unusually low alphabetic content",
+                    evidence={"pages": bad_alpha[:50]},
+                )
+            )
         if bad_garbage:
-            issues.append(ValidationIssue(
-                code="high_garbage_ratio_pages",
-                severity="warn",
-                message="Some pages contain many unusual characters (possible OCR failure)",
-                evidence={"pages": bad_garbage[:50]},
-            ))
+            issues.append(
+                ValidationIssue(
+                    code="high_garbage_ratio_pages",
+                    severity="warn",
+                    message="Some pages contain many unusual characters (possible OCR failure)",
+                    evidence={"pages": bad_garbage[:50]},
+                )
+            )
 
         ok = not any(i.severity == "error" for i in issues)
         return OCRValidationReport(ok=ok, issues=issues, metrics=metrics)
@@ -204,12 +229,14 @@ def validate_pages(
 # Structured output validation
 # ----------------------------
 
+
 class StructuredOutputValidator:
     """
     Validates the structured output conforms to a specific Pydantic model.
     Useful when you do LLM-based extraction.
     """
-    def validate(self, *, model_type: type[BaseModel], obj: Any) -> Tuple[bool, Optional[str]]:
+
+    def validate(self, *, model_type: type[BaseModel], obj: Any) -> tuple[bool, str | None]:
         try:
             model_type.model_validate(obj)
             return True, None
diff --git a/agent_ext/ingest/validation_evidence.py b/agent_ext/ingest/validation_evidence.py
index 3e1558f..5c8e04c 100644
--- a/agent_ext/ingest/validation_evidence.py
+++ b/agent_ext/ingest/validation_evidence.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-from typing import List, Optional
+from agent_ext.evidence.models import Citation, Evidence, Provenance
+from agent_ext.run_context import RunContext
 
-from agent_ext.evidence.models import Evidence, Provenance, Citation
 from .validation import OCRValidationReport, ValidationIssue
-from agent_ext.run_context import RunContext
 
 
 class ValidationEvidenceEmitter:
@@ -31,10 +30,10 @@ def emit_ocr_validation(
         *,
         doc_artifact_id: str,
         report: OCRValidationReport,
-    ) -> List[Evidence]:
-        evidences: List[Evidence] = []
+    ) -> list[Evidence]:
+        evidences: list[Evidence] = []
 
-        report_artifact_id: Optional[str] = None
+        report_artifact_id: str | None = None
         if self.store_full_report_artifact:
             report_artifact_id = ctx.artifacts.put_json(
                 report.model_dump(),
@@ -89,9 +88,9 @@ def _emit_page_level(
         ctx: RunContext,
         doc_artifact_id: str,
         report: OCRValidationReport,
-        report_artifact_id: Optional[str],
-    ) -> List[Evidence]:
-        out: List[Evidence] = []
+        report_artifact_id: str | None,
+    ) -> list[Evidence]:
+        out: list[Evidence] = []
         by_page: dict[int, list[ValidationIssue]] = {}
         for issue in report.issues:
             if issue.page_index is None:
diff --git a/agent_ext/mcp/__init__.py b/agent_ext/mcp/__init__.py
new file mode 100644
index 0000000..30b773d
--- /dev/null
+++ b/agent_ext/mcp/__init__.py
@@ -0,0 +1,5 @@
+from .client import MCPClient
+from .registry import MCPToolRegistry
+from .server import MCPServer
+from .transport import LocalTransport
+from .types import ToolCall, ToolResult, ToolSpec
diff --git a/agent_ext/mcp/client.py b/agent_ext/mcp/client.py
new file mode 100644
index 0000000..138a46c
--- /dev/null
+++ b/agent_ext/mcp/client.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import uuid
+from typing import Any
+
+from .transport import LocalTransport
+from .types import ToolCall, ToolResult
+
+
+class MCPClient:
+    def __init__(self, transport: LocalTransport):
+        self.transport = transport
+
+    async def call(self, tool: str, args: dict[str, Any]) -> ToolResult:
+        call_id = uuid.uuid4().hex
+        await self.transport.server_in.put(ToolCall(tool=tool, args=args, call_id=call_id))
+        # naive: wait for matching call_id
+        while True:
+            res: ToolResult = await self.transport.server_out.get()
+            if res.call_id == call_id:
+                return res
diff --git a/agent_ext/mcp/registry.py b/agent_ext/mcp/registry.py
new file mode 100644
index 0000000..842ccff
--- /dev/null
+++ b/agent_ext/mcp/registry.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
+from .types import ToolResult, ToolSpec
+
+ToolFn = Callable[[dict], Any]
+
+
+class MCPToolRegistry:
+    def __init__(self):
+        self._specs: dict[str, ToolSpec] = {}
+        self._fns: dict[str, ToolFn] = {}
+
+    def register(self, spec: ToolSpec, fn: ToolFn) -> None:
+        self._specs[spec.name] = spec
+        self._fns[spec.name] = fn
+
+    def list_specs(self) -> list[ToolSpec]:
+        return [self._specs[k] for k in sorted(self._specs)]
+
+    def call(self, tool: str, args: dict, call_id: str) -> ToolResult:
+        fn = self._fns.get(tool)
+        if not fn:
+            return ToolResult(call_id=call_id, ok=False, error=f"unknown tool: {tool}")
+        try:
+            out = fn(args)
+            return ToolResult(call_id=call_id, ok=True, result=out)
+        except Exception as e:
+            return ToolResult(call_id=call_id, ok=False, error=repr(e))
diff --git a/agent_ext/mcp/server.py b/agent_ext/mcp/server.py
new file mode 100644
index 0000000..f17ec2e
--- /dev/null
+++ b/agent_ext/mcp/server.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import asyncio
+
+from .registry import MCPToolRegistry
+from .transport import LocalTransport
+from .types import ToolCall
+
+
+class MCPServer:
+    def __init__(self, registry: MCPToolRegistry, transport: LocalTransport):
+        self.registry = registry
+        self.transport = transport
+        self._task: asyncio.Task | None = None
+
+    async def serve_forever(self) -> None:
+        while True:
+            call: ToolCall = await self.transport.server_in.get()
+            res = self.registry.call(call.tool, call.args, call.call_id)
+            await self.transport.server_out.put(res)
+
+    def start(self) -> None:
+        if self._task is None:
+            self._task = asyncio.create_task(self.serve_forever())
diff --git a/agent_ext/mcp/transport.py b/agent_ext/mcp/transport.py
new file mode 100644
index 0000000..f98dbda
--- /dev/null
+++ b/agent_ext/mcp/transport.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+
+from .types import ToolCall, ToolResult
+
+
+@dataclass
+class LocalTransport:
+    """
+    Simple in-process transport: client pushes ToolCall to server queue, gets ToolResult back.
+    """
+
+    server_in: asyncio.Queue[ToolCall]
+    server_out: asyncio.Queue[ToolResult]
diff --git a/agent_ext/mcp/types.py b/agent_ext/mcp/types.py
new file mode 100644
index 0000000..0e125ef
--- /dev/null
+++ b/agent_ext/mcp/types.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+Json = dict[str, Any]
+
+
+@dataclass(frozen=True)
+class ToolSpec:
+    name: str
+    description: str
+    input_schema: Json = field(default_factory=dict)
+    output_schema: Json = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ToolCall:
+    tool: str
+    args: Json
+    call_id: str
+
+
+@dataclass(frozen=True)
+class ToolResult:
+    call_id: str
+    ok: bool
+    result: Any = None
+    error: str | None = None
diff --git a/agent_ext/memory/README.md b/agent_ext/memory/README.md
new file mode 100644
index 0000000..d11b52c
--- /dev/null
+++ b/agent_ext/memory/README.md
@@ -0,0 +1,63 @@
+# Memory — Context Management
+
+Automatic conversation history management with safe cutoff and optional LLM summarization.
+
+## Features
+
+- **Sliding Window**: Message-count or token-aware trimming (zero LLM cost)
+- **Safe Cutoff**: Never splits tool call/response pairs
+- **Token Counting**: Approximate or custom (tiktoken)
+- **Summarization**: LLM-powered dossier compression (pluggable summarize_fn)
+- **Triggers**: Trim only when thresholds are hit
+
+## Quick Start
+
+```python
+from agent_ext.memory import SlidingWindowMemory
+
+# Message-count mode (default)
+memory = SlidingWindowMemory(max_messages=50)
+
+# Token-aware mode
+memory = SlidingWindowMemory(
+    max_tokens=100_000,
+    trigger_tokens=80_000,  # only trim when over 80k tokens
+)
+
+# With custom token counter
+import tiktoken
+enc = tiktoken.get_encoding("o200k_base")
+
+def count_tokens(messages):
+    return sum(len(enc.encode(str(m))) for m in messages)
+
+memory = SlidingWindowMemory(max_tokens=50_000, token_counter=count_tokens)
+```
+
+## Safe Cutoff
+
+The system never splits tool call/response pairs when trimming:
+
+```python
+from agent_ext.memory import find_safe_cutoff, is_safe_cutoff_point
+
+# Find safe place to cut keeping last 20 messages
+cutoff_index = find_safe_cutoff(messages, messages_to_keep=20)
+trimmed = messages[cutoff_index:]
+```
+
+## Summarization
+
+```python
+from agent_ext.memory import SummarizingMemory, SummarizeConfig
+
+def my_summarize_fn(ctx, text, base_dossier):
+    # Use LLM to update the dossier
+    base_dossier.summary = f"Updated summary of: {text[:500]}..."
+    return base_dossier
+
+memory = SummarizingMemory(
+    cfg=SummarizeConfig(max_messages=80, keep_last_n=30),
+    summarize_fn=my_summarize_fn,
+)
+```
diff --git a/agent_ext/memory/__init__.py b/agent_ext/memory/__init__.py
index e69de29..93c0641 100644
--- a/agent_ext/memory/__init__.py
+++ b/agent_ext/memory/__init__.py
@@ -0,0 +1,32 @@
+"""Memory management — sliding window, summarization, and safe cutoff."""
+
+from .base import MemoryManager
+from .cutoff import (
+    TokenCounter,
+    approximate_token_count,
+    find_safe_cutoff,
+    find_token_based_cutoff,
+    is_safe_cutoff_point,
+)
+from .processor import (
+    DEFAULT_SUMMARY_PROMPT,
+    ContextSize,
+    SummarizationProcessor,
+    create_summarization_processor,
+    format_messages_for_summary,
+)
+from .summarize import Dossier, SummarizeConfig, SummarizingMemory
+from .window import SlidingWindowMemory
+
+__all__ = [
+    "MemoryManager",
+    "SlidingWindowMemory",
+    "SummarizingMemory",
+    "SummarizeConfig",
+    "Dossier",
+    "TokenCounter",
+    "approximate_token_count",
+    "find_safe_cutoff",
+    "find_token_based_cutoff",
+    "is_safe_cutoff_point",
+]
diff --git a/agent_ext/memory/base.py b/agent_ext/memory/base.py
index c099fc0..52bc8c2 100644
--- a/agent_ext/memory/base.py
+++ b/agent_ext/memory/base.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
-from typing import Any, List, Protocol
+
+from typing import Any, Protocol
 
 
 class MemoryManager(Protocol):
-    def shape_messages(self, messages: List[Any]) -> List[Any]: ...
-    def checkpoint(self, messages: List[Any], *, outcome: Any) -> None: ...
+    def shape_messages(self, messages: list[Any]) -> list[Any]: ...
+    def checkpoint(self, messages: list[Any], *, outcome: Any) -> None: ...
diff --git a/agent_ext/memory/cutoff.py b/agent_ext/memory/cutoff.py
new file mode 100644
index 0000000..91ed245
--- /dev/null
+++ b/agent_ext/memory/cutoff.py
@@ -0,0 +1,121 @@
+"""Safe cutoff algorithms for history processors.
+
+Provides:
+- Tool-call/response pair preservation when trimming
+- Token-based cutoff via binary search
+- Message-count cutoff with safety adjustment
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from typing import Any
+
+# Type for token counters: (messages) -> int
+TokenCounter = Callable[[Sequence[Any]], int]
+
+
+def approximate_token_count(messages: Sequence[Any]) -> int:
+    """Quick approximation: ~4 chars per token."""
+    total = 0
+    for m in messages:
+        total += len(str(m))
+    return total // 4
+
+
+def _has_tool_call(msg: Any) -> bool:
+    """Check if a message contains tool calls."""
+    if isinstance(msg, dict):
+        return bool(msg.get("tool_calls") or msg.get("tool_call_id"))
+    if hasattr(msg, "parts"):
+        for part in msg.parts:
+            cls_name = type(part).__name__
+            if "ToolCall" in cls_name:
+                return True
+    if hasattr(msg, "tool_calls"):
+        return bool(msg.tool_calls)
+    return False
+
+
+def _has_tool_return(msg: Any) -> bool:
+    """Check if a message contains tool returns."""
+    if isinstance(msg, dict):
+        role = msg.get("role", "")
+        return role == "tool" or bool(msg.get("tool_call_id"))
+    if hasattr(msg, "parts"):
+        for part in msg.parts:
+            cls_name = type(part).__name__
+            if "ToolReturn" in cls_name:
+                return True
+    return False
+
+
+def is_safe_cutoff_point(messages: list[Any], cutoff_index: int, search_range: int = 5) -> bool:
+    """Check if cutting at *cutoff_index* would split a tool call/response pair.
+
+    Returns ``True`` when the cutoff is safe (no pairs are split).
+    """
+    if cutoff_index >= len(messages) or cutoff_index <= 0:
+        return True
+
+    start = max(0, cutoff_index - search_range)
+    end = min(len(messages), cutoff_index + search_range)
+
+    # If the message right before cutoff has tool calls but the message
+    # right after has tool returns — we'd split a pair.
+    for i in range(start, min(cutoff_index, end)):
+        if _has_tool_call(messages[i]):
+            # Check if any message after cutoff is the return
+            for j in range(cutoff_index, end):
+                if _has_tool_return(messages[j]):
+                    return False
+    return True
+
+
+def find_safe_cutoff(messages: list[Any], messages_to_keep: int) -> int:
+    """Find a safe cutoff index preserving tool call/response pairs.
+
+    Returns the index from which to slice: ``messages[cutoff:]``.
+    """
+    if messages_to_keep == 0:
+        return len(messages)
+    if len(messages) <= messages_to_keep:
+        return 0
+
+    target = len(messages) - messages_to_keep
+    for i in range(target, -1, -1):
+        if is_safe_cutoff_point(messages, i):
+            return i
+    return 0
+
+
+def find_token_based_cutoff(
+    messages: list[Any],
+    target_tokens: int,
+    token_counter: TokenCounter,
+) -> int:
+    """Binary search for cutoff index to retain ≤ *target_tokens*."""
+    if not messages or token_counter(messages) <= target_tokens:
+        return 0
+
+    left, right = 0, len(messages)
+    best = len(messages)
+
+    for _ in range(len(messages).bit_length() + 2):
+        if left >= right:
+            break
+        mid = (left + right) // 2
+        if token_counter(messages[mid:]) <= target_tokens:
+            best = mid
+            right = mid
+        else:
+            left = mid + 1
+
+    if best >= len(messages):
+        best = max(0, len(messages) - 1)
+
+    # Adjust for safety
+    for i in range(best, -1, -1):
+        if is_safe_cutoff_point(messages, i):
+            return i
+    return 0
diff --git a/agent_ext/memory/processor.py b/agent_ext/memory/processor.py
new file mode 100644
index 0000000..eeffc13
--- /dev/null
+++ b/agent_ext/memory/processor.py
@@ -0,0 +1,266 @@
+"""Auto-triggering LLM summarization processor.
+
+Monitors token/message counts and automatically summarizes older messages
+when thresholds are reached.  Works as a pydantic-ai ``history_processor``
+or standalone via ``shape_messages``.
+
+Example::
+
+    from agent_ext.memory.processor import create_summarization_processor
+
+    processor = create_summarization_processor(
+        model="openai:gpt-4o",
+        trigger=("tokens", 100_000),
+        keep=("messages", 20),
+    )
+
+    agent = Agent("openai:gpt-4o", history_processors=[processor])
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass, field
+from typing import Any, Literal
+
+from .cutoff import (
+    TokenCounter,
+    approximate_token_count,
+    find_safe_cutoff,
+    find_token_based_cutoff,
+    is_safe_cutoff_point,
+)
+
+# ---------------------------------------------------------------------------
+# Context size types (parity with summarization-pydantic-ai)
+# ---------------------------------------------------------------------------
+
+ContextSize = tuple[Literal["messages"], int] | tuple[Literal["tokens"], int] | tuple[Literal["fraction"], float]
+"""Specify a context size as messages, tokens, or fraction of max."""
+
+# ---------------------------------------------------------------------------
+# Default prompt
+# ---------------------------------------------------------------------------
+
+DEFAULT_SUMMARY_PROMPT = (
+    "<role>\nContext Extraction Assistant\n</role>\n\n"
+    "<primary_objective>\n"
+    "Extract the most relevant context from the conversation history below.\n"
+    "</primary_objective>\n\n"
+    "<instructions>\n"
+    "The conversation history will be replaced with your extracted context. "
+    "Extract and record the most important context. Focus on information "
+    "relevant to the overall goal. Avoid repeating completed actions.\n"
+    "</instructions>\n\n"
+    "Respond ONLY with the extracted context. No additional information.\n\n"
+    "<messages>\n{messages}\n</messages>"
+)
+
+_DEFAULT_KEEP = 20
+_DEFAULT_TRIGGER_TOKENS = 170_000
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _validate_context_size(cs: ContextSize, name: str) -> ContextSize:
+    kind, value = cs
+    if kind == "fraction":
+        if not 0 < value <= 1:
+            raise ValueError(f"{name} fraction must be in (0, 1], got {value}")
+    elif kind in ("tokens", "messages"):
+        if value < 0:
+            raise ValueError(f"{name} must be non-negative, got {value}")
+    else:
+        raise ValueError(f"Unsupported context size type: {kind}")
+    return cs
+
+
+def _should_trigger(
+    conditions: list[ContextSize],
+    messages: list[Any],
+    total_tokens: int,
+    max_input_tokens: int | None,
+) -> bool:
+    """OR logic: any condition met → trigger."""
+    for kind, value in conditions:
+        if (
+            (kind == "messages" and len(messages) >= value)
+            or (kind == "tokens" and total_tokens >= value)
+            or (kind == "fraction" and max_input_tokens and total_tokens >= int(max_input_tokens * value))
+        ):
+            return True
+    return False
+
+
+def _determine_cutoff(
+    messages: list[Any],
+    keep: ContextSize,
+    token_counter: TokenCounter,
+    max_input_tokens: int | None,
+    default_keep: int,
+) -> int:
+    kind, value = keep
+    if kind == "messages":
+        return find_safe_cutoff(messages, int(value))
+    elif kind == "tokens":
+        return find_token_based_cutoff(messages, int(value), token_counter)
+    elif kind == "fraction" and max_input_tokens:
+        return find_token_based_cutoff(messages, int(max_input_tokens * value), token_counter)
+    return find_safe_cutoff(messages, default_keep)
+
+
+def format_messages_for_summary(messages: Sequence[Any]) -> str:
+    """Format messages into a readable string for summarization.
+
+    Works with pydantic-ai ModelMessages, dicts, or plain strings.
+    """
+    lines: list[str] = []
+    for msg in messages:
+        if isinstance(msg, dict):
+            role = msg.get("role", "message")
+            content = msg.get("content", "")
+            lines.append(f"{role.title()}: {content}")
+        elif isinstance(msg, str):
+            lines.append(msg)
+        elif hasattr(msg, "parts"):
+            # pydantic-ai ModelRequest / ModelResponse
+            cls_name = type(msg).__name__
+            role = "User" if "Request" in cls_name else "Assistant"
+            parts_text = []
+            for part in msg.parts:
+                if hasattr(part, "content"):
+                    parts_text.append(str(part.content))
+                elif hasattr(part, "tool_name"):
+                    parts_text.append(f"[Tool: {part.tool_name}]")
+            lines.append(f"{role}: {' '.join(parts_text)}")
+        else:
+            lines.append(str(msg))
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# SummarizationProcessor
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SummarizationProcessor:
+    """Auto-triggering LLM summarization processor.
+
+    Monitors token/message counts and automatically summarizes older messages
+    when thresholds are reached.  Injects a summary as a system message at
+    the front of the history.
+
+    Can be used as:
+    - pydantic-ai ``history_processor`` (``__call__``)
+    - standalone via ``process(messages)``
+    """
+
+    model: str | Any
+    """Model for generating summaries (string name or pydantic-ai Model)."""
+
+    trigger: ContextSize | list[ContextSize] | None = None
+    """Threshold(s) that trigger summarization (OR logic)."""
+
+    keep: ContextSize = ("messages", _DEFAULT_KEEP)
+    """How much recent context to keep after summarization."""
+
+    token_counter: TokenCounter = field(default=approximate_token_count)
+    summary_prompt: str = DEFAULT_SUMMARY_PROMPT
+    max_input_tokens: int | None = None
+    trim_tokens_to_summarize: int | None = 4000
+
+    _trigger_conditions: list[ContextSize] = field(default_factory=list, init=False)
+    _summary_cache: str | None = field(default=None, init=False)
+
+    def __post_init__(self) -> None:
+        if self.trigger is None:
+            self._trigger_conditions = []
+        elif isinstance(self.trigger, list):
+            self._trigger_conditions = [_validate_context_size(t, "trigger") for t in self.trigger]
+        else:
+            self._trigger_conditions = [_validate_context_size(self.trigger, "trigger")]
+        self.keep = _validate_context_size(self.keep, "keep")
+        if (
+            any(t[0] == "fraction" for t in self._trigger_conditions) or self.keep[0] == "fraction"
+        ) and self.max_input_tokens is None:
+            raise ValueError("max_input_tokens required for fraction-based trigger/keep")
+
+    def _should_summarize(self, messages: list[Any], total_tokens: int) -> bool:
+        return _should_trigger(self._trigger_conditions, messages, total_tokens, self.max_input_tokens)
+
+    def _determine_cutoff(self, messages: list[Any]) -> int:
+        return _determine_cutoff(messages, self.keep, self.token_counter, self.max_input_tokens, _DEFAULT_KEEP)
+
+    async def _create_summary(self, messages_to_summarize: list[Any]) -> str:
+        """Generate summary using the configured LLM."""
+        if not messages_to_summarize:
+            return "No previous conversation history."
+
+        formatted = format_messages_for_summary(messages_to_summarize)
+        if self.trim_tokens_to_summarize and len(formatted) > self.trim_tokens_to_summarize * 4:
+            formatted = formatted[-(self.trim_tokens_to_summarize * 4) :]
+
+        prompt = self.summary_prompt.format(messages=formatted)
+
+        try:
+            from pydantic_ai import Agent
+
+            agent = Agent(self.model, instructions="You summarize conversations concisely.")
+            result = await agent.run(prompt)
+            return (getattr(result, "output", None) or str(result)).strip()
+        except Exception as e:
+            return f"Error generating summary: {e!s}"
+
+    async def process(self, messages: list[Any]) -> list[Any]:
+        """Process messages: summarize if thresholds are exceeded."""
+        total_tokens = self.token_counter(messages)
+        if not self._should_summarize(messages, total_tokens):
+            return messages
+
+        cutoff = self._determine_cutoff(messages)
+        if cutoff <= 0:
+            return messages
+
+        to_summarize = messages[:cutoff]
+        preserved = messages[cutoff:]
+        summary = await self._create_summary(to_summarize)
+        self._summary_cache = summary
+
+        # Inject summary as a system-like message at front
+        summary_msg = {"role": "system", "content": f"Summary of previous conversation:\n\n{summary}"}
+        return [summary_msg, *preserved]
+
+    async def __call__(self, *args: Any) -> list[Any]:
+        """pydantic-ai history_processor interface.
+
+        Accepts either (messages,) or (ctx, messages).
+        """
+        if len(args) == 1:
+            messages = args[0]
+        elif len(args) == 2:
+            messages = args[1]
+        else:
+            return list(args[0]) if args else []
+        return await self.process(messages)
+
+
+def create_summarization_processor(
+    model: str | Any = "openai:gpt-4o",
+    trigger: ContextSize | list[ContextSize] | None = ("tokens", _DEFAULT_TRIGGER_TOKENS),
+    keep: ContextSize = ("messages", _DEFAULT_KEEP),
+    max_input_tokens: int | None = None,
+    token_counter: TokenCounter | None = None,
+    summary_prompt: str | None = None,
+) -> SummarizationProcessor:
+    """Factory for SummarizationProcessor with sensible defaults."""
+    kwargs: dict[str, Any] = {"model": model, "trigger": trigger, "keep": keep}
+    if max_input_tokens is not None:
+        kwargs["max_input_tokens"] = max_input_tokens
+    if token_counter is not None:
+        kwargs["token_counter"] = token_counter
+    if summary_prompt is not None:
+        kwargs["summary_prompt"] = summary_prompt
+    return SummarizationProcessor(**kwargs)
diff --git a/agent_ext/memory/summarize.py b/agent_ext/memory/summarize.py
index b590aea..5cd629b 100644
--- a/agent_ext/memory/summarize.py
+++ b/agent_ext/memory/summarize.py
@@ -1,23 +1,26 @@
 from __future__ import annotations
 
 import hashlib
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from collections.abc import Callable
+from typing import Any
 
 from pydantic import BaseModel, Field
 
-from .base import MemoryManager
 from agent_ext.run_context import RunContext
 
+from .base import MemoryManager
+
 
 class Dossier(BaseModel):
     """
     Long-lived compressed state for a case/session.
     """
-    pinned_facts: List[str] = Field(default_factory=list)
-    timeline: List[str] = Field(default_factory=list)
-    entities: List[str] = Field(default_factory=list)
-    decisions: List[str] = Field(default_factory=list)
-    open_questions: List[str] = Field(default_factory=list)
+
+    pinned_facts: list[str] = Field(default_factory=list)
+    timeline: list[str] = Field(default_factory=list)
+    entities: list[str] = Field(default_factory=list)
+    decisions: list[str] = Field(default_factory=list)
+    open_questions: list[str] = Field(default_factory=list)
     summary: str = ""
 
 
@@ -70,11 +73,11 @@ def __init__(
         self.summarize_fn = summarize_fn
         self.message_to_text = message_to_text
 
-        self._dossier: Optional[Dossier] = None
-        self._dossier_artifact_id: Optional[str] = None
-        self._last_input_hash: Optional[str] = None
+        self._dossier: Dossier | None = None
+        self._dossier_artifact_id: str | None = None
+        self._last_input_hash: str | None = None
 
-    def shape_messages(self, messages: List[Any]) -> List[Any]:
+    def shape_messages(self, messages: list[Any]) -> list[Any]:
         # If we already have a dossier, prepend it as a synthetic system message.
         if not self._dossier:
             # Window only
@@ -89,7 +92,7 @@ def shape_messages(self, messages: List[Any]) -> List[Any]:
         shaped = [dossier_msg, *tail]
         return shaped[-self.cfg.max_messages :]
 
-    def checkpoint(self, messages: List[Any], *, outcome: Any) -> None:
+    def checkpoint(self, messages: list[Any], *, outcome: Any) -> None:
         # Only summarize if we have enough history.
         if len(messages) < self.cfg.min_messages_before_summarize:
             return
@@ -128,7 +131,7 @@ def checkpoint(self, messages: List[Any], *, outcome: Any) -> None:
         self._last_input_hash = input_hash
 
     # --- wiring: ctx is set by composition root after instantiation
-    _ctx: Optional[RunContext] = None
+    _ctx: RunContext | None = None
 
     def bind_ctx(self, ctx: RunContext) -> None:
         self._ctx = ctx
@@ -141,7 +144,7 @@ def _ctx_required(self) -> RunContext:
     @staticmethod
     def _render_dossier(d: Dossier) -> str:
         # Keep it compact; model should treat as authoritative "case memory".
-        lines: List[str] = []
+        lines: list[str] = []
         if d.summary:
             lines.append(f"CASE DOSSIER SUMMARY:\n{d.summary}\n")
         if d.pinned_facts:
diff --git a/agent_ext/memory/window.py b/agent_ext/memory/window.py
index 73cecb9..26316f7 100644
--- a/agent_ext/memory/window.py
+++ b/agent_ext/memory/window.py
@@ -1,18 +1,75 @@
+"""Sliding window memory — message-count or token-aware trimming.
+
+Zero LLM cost, near-zero latency.  Preserves tool call/response pairs.
+"""
+
 from __future__ import annotations
-from typing import Any, List
+
+from typing import Any
 
 from .base import MemoryManager
+from .cutoff import (
+    TokenCounter,
+    approximate_token_count,
+    find_safe_cutoff,
+    find_token_based_cutoff,
+)
 
 
 class SlidingWindowMemory(MemoryManager):
-    def __init__(self, max_messages: int):
+    """Keeps the most recent messages, discarding older ones.
+
+    Supports both message-count and token-count modes.
+    Preserves tool call/response pairs (never splits a pair).
+
+    Args:
+        max_messages: Max messages to keep (message-count mode).
+        max_tokens: Max tokens to keep (token mode, overrides max_messages).
+        token_counter: Custom token counting function.
+        trigger_messages: Only trim when this many messages are reached.
+            ``None`` means always trim when over *max_messages*.
+        trigger_tokens: Only trim when this many tokens are reached.
+    """
+
+    def __init__(
+        self,
+        max_messages: int = 50,
+        *,
+        max_tokens: int | None = None,
+        token_counter: TokenCounter | None = None,
+        trigger_messages: int | None = None,
+        trigger_tokens: int | None = None,
+    ):
         self.max_messages = max_messages
+        self.max_tokens = max_tokens
+        self.token_counter = token_counter or approximate_token_count
+        self.trigger_messages = trigger_messages
+        self.trigger_tokens = trigger_tokens
+
+    def _should_trim(self, messages: list[Any]) -> bool:
+        """Check if trimming should happen."""
+        if self.trigger_tokens is not None and self.token_counter(messages) >= self.trigger_tokens:
+            return True
+        if self.trigger_messages is not None and len(messages) >= self.trigger_messages:
+            return True
+        # Default: trim when exceeding max
+        if self.trigger_messages is None and self.trigger_tokens is None:
+            if self.max_tokens is not None:
+                return self.token_counter(messages) > self.max_tokens
+            return len(messages) > self.max_messages
+        return False
 
-    def shape_messages(self, messages: List[Any]) -> List[Any]:
-        if len(messages) <= self.max_messages:
+    def shape_messages(self, messages: list[Any]) -> list[Any]:
+        if not self._should_trim(messages):
             return messages
-        return messages[-self.max_messages :]
 
-    def checkpoint(self, messages: List[Any], *, outcome: Any) -> None:
-        # No-op baseline; you can persist summaries/dossiers via artifacts here.
+        if self.max_tokens is not None:
+            cutoff = find_token_based_cutoff(messages, self.max_tokens, self.token_counter)
+        else:
+            cutoff = find_safe_cutoff(messages, self.max_messages)
+
+        return messages[cutoff:] if cutoff > 0 else messages
+
+    def checkpoint(self, messages: list[Any], *, outcome: Any) -> None:
+        # Sliding window doesn't checkpoint — no persistent state needed
         return None
diff --git a/agent_ext/modules/__init__.py b/agent_ext/modules/__init__.py
new file mode 100644
index 0000000..7b4bc8b
--- /dev/null
+++ b/agent_ext/modules/__init__.py
@@ -0,0 +1,2 @@
+from .registry import ModuleRegistry
+from .spec import ModuleProvides, ModuleSpec, ModuleState
diff --git a/agent_ext/modules/builtins/core/__init__.py b/agent_ext/modules/builtins/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/modules/builtins/core/module.py b/agent_ext/modules/builtins/core/module.py
new file mode 100644
index 0000000..e76d31d
--- /dev/null
+++ b/agent_ext/modules/builtins/core/module.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from agent_ext.modules.spec import ModuleProvides, ModuleSpec
+
+
+def init(ctx) -> None:
+    # Keep core tiny: just ensure these dicts exist so modules can register into them.
+    if getattr(ctx, "commands", None) is None:
+        ctx.commands = {}
+    if getattr(ctx, "events", None) is None:
+        ctx.events = {}
+    # A place to register interactive commands (like /status, /modules, etc.)
+
+
+module_spec = ModuleSpec(
+    name="core",
+    version="0.1.0",
+    description="Core runtime scaffolding for the workbench.",
+    provides=ModuleProvides(commands=["/status", "/modules", "/help"]),
+    init=init,
+)
diff --git a/agent_ext/modules/builtins/self_improve/__init__.py b/agent_ext/modules/builtins/self_improve/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/modules/builtins/self_improve/module.py b/agent_ext/modules/builtins/self_improve/module.py
new file mode 100644
index 0000000..459a99a
--- /dev/null
+++ b/agent_ext/modules/builtins/self_improve/module.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from agent_ext.modules.spec import ModuleProvides, ModuleSpec
+from agent_ext.self_improve.controller import SelfImproveController
+
+
+def init(ctx) -> None:
+    ctx.self_improve = SelfImproveController()
+
+    # optional commands for the TUI
+    def cmd_improve_status() -> str:
+        return "Self-improve enabled. Records in .agent_state/runs/"
+
+    ctx.commands["/improve_status"] = cmd_improve_status
+
+
+module_spec = ModuleSpec(
+    name="self_improve",
+    version="0.1.0",
+    description="Trigger-driven self-improvement loop (gated).",
+    provides=ModuleProvides(commands=["/improve_status"]),
+    init=init,
+)
diff --git a/agent_ext/modules/builtins/workflow/__init__.py b/agent_ext/modules/builtins/workflow/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/modules/builtins/workflow/module.py b/agent_ext/modules/builtins/workflow/module.py
new file mode 100644
index 0000000..e8e01a6
--- /dev/null
+++ b/agent_ext/modules/builtins/workflow/module.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+from agent_ext.modules.spec import ModuleProvides, ModuleSpec
+
+
+def init(ctx) -> None:
+    # TUI already handles /workflows /assemble /exec; this is just a marker for module registry
+    ctx.commands.setdefault("/workflows", lambda: "Use the TUI command /workflows")
+    ctx.commands.setdefault("/assemble", lambda: "Use: /assemble <task_type> <text>")
+    ctx.commands.setdefault("/exec", lambda: "Use: /exec <task_type> <text>")
+
+
+module_spec = ModuleSpec(
+    name="workflow",
+    version="0.1.0",
+    description="Workflow synthesis + execution + learning (bandit over assemblies).",
+    provides=ModuleProvides(commands=["/workflows", "/assemble", "/exec"]),
+    init=init,
+)
diff --git a/agent_ext/modules/loader.py b/agent_ext/modules/loader.py
new file mode 100644
index 0000000..3ac187e
--- /dev/null
+++ b/agent_ext/modules/loader.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+import importlib
+from types import ModuleType
+
+
+def import_module(import_path: str) -> ModuleType:
+    return importlib.import_module(import_path)
+
+
+def reload_module(mod: ModuleType) -> ModuleType:
+    return importlib.reload(mod)
diff --git a/agent_ext/modules/registry.py b/agent_ext/modules/registry.py
new file mode 100644
index 0000000..b633fe5
--- /dev/null
+++ b/agent_ext/modules/registry.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable
+from pathlib import Path
+
+from .loader import import_module
+from .spec import ModuleSpec, ModuleState
+
+STATE_FILE_DEFAULT = Path(".agent_state/registry.json")
+
+
+class ModuleRegistry:
+    """
+    Discovers module.py files under agent_ext/modules/builtins/*/module.py,
+    imports them, and enables their init(ctx) hooks.
+    """
+
+    def __init__(self, *, state_file: Path = STATE_FILE_DEFAULT):
+        self.state_file = state_file
+        self.modules: dict[str, ModuleState] = {}
+
+    def discover_builtin_import_paths(self) -> list[str]:
+        root = Path(__file__).resolve().parent / "builtins"
+        paths: list[str] = []
+        if not root.exists():
+            return paths
+        for mod_dir in sorted(p for p in root.iterdir() if p.is_dir()):
+            candidate = mod_dir / "module.py"
+            if candidate.exists():
+                # agent_ext.modules.builtins.<name>.module
+                paths.append(f"agent_ext.modules.builtins.{mod_dir.name}.module")
+        return paths
+
+    def load_from_import_path(self, import_path: str) -> ModuleSpec:
+        mod = import_module(import_path)
+        spec: ModuleSpec | None = getattr(mod, "module_spec", None)
+        if spec is None:
+            raise RuntimeError(f"{import_path} must define `module_spec: ModuleSpec`")
+        return spec
+
+    def enable(self, spec: ModuleSpec, *, import_path: str, ctx) -> None:
+        state = ModuleState(spec=spec, enabled=True, loaded_from=import_path)
+        self.modules[spec.name] = state
+        if spec.init is not None:
+            spec.init(ctx)
+
+    def disable(self, name: str) -> None:
+        if name in self.modules:
+            self.modules[name].enabled = False
+
+    def enabled_specs(self) -> Iterable[ModuleSpec]:
+        for st in self.modules.values():
+            if st.enabled:
+                yield st.spec
+
+    def save(self) -> None:
+        self.state_file.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            "modules": [
+                {
+                    "name": st.spec.name,
+                    "version": st.spec.version,
+                    "description": st.spec.description,
+                    "enabled": st.enabled,
+                    "loaded_from": st.loaded_from,
+                }
+                for st in self.modules.values()
+            ]
+        }
+        self.state_file.write_text(json.dumps(data, indent=2), encoding="utf-8")
+
+    def load_saved(self) -> dict[str, bool]:
+        if not self.state_file.exists():
+            return {}
+        try:
+            raw = self.state_file.read_text(encoding="utf-8").strip()
+            if not raw:
+                return {}
+            data = json.loads(raw)
+        except (json.JSONDecodeError, OSError):
+            return {}
+        out: dict[str, bool] = {}
+        for item in data.get("modules", []):
+            out[item["name"]] = bool(item.get("enabled", True))
+        return out
+
+    def load_all_builtins(self, ctx) -> None:
+        enabled_map = self.load_saved()
+        for import_path in self.discover_builtin_import_paths():
+            spec = self.load_from_import_path(import_path)
+            is_enabled = enabled_map.get(spec.name, True)
+            if is_enabled:
+                self.enable(spec, import_path=import_path, ctx=ctx)
+            else:
+                self.modules[spec.name] = ModuleState(spec=spec, enabled=False, loaded_from=import_path)
diff --git a/agent_ext/modules/spec.py b/agent_ext/modules/spec.py
new file mode 100644
index 0000000..96e1a63
--- /dev/null
+++ b/agent_ext/modules/spec.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Any
+
+InitFn = Callable[[Any], None]  # ctx-like (RunContext)
+
+
+@dataclass(frozen=True)
+class ModuleProvides:
+    # Names only; actual registrations happen in init()
+    tools: list[str] = field(default_factory=list)
+    subagents: list[str] = field(default_factory=list)
+    hooks: list[str] = field(default_factory=list)
+    skills: list[str] = field(default_factory=list)
+    commands: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ModuleSpec:
+    name: str
+    version: str = "0.1.0"
+    description: str = ""
+    provides: ModuleProvides = field(default_factory=ModuleProvides)
+
+    # Optional hard requirements (kept simple for now)
+    requires_exec: bool = False
+    requires_tools: bool = True
+
+    # Called when module is enabled; should mutate ctx by attaching tools/subagents/hooks/etc.
+    init: InitFn | None = None
+
+
+@dataclass
+class ModuleState:
+    spec: ModuleSpec
+    enabled: bool = True
+    loaded_from: str = ""  # import path
+    meta: dict[str, Any] = field(default_factory=dict)
diff --git a/agent_ext/research/__init__.py b/agent_ext/research/__init__.py
index d44ec4c..81d470b 100644
--- a/agent_ext/research/__init__.py
+++ b/agent_ext/research/__init__.py
@@ -1,4 +1,4 @@
-from .models import ResearchPlan, ResearchTask, ResearchOutcome, ResearchBudget, Claim
-from .planner import ResearchPlanner
-from .executor import ResearchExecutor
 from .controller import DeepResearchController
+from .executor import ResearchExecutor
+from .models import Claim, ResearchBudget, ResearchOutcome, ResearchPlan, ResearchTask
+from .planner import ResearchPlanner
diff --git a/agent_ext/research/controller.py b/agent_ext/research/controller.py
index 0059e93..9f648f6 100644
--- a/agent_ext/research/controller.py
+++ b/agent_ext/research/controller.py
@@ -1,16 +1,15 @@
 from __future__ import annotations
 
 import time
-from typing import Dict, List, Optional
 
-from agent_ext.run_context import RunContext
-from agent_ext.research.models import ResearchBudget, ResearchOutcome, ResearchPlan, ResearchTask
-from agent_ext.research.planner import ResearchPlanner
-from agent_ext.research.executor import ResearchExecutor
-from agent_ext.research.ledger import ResearchLedger
 from agent_ext.research.evidence_graph import EvidenceGraph
+from agent_ext.research.executor import ResearchExecutor
 from agent_ext.research.gap_analysis import propose_gaps
+from agent_ext.research.ledger import ResearchLedger
+from agent_ext.research.models import ResearchBudget, ResearchOutcome
+from agent_ext.research.planner import ResearchPlanner
 from agent_ext.research.synth import build_outcome
+from agent_ext.run_context import RunContext
 
 
 class DeepResearchController:
@@ -105,5 +104,8 @@ async def run(self, ctx: RunContext, *, question: str) -> ResearchOutcome:
         ledger.add_event("research_end", {"steps": steps})
 
         # store final report artifact
-        ctx.artifacts.put_json(outcome.model_dump(), metadata={"kind": "research_outcome", "case_id": ctx.case_id, "session_id": ctx.session_id})
+        ctx.artifacts.put_json(
+            outcome.model_dump(),
+            metadata={"kind": "research_outcome", "case_id": ctx.case_id, "session_id": ctx.session_id},
+        )
         return outcome
diff --git a/agent_ext/research/evidence_graph.py b/agent_ext/research/evidence_graph.py
index 890dc2b..b25f635 100644
--- a/agent_ext/research/evidence_graph.py
+++ b/agent_ext/research/evidence_graph.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Dict, List, Set, Tuple
 
 from agent_ext.evidence.models import Evidence
 
@@ -13,12 +12,13 @@ class EvidenceGraph:
     - evidence nodes
     - links to sources (artifact ids, URLs, doc ids)
     """
-    evidence_nodes: Dict[str, Evidence] = field(default_factory=dict)
-    sources_by_evidence: Dict[str, Set[str]] = field(default_factory=dict)
+
+    evidence_nodes: dict[str, Evidence] = field(default_factory=dict)
+    sources_by_evidence: dict[str, set[str]] = field(default_factory=dict)
 
     def add(self, evidence_id: str, ev: Evidence) -> None:
         self.evidence_nodes[evidence_id] = ev
-        srcs: Set[str] = set()
+        srcs: set[str] = set()
         for c in ev.citations:
             if c.source_id:
                 srcs.add(c.source_id)
@@ -27,17 +27,17 @@ def add(self, evidence_id: str, ev: Evidence) -> None:
                 srcs.add(aid)
         self.sources_by_evidence[evidence_id] = srcs
 
-    def all_sources(self) -> Set[str]:
-        out: Set[str] = set()
+    def all_sources(self) -> set[str]:
+        out: set[str] = set()
         for s in self.sources_by_evidence.values():
             out |= s
         return out
 
-    def evidence_without_citations(self) -> List[str]:
+    def evidence_without_citations(self) -> list[str]:
         return [eid for eid, ev in self.evidence_nodes.items() if not ev.citations]
 
-    def validation_failures(self) -> List[str]:
-        bad: List[str] = []
+    def validation_failures(self) -> list[str]:
+        bad: list[str] = []
         for eid, ev in self.evidence_nodes.items():
             if ev.kind == "validation" and any(t == "validation:fail" for t in (ev.tags or [])):
                 bad.append(eid)
diff --git a/agent_ext/research/executor.py b/agent_ext/research/executor.py
index 4a74e49..ee23299 100644
--- a/agent_ext/research/executor.py
+++ b/agent_ext/research/executor.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
 import time
-from typing import Awaitable, Callable, Dict, List, Optional, Sequence
+from collections.abc import Awaitable, Callable, Sequence
 
 from agent_ext.evidence.models import Evidence, Provenance
-from agent_ext.run_context import RunContext
-from agent_ext.research.models import ResearchTask
 from agent_ext.research.ledger import ResearchLedger
-
+from agent_ext.research.models import ResearchTask
+from agent_ext.run_context import RunContext
 
 TaskHandler = Callable[[RunContext, ResearchTask, ResearchLedger], Awaitable[Sequence[Evidence]]]
 
@@ -17,10 +16,10 @@ class ResearchExecutor:
     Executes ResearchTask via handlers (kind-based).
     """
 
-    def __init__(self, handlers: Dict[str, TaskHandler]):
+    def __init__(self, handlers: dict[str, TaskHandler]):
         self.handlers = handlers
 
-    async def run_task(self, ctx: RunContext, task: ResearchTask, ledger: ResearchLedger) -> List[Evidence]:
+    async def run_task(self, ctx: RunContext, task: ResearchTask, ledger: ResearchLedger) -> list[Evidence]:
         if task.kind not in self.handlers:
             # Return a diagnostic Evidence instead of hard failing
             return [
diff --git a/agent_ext/research/executor_parallel.py b/agent_ext/research/executor_parallel.py
new file mode 100644
index 0000000..61c0a47
--- /dev/null
+++ b/agent_ext/research/executor_parallel.py
@@ -0,0 +1,14 @@
+import asyncio
+
+
+async def execute_tasks_parallel(ctx, ledger, tasks, handler_map, max_concurrency: int = 4):
+    sem = asyncio.Semaphore(max_concurrency)
+
+    async def run_one(task):
+        async with sem:
+            handler = handler_map[task.kind]
+            return task, await handler(ctx, task, ledger)
+
+    pairs = await asyncio.gather(*[run_one(t) for t in tasks])
+    for task, evidence_list in pairs:
+        ledger.add_evidence(task.id, evidence_list)
diff --git a/agent_ext/research/gap_analysis.py b/agent_ext/research/gap_analysis.py
index 40ea3fe..6e1dbd9 100644
--- a/agent_ext/research/gap_analysis.py
+++ b/agent_ext/research/gap_analysis.py
@@ -1,10 +1,8 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
-from agent_ext.research.models import ResearchTask
-from agent_ext.research.ledger import ResearchLedger
 from agent_ext.research.evidence_graph import EvidenceGraph
+from agent_ext.research.ledger import ResearchLedger
+from agent_ext.research.models import ResearchTask
 
 
 def propose_gaps(
@@ -12,48 +10,54 @@ def propose_gaps(
     graph: EvidenceGraph,
     *,
     max_new_tasks: int = 6,
-) -> List[ResearchTask]:
+) -> list[ResearchTask]:
     """
     Deterministic gap finder:
     - uncited evidence -> add 'find source' task
     - OCR validation failures -> add 'retry OCR' task
     - too little evidence overall -> broaden search
     """
-    new_tasks: List[ResearchTask] = []
+    new_tasks: list[ResearchTask] = []
 
     # 1) Uncited evidence (often model notes) -> request citations
     uncited = graph.evidence_without_citations()
     if uncited:
-        new_tasks.append(ResearchTask(
-            id=f"gap_citations_{len(ledger.tasks)+1}",
-            kind="analyze",
-            goal="Review findings that lack citations and either add citations or mark as inference/uncertain.",
-            inputs={"evidence_ids": uncited[:20]},
-            priority=12,
-            tags=["gap", "citations"],
-        ))
+        new_tasks.append(
+            ResearchTask(
+                id=f"gap_citations_{len(ledger.tasks) + 1}",
+                kind="analyze",
+                goal="Review findings that lack citations and either add citations or mark as inference/uncertain.",
+                inputs={"evidence_ids": uncited[:20]},
+                priority=12,
+                tags=["gap", "citations"],
+            )
+        )
 
     # 2) OCR validation failures -> schedule retry task
     val_fails = graph.validation_failures()
     if val_fails:
-        new_tasks.append(ResearchTask(
-            id=f"gap_ocr_retry_{len(ledger.tasks)+1}",
-            kind="analyze",
-            goal="OCR validation failed. Build and execute an OCR retry plan (higher DPI / alternate engine / rerun bad pages).",
-            inputs={"validation_evidence_ids": val_fails},
-            priority=10,
-            tags=["gap", "ocr"],
-        ))
+        new_tasks.append(
+            ResearchTask(
+                id=f"gap_ocr_retry_{len(ledger.tasks) + 1}",
+                kind="analyze",
+                goal="OCR validation failed. Build and execute an OCR retry plan (higher DPI / alternate engine / rerun bad pages).",
+                inputs={"validation_evidence_ids": val_fails},
+                priority=10,
+                tags=["gap", "ocr"],
+            )
+        )
 
     # 3) If too little evidence, add a broader search/browse task (if you support it)
     if len(graph.evidence_nodes) < 3:
-        new_tasks.append(ResearchTask(
-            id=f"gap_broaden_{len(ledger.tasks)+1}",
-            kind="search",
-            goal="Gather more sources relevant to the question; broaden query and collect citations.",
-            query=ledger.plan.question,
-            priority=20,
-            tags=["gap", "coverage"],
-        ))
+        new_tasks.append(
+            ResearchTask(
+                id=f"gap_broaden_{len(ledger.tasks) + 1}",
+                kind="search",
+                goal="Gather more sources relevant to the question; broaden query and collect citations.",
+                query=ledger.plan.question,
+                priority=20,
+                tags=["gap", "coverage"],
+            )
+        )
 
     return new_tasks[:max_new_tasks]
diff --git a/agent_ext/research/handlers_default.py b/agent_ext/research/handlers_default.py
index 118a940..3ab67b2 100644
--- a/agent_ext/research/handlers_default.py
+++ b/agent_ext/research/handlers_default.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
-from typing import Sequence
+from collections.abc import Sequence
 
-from agent_ext.evidence.models import Evidence, Provenance, Citation
-from agent_ext.run_context import RunContext
-from agent_ext.research.models import ResearchTask
+from agent_ext.evidence.models import Evidence, Provenance
 from agent_ext.research.ledger import ResearchLedger
+from agent_ext.research.models import ResearchTask
 from agent_ext.research.synth import build_outcome
+from agent_ext.run_context import RunContext
 
 
 async def handle_analyze(ctx: RunContext, task: ResearchTask, ledger: ResearchLedger) -> Sequence[Evidence]:
@@ -29,7 +29,11 @@ async def handle_search(ctx: RunContext, task: ResearchTask, ledger: ResearchLed
     return [
         Evidence(
             kind="note",
-            content={"goal": task.goal, "query": task.query, "note": "Search handler placeholder (wire to retrieval/web tool)."},
+            content={
+                "goal": task.goal,
+                "query": task.query,
+                "note": "Search handler placeholder (wire to retrieval/web tool).",
+            },
             citations=[],
             provenance=Provenance(produced_by="handle_search", artifact_ids=[]),
             confidence=0.3,
@@ -43,7 +47,9 @@ async def handle_subagent(ctx: RunContext, task: ResearchTask, ledger: ResearchL
     orch = ctx.subagents["orchestrator"]
     name = task.inputs.get("subagent_name")
     payload = task.inputs.get("payload", {})
-    results = await orch.run_many(ctx, [(name, payload, {"task_id": task.id})], timeout_s=task.inputs.get("timeout_s", 60))
+    results = await orch.run_many(
+        ctx, [(name, payload, {"task_id": task.id})], timeout_s=task.inputs.get("timeout_s", 60)
+    )
     res = results.get(name)
     return [
         Evidence(
@@ -63,7 +69,11 @@ async def handle_synthesize(ctx: RunContext, task: ResearchTask, ledger: Researc
     return [
         Evidence(
             kind="finding",
-            content={"final_answer": out.answer, "limitations": out.limitations, "claims": [c.model_dump() for c in out.claims]},
+            content={
+                "final_answer": out.answer,
+                "limitations": out.limitations,
+                "claims": [c.model_dump() for c in out.claims],
+            },
             citations=[],
             provenance=Provenance(produced_by="handle_synthesize", artifact_ids=[]),
             confidence=0.75,
diff --git a/agent_ext/research/ledger.py b/agent_ext/research/ledger.py
index 04deb09..0498831 100644
--- a/agent_ext/research/ledger.py
+++ b/agent_ext/research/ledger.py
@@ -2,12 +2,13 @@
 
 import hashlib
 import time
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any
 
 from agent_ext.evidence.models import Evidence
-from agent_ext.run_context import RunContext
 from agent_ext.research.models import ResearchPlan, ResearchTask
+from agent_ext.run_context import RunContext
 
 
 def _hash_jsonable(obj: Any) -> str:
@@ -26,18 +27,19 @@ class ResearchLedger:
     - evidence batches
     - final report
     """
+
     plan: ResearchPlan
-    tasks: Dict[str, ResearchTask] = field(default_factory=dict)
-    evidence: List[Evidence] = field(default_factory=list)
-    events: List[Dict[str, Any]] = field(default_factory=list)
+    tasks: dict[str, ResearchTask] = field(default_factory=dict)
+    evidence: list[Evidence] = field(default_factory=list)
+    events: list[dict[str, Any]] = field(default_factory=list)
 
     def __post_init__(self) -> None:
         self.tasks = {t.id: t for t in self.plan.tasks}
 
-    def add_event(self, kind: str, payload: Dict[str, Any]) -> None:
+    def add_event(self, kind: str, payload: dict[str, Any]) -> None:
         self.events.append({"t": time.time(), "kind": kind, "payload": payload})
 
-    def add_evidence(self, ev: Sequence[Evidence]) -> List[str]:
+    def add_evidence(self, ev: Sequence[Evidence]) -> list[str]:
         before = len(self.evidence)
         self.evidence.extend(list(ev))
         after = len(self.evidence)
@@ -47,25 +49,27 @@ def add_evidence(self, ev: Sequence[Evidence]) -> List[str]:
 
     def evidence_id(self, e: Evidence) -> str:
         # stable id for this run, based on content/provenance/citations
-        return _hash_jsonable({
-            "kind": e.kind,
-            "content": e.content,
-            "prov": e.provenance.model_dump(),
-            "cits": [c.model_dump() for c in e.citations],
-            "tags": e.tags,
-        })
+        return _hash_jsonable(
+            {
+                "kind": e.kind,
+                "content": e.content,
+                "prov": e.provenance.model_dump(),
+                "cits": [c.model_dump() for c in e.citations],
+                "tags": e.tags,
+            }
+        )
 
     def get_task(self, task_id: str) -> ResearchTask:
         return self.tasks[task_id]
 
-    def list_tasks(self) -> List[ResearchTask]:
+    def list_tasks(self) -> list[ResearchTask]:
         return list(self.tasks.values())
 
-    def pending_tasks(self) -> List[ResearchTask]:
+    def pending_tasks(self) -> list[ResearchTask]:
         return [t for t in self.tasks.values() if t.status == "pending"]
 
-    def runnable_tasks(self) -> List[ResearchTask]:
-        runnable: List[ResearchTask] = []
+    def runnable_tasks(self) -> list[ResearchTask]:
+        runnable: list[ResearchTask] = []
         for t in self.tasks.values():
             if t.status != "pending":
                 continue
diff --git a/agent_ext/research/models.py b/agent_ext/research/models.py
index f02d842..6979419 100644
--- a/agent_ext/research/models.py
+++ b/agent_ext/research/models.py
@@ -1,19 +1,19 @@
 from __future__ import annotations
 
-from typing import Any, Dict, List, Literal, Optional
-from pydantic import BaseModel, Field
+from typing import Any, Literal
 
+from pydantic import BaseModel, Field
 
 TaskStatus = Literal["pending", "running", "done", "failed", "skipped"]
 TaskKind = Literal[
-    "search",            # web/retrieval (if allowed)
-    "browse",            # computer-use (playwright/cdp)
-    "ingest_document",   # OCR pipeline
-    "analyze",           # reasoning over evidence / RLM
-    "extract",           # structured extraction
-    "subagent",          # delegate to local/server specialist
-    "tool",              # direct tool call
-    "synthesize",        # generate narrative/structured report
+    "search",  # web/retrieval (if allowed)
+    "browse",  # computer-use (playwright/cdp)
+    "ingest_document",  # OCR pipeline
+    "analyze",  # reasoning over evidence / RLM
+    "extract",  # structured extraction
+    "subagent",  # delegate to local/server specialist
+    "tool",  # direct tool call
+    "synthesize",  # generate narrative/structured report
 ]
 
 EvidenceKind = Literal[
@@ -32,49 +32,50 @@ class ResearchBudget(BaseModel):
     max_steps: int = 40
     max_tool_calls: int = 60
     max_runtime_s: int = 180
-    max_cost_usd: Optional[float] = None  # optional if you track cost
+    max_cost_usd: float | None = None  # optional if you track cost
 
 
 class ResearchTask(BaseModel):
     id: str
     kind: TaskKind
     goal: str
-    query: Optional[str] = None                 # for search/browse
-    inputs: Dict[str, Any] = Field(default_factory=dict)
-    depends_on: List[str] = Field(default_factory=list)
+    query: str | None = None  # for search/browse
+    inputs: dict[str, Any] = Field(default_factory=dict)
+    depends_on: list[str] = Field(default_factory=list)
     status: TaskStatus = "pending"
     attempts: int = 0
     max_attempts: int = 2
-    priority: int = 50                          # lower = earlier
-    tags: List[str] = Field(default_factory=list)
-    error: Optional[str] = None
+    priority: int = 50  # lower = earlier
+    tags: list[str] = Field(default_factory=list)
+    error: str | None = None
 
 
 class ResearchPlan(BaseModel):
     question: str
-    tasks: List[ResearchTask] = Field(default_factory=list)
-    assumptions: List[str] = Field(default_factory=list)
-    stop_conditions: List[str] = Field(default_factory=list)
+    tasks: list[ResearchTask] = Field(default_factory=list)
+    assumptions: list[str] = Field(default_factory=list)
+    stop_conditions: list[str] = Field(default_factory=list)
 
 
 class Claim(BaseModel):
     """
     Claim ledger entry (atomic statement).
     """
+
     id: str
     text: str
     confidence: float = 0.7
-    citations: List[Dict[str, Any]] = Field(default_factory=list)  # store as dicts to avoid tight coupling
-    tags: List[str] = Field(default_factory=list)
-    derived_from_evidence_ids: List[str] = Field(default_factory=list)
+    citations: list[dict[str, Any]] = Field(default_factory=list)  # store as dicts to avoid tight coupling
+    tags: list[str] = Field(default_factory=list)
+    derived_from_evidence_ids: list[str] = Field(default_factory=list)
 
 
 class ResearchOutcome(BaseModel):
     question: str
     answer: str
-    structured: Dict[str, Any] = Field(default_factory=dict)
-    claims: List[Claim] = Field(default_factory=list)
-    evidence_ids: List[str] = Field(default_factory=list)
-    limitations: List[str] = Field(default_factory=list)
+    structured: dict[str, Any] = Field(default_factory=dict)
+    claims: list[Claim] = Field(default_factory=list)
+    evidence_ids: list[str] = Field(default_factory=list)
+    limitations: list[str] = Field(default_factory=list)
     steps_taken: int = 0
-    plan: Optional[ResearchPlan] = None
+    plan: ResearchPlan | None = None
diff --git a/agent_ext/research/planner.py b/agent_ext/research/planner.py
index 021ac1c..02fabc1 100644
--- a/agent_ext/research/planner.py
+++ b/agent_ext/research/planner.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Callable, List, Optional
+from collections.abc import Callable
 
 from agent_ext.research.models import ResearchPlan, ResearchTask
 
@@ -10,7 +10,7 @@ def default_plan(question: str) -> ResearchPlan:
     Deterministic starter plan. Good enough to run without an LLM.
     You can replace with an LLM planner later.
     """
-    tasks: List[ResearchTask] = [
+    tasks: list[ResearchTask] = [
         ResearchTask(
             id="t1_scope",
             kind="analyze",
@@ -53,7 +53,8 @@ class ResearchPlanner:
     Planner with an optional LLM-based planning seam:
     plan_fn(question) -> ResearchPlan
     """
-    def __init__(self, plan_fn: Optional[Callable[[str], ResearchPlan]] = None):
+
+    def __init__(self, plan_fn: Callable[[str], ResearchPlan] | None = None):
         self.plan_fn = plan_fn or default_plan
 
     def make_plan(self, question: str) -> ResearchPlan:
diff --git a/agent_ext/research/synth.py b/agent_ext/research/synth.py
index 3003a7e..121783e 100644
--- a/agent_ext/research/synth.py
+++ b/agent_ext/research/synth.py
@@ -1,18 +1,18 @@
 from __future__ import annotations
 
-from typing import Any, Dict, List, Sequence
+from collections.abc import Sequence
 
 from agent_ext.evidence.models import Evidence
 from agent_ext.research.models import Claim, ResearchOutcome
 
 
-def build_claims_from_evidence(evidence: Sequence[Evidence], *, max_claims: int = 12) -> List[Claim]:
+def build_claims_from_evidence(evidence: Sequence[Evidence], *, max_claims: int = 12) -> list[Claim]:
     """
     Deterministic baseline claim builder:
     - turns 'finding' / 'structured' / 'web_capture' / 'doc_extract' into claims.
     In real use, you’ll replace this with an LLM-based claim extractor + validator.
     """
-    claims: List[Claim] = []
+    claims: list[Claim] = []
     i = 0
     for ev in evidence:
         if ev.kind not in {"finding", "structured", "web_capture", "doc_extract", "text"}:
@@ -25,20 +25,22 @@ def build_claims_from_evidence(evidence: Sequence[Evidence], *, max_claims: int
             txt = str(ev.content)[:500]
 
         cits = [c.model_dump() for c in (ev.citations or [])]
-        claims.append(Claim(
-            id=f"c{i}",
-            text=txt if txt else f"Claim derived from evidence {i}",
-            confidence=min(0.9, max(0.4, ev.confidence or 0.7)),
-            citations=cits,
-            tags=list(ev.tags or []),
-            derived_from_evidence_ids=[],
-        ))
+        claims.append(
+            Claim(
+                id=f"c{i}",
+                text=txt if txt else f"Claim derived from evidence {i}",
+                confidence=min(0.9, max(0.4, ev.confidence or 0.7)),
+                citations=cits,
+                tags=list(ev.tags or []),
+                derived_from_evidence_ids=[],
+            )
+        )
         if len(claims) >= max_claims:
             break
     return claims
 
 
-def synthesize_answer(question: str, claims: List[Claim]) -> str:
+def synthesize_answer(question: str, claims: list[Claim]) -> str:
     """
     Deterministic baseline synthesis.
     Replace with a PydanticAI synth agent for better narrative.
@@ -52,7 +54,9 @@ def synthesize_answer(question: str, claims: List[Claim]) -> str:
     for c in claims[:10]:
         lines.append(f"- {c.text}")
     lines.append("")
-    lines.append("Limitations: This answer is based on the collected evidence; some claims may be incomplete or require additional sources.")
+    lines.append(
+        "Limitations: This answer is based on the collected evidence; some claims may be incomplete or require additional sources."
+    )
     return "\n".join(lines)
 
 
diff --git a/agent_ext/rlm/README.md b/agent_ext/rlm/README.md
new file mode 100644
index 0000000..e6a29cb
--- /dev/null
+++ b/agent_ext/rlm/README.md
@@ -0,0 +1,61 @@
+# RLM — Recursive Language Model
+
+Handle extremely large contexts with any LLM provider. The LLM writes Python code to programmatically explore and analyze data, optionally delegating semantic analysis to a sub-model via `llm_query()`.
+
+## Features
+
+- **REPL Environment**: Persistent Python sandbox with state between executions
+- **Sub-Model Delegation**: `llm_query()` within the sandbox for semantic analysis
+- **Grounded Citations**: `GroundedResponse` model with citation markers
+- **Safety**: Restricted built-ins, controlled imports, output truncation
+- **Sandboxed File Access**: Temp directory for intermediate results
+
+## Quick Start
+
+```python
+from agent_ext.rlm import REPLEnvironment, RLMConfig
+
+# Create REPL with context data
+repl = REPLEnvironment(
+    context=massive_document,  # str, dict, or list
+    config=RLMConfig(
+        sub_model="openai:gpt-4o-mini",  # for llm_query()
+        code_timeout=60.0,
+    ),
+)
+
+# LLM writes code to explore the data
+result = repl.execute("""
+print(f"Context length: {len(context)}")
+print(f"First 200 chars: {context[:200]}")
+""")
+
+# State persists between executions
+result2 = repl.execute("""
+# Find specific information
+lines = context.split('\\n')
+relevant = [l for l in lines if 'revenue' in l.lower()]
+print(f"Found {len(relevant)} revenue lines")
+
+# Delegate semantic analysis to sub-model
+if relevant:
+    analysis = llm_query(f"Summarize these revenue figures: {relevant[:5]}")
+    print(analysis)
+""")
+
+repl.cleanup()
+```
+
+## Grounded Response
+
+```python
+from agent_ext.rlm import GroundedResponse
+
+response = GroundedResponse(
+    info="Revenue grew [1] driven by expansion [2]",
+    grounding={
+        "1": "increased by 45% year-over-year",
+        "2": "new markets in Asia-Pacific region",
+    },
+)
+```
diff --git a/agent_ext/rlm/__init__.py b/agent_ext/rlm/__init__.py
index e69de29..685d24f 100644
--- a/agent_ext/rlm/__init__.py
+++ b/agent_ext/rlm/__init__.py
@@ -0,0 +1,25 @@
+"""RLM — Recursive Language Model pattern for large-context analysis.
+
+Provides a sandboxed REPL where an LLM can write Python code to explore
+data, with optional ``llm_query()`` for sub-model delegation and
+``GroundedResponse`` for citation-grounded output.
+"""
+
+from .models import ContextType, GroundedResponse, REPLResult, RLMConfig, RLMDependencies
+from .policies import RLMPolicy
+from .python_runner import RLMRunError, run_restricted_python
+from .repl import REPLEnvironment, format_repl_result
+from .toolset import cleanup_repl_environments, create_rlm_toolset
+
+__all__ = [
+    "RLMPolicy",
+    "RLMRunError",
+    "run_restricted_python",
+    "ContextType",
+    "GroundedResponse",
+    "REPLResult",
+    "RLMConfig",
+    "RLMDependencies",
+    "REPLEnvironment",
+    "format_repl_result",
+]
diff --git a/agent_ext/rlm/logging.py b/agent_ext/rlm/logging.py
new file mode 100644
index 0000000..c3d275e
--- /dev/null
+++ b/agent_ext/rlm/logging.py
@@ -0,0 +1,101 @@
+"""Pretty logging for RLM code execution.
+
+Uses Rich for styled terminal output with syntax highlighting.
+Falls back to plain text if Rich is not available.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .models import REPLResult
+
+try:
+    from rich.console import Console
+    from rich.panel import Panel
+    from rich.syntax import Syntax
+    from rich.text import Text
+
+    RICH_AVAILABLE = True
+except ImportError:
+    RICH_AVAILABLE = False
+
+
+class RLMLogger:
+    """Pretty logger for RLM code execution."""
+
+    def __init__(self, enabled: bool = True):
+        self.enabled = enabled
+        self.console = Console() if RICH_AVAILABLE else None
+
+    def log_code_execution(self, code: str) -> None:
+        """Log the code being executed."""
+        if not self.enabled:
+            return
+        if RICH_AVAILABLE and self.console:
+            syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
+            self.console.print(Panel(syntax, title="[bold cyan]Code Execution[/]", border_style="cyan"))
+        else:
+            print(f"\n{'=' * 50}\nCODE EXECUTION\n{'=' * 50}\n{code}\n{'=' * 50}")
+
+    def log_result(self, result: REPLResult) -> None:
+        """Log the execution result."""
+        if not self.enabled:
+            return
+        if RICH_AVAILABLE and self.console:
+            status = "[bold green]SUCCESS[/]" if result.success else "[bold red]ERROR[/]"
+            parts = [f"Executed in {result.execution_time:.3f}s"]
+            if result.stdout.strip():
+                parts.append(f"\n[bold yellow]Output:[/]\n{result.stdout.strip()[:2000]}")
+            if result.stderr.strip():
+                parts.append(f"\n[bold red]Errors:[/]\n{result.stderr.strip()[:1000]}")
+            body = "\n".join(parts)
+            border = "green" if result.success else "red"
+            self.console.print(Panel(body, title=f"Result: {status}", border_style=border))
+        else:
+            status = "SUCCESS" if result.success else "ERROR"
+            print(f"\n{'=' * 50}\nRESULT: {status} ({result.execution_time:.3f}s)\n{'=' * 50}")
+            if result.stdout.strip():
+                print(f"Output:\n{result.stdout.strip()[:2000]}")
+            if result.stderr.strip():
+                print(f"Errors:\n{result.stderr.strip()[:1000]}")
+
+    def log_llm_query(self, prompt: str) -> None:
+        """Log an llm_query call."""
+        if not self.enabled:
+            return
+        display = prompt[:500] + "..." if len(prompt) > 500 else prompt
+        if RICH_AVAILABLE and self.console:
+            self.console.print(Panel(display, title="[bold blue]LLM Query[/]", border_style="blue"))
+        else:
+            print(f"\n{'=' * 50}\nLLM QUERY\n{'=' * 50}\n{display}")
+
+    def log_llm_response(self, response: str) -> None:
+        """Log an llm_query response."""
+        if not self.enabled:
+            return
+        display = response[:500] + "..." if len(response) > 500 else response
+        if RICH_AVAILABLE and self.console:
+            self.console.print(Panel(display, title="[bold blue]LLM Response[/]", border_style="blue"))
+        else:
+            print(f"\nLLM RESPONSE:\n{display}")
+
+
+# Global logger instance
+_logger: RLMLogger | None = None
+
+
+def get_logger() -> RLMLogger:
+    """Get the global RLM logger (disabled by default)."""
+    global _logger
+    if _logger is None:
+        _logger = RLMLogger(enabled=False)
+    return _logger
+
+
+def configure_logging(enabled: bool = True) -> RLMLogger:
+    """Configure RLM logging."""
+    global _logger
+    _logger = RLMLogger(enabled=enabled)
+    return _logger
diff --git a/agent_ext/rlm/models.py b/agent_ext/rlm/models.py
new file mode 100644
index 0000000..d2424f5
--- /dev/null
+++ b/agent_ext/rlm/models.py
@@ -0,0 +1,110 @@
+"""Pydantic models and data types for the RLM system."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+ContextType = str | dict[str, Any] | list[Any]
+
+
+@dataclass
+class RLMConfig:
+    """Configuration for RLM behavior."""
+
+    code_timeout: float = 60.0
+    """Timeout in seconds for code execution."""
+
+    truncate_output_chars: int = 50_000
+    """Maximum characters to return from code execution output."""
+
+    sub_model: str | None = None
+    """Model for llm_query() within the REPL environment."""
+
+    allow_imports: list[str] = field(
+        default_factory=lambda: [
+            "math",
+            "json",
+            "re",
+            "statistics",
+            "collections",
+            "itertools",
+            "functools",
+            "operator",
+            "string",
+            "textwrap",
+            "datetime",
+            "hashlib",
+            "csv",
+        ]
+    )
+    """Modules the REPL is allowed to import."""
+
+
+@dataclass
+class RLMDependencies:
+    """Dependencies injected into RLM tools via RunContext."""
+
+    context: ContextType
+    """The context to analyze (string, dict, or list)."""
+
+    config: RLMConfig = field(default_factory=RLMConfig)
+    """RLM configuration options."""
+
+    def __post_init__(self):
+        if self.context is None:
+            raise ValueError("context cannot be None")
+
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class REPLResult:
+    """Result from REPL code execution."""
+
+    stdout: str
+    """Standard output from execution."""
+
+    stderr: str
+    """Standard error from execution."""
+
+    locals: dict[str, Any]
+    """Local variables after execution."""
+
+    execution_time: float
+    """Time taken to execute in seconds."""
+
+    success: bool = True
+    """Whether execution completed without errors."""
+
+
+# ---------------------------------------------------------------------------
+# Grounded response (citations)
+# ---------------------------------------------------------------------------
+
+
+class GroundedResponse(BaseModel):
+    """A response with citation markers mapping to exact quotes from source documents.
+
+    Example::
+
+        GroundedResponse(
+            info="Revenue grew [1] driven by expansion [2]",
+            grounding={"1": "increased by 45%", "2": "new markets in Asia"},
+        )
+    """
+
+    info: str = Field(description="Response text with citation markers like [1]")
+    grounding: dict[str, str] = Field(
+        default_factory=dict,
+        description="Mapping from citation markers to exact quotes from the source",
+    )
diff --git a/agent_ext/rlm/policies.py b/agent_ext/rlm/policies.py
index 164ff8c..05ed379 100644
--- a/agent_ext/rlm/policies.py
+++ b/agent_ext/rlm/policies.py
@@ -1,8 +1,17 @@
+"""RLM policies and the legacy restricted runner.
+
+For new code, use ``REPLEnvironment`` from ``rlm.repl`` instead of
+``run_restricted_python``.
+"""
+
 from __future__ import annotations
+
 from pydantic import BaseModel
 
 
 class RLMPolicy(BaseModel):
-    allow_imports: list[str] = ["math", "json", "re", "statistics"]
+    """Policy for restricted Python execution (legacy)."""
+
+    allow_imports: list[str] = ["math", "json", "re", "statistics", "collections"]
     max_stdout_chars: int = 50_000
     max_runtime_s: int = 10
diff --git a/agent_ext/rlm/prompts.py b/agent_ext/rlm/prompts.py
new file mode 100644
index 0000000..0db27a1
--- /dev/null
+++ b/agent_ext/rlm/prompts.py
@@ -0,0 +1,121 @@
+"""Instruction templates for RLM agents.
+
+Provides detailed prompts for code execution strategy, grounding/citations,
+and llm_query usage.
+"""
+
+from __future__ import annotations
+
+RLM_INSTRUCTIONS = """You are an AI assistant that analyzes data using Python code execution. You have access to a REPL environment where code persists between executions.
+
+## REPL Environment
+
+The REPL environment provides:
+1. A `context` variable containing your data (string, dict, or list)
+2. Common modules available via import: `re`, `json`, `collections`, etc.
+3. Variables persist between code executions
+
+## Strategy for Large Contexts
+
+### Step 1: Explore the Context Structure
+```python
+print(f"Context type: {type(context)}")
+print(f"Context length: {len(context)}")
+if isinstance(context, str):
+    print(f"First 500 chars: {context[:500]}")
+```
+
+### Step 2: Process the Data
+For structured data:
+```python
+import re
+sections = re.split(r'### (.+)', context)
+for i in range(1, len(sections), 2):
+    header = sections[i]
+    content = sections[i+1][:200]
+    print(f"{header}: {content}...")
+```
+
+### Step 3: Build Your Answer
+```python
+results = []
+# ... process data ...
+print(f"Final answer: {results}")
+```
+
+## Guidelines
+
+1. **Always explore first** — Check context type and size before processing
+2. **Use print() liberally** — See intermediate results
+3. **Store results in variables** — Build up your answer incrementally
+4. **Be thorough** — For needle-in-haystack, search the entire context
+"""
+
+GROUNDING_INSTRUCTIONS = """
+
+## Grounding Requirements
+
+Your response MUST include grounded citations:
+
+1. **Citation Format**: Use markers like `[1]`, `[2]`, etc. in your response text
+2. **Exact Quotes**: Each marker must map to an EXACT quote from the source context
+3. **Quote Length**: Each quote should be 10-200 characters
+4. **Consecutive Numbering**: Number citations consecutively starting from 1
+
+### Output Format
+
+```json
+{
+   "info": "The document states that X [1]. Additionally, Y [2]",
+   "grounding": {
+      "1": "exact quote from source",
+      "2": "another exact quote"
+   }
+}
+```
+"""
+
+LLM_QUERY_INSTRUCTIONS = """
+
+## Sub-LLM Queries
+
+You have access to `llm_query(prompt: str) -> str` for:
+- **Semantic analysis** — Understanding meaning, not just text patterns
+- **Summarization** — Condensing large sections of context
+- **Chunked processing** — Analyzing context in manageable pieces
+
+### Example: Chunked Analysis
+```python
+chunk_size = 50000
+chunks = [context[i:i+chunk_size] for i in range(0, len(context), chunk_size)]
+
+summaries = []
+for i, chunk in enumerate(chunks):
+    summary = llm_query(f"Summarize this section:\\n{chunk}")
+    summaries.append(f"Chunk {i+1}: {summary}")
+    print(f"Processed chunk {i+1}/{len(chunks)}")
+
+final = llm_query(f"Based on these summaries, answer: ...\\n" + "\\n".join(summaries))
+print(final)
+```
+
+**Tips:**
+- Use llm_query for semantic analysis that regex/string operations can't do
+- Store results in variables to build up your answer
+"""
+
+
+def build_rlm_instructions(
+    include_llm_query: bool = False,
+    include_grounding: bool = False,
+    custom_suffix: str | None = None,
+) -> str:
+    """Build RLM instructions with optional customization."""
+    base = RLM_INSTRUCTIONS
+    if include_llm_query:
+        base += LLM_QUERY_INSTRUCTIONS
+    if include_grounding:
+        base += GROUNDING_INSTRUCTIONS
+    if custom_suffix:
+        base += f"\n\n## Additional Instructions\n\n{custom_suffix}"
+    return base
diff --git a/agent_ext/rlm/python_runner.py b/agent_ext/rlm/python_runner.py
index 9ac3a76..6734368 100644
--- a/agent_ext/rlm/python_runner.py
+++ b/agent_ext/rlm/python_runner.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
+
 import io
-import sys
 import time
-from contextlib import redirect_stdout, redirect_stderr
-from typing import Any, Dict
+from contextlib import redirect_stderr, redirect_stdout
+from typing import Any
 
 from .policies import RLMPolicy
 
@@ -12,7 +12,7 @@ class RLMRunError(RuntimeError):
     pass
 
 
-def run_restricted_python(code: str, *, policy: RLMPolicy, globals_in: Dict[str, Any] | None = None) -> Dict[str, Any]:
+def run_restricted_python(code: str, *, policy: RLMPolicy, globals_in: dict[str, Any] | None = None) -> dict[str, Any]:
     """
     Minimal safe-ish runner:
     - allows only a controlled import set via custom __import__
@@ -27,7 +27,7 @@ def limited_import(name, globals=None, locals=None, fromlist=(), level=0):
             raise ImportError(f"Import not allowed: {name}")
         return __import__(name, globals, locals, fromlist, level)
 
-    g: Dict[str, Any] = dict(globals_in or {})
+    g: dict[str, Any] = dict(globals_in or {})
     g["__builtins__"] = dict(__builtins__)
     g["__builtins__"]["__import__"] = limited_import
 
diff --git a/agent_ext/rlm/repl.py b/agent_ext/rlm/repl.py
new file mode 100644
index 0000000..85b5c88
--- /dev/null
+++ b/agent_ext/rlm/repl.py
@@ -0,0 +1,288 @@
+"""Sandboxed Python REPL environment for RLM.
+
+Provides a safe execution environment where an LLM can run Python code
+to programmatically explore and analyze large contexts.  State persists
+between executions within a session.
+
+Key features:
+- Restricted built-ins (no eval/exec/compile/globals/input)
+- Controlled imports via allow-list
+- ``context`` variable pre-loaded with data to analyze
+- ``llm_query()`` function for sub-model delegation (when configured)
+- Persistent local state across executions
+- stdout/stderr capture with truncation
+"""
+
+from __future__ import annotations
+
+import io
+import json
+import os
+import shutil
+import sys
+import tempfile
+import textwrap
+import threading
+import time
+from contextlib import contextmanager, suppress
+from typing import Any, ClassVar
+
+from .models import ContextType, REPLResult, RLMConfig
+
+
+class REPLEnvironment:
+    """Sandboxed Python execution environment for RLM."""
+
+    SAFE_BUILTINS: ClassVar[dict[str, Any]] = {
+        # Core types
+        "print": print,
+        "len": len,
+        "str": str,
+        "int": int,
+        "float": float,
+        "bool": bool,
+        "list": list,
+        "dict": dict,
+        "set": set,
+        "tuple": tuple,
+        "type": type,
+        "isinstance": isinstance,
+        "issubclass": issubclass,
+        # Iteration
+        "range": range,
+        "enumerate": enumerate,
+        "zip": zip,
+        "map": map,
+        "filter": filter,
+        "sorted": sorted,
+        "reversed": reversed,
+        "iter": iter,
+        "next": next,
+        # Math
+        "min": min,
+        "max": max,
+        "sum": sum,
+        "abs": abs,
+        "round": round,
+        "pow": pow,
+        "divmod": divmod,
+        # String / char
+        "chr": chr,
+        "ord": ord,
+        "hex": hex,
+        "bin": bin,
+        "oct": oct,
+        "repr": repr,
+        "ascii": ascii,
+        "format": format,
+        # Collections
+        "any": any,
+        "all": all,
+        "slice": slice,
+        "hash": hash,
+        "id": id,
+        "callable": callable,
+        # Attribute access
+        "hasattr": hasattr,
+        "getattr": getattr,
+        "setattr": setattr,
+        "delattr": delattr,
+        "dir": dir,
+        "vars": vars,
+        # Binary
+        "bytes": bytes,
+        "bytearray": bytearray,
+        "memoryview": memoryview,
+        "complex": complex,
+        # OOP
+        "super": super,
+        "property": property,
+        "staticmethod": staticmethod,
+        "classmethod": classmethod,
+        "object": object,
+        # Exceptions
+        "Exception": Exception,
+        "ValueError": ValueError,
+        "TypeError": TypeError,
+        "KeyError": KeyError,
+        "IndexError": IndexError,
+        "AttributeError": AttributeError,
+        "RuntimeError": RuntimeError,
+        "StopIteration": StopIteration,
+        "NotImplementedError": NotImplementedError,
+        # File access (sandboxed to temp dir)
+        "open": open,
+        "FileNotFoundError": FileNotFoundError,
+        "OSError": OSError,
+    }
+
+    BLOCKED_BUILTINS: ClassVar[dict[str, None]] = {
+        "eval": None,
+        "exec": None,
+        "compile": None,
+        "globals": None,
+        "locals": None,
+        "input": None,
+        "__builtins__": None,
+    }
+
+    def __init__(self, context: ContextType, config: RLMConfig | None = None) -> None:
+        self.config = config or RLMConfig()
+        self.temp_dir = tempfile.mkdtemp(prefix="rlm_repl_")
+        self._lock = threading.Lock()
+        self.locals: dict[str, Any] = {}
+
+        # Set up globals with safe built-ins and controlled __import__
+        builtins = {**self.SAFE_BUILTINS, **self.BLOCKED_BUILTINS}
+        allowed_set = set(self.config.allow_imports)
+
+        def controlled_import(name, globs=None, locs=None, fromlist=(), level=0):
+            base = name.split(".")[0]
+            if base not in allowed_set:
+                raise ImportError(f"Import not allowed: {name}")
+            return __import__(name, globs, locs, fromlist, level)
+
+        builtins["__import__"] = controlled_import
+
+        self.globals: dict[str, Any] = {"__builtins__": builtins}
+
+        if self.config.sub_model:
+            self._setup_llm_query()
+
+        self._load_context(context)
+
+    def _setup_llm_query(self) -> None:
+        """Set up llm_query() for sub-model delegation inside REPL."""
+
+        def llm_query(prompt: str) -> str:
+            """Query a sub-LLM.  Useful for analyzing chunks of large context."""
+            try:
+                if not self.config.sub_model:
+                    return "Error: No sub-model configured"
+                from pydantic_ai import ModelRequest
+                from pydantic_ai.direct import model_request_sync
+                from pydantic_ai.messages import TextPart
+
+                result = model_request_sync(
+                    self.config.sub_model,
+                    [ModelRequest.user_text_prompt(prompt)],
+                )
+                text_parts = [p.content for p in result.parts if isinstance(p, TextPart)]
+                return "".join(text_parts) if text_parts else ""
+            except Exception as e:
+                return f"Error querying sub-LLM: {e!s}"
+
+        self.globals["llm_query"] = llm_query
+
+    def _load_context(self, context: ContextType) -> None:
+        """Load context into the REPL as the ``context`` variable."""
+        if isinstance(context, str):
+            ctx_path = os.path.join(self.temp_dir, "context.txt")
+            with open(ctx_path, "w", encoding="utf-8") as f:
+                f.write(context)
+            load_code = f"with open(r'{ctx_path}', 'r', encoding='utf-8') as f:\n    context = f.read()\n"
+        else:
+            ctx_path = os.path.join(self.temp_dir, "context.json")
+            with open(ctx_path, "w", encoding="utf-8") as f:
+                json.dump(context, f, indent=2, default=str)
+            load_code = (
+                f"import json\nwith open(r'{ctx_path}', 'r', encoding='utf-8') as f:\n    context = json.load(f)\n"
+            )
+        self._execute_internal(load_code)
+
+    def _execute_internal(self, code: str) -> None:
+        combined = {**self.globals, **self.locals}
+        exec(code, combined, combined)
+        for key, value in combined.items():
+            if key not in self.globals and not key.startswith("_"):
+                self.locals[key] = value
+
+    @contextmanager
+    def _capture_output(self):
+        old_stdout, old_stderr = sys.stdout, sys.stderr
+        stdout_buf, stderr_buf = io.StringIO(), io.StringIO()
+        try:
+            sys.stdout, sys.stderr = stdout_buf, stderr_buf
+            yield stdout_buf, stderr_buf
+        finally:
+            sys.stdout, sys.stderr = old_stdout, old_stderr
+
+    def execute(self, code: str) -> REPLResult:
+        """Execute Python code in the REPL.  State persists between calls."""
+        code = textwrap.dedent(code).strip()
+        t0 = time.time()
+        success = True
+        stdout_content = stderr_content = ""
+
+        with self._lock, self._capture_output() as (stdout_buf, stderr_buf):
+            try:
+                # Split imports from other code
+                lines = code.split("\n")
+                import_lines = [
+                    l for l in lines if l.strip().startswith(("import ", "from ")) and not l.strip().startswith("#")
+                ]
+                other_lines = [l for l in lines if l not in import_lines]
+
+                if import_lines:
+                    exec("\n".join(import_lines), self.globals, self.globals)
+
+                if other_lines:
+                    other_code = "\n".join(other_lines)
+                    combined = {**self.globals, **self.locals}
+                    exec(other_code, combined, combined)
+                    for key, value in combined.items():
+                        if key not in self.globals:
+                            self.locals[key] = value
+
+                stdout_content = stdout_buf.getvalue()
+                stderr_content = stderr_buf.getvalue()
+            except Exception as e:
+                success = False
+                stderr_content = stderr_buf.getvalue() + f"\nError: {e!s}"
+                stdout_content = stdout_buf.getvalue()
+
+        dt = time.time() - t0
+        max_chars = self.config.truncate_output_chars
+        if len(stdout_content) > max_chars:
+            stdout_content = stdout_content[:max_chars] + "\n… (truncated)"
+        if len(stderr_content) > max_chars:
+            stderr_content = stderr_content[:max_chars] + "\n… (truncated)"
+
+        return REPLResult(
+            stdout=stdout_content,
+            stderr=stderr_content,
+            locals=dict(self.locals),
+            execution_time=dt,
+            success=success,
+        )
+
+    def cleanup(self) -> None:
+        """Clean up temporary directory."""
+        with suppress(Exception):
+            shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+
+def format_repl_result(result: REPLResult, max_var_display: int = 200) -> str:
+    """Format a REPL result for display to the LLM."""
+    parts: list[str] = []
+    if result.stdout.strip():
+        parts.append(f"Output:\n{result.stdout}")
+    if result.stderr.strip():
+        parts.append(f"Errors:\n{result.stderr}")
+    user_vars = {
+        k: v for k, v in result.locals.items() if not k.startswith("_") and k not in ("context", "json", "re", "os")
+    }
+    if user_vars:
+        var_lines = []
+        for name, value in user_vars.items():
+            try:
+                vs = repr(value)
+                if len(vs) > max_var_display:
+                    vs = vs[:max_var_display] + "..."
+                var_lines.append(f"  {name} = {vs}")
+            except Exception:
+                var_lines.append(f"  {name} = <{type(value).__name__}>")
+        if var_lines:
+            parts.append("Variables:\n" + "\n".join(var_lines))
+    parts.append(f"Execution time: {result.execution_time:.3f}s")
+    return "\n\n".join(parts) if parts else "Code executed successfully (no output)"
diff --git a/agent_ext/rlm/toolset.py b/agent_ext/rlm/toolset.py
new file mode 100644
index 0000000..5968a7a
--- /dev/null
+++ b/agent_ext/rlm/toolset.py
@@ -0,0 +1,108 @@
+"""RLM toolset — gives any pydantic-ai agent a sandboxed code execution tool.
+
+Example::
+
+    from pydantic_ai import Agent
+    from agent_ext.rlm import create_rlm_toolset, RLMDependencies
+
+    toolset = create_rlm_toolset()
+    agent = Agent("openai:gpt-4o", toolsets=[toolset])
+
+    deps = RLMDependencies(context={"users": [...]})
+    result = await agent.run("Analyze the user data", deps=deps)
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+from pydantic_ai import RunContext
+from pydantic_ai.toolsets import FunctionToolset
+
+from .models import REPLResult, RLMConfig, RLMDependencies
+from .repl import REPLEnvironment, format_repl_result
+
+EXECUTE_CODE_DESCRIPTION = """
+Execute Python code in a sandboxed REPL environment.
+
+## Environment
+- A `context` variable is pre-loaded with the data to analyze
+- Variables persist between executions within the same session
+- Standard library modules are available (json, re, collections, etc.)
+- Use print() to display output
+
+## When to Use
+- Analyzing or processing structured data (JSON, dicts, lists)
+- Performing calculations or data transformations
+- Extracting specific information from large datasets
+- Testing hypotheses about the data structure
+
+## Best Practices
+1. Start by exploring the context: `print(type(context))`, `print(len(context))`
+2. Break complex operations into smaller steps
+3. Use print() liberally to understand intermediate results
+4. Handle potential errors gracefully with try/except
+
+## Available Functions
+- `llm_query(prompt)`: Query the LLM for reasoning assistance (if configured)
+- Important: Do not use `llm_query` in the first code execution. Use it only after you have
+  explored the context and identified specific sections that need semantic analysis.
+"""
+
+# Global registry to track REPL environments for cleanup
+_repl_registry: dict[int, REPLEnvironment] = {}
+
+
+def create_rlm_toolset(
+    *,
+    code_timeout: float = 60.0,
+    sub_model: str | None = None,
+    toolset_id: str | None = None,
+) -> FunctionToolset[RLMDependencies]:
+    """Create an RLM toolset for code execution in a sandboxed REPL.
+
+    Args:
+        code_timeout: Timeout in seconds for code execution.
+        sub_model: Model to use for llm_query() within the REPL.
+        toolset_id: Optional unique identifier for the toolset.
+
+    Returns:
+        FunctionToolset compatible with any pydantic-ai agent.
+    """
+    toolset: FunctionToolset[RLMDependencies] = FunctionToolset(id=toolset_id)
+
+    def _get_or_create_repl(ctx: RunContext[RLMDependencies]) -> REPLEnvironment:
+        deps_id = id(ctx.deps)
+        if deps_id not in _repl_registry:
+            config = ctx.deps.config or RLMConfig()
+            if sub_model and not config.sub_model:
+                config = RLMConfig(sub_model=sub_model)
+            _repl_registry[deps_id] = REPLEnvironment(
+                context=ctx.deps.context,
+                config=config,
+            )
+        return _repl_registry[deps_id]
+
+    @toolset.tool(description=EXECUTE_CODE_DESCRIPTION)
+    async def execute_code(ctx: RunContext[RLMDependencies], code: str) -> str:
+        repl_env = _get_or_create_repl(ctx)
+        try:
+            loop = asyncio.get_running_loop()
+            result: REPLResult = await asyncio.wait_for(
+                loop.run_in_executor(None, repl_env.execute, code),
+                timeout=code_timeout,
+            )
+            return format_repl_result(result)
+        except TimeoutError:
+            return f"Error: Code execution timed out after {code_timeout} seconds."
+        except Exception as e:
+            return f"Error executing code: {e!s}"
+
+    return toolset
+
+
+def cleanup_repl_environments() -> None:
+    """Clean up all REPL environments."""
+    for repl_env in _repl_registry.values():
+        repl_env.cleanup()
+    _repl_registry.clear()
diff --git a/agent_ext/run_context.py b/agent_ext/run_context.py
index d93308d..02e162c 100644
--- a/agent_ext/run_context.py
+++ b/agent_ext/run_context.py
@@ -6,12 +6,15 @@
 This avoids agent_ext code depending on agent_patterns package name and keeps
 imports resolvable when the project is opened as agent_patterns (no parent on path).
 """
+
 from __future__ import annotations
 
+
 # Ensure root package is importable (same bootstrap as agent_ext/__init__.py)
 def _ensure_root_importable() -> None:
     import sys
     from pathlib import Path
+
     _root = Path(__file__).resolve().parent.parent
     _parent = _root.parent
     if _parent not in (Path(p).resolve() for p in sys.path):
@@ -20,6 +23,7 @@ def _ensure_root_importable() -> None:
 
 _ensure_root_importable()
 
+# Re-export from root package (agent_patterns.run_context), not from self
 from agent_patterns.run_context import (
     ArtifactStore,
     Cache,
diff --git a/agent_ext/search/__init__.py b/agent_ext/search/__init__.py
new file mode 100644
index 0000000..7aad5e3
--- /dev/null
+++ b/agent_ext/search/__init__.py
@@ -0,0 +1,3 @@
+from .bm25 import BM25Config, BM25Index
+from .index import RepoIndexerConfig
+from .tokenize import TokenizerConfig
diff --git a/agent_ext/search/bm25.py b/agent_ext/search/bm25.py
new file mode 100644
index 0000000..0f8c9c5
--- /dev/null
+++ b/agent_ext/search/bm25.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+
+from .index import RepoIndexer, RepoIndexerConfig
+from .store import BM25_INDEX_FILE, BM25_META_FILE, read_json, write_json
+from .tokenize import Tokenizer, TokenizerConfig
+
+
+@dataclass
+class BM25Config:
+    k1: float = 1.2
+    b: float = 0.75
+    top_k: int = 20
+
+
+class BM25Index:
+    """
+    In-memory BM25 with incremental rebuild for changed files.
+    Persisted to .agent_state/bm25_index.json and bm25_meta.json
+    """
+
+    def __init__(self, *, bm25_cfg: BM25Config, tok_cfg: TokenizerConfig, indexer_cfg: RepoIndexerConfig):
+        self.bm25_cfg = bm25_cfg
+        self.tokenizer = Tokenizer(tok_cfg)
+        self.indexer = RepoIndexer(indexer_cfg)
+
+        # postings: token -> {doc_id: tf}
+        self.postings: dict[str, dict[str, int]] = {}
+        self.doc_len: dict[str, int] = {}
+        self.doc_sha: dict[str, str] = {}
+        self.N: int = 0
+        self.avgdl: float = 0.0
+        self._index_ready: bool = False  # rebuild deferred until first search
+
+    def ensure_index(self) -> None:
+        """Build or refresh index on first use (keeps startup fast)."""
+        if self._index_ready:
+            return
+        self.rebuild_incremental()
+        self._index_ready = True
+
+    def load(self) -> None:
+        read_json(BM25_META_FILE, {})
+        data = read_json(BM25_INDEX_FILE, {})
+        self.postings = data.get("postings", {})
+        self.doc_len = data.get("doc_len", {})
+        self.doc_sha = data.get("doc_sha", {})
+        self.N = int(data.get("N", 0))
+        self.avgdl = float(data.get("avgdl", 0.0))
+
+    def save(self) -> None:
+        write_json(
+            BM25_META_FILE,
+            {
+                "k1": self.bm25_cfg.k1,
+                "b": self.bm25_cfg.b,
+                "tokenizer": {
+                    "use_tiktoken": bool(self.tokenizer.cfg.use_tiktoken),
+                    "tiktoken_encoding": self.tokenizer.cfg.tiktoken_encoding,
+                },
+            },
+        )
+        write_json(
+            BM25_INDEX_FILE,
+            {
+                "postings": self.postings,
+                "doc_len": self.doc_len,
+                "doc_sha": self.doc_sha,
+                "N": self.N,
+                "avgdl": self.avgdl,
+            },
+        )
+
+    def _remove_doc(self, doc_id: str) -> None:
+        # remove doc from postings
+        for tok, plist in list(self.postings.items()):
+            if doc_id in plist:
+                del plist[doc_id]
+                if not plist:
+                    del self.postings[tok]
+        self.doc_len.pop(doc_id, None)
+        self.doc_sha.pop(doc_id, None)
+
+    def _add_doc(self, doc_id: str, text: str, sha: str) -> None:
+        toks = self.tokenizer.tokenize(text)
+        tf: dict[str, int] = {}
+        for t in toks:
+            tf[t] = tf.get(t, 0) + 1
+
+        for tok, cnt in tf.items():
+            self.postings.setdefault(tok, {})[doc_id] = cnt
+
+        self.doc_len[doc_id] = len(toks)
+        self.doc_sha[doc_id] = sha
+
+    def rebuild_incremental(self) -> tuple[int, int]:
+        """
+        Updates repo index, then updates BM25 index only for changed/removed files.
+        Returns (num_changed, num_removed).
+        """
+        repo_state, changed, removed = self.indexer.update_incremental()
+
+        # load existing index if not loaded
+        if not self.postings:
+            self.load()
+
+        # apply removals
+        for doc_id in removed:
+            self._remove_doc(doc_id)
+
+        # apply changes
+        for doc_id in changed:
+            meta = repo_state.files.get(doc_id)
+            if not meta:
+                continue
+            sha = meta.get("sha256", "")
+            skipped = bool(meta.get("skipped", False))
+            if skipped:
+                # treat as removed to avoid stale
+                self._remove_doc(doc_id)
+                continue
+
+            # if sha unchanged, skip
+            if self.doc_sha.get(doc_id) == sha:
+                continue
+
+            text = self.indexer.read_text(doc_id)
+            if text is None:
+                continue
+            # remove old then add
+            self._remove_doc(doc_id)
+            self._add_doc(doc_id, text, sha)
+
+        # recompute corpus stats
+        self.N = len(self.doc_len)
+        self.avgdl = (sum(self.doc_len.values()) / self.N) if self.N else 0.0
+        self.save()
+        return len(changed), len(removed)
+
+    def search(self, query: str, *, top_k: int | None = None) -> list[tuple[str, float]]:
+        self.ensure_index()
+        q_toks = self.tokenizer.tokenize(query)
+        if not q_toks or self.N == 0:
+            return []
+
+        k1 = self.bm25_cfg.k1
+        b = self.bm25_cfg.b
+        top_k = top_k or self.bm25_cfg.top_k
+
+        scores: dict[str, float] = {}
+        seen_tokens = set(q_toks)
+
+        for tok in seen_tokens:
+            plist = self.postings.get(tok)
+            if not plist:
+                continue
+            df = len(plist)
+            # idf with BM25+ style smoothing
+            idf = math.log(1.0 + (self.N - df + 0.5) / (df + 0.5))
+
+            for doc_id, tf in plist.items():
+                dl = self.doc_len.get(doc_id, 0)
+                denom = tf + k1 * (1.0 - b + b * (dl / (self.avgdl + 1e-9)))
+                score = idf * (tf * (k1 + 1.0) / (denom + 1e-9))
+                scores[doc_id] = scores.get(doc_id, 0.0) + score
+
+        return sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
diff --git a/agent_ext/search/index.py b/agent_ext/search/index.py
new file mode 100644
index 0000000..284136d
--- /dev/null
+++ b/agent_ext/search/index.py
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass
+from pathlib import Path
+
+from .store import REPO_INDEX_FILE, RepoIndexState, read_json, write_json
+
+DEFAULT_EXTS = (".py", ".md", ".toml", ".yaml", ".yml", ".json", ".txt")
+
+
+def _sha256_bytes(b: bytes) -> str:
+    h = hashlib.sha256()
+    h.update(b)
+    return h.hexdigest()
+
+
+def _file_lang(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext == ".py":
+        return "python"
+    if ext in (".md",):
+        return "markdown"
+    if ext in (".toml",):
+        return "toml"
+    if ext in (".yaml", ".yml"):
+        return "yaml"
+    if ext in (".json",):
+        return "json"
+    return ext.lstrip(".")
+
+
+@dataclass
+class RepoIndexerConfig:
+    root: str = "."
+    exts: tuple[str, ...] = DEFAULT_EXTS
+    exclude_dirs: tuple[str, ...] = (".git", ".agent_state", "__pycache__", "dist", "build", ".venv")
+    max_file_bytes: int = 2_000_000  # 2MB cap per file for indexing
+    max_files: int = 50_000
+
+
+class RepoIndexer:
+    def __init__(self, cfg: RepoIndexerConfig):
+        self.cfg = cfg
+        self.root = Path(cfg.root)
+
+    def load_state(self) -> RepoIndexState:
+        data = read_json(REPO_INDEX_FILE, {"version": "0.1.0", "files": {}})
+        st = RepoIndexState(version=data.get("version", "0.1.0"), files=data.get("files", {}))
+        return st
+
+    def save_state(self, st: RepoIndexState) -> None:
+        write_json(REPO_INDEX_FILE, {"version": st.version, "files": st.files})
+
+    def _should_exclude(self, p: Path) -> bool:
+        parts = set(p.parts)
+        return any(ed in parts for ed in self.cfg.exclude_dirs)
+
+    def scan(self) -> list[Path]:
+        out: list[Path] = []
+        for p in self.root.rglob("*"):
+            if len(out) >= self.cfg.max_files:
+                break
+            if p.is_dir():
+                continue
+            if self._should_exclude(p):
+                continue
+            if p.suffix.lower() not in self.cfg.exts:
+                continue
+            out.append(p)
+        return out
+
+    def update_incremental(self) -> tuple[RepoIndexState, list[str], list[str]]:
+        """
+        Returns (state, changed_paths, removed_paths)
+        """
+        st = self.load_state()
+        existing = set(st.files.keys())
+        seen = set()
+        changed: list[str] = []
+
+        for p in self.scan():
+            rel = str(p.relative_to(self.root))
+            seen.add(rel)
+
+            try:
+                stat = p.stat()
+            except Exception:
+                continue
+
+            # quick skip by mtime/size
+            prev = st.files.get(rel)
+            if (
+                prev
+                and int(prev.get("size", -1)) == int(stat.st_size)
+                and float(prev.get("mtime", -1)) == float(stat.st_mtime)
+            ):
+                continue
+
+            if stat.st_size > self.cfg.max_file_bytes:
+                # record meta but skip hashing huge text
+                st.files[rel] = {
+                    "sha256": prev.get("sha256", "") if prev else "",
+                    "size": int(stat.st_size),
+                    "mtime": float(stat.st_mtime),
+                    "lang": _file_lang(p),
+                    "skipped": True,
+                }
+                changed.append(rel)
+                continue
+
+            try:
+                b = p.read_bytes()
+            except Exception:
+                continue
+
+            st.files[rel] = {
+                "sha256": _sha256_bytes(b),
+                "size": int(stat.st_size),
+                "mtime": float(stat.st_mtime),
+                "lang": _file_lang(p),
+                "skipped": False,
+            }
+            changed.append(rel)
+
+        removed = sorted(list(existing - seen))
+        for rp in removed:
+            st.files.pop(rp, None)
+
+        self.save_state(st)
+        return st, changed, removed
+
+    def read_text(self, rel_path: str) -> str | None:
+        p = self.root / rel_path
+        try:
+            return p.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            return None
diff --git a/agent_ext/search/store.py b/agent_ext/search/store.py
new file mode 100644
index 0000000..aabb2d6
--- /dev/null
+++ b/agent_ext/search/store.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+STATE_DIR = Path(".agent_state")
+REPO_INDEX_FILE = STATE_DIR / "repo_index.json"
+BM25_INDEX_FILE = STATE_DIR / "bm25_index.json"
+BM25_META_FILE = STATE_DIR / "bm25_meta.json"
+
+
+def read_json(path: Path, default: Any) -> Any:
+    if not path.exists():
+        return default
+    try:
+        raw = path.read_text(encoding="utf-8").strip()
+        if not raw:
+            return default
+        return json.loads(raw)
+    except (json.JSONDecodeError, OSError):
+        return default
+
+
+def write_json(path: Path, obj: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(obj, indent=2), encoding="utf-8")
+
+
+@dataclass
+class RepoFileMeta:
+    path: str
+    sha256: str
+    size: int
+    mtime: float
+    lang: str
+
+
+@dataclass
+class RepoIndexState:
+    version: str = "0.1.0"
+    files: dict[str, dict[str, Any]] = None  # path -> meta dict
+
+    def __post_init__(self):
+        if self.files is None:
+            self.files = {}
diff --git a/agent_ext/search/tokenize.py b/agent_ext/search/tokenize.py
new file mode 100644
index 0000000..27feb1f
--- /dev/null
+++ b/agent_ext/search/tokenize.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+_WORD_RE = re.compile(r"[A-Za-z_][A-Za-z_0-9]{1,}|[0-9]+")
+
+
+@dataclass
+class TokenizerConfig:
+    use_tiktoken: bool = False
+    tiktoken_encoding: str = "o200k_base"  # override if needed
+    max_tokens_per_doc: int = 20000  # prevent indexing huge blobs
+
+
+def _regex_tokens(text: str) -> list[str]:
+    return [m.group(0).lower() for m in _WORD_RE.finditer(text)]
+
+
+class Tokenizer:
+    """
+    Tokenizer with optional tiktoken backing. For BM25 you usually want word-ish tokens;
+    tiktoken can help for code identifiers + punctuation-y stuff, but word tokens often win.
+
+    Strategy:
+      - Default to regex word tokens (fast, good for BM25)
+      - Optionally augment with tiktoken token strings (configurable)
+    """
+
+    def __init__(self, cfg: TokenizerConfig):
+        self.cfg = cfg
+        self._enc = None
+        if cfg.use_tiktoken:
+            try:
+                import tiktoken  # type: ignore
+
+                self._enc = tiktoken.get_encoding(cfg.tiktoken_encoding)
+            except Exception:
+                self._enc = None  # silently fall back
+
+    def tokenize(self, text: str) -> list[str]:
+        toks = _regex_tokens(text)
+
+        if self._enc is not None:
+            # Augment with token ids mapped to strings to capture code-ish fragments
+            # Keep bounded so we don't explode postings.
+            ids = self._enc.encode(text)
+            if len(ids) > self.cfg.max_tokens_per_doc:
+                ids = ids[: self.cfg.max_tokens_per_doc]
+            # Represent token ids as "t<id>" (stable, compact, not leaking raw bytes)
+            toks.extend([f"t{tid}" for tid in ids])
+
+        return toks
diff --git a/agent_ext/self_improve/__init__.py b/agent_ext/self_improve/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/self_improve/controller.py b/agent_ext/self_improve/controller.py
new file mode 100644
index 0000000..df52bc5
--- /dev/null
+++ b/agent_ext/self_improve/controller.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from .gates import run_gates
+from .models import ImprovementRunRecord, PatchProposal, TriggerEvent
+from .patching import apply_unified_diff
+
+RUNS_DIR = Path(".agent_state/runs")
+
+
+class SelfImproveController:
+    """
+    Trigger-driven: does nothing unless you call run_once(trigger, proposal).
+    In the next iteration, we’ll have the agent generate the proposal/diff.
+    For now, it’s the plumbing for: apply diff -> run gates -> record result.
+    """
+
+    def __init__(self):
+        RUNS_DIR.mkdir(parents=True, exist_ok=True)
+
+    def run_once(self, trigger: TriggerEvent, proposal: PatchProposal, *, adopt: bool = False) -> ImprovementRunRecord:
+        # 1) apply patch if present
+        if proposal.unified_diff:
+            ok, out = apply_unified_diff(proposal.unified_diff)
+            if not ok:
+                rec = ImprovementRunRecord(
+                    trigger=trigger, proposal=proposal, gates=run_gates(proposal.gate_plan), adopted=False
+                )
+                self._write_record(rec, extra={"patch_apply_error": out})
+                return rec
+
+        # 2) run gates
+        gates = run_gates(proposal.gate_plan)
+
+        # 3) decide adoption (for now, adoption = “keep the changes”)
+        adopted = bool(adopt and gates.ok)
+
+        rec = ImprovementRunRecord(trigger=trigger, proposal=proposal, gates=gates, adopted=adopted)
+        self._write_record(rec)
+        return rec
+
+    def _write_record(self, rec: ImprovementRunRecord, extra: dict | None = None) -> None:
+        payload = {
+            "trigger": rec.trigger.__dict__,
+            "proposal": {
+                "title": rec.proposal.title,
+                "rationale": rec.proposal.rationale,
+                "files_to_edit": rec.proposal.files_to_edit,
+                "gate_plan": rec.proposal.gate_plan.__dict__,
+                "has_diff": bool(rec.proposal.unified_diff),
+            },
+            "gates": {"ok": rec.gates.ok, "details": rec.gates.details},
+            "adopted": rec.adopted,
+        }
+        if extra:
+            payload["extra"] = extra
+        out = RUNS_DIR / f"run_{rec.trigger.kind}_{abs(hash(rec.trigger.signature))}.json"
+        out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
diff --git a/agent_ext/self_improve/gates.py b/agent_ext/self_improve/gates.py
new file mode 100644
index 0000000..a31a010
--- /dev/null
+++ b/agent_ext/self_improve/gates.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+from .models import GatePlan, GateResults
+
+
+def run_import_check(*, cwd: Path | None = None) -> tuple[bool, str]:
+    # Try importing the main packages as a sanity check.
+    code = "import agent_ext\nimport agent_patterns\nprint('imports_ok')\n"
+    kw = {"capture_output": True, "text": True}
+    if cwd is not None:
+        kw["cwd"] = str(cwd)
+    p = subprocess.run([sys.executable, "-c", code], **kw)
+    ok = p.returncode == 0 and "imports_ok" in (p.stdout or "")
+    return ok, (p.stdout + "\n" + p.stderr).strip()
+
+
+def run_compile_check(*, cwd: Path | None = None) -> tuple[bool, str]:
+    # Compile agent_ext and repo root (agent_patterns package lives at root, no agent_patterns/ dir)
+    kw = {"capture_output": True, "text": True}
+    if cwd is not None:
+        kw["cwd"] = str(cwd)
+    p = subprocess.run([sys.executable, "-m", "compileall", "-q", "."], **kw)
+    ok = p.returncode == 0
+    return ok, (p.stdout + "\n" + p.stderr).strip()
+
+
+def run_pytest(paths: list[str], *, cwd: Path | None = None) -> tuple[bool, str]:
+    if not paths:
+        return True, "pytest skipped (no paths)"
+    kw = {"capture_output": True, "text": True}
+    if cwd is not None:
+        kw["cwd"] = str(cwd)
+    p = subprocess.run([sys.executable, "-m", "pytest", *paths], **kw)
+    ok = p.returncode == 0
+    return ok, (p.stdout + "\n" + p.stderr).strip()
+
+
+def run_gates(plan: GatePlan, *, repo_root: Path | None = None) -> GateResults:
+    cwd = Path(repo_root) if repo_root is not None else None
+    details = {}
+    ok = True
+
+    if plan.import_check:
+        iok, out = run_import_check(cwd=cwd)
+        details["import_check"] = out
+        ok = ok and iok
+
+    if plan.compile_check:
+        cok, out = run_compile_check(cwd=cwd)
+        details["compile_check"] = out
+        ok = ok and cok
+
+    if plan.pytest_paths:
+        pok, out = run_pytest(plan.pytest_paths, cwd=cwd)
+        details["pytest"] = out
+        ok = ok and pok
+
+    return GateResults(ok=ok, details=details)
diff --git a/agent_ext/self_improve/models.py b/agent_ext/self_improve/models.py
new file mode 100644
index 0000000..c28a6bb
--- /dev/null
+++ b/agent_ext/self_improve/models.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class TriggerEvent:
+    kind: str  # "exception", "test_fail", "user_feedback"
+    signature: str  # stable fingerprint (e.g., exception type + message)
+    detail: str  # human readable
+    count: int = 1
+
+
+@dataclass
+class GatePlan:
+    import_check: bool = True
+    compile_check: bool = True
+    pytest_paths: list[str] = field(default_factory=list)  # optional
+
+
+@dataclass
+class GateResults:
+    ok: bool
+    details: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass
+class PatchProposal:
+    title: str
+    rationale: str
+    files_to_edit: list[str]
+    gate_plan: GatePlan
+    # Minimal diff representation; keep simple for now
+    unified_diff: str | None = None
+
+
+@dataclass
+class ImprovementRunRecord:
+    trigger: TriggerEvent
+    proposal: PatchProposal
+    gates: GateResults
+    adopted: bool
diff --git a/agent_ext/self_improve/patching.py b/agent_ext/self_improve/patching.py
new file mode 100644
index 0000000..e0616d7
--- /dev/null
+++ b/agent_ext/self_improve/patching.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import re
+import subprocess
+from pathlib import Path
+
+
+def _extract_diff_from_lines(lines: list[str]) -> str:
+    """From a list of lines, find a contiguous unified diff and return it (with trailing newline)."""
+    start = None
+    for i, line in enumerate(lines):
+        if line.startswith("--- ") or line.startswith("diff --git "):
+            start = i
+            break
+    if start is None:
+        return ""
+
+    def is_diff_line(ln: str) -> bool:
+        return (
+            ln.startswith("--- ")
+            or ln.startswith("+++ ")
+            or ln.startswith("diff --git ")
+            or ln.startswith("@@")  # Valid: @@ -1,3 +1,4 @@  ; LLM sometimes outputs bare @@ (we repair later)
+            or (len(ln) >= 1 and ln[0] == " ")  # context line (must have leading space)
+            or (len(ln) >= 1 and ln[0] == "+" and not ln.startswith("+++ "))  # added line
+            or (len(ln) >= 1 and ln[0] == "-" and not ln.startswith("--- "))  # removed line
+            or ln.startswith("index ")
+            or ln.startswith("new file mode ")
+            or ln.startswith("old mode ")
+            or ln.startswith("deleted file mode ")
+        )
+
+    end = start
+    for i in range(start, len(lines)):
+        if is_diff_line(lines[i]):
+            end = i + 1
+        else:
+            break
+    return "\n".join(lines[start:end]) + "\n" if end > start else ""
+
+
+# Valid hunk header: @@ -L1,N1 +L2,N2 @@ (optional trailing text)
+_HUNK_HEADER_RE = re.compile(r"^@@ -\d+,\d+ \+\d+,\d+ @@")
+
+
+def _repair_hunk_headers(diff: str) -> str:
+    """
+    Repair malformed @@ hunk headers (e.g. LLM outputs bare @@ with no line numbers).
+    Counts old/new lines in each hunk body and writes valid @@ -N1,C1 +N2,C2 @@.
+    """
+    if not diff or "@@" not in diff:
+        return diff
+    lines = diff.split("\n")
+    out: list[str] = []
+    i = 0
+    in_new_file = False
+    while i < len(lines):
+        line = lines[i]
+        if line.startswith("--- ") and "/dev/null" in line:
+            in_new_file = True
+            out.append(line)
+            i += 1
+            continue
+        if line.startswith("--- ") or line.startswith("diff --git "):
+            in_new_file = False
+        if not line.startswith("@@"):
+            out.append(line)
+            i += 1
+            continue
+        # This line is a hunk header (maybe malformed)
+        if _HUNK_HEADER_RE.match(line.strip()):
+            out.append(line)
+            i += 1
+            continue
+        # Malformed: collect hunk body and build valid header
+        n_old = n_new = 0
+        j = i + 1
+        while j < len(lines) and not (
+            lines[j].startswith("@@") or lines[j].startswith("diff --git") or lines[j].startswith("--- ")
+        ):
+            ln = lines[j]
+            if ln.startswith("-") and not ln.startswith("--- "):
+                n_old += 1
+            elif ln.startswith("+") and not ln.startswith("+++ "):
+                n_new += 1
+            elif ln.startswith(" ") and len(ln) >= 1:
+                # Context line (leading space)
+                n_old += 1
+                n_new += 1
+            j += 1
+        if in_new_file:
+            # New file: @@ -0,0 +1,N @@
+            out.append(f"@@ -0,0 +1,{max(1, n_new)} @@")
+        else:
+            # Modified file: @@ -1,N_old +1,N_new @@ (single-hunk heuristic)
+            out.append(f"@@ -1,{max(1, n_old)} +1,{max(1, n_new)} @@")
+        i += 1
+        # Append the hunk body (lines between this @@ and the next @@ or end)
+        while i < j:
+            out.append(lines[i])
+            i += 1
+    return "\n".join(out) + ("\n" if out else "")
+
+
+def _normalize_path_in_line(line: str) -> str:
+    """Strip leading slashes and normalize backslashes in path part so git apply accepts paths."""
+    if line.startswith("diff --git "):
+        rest = line[len("diff --git ") :].strip()
+        parts = rest.split(None, 1)
+        if len(parts) >= 2:
+            a_path = parts[0].replace("\\", "/").lstrip("/")
+            b_path = parts[1].replace("\\", "/").lstrip("/")
+            a_path = a_path if a_path.startswith("a/") else "a/" + a_path.lstrip("a/")
+            b_path = b_path if b_path.startswith("b/") else "b/" + b_path.lstrip("b/")
+            return f"diff --git {a_path} {b_path}"
+    if line.startswith("--- ") or line.startswith("+++ "):
+        path_part = line[4:].strip().replace("\\", "/")
+        if "/dev/null" not in path_part:
+            path_part = path_part.lstrip("/")
+        return line[:4] + path_part
+    return line
+
+
+def _normalize_diff_paths(diff: str) -> str:
+    """Apply path normalization to header lines so Windows/absolute paths work."""
+    lines = diff.split("\n")
+    out = []
+    for ln in lines:
+        if ln.startswith("diff --git ") or ln.startswith("--- ") or ln.startswith("+++ "):
+            out.append(_normalize_path_in_line(ln))
+        else:
+            out.append(ln)
+    return "\n".join(out) + ("\n" if out else "")
+
+
+def _extract_diff_anywhere(text: str) -> str:
+    """Find a block that looks like a unified diff (has ---/+++ and @@) anywhere in text."""
+    lines = text.split("\n")
+    best = ""
+    for i, line in enumerate(lines):
+        if not (line.startswith("--- ") or line.startswith("diff --git ")):
+            continue
+        out = _extract_diff_from_lines(lines[i:])
+        if out and "@@" in out and len(out) > len(best):
+            best = out
+    return best
+
+
+def sanitize_diff_for_apply(diff_text: str) -> str:
+    """
+    Extract a single valid unified diff from LLM output (may contain markdown, commentary, trailing text).
+    - Strips markdown code fences (```diff, ```patch, ```); also looks inside response for fenced blocks.
+    - Keeps only lines that look like a unified diff (---/+++, diff --git, @@ hunks, context).
+    - Normalizes line endings to LF and path separators so git apply accepts the patch.
+    """
+    if not diff_text or not diff_text.strip():
+        return ""
+    raw = diff_text.strip()
+    raw = raw.replace("\r\n", "\n").replace("\r", "\n")
+    for marker in ("```diff", "```patch", "```"):
+        if raw.startswith(marker):
+            raw = raw[len(marker) :].lstrip("\n")
+        if raw.endswith("```"):
+            raw = raw[: raw.rfind("```")].rstrip("\n")
+    lines = raw.split("\n")
+    out = _extract_diff_from_lines(lines)
+    if not out and ("--- " in raw or "diff --git " in raw) and "@@" in raw:
+        out = _extract_diff_anywhere(raw)
+    if out:
+        out = _repair_hunk_headers(out)
+        out = _normalize_diff_paths(out)
+        return out
+    parts = re.split(r"```(?:\w*)\s*\n?", raw)
+    for block in parts:
+        block = block.strip()
+        if "@@" not in block or ("--- " not in block and "+++ " not in block and "diff --git " not in block):
+            continue
+        out = _extract_diff_from_lines(block.split("\n"))
+        if not out:
+            out = _extract_diff_anywhere(block)
+        if out and "@@" in out:
+            out = _repair_hunk_headers(out)
+            out = _normalize_diff_paths(out)
+            return out
+    return ""
+
+
+def apply_unified_diff(diff_text: str, repo_root: Path = Path(".")) -> tuple[bool, str]:
+    """
+    Applies a unified diff using `git apply` if available.
+    Sanitizes LLM output (markdown, trailing text, line endings) before applying.
+    """
+    cleaned = sanitize_diff_for_apply(diff_text)
+    if not cleaned.strip():
+        return (
+            False,
+            "sanitize_diff: no unified diff found in output (LLM must output raw diff: ---/+++ headers, @@ hunks, no markdown)",
+        )
+    if "@@" not in cleaned:
+        return (
+            False,
+            "sanitize_diff: no valid hunks (unified diff must contain @@ hunk headers; LLM may have returned prose instead of a diff)",
+        )
+    p = subprocess.run(
+        ["git", "apply", "--whitespace=nowarn", "-"],
+        input=cleaned,
+        text=True,
+        cwd=str(repo_root),
+        capture_output=True,
+    )
+    ok = p.returncode == 0
+    err = (p.stdout + "\n" + p.stderr).strip()
+    if not ok and "No valid patches" in err:
+        err = "LLM did not produce a valid unified diff (git apply: no valid patches). Output must be raw diff only: diff --git or ---/+++, then @@ hunks with +/− lines. No markdown or commentary."
+    return ok, err
diff --git a/agent_ext/self_improve/triggers.py b/agent_ext/self_improve/triggers.py
new file mode 100644
index 0000000..2cb9168
--- /dev/null
+++ b/agent_ext/self_improve/triggers.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from .models import TriggerEvent
+
+TRIGGERS_FILE = Path(".agent_state/triggers.json")
+
+
+class TriggerStore:
+    def __init__(self, path: Path = TRIGGERS_FILE):
+        self.path = path
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self._data: dict[str, int] = {}
+        self._load()
+
+    def _load(self) -> None:
+        if not self.path.exists():
+            self._data = {}
+            return
+        try:
+            raw = self.path.read_text(encoding="utf-8").strip()
+            if not raw:
+                self._data = {}
+                return
+            self._data = json.loads(raw)
+        except (json.JSONDecodeError, OSError):
+            self._data = {}
+
+    def save(self) -> None:
+        self.path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
+
+    def bump(self, signature: str) -> int:
+        self._data[signature] = int(self._data.get(signature, 0)) + 1
+        self.save()
+        return self._data[signature]
+
+    def get_count(self, signature: str) -> int:
+        return int(self._data.get(signature, 0))
+
+    def make_exception_trigger(self, exc: BaseException) -> TriggerEvent:
+        sig = f"{type(exc).__name__}:{str(exc)[:200]}"
+        count = self.bump(sig)
+        return TriggerEvent(kind="exception", signature=sig, detail=repr(exc), count=count)
diff --git a/agent_ext/skills/README.md b/agent_ext/skills/README.md
new file mode 100644
index 0000000..4833c58
--- /dev/null
+++ b/agent_ext/skills/README.md
@@ -0,0 +1,60 @@
+# Skills — Progressive-Disclosure Instruction Packs
+
+Modular skill system for AI agents with discovery, loading, validation, and composable registries.
+
+## Features
+
+- **Directory Discovery**: `skills/<id>/SKILL.md` convention
+- **Programmatic Creation**: Define skills in code (no filesystem)
+- **Registry Composition**: Combine, filter, prefix registries
+- **Validation**: Metadata and structure validation
+- **Progressive Disclosure**: List → get → load (minimal tokens)
+
+## Quick Start
+
+```python
+from agent_ext.skills import SkillRegistry, SkillLoader, create_skill
+
+# Discover from directories
+registry = SkillRegistry(roots=["skills", "vendor/skills"])
+registry.discover()
+
+# Load a skill
+loader = SkillLoader(max_bytes=256_000)
+loaded = loader.load(registry.get("my_skill"))
+print(loaded.body_markdown)
+```
+
+## Programmatic Skills
+
+```python
+from agent_ext.skills import create_skill
+
+skill = create_skill(
+    id="code_review",
+    name="Code Review",
+    description="Review code for quality and bugs",
+    body="# Code Review\n\nReview the code for...",
+    tags=["code", "review"],
+)
+```
+
+## Registry Composition
+
+```python
+from agent_ext.skills import (
+    SkillRegistry, CombinedRegistry, FilteredRegistry, PrefixedRegistry,
+)
+
+# Combine multiple sources
+local = SkillRegistry(roots=["skills"])
+local.discover()
+
+vendor = SkillRegistry(roots=["vendor/skills"])
+vendor.discover()
+
+# Merge, filter, or prefix
+combined = CombinedRegistry([local, vendor])
+python_only = FilteredRegistry(combined, predicate=lambda s: "python" in s.tags)
+namespaced = PrefixedRegistry(vendor, prefix="vendor_")
+```
diff --git a/agent_ext/skills/__init__.py b/agent_ext/skills/__init__.py
index e69de29..0c9ca45 100644
--- a/agent_ext/skills/__init__.py
+++ b/agent_ext/skills/__init__.py
@@ -0,0 +1,22 @@
+"""Skills — progressive-disclosure instruction packs for AI agents."""
+
+from .exceptions import SkillError, SkillLoadError, SkillNotFoundError, SkillValidationError
+from .loader import SkillLoader
+from .models import LoadedSkill, SkillSpec, create_skill
+from .registries import CombinedRegistry, FilteredRegistry, PrefixedRegistry, RenamedRegistry, WrapperRegistry
+from .registry import SkillRegistry
+
+__all__ = [
+    "LoadedSkill",
+    "SkillSpec",
+    "create_skill",
+    "SkillRegistry",
+    "SkillLoader",
+    "SkillError",
+    "SkillNotFoundError",
+    "SkillValidationError",
+    "SkillLoadError",
+    "CombinedRegistry",
+    "FilteredRegistry",
+    "PrefixedRegistry",
+]
diff --git a/agent_ext/skills/exceptions.py b/agent_ext/skills/exceptions.py
new file mode 100644
index 0000000..aacf879
--- /dev/null
+++ b/agent_ext/skills/exceptions.py
@@ -0,0 +1,33 @@
+"""Exceptions for the skills system."""
+
+from __future__ import annotations
+
+
+class SkillError(Exception):
+    """Base exception for skill operations."""
+
+
+class SkillNotFoundError(SkillError):
+    """Raised when a skill is not found in any registry."""
+
+    def __init__(self, skill_id: str):
+        self.skill_id = skill_id
+        super().__init__(f"Skill not found: {skill_id}")
+
+
+class SkillValidationError(SkillError):
+    """Raised when skill metadata or structure is invalid."""
+
+    def __init__(self, skill_id: str, reason: str):
+        self.skill_id = skill_id
+        self.reason = reason
+        super().__init__(f"Skill '{skill_id}' validation failed: {reason}")
+
+
+class SkillLoadError(SkillError):
+    """Raised when a skill cannot be loaded."""
+
+    def __init__(self, skill_id: str, reason: str):
+        self.skill_id = skill_id
+        self.reason = reason
+        super().__init__(f"Skill '{skill_id}' load failed: {reason}")
diff --git a/agent_ext/skills/loader.py b/agent_ext/skills/loader.py
index cddf2f8..ab78fb1 100644
--- a/agent_ext/skills/loader.py
+++ b/agent_ext/skills/loader.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 import hashlib
 
 from .models import LoadedSkill, SkillSpec
diff --git a/agent_ext/skills/models.py b/agent_ext/skills/models.py
index c8b165f..fe0c70e 100644
--- a/agent_ext/skills/models.py
+++ b/agent_ext/skills/models.py
@@ -1,21 +1,68 @@
+"""Skill models — specs, loaded skills, and programmatic creation."""
+
 from __future__ import annotations
-from typing import Any, Dict, List, Optional
+
+import hashlib
+from typing import Any
+
 from pydantic import BaseModel, Field
 
 
 class SkillSpec(BaseModel):
+    """Metadata for a discoverable skill."""
+
     id: str
     name: str
     description: str
-    tags: List[str] = Field(default_factory=list)
+    tags: list[str] = Field(default_factory=list)
     version: str = "0.1.0"
-    path: Optional[str] = None              # where SKILL.md lives
-    required_perms: List[str] = Field(default_factory=list)
-    tool_bundle: Optional[str] = None       # optional name -> tools enabled when active
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    path: str | None = None  # where SKILL.md lives
+    required_perms: list[str] = Field(default_factory=list)
+    tool_bundle: str | None = None  # optional name → tools enabled when active
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 class LoadedSkill(BaseModel):
+    """A skill with its body loaded into memory."""
+
     spec: SkillSpec
     body_markdown: str
     body_hash: str
+
+
+def create_skill(
+    *,
+    id: str,
+    name: str,
+    description: str,
+    body: str,
+    tags: list[str] | None = None,
+    version: str = "0.1.0",
+    required_perms: list[str] | None = None,
+    tool_bundle: str | None = None,
+    metadata: dict[str, Any] | None = None,
+) -> LoadedSkill:
+    """Programmatically create a skill (no filesystem needed).
+
+    Example::
+
+        skill = create_skill(
+            id="code_review",
+            name="Code Review",
+            description="Review code for quality and bugs",
+            body="# Code Review\\n\\nReview the code for...\\n",
+            tags=["code", "review"],
+        )
+    """
+    spec = SkillSpec(
+        id=id,
+        name=name,
+        description=description,
+        tags=tags or [],
+        version=version,
+        required_perms=required_perms or [],
+        tool_bundle=tool_bundle,
+        metadata=metadata or {},
+    )
+    body_hash = hashlib.sha256(body.encode("utf-8")).hexdigest()
+    return LoadedSkill(spec=spec, body_markdown=body, body_hash=body_hash)
diff --git a/agent_ext/skills/pai_toolset.py b/agent_ext/skills/pai_toolset.py
new file mode 100644
index 0000000..4afff65
--- /dev/null
+++ b/agent_ext/skills/pai_toolset.py
@@ -0,0 +1,90 @@
+"""Skills FunctionToolset — progressive-disclosure skill discovery for pydantic-ai agents.
+
+Tools: list_skills, load_skill.
+
+Example::
+
+    from pydantic_ai import Agent
+    from agent_ext.skills import create_skills_toolset, SkillsDeps, SkillRegistry
+
+    registry = SkillRegistry(roots=["skills"])
+    registry.discover()
+
+    toolset = create_skills_toolset()
+    agent = Agent("openai:gpt-4o", toolsets=[toolset])
+
+    deps = SkillsDeps(registry=registry)
+    result = await agent.run("What skills are available?", deps=deps)
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from pydantic import BaseModel, ConfigDict, SkipValidation
+from pydantic_ai import RunContext
+from pydantic_ai.toolsets import FunctionToolset
+
+from .loader import SkillLoader
+from .models import SkillSpec
+
+
+class SkillsDeps(BaseModel):
+    """Dependencies for the skills toolset."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    registry: Annotated[Any, SkipValidation]  # SkillRegistry or any registry with list()/get()
+    loader: Annotated[Any, SkipValidation] = None  # SkillLoader (created if None)
+    max_body_chars: int = 10_000
+
+
+def create_skills_toolset(*, toolset_id: str | None = None) -> FunctionToolset[SkillsDeps]:
+    """Create a skills toolset for progressive-disclosure skill usage.
+
+    Returns:
+        FunctionToolset with list_skills and load_skill tools.
+    """
+    toolset: FunctionToolset[SkillsDeps] = FunctionToolset(id=toolset_id)
+
+    @toolset.tool
+    async def list_skills(ctx: RunContext[SkillsDeps]) -> str:
+        """List all available skills with their IDs and descriptions.
+
+        Use this to discover what skills are available before loading one.
+        """
+        skills = ctx.deps.registry.list()
+        if not skills:
+            return "No skills available."
+        lines = []
+        for s in skills:
+            tags = f" [{', '.join(s.tags)}]" if s.tags else ""
+            lines.append(f"- **{s.id}**: {s.description}{tags}")
+        return "\n".join(lines)
+
+    @toolset.tool
+    async def load_skill(ctx: RunContext[SkillsDeps], skill_id: str) -> str:
+        """Load the full instructions for a specific skill.
+
+        Args:
+            skill_id: The skill ID to load (from list_skills).
+
+        Returns:
+            The skill's markdown instructions.
+        """
+        try:
+            spec = ctx.deps.registry.get(skill_id)
+        except (KeyError, Exception):
+            return f"Error: Skill '{skill_id}' not found. Use list_skills to see available skills."
+
+        loader = ctx.deps.loader or SkillLoader()
+        try:
+            loaded = loader.load(spec)
+            body = loaded.body_markdown
+            if len(body) > ctx.deps.max_body_chars:
+                body = body[: ctx.deps.max_body_chars] + "\n\n... (truncated)"
+            return f"# Skill: {spec.name}\n\n{body}"
+        except Exception as e:
+            return f"Error loading skill '{skill_id}': {e}"
+
+    return toolset
diff --git a/agent_ext/skills/registries/__init__.py b/agent_ext/skills/registries/__init__.py
new file mode 100644
index 0000000..cbf99ab
--- /dev/null
+++ b/agent_ext/skills/registries/__init__.py
@@ -0,0 +1,18 @@
+"""Composable skill registries — combine, filter, prefix, rename, wrap, git."""
+
+from .combined import CombinedRegistry
+from .filtered import FilteredRegistry
+from .git import GitCloneOptions, GitSkillsRegistry
+from .prefixed import PrefixedRegistry
+from .renamed import RenamedRegistry
+from .wrapper import WrapperRegistry
+
+__all__ = [
+    "CombinedRegistry",
+    "FilteredRegistry",
+    "GitCloneOptions",
+    "GitSkillsRegistry",
+    "PrefixedRegistry",
+    "RenamedRegistry",
+    "WrapperRegistry",
+]
diff --git a/agent_ext/skills/registries/combined.py b/agent_ext/skills/registries/combined.py
new file mode 100644
index 0000000..c2e0bdb
--- /dev/null
+++ b/agent_ext/skills/registries/combined.py
@@ -0,0 +1,38 @@
+"""Combined registry — merge multiple registries into one."""
+
+from __future__ import annotations
+
+import builtins
+
+from ..exceptions import SkillNotFoundError
+from ..models import SkillSpec
+
+
+class CombinedRegistry:
+    """Merges multiple registries, first-match wins on conflicts."""
+
+    def __init__(self, registries: list) -> None:
+        self._registries = list(registries)
+
+    def list(self) -> builtins.list[SkillSpec]:
+        seen: set[str] = set()
+        result: list[SkillSpec] = []
+        for reg in self._registries:
+            for spec in reg.list():
+                if spec.id not in seen:
+                    seen.add(spec.id)
+                    result.append(spec)
+        return result
+
+    def get(self, skill_id: str) -> SkillSpec:
+        for reg in self._registries:
+            try:
+                return reg.get(skill_id)
+            except (KeyError, SkillNotFoundError):
+                continue
+        raise SkillNotFoundError(skill_id)
+
+    def has(self, skill_id: str) -> bool:
+        return any(
+            (hasattr(r, "has") and r.has(skill_id)) or skill_id in {s.id for s in r.list()} for r in self._registries
+        )
diff --git a/agent_ext/skills/registries/filtered.py b/agent_ext/skills/registries/filtered.py
new file mode 100644
index 0000000..54e3572
--- /dev/null
+++ b/agent_ext/skills/registries/filtered.py
@@ -0,0 +1,42 @@
+"""Filtered registry — expose only skills matching criteria."""
+
+from __future__ import annotations
+
+import builtins
+from collections.abc import Callable
+
+from ..exceptions import SkillNotFoundError
+from ..models import SkillSpec
+
+
+class FilteredRegistry:
+    """Wraps a registry, only exposing skills that pass *predicate*.
+
+    Example::
+
+        # Only Python skills
+        filtered = FilteredRegistry(
+            base_registry,
+            predicate=lambda spec: "python" in spec.tags,
+        )
+    """
+
+    def __init__(self, inner, *, predicate: Callable[[SkillSpec], bool]) -> None:
+        self._inner = inner
+        self._predicate = predicate
+
+    def list(self) -> builtins.list[SkillSpec]:
+        return [s for s in self._inner.list() if self._predicate(s)]
+
+    def get(self, skill_id: str) -> SkillSpec:
+        spec = self._inner.get(skill_id)
+        if not self._predicate(spec):
+            raise SkillNotFoundError(skill_id)
+        return spec
+
+    def has(self, skill_id: str) -> bool:
+        try:
+            self.get(skill_id)
+            return True
+        except (KeyError, SkillNotFoundError):
+            return False
diff --git a/agent_ext/skills/registries/git.py b/agent_ext/skills/registries/git.py
new file mode 100644
index 0000000..3131a9d
--- /dev/null
+++ b/agent_ext/skills/registries/git.py
@@ -0,0 +1,236 @@
+"""Git-backed skill registry — clone a remote repo and discover skills.
+
+Clones a git repository (via subprocess, no GitPython needed) and discovers
+skills from ``SKILL.md`` files within it.  Supports shallow clones, branch
+selection, token auth, and SSH keys.
+
+Example::
+
+    from agent_ext.skills.registries.git import GitSkillsRegistry
+
+    registry = GitSkillsRegistry(
+        repo_url="https://github.com/anthropics/skills",
+        path="skills",
+        target_dir="./cached-skills",
+    )
+    registry.clone_or_pull()
+
+    for spec in registry.list():
+        print(spec.id, spec.name)
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse, urlunparse
+
+from ..exceptions import SkillNotFoundError
+from ..models import SkillSpec
+
+
+@dataclass
+class GitCloneOptions:
+    """Options for git clone/pull operations.
+
+    Args:
+        depth: Shallow clone depth (``None`` for full clone).
+        branch: Branch, tag, or ref to check out.
+        single_branch: Only clone the specified branch.
+        sparse_paths: Paths for sparse checkout (empty = full tree).
+        env: Extra environment variables for git commands.
+    """
+
+    depth: int | None = 1
+    branch: str | None = None
+    single_branch: bool = True
+    sparse_paths: list[str] = field(default_factory=list)
+    env: dict[str, str] = field(default_factory=dict)
+
+
+def _inject_token(url: str, token: str) -> str:
+    """Embed a token into an HTTPS URL."""
+    parsed = urlparse(url)
+    if parsed.scheme in ("http", "https"):
+        netloc = f"oauth2:{token}@{parsed.hostname}"
+        if parsed.port:
+            netloc += f":{parsed.port}"
+        return urlunparse(parsed._replace(netloc=netloc))
+    return url
+
+
+def _sanitize_url(url: str) -> str:
+    """Strip credentials from a URL for display."""
+    parsed = urlparse(url)
+    if parsed.password:
+        netloc = parsed.hostname or ""
+        if parsed.port:
+            netloc += f":{parsed.port}"
+        return urlunparse(parsed._replace(netloc=netloc))
+    return url
+
+
+def _run_git(cmd: list[str], *, cwd: Path | None = None, env: dict[str, str] | None = None) -> tuple[bool, str]:
+    """Run a git command, return (ok, output)."""
+    full_env = {**os.environ, **(env or {})}
+    p = subprocess.run(cmd, cwd=str(cwd) if cwd else None, env=full_env, capture_output=True, text=True)
+    out = (p.stdout or "") + ("\n" if p.stdout and p.stderr else "") + (p.stderr or "")
+    return p.returncode == 0, out.strip()
+
+
+class GitSkillsRegistry:
+    """Skills registry backed by a cloned git repository.
+
+    Clones on first use (or when ``clone_or_pull()`` is called), then discovers
+    skills from ``SKILL.md`` files within the specified ``path``.
+
+    Args:
+        repo_url: Repository URL (HTTPS or SSH).
+        target_dir: Local directory for the clone (temp dir if None).
+        path: Sub-path inside the repo where skills live (default: root).
+        token: Personal access token for HTTPS auth (falls back to ``GITHUB_TOKEN`` env).
+        ssh_key_file: Path to SSH key for SSH auth.
+        clone_options: Fine-grained clone configuration.
+        auto_clone: Clone immediately on construction (default True).
+    """
+
+    def __init__(
+        self,
+        repo_url: str,
+        *,
+        target_dir: str | Path | None = None,
+        path: str = "",
+        token: str | None = None,
+        ssh_key_file: str | Path | None = None,
+        clone_options: GitCloneOptions | None = None,
+        auto_clone: bool = True,
+    ) -> None:
+        self._repo_url = repo_url
+        self._path = path.strip("/")
+        self._options = clone_options or GitCloneOptions()
+        self._clean_url = _sanitize_url(repo_url)
+
+        # Auth
+        effective_token = token or os.environ.get("GITHUB_TOKEN")
+        self._clone_url = _inject_token(repo_url, effective_token) if effective_token else repo_url
+
+        # SSH key
+        if ssh_key_file:
+            key_path = Path(ssh_key_file).expanduser().resolve()
+            self._options.env["GIT_SSH_COMMAND"] = f"ssh -i {key_path} -o StrictHostKeyChecking=accept-new"
+
+        # Target directory
+        self._tmp_dir: tempfile.TemporaryDirectory | None = None
+        if target_dir is None:
+            self._tmp_dir = tempfile.TemporaryDirectory(prefix="skills_git_")
+            self._target_dir = Path(self._tmp_dir.name)
+        else:
+            self._target_dir = Path(target_dir).expanduser().resolve()
+
+        self._skills: dict[str, SkillSpec] = {}
+
+        if auto_clone:
+            self.clone_or_pull()
+
+    def __repr__(self) -> str:
+        return f"GitSkillsRegistry(repo={self._clean_url!r}, path={self._path!r})"
+
+    @property
+    def skills_root(self) -> Path:
+        if self._path:
+            return self._target_dir / self._path
+        return self._target_dir
+
+    def _is_cloned(self) -> bool:
+        return (self._target_dir / ".git").exists()
+
+    def clone_or_pull(self) -> None:
+        """Clone the repo (or pull if already cloned), then discover skills."""
+        opts = self._options
+        env = opts.env
+
+        if self._is_cloned():
+            # Pull
+            _run_git(["git", "pull"], cwd=self._target_dir, env=env)
+        else:
+            # Clone
+            self._target_dir.mkdir(parents=True, exist_ok=True)
+            cmd = ["git", "clone"]
+            if opts.depth is not None:
+                cmd += ["--depth", str(opts.depth)]
+            if opts.branch:
+                cmd += ["--branch", opts.branch]
+            if opts.single_branch:
+                cmd.append("--single-branch")
+            cmd += [self._clone_url, str(self._target_dir)]
+
+            ok, out = _run_git(cmd, env=env)
+            if not ok:
+                # Sanitize error (don't leak tokens)
+                clean_out = out.replace(self._clone_url, self._clean_url)
+                raise RuntimeError(f"git clone failed: {clean_out}")
+
+            # Sparse checkout if requested
+            if opts.sparse_paths:
+                _run_git(["git", "sparse-checkout", "init"], cwd=self._target_dir, env=env)
+                _run_git(["git", "sparse-checkout", "set", *opts.sparse_paths], cwd=self._target_dir, env=env)
+
+        self._discover()
+
+    def _discover(self) -> None:
+        """Discover skills from SKILL.md files in the skills root."""
+        self._skills.clear()
+        root = self.skills_root
+        if not root.exists():
+            return
+
+        for entry in sorted(root.iterdir()):
+            if not entry.is_dir():
+                continue
+            md_path = entry / "SKILL.md"
+            if not md_path.exists():
+                continue
+
+            body = md_path.read_text(encoding="utf-8", errors="replace")
+            first_line = next((ln.strip("# ").strip() for ln in body.splitlines() if ln.strip()), entry.name)
+            body_hash = hashlib.sha256(body.encode("utf-8")).hexdigest()
+
+            spec = SkillSpec(
+                id=entry.name,
+                name=first_line or entry.name,
+                description=f"Skill from {self._clean_url}: {entry.name}",
+                path=str(md_path),
+                metadata={
+                    "body_hash": body_hash,
+                    "repo": self._clean_url,
+                    "registry": "git",
+                },
+            )
+            self._skills[spec.id] = spec
+
+    def list(self) -> list[SkillSpec]:
+        return list(self._skills.values())
+
+    def get(self, skill_id: str) -> SkillSpec:
+        if skill_id not in self._skills:
+            raise SkillNotFoundError(skill_id)
+        return self._skills[skill_id]
+
+    def has(self, skill_id: str) -> bool:
+        return skill_id in self._skills
+
+    def refresh(self) -> None:
+        """Pull latest and re-discover skills."""
+        self.clone_or_pull()
+
+    def cleanup(self) -> None:
+        """Clean up temporary directory."""
+        if self._tmp_dir:
+            self._tmp_dir.cleanup()
+            self._tmp_dir = None
diff --git a/agent_ext/skills/registries/prefixed.py b/agent_ext/skills/registries/prefixed.py
new file mode 100644
index 0000000..38e7787
--- /dev/null
+++ b/agent_ext/skills/registries/prefixed.py
@@ -0,0 +1,54 @@
+"""Prefixed registry — namespace skills with a prefix."""
+
+from __future__ import annotations
+
+import builtins
+
+from ..exceptions import SkillNotFoundError
+from ..models import SkillSpec
+
+
+class PrefixedRegistry:
+    """Wraps a registry, prefixing all skill IDs.
+
+    Example::
+
+        prefixed = PrefixedRegistry(base, prefix="vendor_")
+        # Skill "search" becomes "vendor_search"
+    """
+
+    def __init__(self, inner, *, prefix: str) -> None:
+        self._inner = inner
+        self._prefix = prefix
+
+    def _prefixed(self, spec: SkillSpec) -> SkillSpec:
+        return SkillSpec(
+            id=f"{self._prefix}{spec.id}",
+            name=spec.name,
+            description=spec.description,
+            tags=spec.tags,
+            version=spec.version,
+            path=spec.path,
+            required_perms=spec.required_perms,
+            tool_bundle=spec.tool_bundle,
+            metadata=spec.metadata,
+        )
+
+    def list(self) -> builtins.list[SkillSpec]:
+        return [self._prefixed(s) for s in self._inner.list()]
+
+    def get(self, skill_id: str) -> SkillSpec:
+        if not skill_id.startswith(self._prefix):
+            raise SkillNotFoundError(skill_id)
+        inner_id = skill_id[len(self._prefix) :]
+        return self._prefixed(self._inner.get(inner_id))
+
+    def has(self, skill_id: str) -> bool:
+        if not skill_id.startswith(self._prefix):
+            return False
+        inner_id = skill_id[len(self._prefix) :]
+        try:
+            self._inner.get(inner_id)
+            return True
+        except (KeyError, SkillNotFoundError):
+            return False
diff --git a/agent_ext/skills/registries/renamed.py b/agent_ext/skills/registries/renamed.py
new file mode 100644
index 0000000..6afac0f
--- /dev/null
+++ b/agent_ext/skills/registries/renamed.py
@@ -0,0 +1,64 @@
+"""Renamed registry — rename skills using an explicit mapping.
+
+``name_map`` maps **new names to original names**: ``{'new-name': 'original-name'}``.
+Skills not in the map keep their original names.
+
+Example::
+
+    renamed = RenamedRegistry(base, name_map={"doc-tool": "pdf", "sheet-tool": "xlsx"})
+    spec = renamed.get("doc-tool")  # fetches "pdf" from base
+"""
+
+from __future__ import annotations
+
+from ..exceptions import SkillNotFoundError
+from ..models import SkillSpec
+from .wrapper import WrapperRegistry
+
+
+class RenamedRegistry(WrapperRegistry):
+    """A registry that renames skills using a name map."""
+
+    def __init__(self, inner, *, name_map: dict[str, str]) -> None:
+        super().__init__(inner)
+        self.name_map = name_map  # new_name → original_name
+
+    @property
+    def _reverse_map(self) -> dict[str, str]:
+        """original_name → new_name."""
+        return {v: k for k, v in self.name_map.items()}
+
+    def _to_new(self, spec: SkillSpec) -> SkillSpec:
+        new_name = self._reverse_map.get(spec.id)
+        if new_name:
+            return SkillSpec(
+                id=new_name,
+                name=spec.name,
+                description=spec.description,
+                tags=spec.tags,
+                version=spec.version,
+                path=spec.path,
+                required_perms=spec.required_perms,
+                tool_bundle=spec.tool_bundle,
+                metadata=spec.metadata,
+            )
+        return spec
+
+    def list(self) -> list[SkillSpec]:
+        return [self._to_new(s) for s in self._inner.list()]
+
+    def get(self, skill_id: str) -> SkillSpec:
+        original = self.name_map.get(skill_id, skill_id)
+        try:
+            spec = self._inner.get(original)
+        except (KeyError, SkillNotFoundError):
+            raise SkillNotFoundError(skill_id) from None
+        return self._to_new(spec)
+
+    def has(self, skill_id: str) -> bool:
+        original = self.name_map.get(skill_id, skill_id)
+        try:
+            self._inner.get(original)
+            return True
+        except (KeyError, SkillNotFoundError):
+            return False
diff --git a/agent_ext/skills/registries/wrapper.py b/agent_ext/skills/registries/wrapper.py
new file mode 100644
index 0000000..24549a3
--- /dev/null
+++ b/agent_ext/skills/registries/wrapper.py
@@ -0,0 +1,37 @@
+"""Wrapper base class for registry composition.
+
+All registry decorators (Filtered, Prefixed, Renamed) can inherit from this.
+"""
+
+from __future__ import annotations
+
+import builtins
+
+from ..models import SkillSpec
+
+
+class WrapperRegistry:
+    """A registry that wraps another and delegates all operations.
+
+    Override only the methods you need to customize.
+    """
+
+    def __init__(self, inner) -> None:
+        self._inner = inner
+
+    @property
+    def wrapped(self):
+        return self._inner
+
+    def list(self) -> builtins.list[SkillSpec]:
+        return self._inner.list()
+
+    def get(self, skill_id: str) -> SkillSpec:
+        return self._inner.get(skill_id)
+
+    def has(self, skill_id: str) -> bool:
+        try:
+            self._inner.get(skill_id)
+            return True
+        except (KeyError, Exception):
+            return False
diff --git a/agent_ext/skills/registry.py b/agent_ext/skills/registry.py
index d0474bb..306fdd6 100644
--- a/agent_ext/skills/registry.py
+++ b/agent_ext/skills/registry.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
+
+import builtins
 import hashlib
 import os
-from typing import Dict, List
 
 from .models import SkillSpec
 
@@ -16,9 +17,10 @@ class SkillRegistry:
       skills/<skill_id>/SKILL.md
       skills/<skill_id>/spec.json (optional)
     """
-    def __init__(self, roots: List[str]):
+
+    def __init__(self, roots: builtins.list[str]):
         self.roots = roots
-        self._skills: Dict[str, SkillSpec] = {}
+        self._skills: dict[str, SkillSpec] = {}
 
     def discover(self) -> None:
         for root in self.roots:
@@ -32,7 +34,7 @@ def discover(self) -> None:
                 if not os.path.exists(md_path):
                     continue
                 # Minimal spec derived from folder + first heading line
-                with open(md_path, "r", encoding="utf-8") as f:
+                with open(md_path, encoding="utf-8") as f:
                     body = f.read()
                 first_line = next((ln.strip("# ").strip() for ln in body.splitlines() if ln.strip()), entry)
                 spec = SkillSpec(
@@ -44,7 +46,7 @@ def discover(self) -> None:
                 )
                 self._skills[spec.id] = spec
 
-    def list(self) -> List[SkillSpec]:
+    def list(self) -> builtins.list[SkillSpec]:
         return list(self._skills.values())
 
     def get(self, skill_id: str) -> SkillSpec:
diff --git a/agent_ext/skills/selector.py b/agent_ext/skills/selector.py
index 8ba3df6..91697e7 100644
--- a/agent_ext/skills/selector.py
+++ b/agent_ext/skills/selector.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
-from typing import List, Sequence
+
+from collections.abc import Sequence
 
 from .models import SkillSpec
 
 
 class SkillSelection:
-    def __init__(self, *, include_catalog: bool, load_full: List[str]):
+    def __init__(self, *, include_catalog: bool, load_full: list[str]):
         self.include_catalog = include_catalog
         self.load_full = load_full
 
@@ -16,6 +17,7 @@ class SkillSelector:
     - always include catalog summaries
     - load full bodies only for selected skill IDs
     """
+
     def select(self, intent: str, *, catalog: Sequence[SkillSpec]) -> SkillSelection:
         # Replace with your router’s intent logic. This is a safe baseline.
         if intent in {"ingest_doc", "ocr", "parse_document"}:
diff --git a/agent_ext/skills/toolset.py b/agent_ext/skills/toolset.py
index 9ac66cd..0666673 100644
--- a/agent_ext/skills/toolset.py
+++ b/agent_ext/skills/toolset.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from typing import Dict, List, Optional
 
 from .models import LoadedSkill, SkillSpec
 
@@ -8,7 +7,8 @@ class SkillContextPack:
     """
     What you inject into the model context.
     """
-    def __init__(self, *, catalog_text: str, loaded_skills: List[LoadedSkill]):
+
+    def __init__(self, *, catalog_text: str, loaded_skills: list[LoadedSkill]):
         self.catalog_text = catalog_text
         self.loaded_skills = loaded_skills
 
@@ -21,7 +21,7 @@ def as_instructions(self) -> str:
         return "".join(out)
 
 
-def build_skill_catalog(catalog: List[SkillSpec]) -> str:
+def build_skill_catalog(catalog: list[SkillSpec]) -> str:
     lines = ["You have access to the following skills (load full instructions only when needed):\n"]
     for s in sorted(catalog, key=lambda x: x.id):
         tag_str = f" [{', '.join(s.tags)}]" if s.tags else ""
diff --git a/agent_ext/subagents/README.md b/agent_ext/subagents/README.md
new file mode 100644
index 0000000..21372d7
--- /dev/null
+++ b/agent_ext/subagents/README.md
@@ -0,0 +1,45 @@
+# Subagents — Multi-Agent Orchestration
+
+Spawn specialized subagents that run synchronously, asynchronously, or auto-select the best mode. Includes inter-agent communication via message bus.
+
+## Features
+
+- **Static + Dynamic Registries**: Register agents at setup or create them at runtime
+- **Message Bus**: In-memory async message passing with ask/answer protocol
+- **Task Manager**: Background task lifecycle with soft/hard cancellation
+- **Auto-Mode Selection**: Intelligent sync/async decision based on task characteristics
+- **Nested Subagents**: Subagents can spawn their own subagents
+
+## Quick Start
+
+```python
+from agent_ext.subagents import (
+    SubagentRegistry, DynamicAgentRegistry,
+    InMemoryMessageBus, TaskManager,
+    SubAgentConfig, TaskCharacteristics, decide_execution_mode,
+)
+
+# Static registry (simple)
+reg = SubagentRegistry()
+reg.register(my_agent)
+result = await reg.get("my_agent").run(ctx, input="hello", meta={})
+
+# Dynamic registry (runtime creation)
+dyn = DynamicAgentRegistry(max_agents=10)
+config = SubAgentConfig(name="researcher", description="...", instructions="...")
+dyn.register(config, agent_instance)
+
+# Message bus
+bus = InMemoryMessageBus()
+queue = bus.register_agent("worker-1")
+await bus.send(AgentMessage(type=MessageType.TASK_ASSIGNED, ...))
+response = await bus.ask("parent", "worker-1", question="help?", task_id="t1")
+```
+
+## Execution Modes
+
+| Mode | When |
+|------|------|
+| `sync` | Simple tasks, needs user context |
+| `async` | Complex independent tasks |
+| `auto` | System decides based on `TaskCharacteristics` |
diff --git a/agent_ext/subagents/__init__.py b/agent_ext/subagents/__init__.py
index e69de29..93ea596 100644
--- a/agent_ext/subagents/__init__.py
+++ b/agent_ext/subagents/__init__.py
@@ -0,0 +1,47 @@
+"""Multi-agent orchestration — registries, message bus, task manager, prompts."""
+
+from .base import Subagent, SubagentResult
+from .message_bus import InMemoryMessageBus, TaskManager, create_message_bus
+from .orchestrator import SubagentOrchestrator
+from .prompts import (
+    SUBAGENT_SYSTEM_PROMPT,
+    TASK_TOOL_DESCRIPTION,
+    get_subagent_system_prompt,
+    get_task_instructions_prompt,
+)
+from .protocols import SubAgentDepsProtocol
+from .registry import DynamicAgentRegistry, SubagentRegistry
+from .toolset import SubAgentDeps, create_subagent_toolset
+from .types import (
+    AgentMessage,
+    CompiledSubAgent,
+    ExecutionMode,
+    MessageType,
+    SubAgentConfig,
+    TaskCharacteristics,
+    TaskHandle,
+    TaskPriority,
+    TaskStatus,
+    decide_execution_mode,
+)
+
+__all__ = [
+    "Subagent",
+    "SubagentResult",
+    "SubagentRegistry",
+    "DynamicAgentRegistry",
+    "SubagentOrchestrator",
+    "AgentMessage",
+    "CompiledSubAgent",
+    "ExecutionMode",
+    "MessageType",
+    "SubAgentConfig",
+    "TaskCharacteristics",
+    "TaskHandle",
+    "TaskPriority",
+    "TaskStatus",
+    "decide_execution_mode",
+    "InMemoryMessageBus",
+    "TaskManager",
+    "create_message_bus",
+]
diff --git a/agent_ext/subagents/base.py b/agent_ext/subagents/base.py
index 3bdabad..318eb4f 100644
--- a/agent_ext/subagents/base.py
+++ b/agent_ext/subagents/base.py
@@ -1,15 +1,18 @@
 from __future__ import annotations
-from typing import Any, Dict, Optional, Protocol
+
+from typing import Any, Protocol
+
 from pydantic import BaseModel
 
 
 class SubagentResult(BaseModel):
     ok: bool = True
     output: Any = None
-    error: Optional[str] = None
-    metadata: Dict[str, Any] = {}
+    error: str | None = None
+    metadata: dict[str, Any] = {}
 
 
 class Subagent(Protocol):
     name: str
-    async def run(self, *, input: Any, metadata: Dict[str, Any]) -> SubagentResult: ...
+
+    async def run(self, *, input: Any, metadata: dict[str, Any]) -> SubagentResult: ...
diff --git a/agent_ext/subagents/message_bus.py b/agent_ext/subagents/message_bus.py
new file mode 100644
index 0000000..1106b17
--- /dev/null
+++ b/agent_ext/subagents/message_bus.py
@@ -0,0 +1,224 @@
+"""Message bus for inter-agent communication.
+
+Provides in-memory message passing between parent agents and subagents
+with request-response correlation (ask/answer pattern).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import uuid
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+
+from .types import AgentMessage, MessageType, TaskHandle, TaskStatus
+
+
+@dataclass
+class InMemoryMessageBus:
+    """In-memory message bus using asyncio queues.
+
+    Suitable for single-process applications.  For distributed systems,
+    swap with a Redis-based implementation.
+    """
+
+    _queues: dict[str, asyncio.Queue[AgentMessage]] = field(default_factory=dict)
+    _pending_questions: dict[str, asyncio.Future[AgentMessage]] = field(default_factory=dict)
+    _handlers: list[Callable[[AgentMessage], Awaitable[None]]] = field(default_factory=list)
+
+    # -- send / receive -----------------------------------------------------
+
+    async def send(self, message: AgentMessage) -> None:
+        """Send a message to a specific agent."""
+        if message.receiver not in self._queues:
+            raise KeyError(f"Agent '{message.receiver}' is not registered")
+        await self._queues[message.receiver].put(message)
+        for handler in self._handlers:
+            with contextlib.suppress(Exception):
+                await handler(message)
+
+    async def ask(
+        self,
+        sender: str,
+        receiver: str,
+        question: Any,
+        task_id: str,
+        timeout: float = 30.0,
+    ) -> AgentMessage:
+        """Send a question and wait for a response (request-response pattern)."""
+        if receiver not in self._queues:
+            raise KeyError(f"Agent '{receiver}' is not registered")
+
+        correlation_id = uuid.uuid4().hex
+        loop = asyncio.get_running_loop()
+        response_future: asyncio.Future[AgentMessage] = loop.create_future()
+        self._pending_questions[correlation_id] = response_future
+
+        try:
+            msg = AgentMessage(
+                type=MessageType.QUESTION,
+                sender=sender,
+                receiver=receiver,
+                payload=question,
+                task_id=task_id,
+                correlation_id=correlation_id,
+            )
+            await self.send(msg)
+            return await asyncio.wait_for(response_future, timeout=timeout)
+        finally:
+            self._pending_questions.pop(correlation_id, None)
+
+    async def answer(self, original: AgentMessage, answer_payload: Any) -> None:
+        """Answer a previously received question."""
+        if original.sender not in self._queues:
+            raise KeyError(f"Agent '{original.sender}' is not registered")
+
+        response = AgentMessage(
+            type=MessageType.ANSWER,
+            sender=original.receiver,
+            receiver=original.sender,
+            payload=answer_payload,
+            task_id=original.task_id,
+            correlation_id=original.correlation_id,
+        )
+
+        if original.correlation_id and original.correlation_id in self._pending_questions:
+            future = self._pending_questions[original.correlation_id]
+            if not future.done():
+                future.set_result(response)
+        else:
+            await self.send(response)
+
+    # -- registration -------------------------------------------------------
+
+    def register_agent(self, agent_id: str) -> asyncio.Queue[AgentMessage]:
+        """Register an agent to receive messages."""
+        if agent_id in self._queues:
+            raise ValueError(f"Agent '{agent_id}' is already registered")
+        queue: asyncio.Queue[AgentMessage] = asyncio.Queue()
+        self._queues[agent_id] = queue
+        return queue
+
+    def unregister_agent(self, agent_id: str) -> None:
+        self._queues.pop(agent_id, None)
+
+    def is_registered(self, agent_id: str) -> bool:
+        return agent_id in self._queues
+
+    def registered_agents(self) -> list[str]:
+        return list(self._queues.keys())
+
+    # -- handlers -----------------------------------------------------------
+
+    def add_handler(self, handler: Callable[[AgentMessage], Awaitable[None]]) -> None:
+        self._handlers.append(handler)
+
+    def remove_handler(self, handler: Callable[[AgentMessage], Awaitable[None]]) -> None:
+        if handler in self._handlers:
+            self._handlers.remove(handler)
+
+    # -- drain --------------------------------------------------------------
+
+    async def get_messages(self, agent_id: str, timeout: float = 0.0) -> list[AgentMessage]:
+        """Get pending messages for an agent (non-blocking)."""
+        if agent_id not in self._queues:
+            raise KeyError(f"Agent '{agent_id}' is not registered")
+        queue = self._queues[agent_id]
+        messages: list[AgentMessage] = []
+        if timeout > 0 and queue.empty():
+            try:
+                msg = await asyncio.wait_for(queue.get(), timeout=timeout)
+                messages.append(msg)
+            except TimeoutError:
+                return messages
+        while not queue.empty():
+            try:
+                messages.append(queue.get_nowait())
+            except asyncio.QueueEmpty:
+                break
+        return messages
+
+
+def create_message_bus(backend: str = "memory", **kwargs: Any) -> InMemoryMessageBus:
+    """Factory for message bus implementations."""
+    if backend == "memory":
+        return InMemoryMessageBus()
+    raise ValueError(f"Unknown message bus backend: {backend}")
+
+
+# ---------------------------------------------------------------------------
+# Task manager
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class TaskManager:
+    """Manages background tasks: creation, status, soft/hard cancellation."""
+
+    tasks: dict[str, asyncio.Task[Any]] = field(default_factory=dict)
+    handles: dict[str, TaskHandle] = field(default_factory=dict)
+    message_bus: InMemoryMessageBus = field(default_factory=InMemoryMessageBus)
+    _cancel_events: dict[str, asyncio.Event] = field(default_factory=dict)
+
+    def create_task(
+        self,
+        task_id: str,
+        coro: Any,
+        handle: TaskHandle,
+    ) -> asyncio.Task[Any]:
+        """Create and track a new background task."""
+        task = asyncio.create_task(coro)
+        self.tasks[task_id] = task
+        self.handles[task_id] = handle
+        self._cancel_events[task_id] = asyncio.Event()
+        handle.status = TaskStatus.RUNNING
+        handle.started_at = datetime.now()
+        return task
+
+    def get_handle(self, task_id: str) -> TaskHandle | None:
+        return self.handles.get(task_id)
+
+    def get_cancel_event(self, task_id: str) -> asyncio.Event | None:
+        return self._cancel_events.get(task_id)
+
+    async def soft_cancel(self, task_id: str) -> bool:
+        """Cooperative cancellation via event flag."""
+        if task_id not in self._cancel_events:
+            return False
+        self._cancel_events[task_id].set()
+        handle = self.handles.get(task_id)
+        if handle and self.message_bus.is_registered(handle.subagent_name):
+            with contextlib.suppress(KeyError):
+                await self.message_bus.send(
+                    AgentMessage(
+                        type=MessageType.CANCEL_REQUEST,
+                        sender="task_manager",
+                        receiver=handle.subagent_name,
+                        payload={"reason": "soft_cancel"},
+                        task_id=task_id,
+                    )
+                )
+        return True
+
+    async def hard_cancel(self, task_id: str) -> bool:
+        """Immediately cancel a task."""
+        if task_id not in self.tasks:
+            return False
+        task = self.tasks[task_id]
+        if not task.done():
+            task.cancel()
+        handle = self.handles.get(task_id)
+        if handle:
+            handle.status = TaskStatus.CANCELLED
+            handle.completed_at = datetime.now()
+        return True
+
+    def cleanup_task(self, task_id: str) -> None:
+        self.tasks.pop(task_id, None)
+        self._cancel_events.pop(task_id, None)
+
+    def list_active_tasks(self) -> list[str]:
+        return [tid for tid, t in self.tasks.items() if not t.done()]
diff --git a/agent_ext/subagents/orchestrator.py b/agent_ext/subagents/orchestrator.py
index 252713c..81ad4b4 100644
--- a/agent_ext/subagents/orchestrator.py
+++ b/agent_ext/subagents/orchestrator.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
+
 import asyncio
-from typing import Any, Dict, List, Tuple
+from typing import Any
+
+from agent_ext.run_context import RunContext
 
 from .base import SubagentResult
 from .registry import SubagentRegistry
-from agent_ext.run_context import RunContext
 
 
 class SubagentOrchestrator:
@@ -14,15 +16,16 @@ def __init__(self, registry: SubagentRegistry):
     async def run_many(
         self,
         ctx: RunContext,
-        requests: List[Tuple[str, Any, Dict[str, Any]]],
+        requests: list[tuple[str, Any, dict[str, Any]]],
         *,
         timeout_s: int = 60,
-    ) -> Dict[str, SubagentResult]:
+    ) -> dict[str, SubagentResult]:
         """
         requests: [(subagent_name, input, metadata), ...]
         returns dict keyed by subagent_name (last write wins)
         """
-        async def _one(name: str, inp: Any, meta: Dict[str, Any]) -> tuple[str, SubagentResult]:
+
+        async def _one(name: str, inp: Any, meta: dict[str, Any]) -> tuple[str, SubagentResult]:
             agent = self.registry.get(name)
             ctx.logger.info("subagent.start", name=name, trace_id=ctx.trace_id)
             try:
@@ -37,7 +40,7 @@ async def _one(name: str, inp: Any, meta: Dict[str, Any]) -> tuple[str, Subagent
         for p in pending:
             p.cancel()
 
-        out: Dict[str, SubagentResult] = {}
+        out: dict[str, SubagentResult] = {}
         for d in done:
             k, v = await d
             out[k] = v
diff --git a/agent_ext/subagents/prompts.py b/agent_ext/subagents/prompts.py
new file mode 100644
index 0000000..883aec2
--- /dev/null
+++ b/agent_ext/subagents/prompts.py
@@ -0,0 +1,100 @@
+"""System prompts for subagent communication.
+
+Contains prompts that configure subagents and explain task delegation
+to the parent agent.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .types import SubAgentConfig
+
+SUBAGENT_SYSTEM_PROMPT = """You are a specialized subagent working on a delegated task.
+
+## Your Role
+You have been spawned by a parent agent to handle a specific task. Focus entirely
+on completing the assigned task to the best of your ability.
+
+## Communication
+- If you need clarification, use the `ask_parent` tool to ask the parent agent
+- Keep questions specific and actionable
+- Do not ask unnecessary questions - use your judgment when possible
+- If you cannot complete a task, explain why clearly
+
+## Task Completion
+- Complete the task thoroughly before returning
+- Provide clear, structured results
+- If the task cannot be completed, explain what was attempted and why it failed
+"""
+
+TASK_TOOL_DESCRIPTION = """\
+Delegate a task to a specialized subagent. The subagent runs independently \
+with its own context and tools, and returns a result when done.
+
+## When to use
+- Complex multi-step tasks that can run independently
+- Research or exploration tasks
+- Multiple independent subtasks that can run in parallel
+- Tasks that require deep focus on a single area
+
+## When NOT to use
+- Trivial tasks you can do faster yourself
+- Tasks that require your full conversation context
+- Tasks that need back-and-forth with the user
+"""
+
+CHECK_TASK_DESCRIPTION = """\
+Check the status of a background (async) task and get its result if completed."""
+
+ANSWER_SUBAGENT_DESCRIPTION = """\
+Answer a question from a background subagent that is waiting for clarification."""
+
+LIST_ACTIVE_TASKS_DESCRIPTION = """\
+List all currently active background tasks with their status."""
+
+WAIT_TASKS_DESCRIPTION = """\
+Wait for multiple background tasks to complete before continuing."""
+
+SOFT_CANCEL_TASK_DESCRIPTION = """\
+Request cooperative cancellation of a background task."""
+
+HARD_CANCEL_TASK_DESCRIPTION = """\
+Immediately cancel a background task."""
+
+DEFAULT_GENERAL_PURPOSE_DESCRIPTION = """A general-purpose agent for a wide variety of tasks.
+Use this when no specialized subagent matches the task requirements."""
+
+
+def get_subagent_system_prompt(configs: list[SubAgentConfig], include_dual_mode: bool = True) -> str:
+    """Generate system prompt section describing available subagents."""
+    lines = ["## Available Subagents", ""]
+    lines.append("Use the `task` tool to delegate work to these subagents:")
+    lines.append("")
+    for config in configs:
+        name = config["name"]
+        desc = config["description"]
+        lines.append(f"- **{name}**: {desc}")
+        if config.get("can_ask_questions") is False:
+            lines[-1] += " *(cannot ask clarifying questions)*"
+    return "\n".join(lines)
+
+
+def get_task_instructions_prompt(
+    task_description: str,
+    can_ask_questions: bool = True,
+    max_questions: int | None = None,
+) -> str:
+    """Generate task instructions for a subagent."""
+    lines = ["## Your Task", "", task_description, ""]
+    if can_ask_questions:
+        lines.append("## Asking Questions")
+        lines.append("If you need clarification, use the `ask_parent` tool.")
+        if max_questions is not None:
+            lines.append(f"You may ask up to {max_questions} questions.")
+    else:
+        lines.append("## Note")
+        lines.append("Complete this task using your best judgment.")
+        lines.append("You cannot ask the parent for clarification.")
+    return "\n".join(lines)
diff --git a/agent_ext/subagents/protocols.py b/agent_ext/subagents/protocols.py
new file mode 100644
index 0000000..dfeef3a
--- /dev/null
+++ b/agent_ext/subagents/protocols.py
@@ -0,0 +1,47 @@
+"""Protocols for subagent dependencies.
+
+Define the interface that dependencies must implement to work with
+the subagent toolset.
+"""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+from .message_bus import InMemoryMessageBus, TaskManager
+from .registry import DynamicAgentRegistry
+from .types import CompiledSubAgent, SubAgentConfig
+
+
+@runtime_checkable
+class SubAgentDepsProtocol(Protocol):
+    """Protocol for dependencies that support subagent operations.
+
+    Any deps object passed to a subagent-aware agent must implement
+    this protocol (or a subset of it via duck typing).
+    """
+
+    @property
+    def subagent_configs(self) -> list[SubAgentConfig]:
+        """List of available subagent configurations."""
+        ...
+
+    @property
+    def compiled_agents(self) -> dict[str, CompiledSubAgent]:
+        """Pre-compiled subagent instances."""
+        ...
+
+    @property
+    def message_bus(self) -> InMemoryMessageBus:
+        """Message bus for inter-agent communication."""
+        ...
+
+    @property
+    def task_manager(self) -> TaskManager:
+        """Task manager for background task lifecycle."""
+        ...
+
+    @property
+    def dynamic_registry(self) -> DynamicAgentRegistry | None:
+        """Optional dynamic agent registry for runtime agent creation."""
+        ...
diff --git a/agent_ext/subagents/registry.py b/agent_ext/subagents/registry.py
index 93285b1..27505b4 100644
--- a/agent_ext/subagents/registry.py
+++ b/agent_ext/subagents/registry.py
@@ -1,18 +1,138 @@
+"""Subagent registries — static and dynamic.
+
+``SubagentRegistry`` is the simple name→agent map (existing, preserved).
+``DynamicAgentRegistry`` adds runtime creation limits, compiled agents, and summaries.
+"""
+
 from __future__ import annotations
-from typing import Dict
 
-from .base import Subagent
+import builtins
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+from .types import CompiledSubAgent, SubAgentConfig
+
+# ---------------------------------------------------------------------------
+# Protocol (what a subagent must look like)
+# ---------------------------------------------------------------------------
+
+
+class SubagentProtocol(Protocol):
+    """Minimal subagent interface."""
+
+    name: str
+
+    async def run(self, ctx: Any, *, input: Any, meta: dict[str, Any]) -> Any: ...
+
+
+# ---------------------------------------------------------------------------
+# Static registry (backward-compat)
+# ---------------------------------------------------------------------------
 
 
 class SubagentRegistry:
-    def __init__(self):
-        self._agents: Dict[str, Subagent] = {}
+    """Simple static registry: name → subagent.
+
+    This is the original registry used by the workbench.
+    """
+
+    def __init__(self) -> None:
+        self._agents: dict[str, Any] = {}
 
-    def register(self, agent: Subagent) -> None:
+    def register(self, agent: Any) -> None:
         self._agents[agent.name] = agent
 
-    def get(self, name: str) -> Subagent:
+    def get(self, name: str) -> Any:
+        if name not in self._agents:
+            raise KeyError(f"Unknown subagent: {name}")
         return self._agents[name]
 
-    def list(self) -> list[str]:
+    def list(self) -> builtins.list[str]:
         return sorted(self._agents.keys())
+
+    def exists(self, name: str) -> bool:
+        return name in self._agents
+
+    def count(self) -> int:
+        return len(self._agents)
+
+
+# ---------------------------------------------------------------------------
+# Dynamic registry (parity with subagents-pydantic-ai)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DynamicAgentRegistry:
+    """Registry for dynamically created agents with limits and compiled agents.
+
+    Supports runtime agent creation, removal, and introspection.
+    """
+
+    agents: dict[str, Any] = field(default_factory=dict)
+    configs: dict[str, SubAgentConfig] = field(default_factory=dict)
+    _compiled: dict[str, CompiledSubAgent] = field(default_factory=dict)
+    max_agents: int | None = None
+
+    def register(self, config: SubAgentConfig, agent: Any) -> None:
+        name = config["name"]
+        if name in self.agents:
+            raise ValueError(f"Agent '{name}' already exists")
+        if self.max_agents and len(self.agents) >= self.max_agents:
+            raise ValueError(
+                f"Maximum number of agents ({self.max_agents}) reached. Remove an agent before creating a new one."
+            )
+        self.agents[name] = agent
+        self.configs[name] = config
+        self._compiled[name] = CompiledSubAgent(
+            name=name,
+            description=config["description"],
+            agent=agent,
+            config=config,
+        )
+
+    def get(self, name: str) -> Any | None:
+        return self.agents.get(name)
+
+    def get_config(self, name: str) -> SubAgentConfig | None:
+        return self.configs.get(name)
+
+    def get_compiled(self, name: str) -> CompiledSubAgent | None:
+        return self._compiled.get(name)
+
+    def remove(self, name: str) -> bool:
+        if name not in self.agents:
+            return False
+        del self.agents[name]
+        del self.configs[name]
+        del self._compiled[name]
+        return True
+
+    def list_agents(self) -> list[str]:
+        return list(self.agents.keys())
+
+    def list_configs(self) -> list[SubAgentConfig]:
+        return list(self.configs.values())
+
+    def list_compiled(self) -> list[CompiledSubAgent]:
+        return list(self._compiled.values())
+
+    def exists(self, name: str) -> bool:
+        return name in self.agents
+
+    def count(self) -> int:
+        return len(self.agents)
+
+    def clear(self) -> None:
+        self.agents.clear()
+        self.configs.clear()
+        self._compiled.clear()
+
+    def get_summary(self) -> str:
+        if not self.agents:
+            return "No dynamically created agents."
+        lines = [f"Dynamic Agents ({len(self.agents)}):"]
+        for name, config in self.configs.items():
+            model = config.get("model", "default")
+            lines.append(f"- {name} [{model}]: {config['description']}")
+        return "\n".join(lines)
diff --git a/agent_ext/subagents/toolset.py b/agent_ext/subagents/toolset.py
new file mode 100644
index 0000000..5b33ad4
--- /dev/null
+++ b/agent_ext/subagents/toolset.py
@@ -0,0 +1,196 @@
+"""Subagent toolset — delegate tasks to specialized subagents via tool calls.
+
+Supports sync (blocking), async (background), and auto execution modes.
+
+Example::
+
+    from pydantic_ai import Agent
+    from agent_ext.subagents import create_subagent_toolset, SubAgentDeps
+
+    toolset = create_subagent_toolset(configs=[...])
+    agent = Agent("openai:gpt-4o", toolsets=[toolset])
+"""
+
+from __future__ import annotations
+
+import asyncio
+import uuid
+from datetime import datetime
+from typing import Annotated, Any
+
+from pydantic import BaseModel, ConfigDict, SkipValidation
+from pydantic_ai import Agent, RunContext
+from pydantic_ai.toolsets import FunctionToolset
+
+from .prompts import (
+    CHECK_TASK_DESCRIPTION,
+    LIST_ACTIVE_TASKS_DESCRIPTION,
+    SOFT_CANCEL_TASK_DESCRIPTION,
+    TASK_TOOL_DESCRIPTION,
+    get_task_instructions_prompt,
+)
+from .types import (
+    CompiledSubAgent,
+    SubAgentConfig,
+    TaskCharacteristics,
+    TaskHandle,
+    TaskStatus,
+)
+
+
+class SubAgentDeps(BaseModel):
+    """Dependencies for the subagent toolset."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    configs: list[Annotated[Any, SkipValidation]] = []
+    compiled_agents: dict[str, Annotated[Any, SkipValidation]] = {}
+    message_bus: Annotated[Any, SkipValidation] = None
+    task_manager: Annotated[Any, SkipValidation] = None
+    default_model: str = "openai:gpt-4o"
+
+
+def _compile_subagent(config: SubAgentConfig, default_model: str) -> CompiledSubAgent:
+    """Compile a subagent config into a ready-to-use agent."""
+    model = config.get("model", default_model)
+    agent_kwargs = config.get("agent_kwargs", {})
+    agent: Agent[Any, str] = Agent(
+        model,
+        system_prompt=config["instructions"],
+        **agent_kwargs,
+    )
+    return CompiledSubAgent(
+        name=config["name"],
+        description=config["description"],
+        agent=agent,
+        config=config,
+    )
+
+
+def create_subagent_toolset(
+    configs: list[SubAgentConfig] | None = None,
+    *,
+    default_model: str = "openai:gpt-4o",
+    toolset_id: str | None = None,
+) -> FunctionToolset[SubAgentDeps]:
+    """Create a subagent toolset for task delegation.
+
+    Args:
+        configs: List of subagent configurations.
+        default_model: Default model for subagents.
+        toolset_id: Optional toolset ID.
+
+    Returns:
+        FunctionToolset with task, check_task, list_tasks, cancel_task tools.
+    """
+    toolset: FunctionToolset[SubAgentDeps] = FunctionToolset(id=toolset_id)
+    configs = configs or []
+
+    # Pre-compile agents
+    _compiled: dict[str, CompiledSubAgent] = {}
+    for cfg in configs:
+        compiled = _compile_subagent(cfg, default_model)
+        _compiled[cfg["name"]] = compiled
+
+    @toolset.tool(description=TASK_TOOL_DESCRIPTION)
+    async def task(
+        ctx: RunContext[SubAgentDeps],
+        subagent_type: str,
+        description: str,
+        mode: str = "sync",
+    ) -> str:
+        """Delegate a task to a subagent.
+
+        Args:
+            subagent_type: Name of the subagent to use.
+            description: Task description with all necessary context.
+            mode: Execution mode: "sync", "async", or "auto".
+        """
+        compiled = _compiled.get(subagent_type) or (ctx.deps.compiled_agents.get(subagent_type))
+        if not compiled or not compiled.agent:
+            available = list(_compiled.keys())
+            return f"Error: Unknown subagent '{subagent_type}'. Available: {available}"
+
+        instructions = get_task_instructions_prompt(
+            description,
+            can_ask_questions=compiled.config.get("can_ask_questions", False),
+            max_questions=compiled.config.get("max_questions"),
+        )
+
+        if mode == "sync" or (mode == "auto" and not TaskCharacteristics().can_run_independently):
+            # Synchronous execution
+            try:
+                result = await compiled.agent.run(instructions)
+                output = getattr(result, "output", None) or str(result)
+                return f"[{subagent_type}] {output}"
+            except Exception as e:
+                return f"[{subagent_type}] Error: {e!s}"
+        else:
+            # Async execution — return task handle
+            task_id = f"task_{uuid.uuid4().hex[:8]}"
+            handle = TaskHandle(
+                task_id=task_id,
+                subagent_name=subagent_type,
+                description=description[:200],
+            )
+
+            async def _run_async():
+                try:
+                    result = await compiled.agent.run(instructions)
+                    output = getattr(result, "output", None) or str(result)
+                    handle.result = str(output)
+                    handle.status = TaskStatus.COMPLETED
+                except Exception as e:
+                    handle.error = str(e)
+                    handle.status = TaskStatus.FAILED
+                finally:
+                    handle.completed_at = datetime.now()
+
+            if ctx.deps.task_manager:
+                ctx.deps.task_manager.create_task(task_id, _run_async(), handle)
+            else:
+                asyncio.create_task(_run_async())
+
+            return f"Task '{task_id}' started in background. Use check_task('{task_id}') to get results."
+
+    @toolset.tool(description=CHECK_TASK_DESCRIPTION)
+    async def check_task(ctx: RunContext[SubAgentDeps], task_id: str) -> str:
+        """Check status of a background task."""
+        if not ctx.deps.task_manager:
+            return "Error: Task manager not available."
+        handle = ctx.deps.task_manager.get_handle(task_id)
+        if not handle:
+            return f"Error: Task '{task_id}' not found."
+        status = f"Task: {task_id}\nAgent: {handle.subagent_name}\nStatus: {handle.status.value}"
+        if handle.result:
+            status += f"\nResult: {handle.result}"
+        if handle.error:
+            status += f"\nError: {handle.error}"
+        return status
+
+    @toolset.tool(description=LIST_ACTIVE_TASKS_DESCRIPTION)
+    async def list_active_tasks(ctx: RunContext[SubAgentDeps]) -> str:
+        """List all active background tasks."""
+        if not ctx.deps.task_manager:
+            return "No task manager available."
+        active = ctx.deps.task_manager.list_active_tasks()
+        if not active:
+            return "No active tasks."
+        lines = []
+        for tid in active:
+            handle = ctx.deps.task_manager.get_handle(tid)
+            if handle:
+                lines.append(f"- {tid}: {handle.subagent_name} ({handle.status.value})")
+        return "\n".join(lines) if lines else "No active tasks."
+
+    @toolset.tool(description=SOFT_CANCEL_TASK_DESCRIPTION)
+    async def cancel_task(ctx: RunContext[SubAgentDeps], task_id: str) -> str:
+        """Cancel a background task."""
+        if not ctx.deps.task_manager:
+            return "Error: Task manager not available."
+        success = await ctx.deps.task_manager.hard_cancel(task_id)
+        if success:
+            return f"Task '{task_id}' cancelled."
+        return f"Error: Task '{task_id}' not found or already completed."
+
+    return toolset
diff --git a/agent_ext/subagents/types.py b/agent_ext/subagents/types.py
new file mode 100644
index 0000000..b37b495
--- /dev/null
+++ b/agent_ext/subagents/types.py
@@ -0,0 +1,184 @@
+"""Type definitions for the subagent system.
+
+Covers messages, task handles, execution modes, and configuration.
+"""
+
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum, StrEnum
+from typing import Any, Literal, NotRequired
+
+from typing_extensions import TypedDict
+
+# ---------------------------------------------------------------------------
+# Message types
+# ---------------------------------------------------------------------------
+
+
+class MessageType(StrEnum):
+    """Types of messages between agents."""
+
+    TASK_ASSIGNED = "task_assigned"
+    TASK_UPDATE = "task_update"
+    TASK_COMPLETED = "task_completed"
+    TASK_FAILED = "task_failed"
+    QUESTION = "question"
+    ANSWER = "answer"
+    CANCEL_REQUEST = "cancel_request"
+    CANCEL_FORCED = "cancel_forced"
+
+
+class TaskStatus(StrEnum):
+    """Status of a background task."""
+
+    PENDING = "pending"
+    RUNNING = "running"
+    WAITING_FOR_ANSWER = "waiting_for_answer"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+class TaskPriority(StrEnum):
+    """Priority levels for background tasks."""
+
+    LOW = "low"
+    NORMAL = "normal"
+    HIGH = "high"
+    CRITICAL = "critical"
+
+
+ExecutionMode = Literal["sync", "async", "auto"]
+
+
+# ---------------------------------------------------------------------------
+# Task characteristics (for auto-mode selection)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class TaskCharacteristics:
+    """Characteristics that help decide sync vs async execution.
+
+    Used by ``decide_execution_mode`` to auto-select.
+    """
+
+    estimated_complexity: Literal["simple", "moderate", "complex"] = "moderate"
+    requires_user_context: bool = False
+    is_time_sensitive: bool = False
+    can_run_independently: bool = True
+    may_need_clarification: bool = False
+
+
+def decide_execution_mode(
+    characteristics: TaskCharacteristics,
+    config: SubAgentConfig,
+    force_mode: ExecutionMode | None = None,
+) -> Literal["sync", "async"]:
+    """Decide whether to run sync or async based on task characteristics."""
+    if force_mode and force_mode != "auto":
+        return force_mode
+
+    config_pref = config.get("preferred_mode", "auto")
+    if config_pref != "auto":
+        return config_pref  # type: ignore[return-value]
+
+    if characteristics.requires_user_context:
+        return "sync"
+    if characteristics.may_need_clarification and characteristics.is_time_sensitive:
+        return "sync"
+    if characteristics.estimated_complexity == "complex" and characteristics.can_run_independently:
+        return "async"
+    if characteristics.estimated_complexity == "simple":
+        return "sync"
+    if characteristics.can_run_independently:
+        return "async"
+    return "sync"
+
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+
+class SubAgentConfig(TypedDict, total=False):
+    """Configuration for a subagent.
+
+    Required: name, description, instructions.
+    Optional: model, toolsets, execution preferences, etc.
+    """
+
+    name: str
+    description: str
+    instructions: str
+    model: NotRequired[str]
+    can_ask_questions: NotRequired[bool]
+    max_questions: NotRequired[int]
+    preferred_mode: NotRequired[ExecutionMode]
+    typical_complexity: NotRequired[Literal["simple", "moderate", "complex"]]
+    typically_needs_context: NotRequired[bool]
+    toolsets: NotRequired[list[Any]]
+    agent_kwargs: NotRequired[dict[str, Any]]
+    context_files: NotRequired[list[str]]
+    extra: NotRequired[dict[str, Any]]
+
+
+# ---------------------------------------------------------------------------
+# Messages
+# ---------------------------------------------------------------------------
+
+
+def _generate_message_id() -> str:
+    return uuid.uuid4().hex
+
+
+@dataclass
+class AgentMessage:
+    """Message passed between agents via the message bus."""
+
+    type: MessageType
+    sender: str
+    receiver: str
+    payload: Any
+    task_id: str
+    id: str = field(default_factory=_generate_message_id)
+    timestamp: datetime = field(default_factory=datetime.now)
+    correlation_id: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Task handle
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class TaskHandle:
+    """Handle for managing a background task.
+
+    Returned when a task is started in async mode.
+    """
+
+    task_id: str
+    subagent_name: str
+    description: str
+    status: TaskStatus = TaskStatus.PENDING
+    priority: TaskPriority = TaskPriority.NORMAL
+    created_at: datetime = field(default_factory=datetime.now)
+    started_at: datetime | None = None
+    completed_at: datetime | None = None
+    result: str | None = None
+    error: str | None = None
+    pending_question: str | None = None
+
+
+@dataclass
+class CompiledSubAgent:
+    """A pre-compiled subagent ready for use."""
+
+    name: str
+    description: str
+    config: SubAgentConfig
+    agent: object | None = None
diff --git a/agent_ext/todo/__init__.py b/agent_ext/todo/__init__.py
index fadfa44..db7cf41 100644
--- a/agent_ext/todo/__init__.py
+++ b/agent_ext/todo/__init__.py
@@ -1,6 +1,6 @@
+from .events import InProcessEventBus, TaskEvent, TaskEventBus, WebhookEventBus
 from .models import Task, TaskCreate, TaskPatch, TaskQuery, TaskStatus
 from .store_base import TaskStore
 from .store_memory import InMemoryTaskStore
 from .store_postgres import PostgresTaskStore
-from .events import TaskEvent, TaskEventBus, InProcessEventBus, WebhookEventBus
 from .toolset import TodoToolset
diff --git a/agent_ext/todo/events.py b/agent_ext/todo/events.py
index 9806f32..be22b4c 100644
--- a/agent_ext/todo/events.py
+++ b/agent_ext/todo/events.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 
 import asyncio
+from collections.abc import Awaitable, Callable
 from dataclasses import dataclass
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Protocol
+from typing import Any, Protocol
 
 import httpx
 
 from agent_ext.todo.models import Task
 
-
 TaskEventName = str  # e.g., "task_created", "task_updated", "task_completed"
 
 
@@ -16,7 +16,7 @@
 class TaskEvent:
     name: TaskEventName
     task: Task
-    payload: Dict[str, Any]
+    payload: dict[str, Any]
 
 
 class TaskEventBus(Protocol):
@@ -25,7 +25,7 @@ async def emit(self, event: TaskEvent) -> None: ...
 
 class InProcessEventBus:
     def __init__(self) -> None:
-        self._handlers: Dict[TaskEventName, List[Callable[[TaskEvent], Awaitable[None]]]] = {}
+        self._handlers: dict[TaskEventName, list[Callable[[TaskEvent], Awaitable[None]]]] = {}
 
     def on(self, name: TaskEventName, handler: Callable[[TaskEvent], Awaitable[None]]) -> None:
         self._handlers.setdefault(name, []).append(handler)
@@ -40,7 +40,8 @@ class WebhookEventBus:
     """
     Sends task events to one or more webhook URLs.
     """
-    def __init__(self, urls: List[str], *, timeout_s: float = 10.0, headers: Optional[Dict[str, str]] = None) -> None:
+
+    def __init__(self, urls: list[str], *, timeout_s: float = 10.0, headers: dict[str, str] | None = None) -> None:
         self.urls = urls
         self.timeout_s = timeout_s
         self.headers = headers or {}
diff --git a/agent_ext/todo/models.py b/agent_ext/todo/models.py
index d120a5d..c5b0688 100644
--- a/agent_ext/todo/models.py
+++ b/agent_ext/todo/models.py
@@ -1,15 +1,15 @@
 from __future__ import annotations
 
-from datetime import datetime, timezone
-from typing import Any, Dict, List, Literal, Optional
-from pydantic import BaseModel, Field
+from datetime import UTC, datetime
+from typing import Any, Literal
 
+from pydantic import BaseModel, Field
 
 TaskStatus = Literal["pending", "in_progress", "blocked", "done", "canceled", "failed"]
 
 
 def now_utc() -> datetime:
-    return datetime.now(timezone.utc)
+    return datetime.now(UTC)
 
 
 class Task(BaseModel):
@@ -19,28 +19,29 @@ class Task(BaseModel):
     - supports dependencies (depends_on)
     - supports multi-tenant scoping (case_id/session_id/user_id)
     """
+
     id: str
     title: str
-    description: Optional[str] = None
+    description: str | None = None
 
     status: TaskStatus = "pending"
     priority: int = 50
 
-    parent_id: Optional[str] = None
-    depends_on: List[str] = Field(default_factory=list)
-    tags: List[str] = Field(default_factory=list)
+    parent_id: str | None = None
+    depends_on: list[str] = Field(default_factory=list)
+    tags: list[str] = Field(default_factory=list)
 
     # Multi-tenant / scoping
-    case_id: Optional[str] = None
-    session_id: Optional[str] = None
-    user_id: Optional[str] = None
+    case_id: str | None = None
+    session_id: str | None = None
+    user_id: str | None = None
 
     # Links to your audit/evidence world
-    artifact_ids: List[str] = Field(default_factory=list)
-    evidence_ids: List[str] = Field(default_factory=list)
+    artifact_ids: list[str] = Field(default_factory=list)
+    evidence_ids: list[str] = Field(default_factory=list)
 
     # Generic metadata for planner/router/judge notes
-    meta: Dict[str, Any] = Field(default_factory=dict)
+    meta: dict[str, Any] = Field(default_factory=dict)
 
     created_at: datetime = Field(default_factory=now_utc)
     updated_at: datetime = Field(default_factory=now_utc)
@@ -48,45 +49,46 @@ class Task(BaseModel):
 
 class TaskCreate(BaseModel):
     title: str
-    description: Optional[str] = None
+    description: str | None = None
     priority: int = 50
-    parent_id: Optional[str] = None
-    depends_on: List[str] = Field(default_factory=list)
-    tags: List[str] = Field(default_factory=list)
-    meta: Dict[str, Any] = Field(default_factory=dict)
+    parent_id: str | None = None
+    depends_on: list[str] = Field(default_factory=list)
+    tags: list[str] = Field(default_factory=list)
+    meta: dict[str, Any] = Field(default_factory=dict)
 
-    case_id: Optional[str] = None
-    session_id: Optional[str] = None
-    user_id: Optional[str] = None
+    case_id: str | None = None
+    session_id: str | None = None
+    user_id: str | None = None
 
 
 class TaskPatch(BaseModel):
-    title: Optional[str] = None
-    description: Optional[str] = None
-    status: Optional[TaskStatus] = None
-    priority: Optional[int] = None
-    parent_id: Optional[str] = None
+    title: str | None = None
+    description: str | None = None
+    status: TaskStatus | None = None
+    priority: int | None = None
+    parent_id: str | None = None
 
-    depends_on: Optional[List[str]] = None
-    tags: Optional[List[str]] = None
+    depends_on: list[str] | None = None
+    tags: list[str] | None = None
 
-    artifact_ids: Optional[List[str]] = None
-    evidence_ids: Optional[List[str]] = None
-    meta: Optional[Dict[str, Any]] = None
+    artifact_ids: list[str] | None = None
+    evidence_ids: list[str] | None = None
+    meta: dict[str, Any] | None = None
 
 
 class TaskQuery(BaseModel):
     """
     Filter used by list/search.
     """
-    case_id: Optional[str] = None
-    session_id: Optional[str] = None
-    user_id: Optional[str] = None
 
-    status: Optional[TaskStatus] = None
-    parent_id: Optional[str] = None
-    tag: Optional[str] = None
+    case_id: str | None = None
+    session_id: str | None = None
+    user_id: str | None = None
+
+    status: TaskStatus | None = None
+    parent_id: str | None = None
+    tag: str | None = None
 
-    text: Optional[str] = None  # simple substring match for title/description
+    text: str | None = None  # simple substring match for title/description
     limit: int = 200
     offset: int = 0
diff --git a/agent_ext/todo/pai_toolset.py b/agent_ext/todo/pai_toolset.py
new file mode 100644
index 0000000..5504b20
--- /dev/null
+++ b/agent_ext/todo/pai_toolset.py
@@ -0,0 +1,149 @@
+"""Todo FunctionToolset factory for pydantic-ai agents.
+
+Example::
+
+    from pydantic_ai import Agent
+    from agent_ext.todo import create_todo_toolset, TodoDeps, InMemoryTaskStore
+
+    store = InMemoryTaskStore()
+    toolset = create_todo_toolset()
+    agent = Agent("openai:gpt-4o", toolsets=[toolset])
+
+    deps = TodoDeps(store=store, case_id="case-1")
+    result = await agent.run("Create a task to review the PR", deps=deps)
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from pydantic import BaseModel, ConfigDict, SkipValidation
+from pydantic_ai import RunContext
+from pydantic_ai.toolsets import FunctionToolset
+
+from .models import TaskCreate, TaskPatch, TaskQuery
+
+TODO_SYSTEM_PROMPT = """
+## Todo Tools
+
+You can manage tasks using the following tools:
+* `create_task` — create a new task
+* `list_tasks` — list tasks with optional filtering
+* `update_task` — update a task's status or details
+* `complete_task` — mark a task as done
+"""
+
+
+class TodoDeps(BaseModel):
+    """Dependencies for the todo toolset."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    store: Annotated[Any, SkipValidation]  # TaskStore
+    case_id: str | None = None
+    session_id: str | None = None
+    user_id: str | None = None
+
+
+def create_todo_toolset(*, toolset_id: str | None = None) -> FunctionToolset[TodoDeps]:
+    """Create a todo toolset for AI agents.
+
+    Returns:
+        FunctionToolset with create_task, list_tasks, update_task, complete_task.
+    """
+    toolset: FunctionToolset[TodoDeps] = FunctionToolset(id=toolset_id)
+
+    @toolset.tool
+    async def create_task(
+        ctx: RunContext[TodoDeps],
+        title: str,
+        description: str = "",
+        priority: int = 50,
+        tags: str = "",
+    ) -> str:
+        """Create a new task.
+
+        Args:
+            title: Task title.
+            description: Optional description.
+            priority: Priority (0-100, lower = higher priority).
+            tags: Comma-separated tags.
+        """
+        data = TaskCreate(
+            title=title,
+            description=description or None,
+            priority=priority,
+            tags=[t.strip() for t in tags.split(",") if t.strip()],
+            case_id=ctx.deps.case_id,
+            session_id=ctx.deps.session_id,
+            user_id=ctx.deps.user_id,
+        )
+        task = await ctx.deps.store.create_task(data)
+        return f"Created task '{task.title}' (id: {task.id})"
+
+    @toolset.tool
+    async def list_tasks(
+        ctx: RunContext[TodoDeps],
+        status: str | None = None,
+        limit: int = 20,
+    ) -> str:
+        """List tasks with optional status filter.
+
+        Args:
+            status: Filter by status (pending, in_progress, done, blocked, failed).
+            limit: Max results.
+        """
+        q = TaskQuery(
+            case_id=ctx.deps.case_id,
+            session_id=ctx.deps.session_id,
+            user_id=ctx.deps.user_id,
+            status=status if status in ("pending", "in_progress", "done", "blocked", "canceled", "failed") else None,
+            limit=limit,
+        )
+        tasks = await ctx.deps.store.list_tasks(q)
+        if not tasks:
+            return "No tasks found."
+        lines = []
+        for t in tasks:
+            lines.append(f"[{t.id[:8]}] {t.title} (status={t.status}, priority={t.priority})")
+        return "\n".join(lines)
+
+    @toolset.tool
+    async def update_task(
+        ctx: RunContext[TodoDeps],
+        task_id: str,
+        status: str | None = None,
+        title: str | None = None,
+        description: str | None = None,
+    ) -> str:
+        """Update a task's status or details.
+
+        Args:
+            task_id: The task ID.
+            status: New status.
+            title: New title.
+            description: New description.
+        """
+        patch = TaskPatch(
+            status=status if status else None,  # type: ignore[arg-type]
+            title=title,
+            description=description,
+        )
+        task = await ctx.deps.store.update_task(task_id, patch)
+        if not task:
+            return f"Task '{task_id}' not found."
+        return f"Updated task '{task.title}' to status={task.status}"
+
+    @toolset.tool
+    async def complete_task(ctx: RunContext[TodoDeps], task_id: str) -> str:
+        """Mark a task as done.
+
+        Args:
+            task_id: The task ID to complete.
+        """
+        task = await ctx.deps.store.update_task(task_id, TaskPatch(status="done"))
+        if not task:
+            return f"Task '{task_id}' not found."
+        return f"Completed task '{task.title}'"
+
+    return toolset
diff --git a/agent_ext/todo/store_base.py b/agent_ext/todo/store_base.py
index 68a090a..c2d7c56 100644
--- a/agent_ext/todo/store_base.py
+++ b/agent_ext/todo/store_base.py
@@ -1,19 +1,21 @@
 from __future__ import annotations
 
-from typing import List, Optional, Protocol
+from typing import Protocol
+
 from agent_ext.todo.models import Task, TaskCreate, TaskPatch, TaskQuery
 
+
 class TaskStore(Protocol):
     async def create_task(self, data: TaskCreate) -> Task: ...
-    async def get_task(self, task_id: str) -> Optional[Task]: ...
-    async def list_tasks(self, q: TaskQuery) -> List[Task]: ...
-    async def update_task(self, task_id: str, patch: TaskPatch) -> Optional[Task]: ...
+    async def get_task(self, task_id: str) -> Task | None: ...
+    async def list_tasks(self, q: TaskQuery) -> list[Task]: ...
+    async def update_task(self, task_id: str, patch: TaskPatch) -> Task | None: ...
     async def delete_task(self, task_id: str) -> bool: ...
 
-    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Optional[Task]: ...
+    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Task | None: ...
     async def add_subtask(self, parent_id: str, data: TaskCreate) -> Task: ...
 
-    async def next_runnable_tasks(self, q: TaskQuery) -> List[Task]: ...
+    async def next_runnable_tasks(self, q: TaskQuery) -> list[Task]: ...
     async def refresh_blocked_status(self, q: TaskQuery) -> int: ...
 
-    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> Optional[dict]: ...
+    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> dict | None: ...
diff --git a/agent_ext/todo/store_memory.py b/agent_ext/todo/store_memory.py
index fd1bfc1..b1d96c7 100644
--- a/agent_ext/todo/store_memory.py
+++ b/agent_ext/todo/store_memory.py
@@ -1,14 +1,13 @@
 from __future__ import annotations
 
 import uuid
-from typing import Dict, List, Optional
 
 from agent_ext.todo.models import Task, TaskCreate, TaskPatch, TaskQuery, now_utc
 
 
 class InMemoryTaskStore:
     def __init__(self) -> None:
-        self._tasks: Dict[str, Task] = {}
+        self._tasks: dict[str, Task] = {}
 
     async def create_task(self, data: TaskCreate) -> Task:
         tid = uuid.uuid4().hex
@@ -28,10 +27,10 @@ async def create_task(self, data: TaskCreate) -> Task:
         self._tasks[tid] = t
         return t
 
-    async def get_task(self, task_id: str) -> Optional[Task]:
+    async def get_task(self, task_id: str) -> Task | None:
         return self._tasks.get(task_id)
 
-    async def list_tasks(self, q: TaskQuery) -> List[Task]:
+    async def list_tasks(self, q: TaskQuery) -> list[Task]:
         items = list(self._tasks.values())
 
         def match(t: Task) -> bool:
@@ -57,7 +56,7 @@ def match(t: Task) -> bool:
         filtered.sort(key=lambda x: (x.priority, x.created_at))
         return filtered[q.offset : q.offset + q.limit]
 
-    async def update_task(self, task_id: str, patch: TaskPatch) -> Optional[Task]:
+    async def update_task(self, task_id: str, patch: TaskPatch) -> Task | None:
         t = self._tasks.get(task_id)
         if not t:
             return None
@@ -80,7 +79,7 @@ async def update_task(self, task_id: str, patch: TaskPatch) -> Optional[Task]:
     async def delete_task(self, task_id: str) -> bool:
         return self._tasks.pop(task_id, None) is not None
 
-    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Optional[Task]:
+    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Task | None:
         t = self._tasks.get(task_id)
         if not t:
             return None
@@ -101,7 +100,8 @@ async def add_subtask(self, parent_id: str, data: TaskCreate) -> Task:
             user_id=data.user_id or parent.user_id,
         )
         return await self.create_task(merged)
-    async def next_runnable_tasks(self, q: TaskQuery) -> List[Task]:
+
+    async def next_runnable_tasks(self, q: TaskQuery) -> list[Task]:
         """
         Runnable = status in {pending, in_progress} AND all dependencies are done.
         (in_progress included so you can resume partially-run tasks)
@@ -158,7 +158,7 @@ def deps_done(t: Task) -> bool:
 
         return updated
 
-    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> Optional[dict]:
+    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> dict | None:
         root = await self.get_task(root_task_id)
         if not root:
             return None
@@ -192,10 +192,13 @@ def build(node: Task) -> dict:
                 is_runnable = (node.status in {"pending", "in_progress"}) and not blocked_by
 
                 # subtree stats
-                totals = {"total": 1, "done": 1 if node.status == "done" else 0,
-                          "blocked": 1 if (node.status == "blocked" or blocked_by) else 0,
-                          "failed": 1 if node.status == "failed" else 0,
-                          "open": 0 if is_terminal else 1}
+                totals = {
+                    "total": 1,
+                    "done": 1 if node.status == "done" else 0,
+                    "blocked": 1 if (node.status == "blocked" or blocked_by) else 0,
+                    "failed": 1 if node.status == "failed" else 0,
+                    "open": 0 if is_terminal else 1,
+                }
 
                 for ch in out["children"]:
                     r = ch.get("rollup") or {}
diff --git a/agent_ext/todo/store_postgres.py b/agent_ext/todo/store_postgres.py
index 3f1bc02..4365bf7 100644
--- a/agent_ext/todo/store_postgres.py
+++ b/agent_ext/todo/store_postgres.py
@@ -1,13 +1,11 @@
 from __future__ import annotations
 
 import uuid
-from typing import List, Optional
 
 import asyncpg
 
 from agent_ext.todo.models import Task, TaskCreate, TaskPatch, TaskQuery, now_utc
 
-
 CREATE_SQL = """
 CREATE TABLE IF NOT EXISTS agent_tasks (
   id TEXT PRIMARY KEY,
@@ -67,7 +65,7 @@ def __init__(self, pool: asyncpg.Pool) -> None:
         self.pool = pool
 
     @classmethod
-    async def connect(cls, dsn: str) -> "PostgresTaskStore":
+    async def connect(cls, dsn: str) -> PostgresTaskStore:
         pool = await asyncpg.create_pool(dsn)
         async with pool.acquire() as conn:
             await conn.execute(CREATE_SQL)
@@ -108,12 +106,12 @@ async def create_task(self, data: TaskCreate) -> Task:
             row = await conn.fetchrow("SELECT * FROM agent_tasks WHERE id=$1", tid)
             return _row_to_task(row)
 
-    async def get_task(self, task_id: str) -> Optional[Task]:
+    async def get_task(self, task_id: str) -> Task | None:
         async with self.pool.acquire() as conn:
             row = await conn.fetchrow("SELECT * FROM agent_tasks WHERE id=$1", task_id)
             return _row_to_task(row) if row else None
 
-    async def list_tasks(self, q: TaskQuery) -> List[Task]:
+    async def list_tasks(self, q: TaskQuery) -> list[Task]:
         clauses = []
         args = []
         i = 1
@@ -154,7 +152,7 @@ def add(cond: str, val):
             rows = await conn.fetch(sql, *args)
             return [_row_to_task(r) for r in rows]
 
-    async def update_task(self, task_id: str, patch: TaskPatch) -> Optional[Task]:
+    async def update_task(self, task_id: str, patch: TaskPatch) -> Task | None:
         existing = await self.get_task(task_id)
         if not existing:
             return None
@@ -205,7 +203,7 @@ async def delete_task(self, task_id: str) -> bool:
             # asyncpg returns "DELETE <n>"
             return r.split()[-1] != "0"
 
-    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Optional[Task]:
+    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Task | None:
         t = await self.get_task(task_id)
         if not t:
             return None
@@ -226,7 +224,7 @@ async def add_subtask(self, parent_id: str, data: TaskCreate) -> Task:
         )
         return await self.create_task(merged)
 
-    async def next_runnable_tasks(self, q: TaskQuery) -> List[Task]:
+    async def next_runnable_tasks(self, q: TaskQuery) -> list[Task]:
         """
         Postgres-side filter for runnable tasks:
         - within tenant scope filters
@@ -363,7 +361,7 @@ def _count(r: str) -> int:
 
         return _count(r1) + _count(r2)
 
-    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> Optional[dict]:
+    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> dict | None:
         async with self.pool.acquire() as conn:
             rows = await conn.fetch(
                 """
@@ -410,10 +408,13 @@ def build(task_id: str) -> dict:
                 is_terminal = node.status in {"done", "canceled", "failed"}
                 is_runnable = (node.status in {"pending", "in_progress"}) and not blocked_by
 
-                totals = {"total": 1, "done": 1 if node.status == "done" else 0,
-                          "blocked": 1 if (node.status == "blocked" or blocked_by) else 0,
-                          "failed": 1 if node.status == "failed" else 0,
-                          "open": 0 if is_terminal else 1}
+                totals = {
+                    "total": 1,
+                    "done": 1 if node.status == "done" else 0,
+                    "blocked": 1 if (node.status == "blocked" or blocked_by) else 0,
+                    "failed": 1 if node.status == "failed" else 0,
+                    "open": 0 if is_terminal else 1,
+                }
 
                 for ch in out["children"]:
                     r = ch.get("rollup") or {}
diff --git a/agent_ext/todo/toolset.py b/agent_ext/todo/toolset.py
index 07770c5..e209ec4 100644
--- a/agent_ext/todo/toolset.py
+++ b/agent_ext/todo/toolset.py
@@ -1,17 +1,18 @@
 from __future__ import annotations
 
-from typing import Any, Dict, List, Optional
+from typing import Any
 
+from agent_ext.todo.events import TaskEvent, TaskEventBus
 from agent_ext.todo.models import Task, TaskCreate, TaskPatch, TaskQuery
 from agent_ext.todo.store_base import TaskStore
-from agent_ext.todo.events import TaskEvent, TaskEventBus
 
 
 class TodoToolset:
     """
     Provide CRUD + dependency/subtask helpers.
     """
-    def __init__(self, store: TaskStore, *, events: Optional[TaskEventBus] = None) -> None:
+
+    def __init__(self, store: TaskStore, *, events: TaskEventBus | None = None) -> None:
         self.store = store
         self.events = events
 
@@ -21,16 +22,16 @@ async def create_task(self, data: TaskCreate) -> Task:
             await self.events.emit(TaskEvent(name="task_created", task=t, payload={}))
         return t
 
-    async def get_task(self, task_id: str) -> Optional[Task]:
+    async def get_task(self, task_id: str) -> Task | None:
         return await self.store.get_task(task_id)
 
-    async def list_tasks(self, q: TaskQuery) -> List[Task]:
+    async def list_tasks(self, q: TaskQuery) -> list[Task]:
         return await self.store.list_tasks(q)
 
-    async def update_task(self, task_id: str, patch: TaskPatch) -> Optional[Task]:
+    async def update_task(self, task_id: str, patch: TaskPatch) -> Task | None:
         t = await self.store.update_task(task_id, patch)
         if t and self.events:
-            payload: Dict[str, Any] = {"patch": patch.model_dump(exclude_unset=True)}
+            payload: dict[str, Any] = {"patch": patch.model_dump(exclude_unset=True)}
             name = "task_updated"
             if patch.status == "done":
                 name = "task_completed"
@@ -39,10 +40,12 @@ async def update_task(self, task_id: str, patch: TaskPatch) -> Optional[Task]:
             await self.events.emit(TaskEvent(name=name, task=t, payload=payload))
         return t
 
-    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Optional[Task]:
+    async def add_dependency(self, task_id: str, depends_on_task_id: str) -> Task | None:
         t = await self.store.add_dependency(task_id, depends_on_task_id)
         if t and self.events:
-            await self.events.emit(TaskEvent(name="task_updated", task=t, payload={"depends_on_added": depends_on_task_id}))
+            await self.events.emit(
+                TaskEvent(name="task_updated", task=t, payload={"depends_on_added": depends_on_task_id})
+            )
         return t
 
     async def add_subtask(self, parent_id: str, data: TaskCreate) -> Task:
@@ -51,11 +54,11 @@ async def add_subtask(self, parent_id: str, data: TaskCreate) -> Task:
             await self.events.emit(TaskEvent(name="task_created", task=t, payload={"parent_id": parent_id}))
         return t
 
-    async def next_runnable_tasks(self, q: TaskQuery) -> List[Task]:
+    async def next_runnable_tasks(self, q: TaskQuery) -> list[Task]:
         return await self.store.next_runnable_tasks(q)
 
     async def refresh_blocked_status(self, q: TaskQuery) -> int:
         return await self.store.refresh_blocked_status(q)
 
-    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> Optional[dict]:
+    async def get_task_tree(self, root_task_id: str, include_rollup: bool = False) -> dict | None:
         return await self.store.get_task_tree(root_task_id, include_rollup=include_rollup)
diff --git a/agent_ext/workbench/__init__.py b/agent_ext/workbench/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/workbench/__main__.py b/agent_ext/workbench/__main__.py
new file mode 100644
index 0000000..bdee00d
--- /dev/null
+++ b/agent_ext/workbench/__main__.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+
+from dotenv import find_dotenv, load_dotenv
+
+from agent_ext.workbench.models import build_openai_chat_model, model_from_env
+from agent_ext.workbench.runtime import build_ctx
+from agent_ext.workbench.tui_async import run_tui
+
+# Load .env from cwd or any parent (so running from repo root finds .env)
+load_dotenv(find_dotenv())
+
+max_sub = int(os.getenv("MAX_PARALLEL_SUBAGENTS", "4"))
+max_llm = int(os.getenv("MAX_PARALLEL_MODEL_CALLS", "2"))
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--case-id", default="case-1")
+    ap.add_argument("--session-id", default="sess-1")
+    ap.add_argument("--user-id", default="user-1")
+    ap.add_argument("--use-openai-chat-model", action="store_true")
+    ap.add_argument("--max-parallel-subagents", type=int, default=4)
+    ap.add_argument("--max-parallel-model-calls", type=int, default=2)
+    args = ap.parse_args()
+
+    model = None
+    if args.use_openai_chat_model:
+        cfg = model_from_env()
+        model = build_openai_chat_model(cfg)
+        print(f"[model] base_url={cfg.base_url} model={cfg.model}")
+
+    ctx = build_ctx(
+        case_id=args.case_id,
+        session_id=args.session_id,
+        user_id=args.user_id,
+        model=model,
+        max_parallel_subagents=args.max_parallel_subagents,
+        max_parallel_model_calls=args.max_parallel_model_calls,
+    )
+
+    asyncio.run(run_tui(ctx))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/agent_ext/workbench/adopt.py b/agent_ext/workbench/adopt.py
new file mode 100644
index 0000000..bc34e3c
--- /dev/null
+++ b/agent_ext/workbench/adopt.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import os
+import subprocess
+from pathlib import Path
+
+# Pull strategy: "merge" (default) or "rebase". Merge is safer for automation.
+ADOPT_PULL_STRATEGY = os.getenv("ADOPT_PULL_STRATEGY", "merge").strip().lower()
+# Max push retries after pull on non-fast-forward
+ADOPT_PUSH_RETRIES = int(os.getenv("ADOPT_PUSH_RETRIES", "2"))
+
+
+def _run(cmd: list[str], *, cwd: Path | None = None) -> tuple[bool, str]:
+    env = os.environ.copy()  # inherits HTTP_PROXY/HTTPS_PROXY/etc.
+    p = subprocess.run(cmd, cwd=str(cwd) if cwd else None, env=env, capture_output=True, text=True)
+    out = (p.stdout or "") + ("\n" if p.stdout and p.stderr else "") + (p.stderr or "")
+    return (p.returncode == 0), out.strip()
+
+
+def ensure_branch(branch: str, *, repo_root: Path = Path(".")) -> None:
+    ok, out = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=repo_root)
+    if not ok:
+        raise RuntimeError(out)
+    cur = out.strip()
+    if cur != branch:
+        ok, out = _run(["git", "checkout", branch], cwd=repo_root)
+        if not ok:
+            raise RuntimeError(out)
+
+
+def fetch_and_merge_origin(branch: str, *, repo_root: Path = Path(".")) -> None:
+    """Fetch origin and integrate origin/<branch> into current branch. Use before commit_and_push to deconflict with other runners. No-op if origin/branch does not exist yet."""
+    ok, _ = _run(["git", "fetch", "origin"], cwd=repo_root)
+    if not ok:
+        raise RuntimeError("git fetch origin failed")
+    ok, _ = _run(["git", "rev-parse", f"origin/{branch}"], cwd=repo_root)
+    if not ok:
+        return  # remote branch doesn't exist yet (e.g. first push)
+    if ADOPT_PULL_STRATEGY == "rebase":
+        ok, out = _run(["git", "rebase", f"origin/{branch}"], cwd=repo_root)
+        if not ok:
+            _run(["git", "rebase", "--abort"], cwd=repo_root)
+            raise RuntimeError(f"git rebase origin/{branch} failed (aborted): {out}")
+    else:
+        ok, out = _run(["git", "merge", "--no-edit", f"origin/{branch}"], cwd=repo_root)
+        if not ok:
+            _run(["git", "merge", "--abort"], cwd=repo_root)
+            raise RuntimeError(f"git merge origin/{branch} failed (aborted): {out}")
+
+
+def apply_diff_to_repo(diff_text: str, *, repo_root: Path = Path(".")) -> None:
+    env = os.environ.copy()
+    p = subprocess.run(
+        ["git", "apply", "--whitespace=nowarn", "-"],
+        cwd=str(repo_root),
+        env=env,
+        input=diff_text,
+        capture_output=True,
+        text=True,
+    )
+    out = (p.stdout or "") + ("\n" if p.stdout and p.stderr else "") + (p.stderr or "")
+    if p.returncode == 0:
+        return
+    p2 = subprocess.run(
+        ["git", "apply", "--3way", "--whitespace=nowarn", "-"],
+        cwd=str(repo_root),
+        env=env,
+        input=diff_text,
+        capture_output=True,
+        text=True,
+    )
+    out2 = (p2.stdout or "") + ("\n" if p2.stdout and p2.stderr else "") + (p2.stderr or "")
+    if p2.returncode != 0:
+        raise RuntimeError(f"git apply failed:\n{out}\n\n3way failed:\n{out2}")
+
+
+def commit_and_push(*, message: str, branch: str = "dev", repo_root: Path = Path(".")) -> None:
+    ensure_branch(branch, repo_root=repo_root)
+    ok, out = _run(["git", "status", "--porcelain"], cwd=repo_root)
+    if not ok:
+        raise RuntimeError(out)
+    had_changes = bool(out.strip())
+    if had_changes:
+        ok, out = _run(["git", "stash", "push", "-m", "adopt-pre-pull"], cwd=repo_root)
+        if not ok:
+            raise RuntimeError(f"git stash failed: {out}")
+    try:
+        fetch_and_merge_origin(branch, repo_root=repo_root)
+    finally:
+        if had_changes:
+            _run(["git", "stash", "pop"], cwd=repo_root)
+    ok, out = _run(["git", "status", "--porcelain"], cwd=repo_root)
+    if not ok:
+        raise RuntimeError(out)
+    if not out.strip():
+        return  # nothing to commit
+
+    ok, out = _run(["git", "add", "-A"], cwd=repo_root)
+    if not ok:
+        raise RuntimeError(out)
+
+    ok, out = _run(["git", "commit", "-m", message], cwd=repo_root)
+    if not ok:
+        raise RuntimeError(out)
+
+    last_err = out
+    for attempt in range(ADOPT_PUSH_RETRIES + 1):
+        ok, out = _run(["git", "push", "origin", branch], cwd=repo_root)
+        if ok:
+            return
+        last_err = out
+        if attempt < ADOPT_PUSH_RETRIES:
+            fetch_and_merge_origin(branch, repo_root=repo_root)
+    raise RuntimeError(f"git push origin {branch} failed after {ADOPT_PUSH_RETRIES + 1} attempt(s): {last_err}")
diff --git a/agent_ext/workbench/events.py b/agent_ext/workbench/events.py
new file mode 100644
index 0000000..4f8fa5a
--- /dev/null
+++ b/agent_ext/workbench/events.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass(frozen=True)
+class Event:
+    kind: str
+    who: str
+    msg: str
+    data: dict[str, Any]
+
+
+class EventBus:
+    def __init__(self):
+        self.q: asyncio.Queue[Event] = asyncio.Queue()
+
+    async def emit(self, e: Event) -> None:
+        await self.q.put(e)
+
+    async def drain(self, limit: int = 50) -> list[Event]:
+        out = []
+        for _ in range(limit):
+            if self.q.empty():
+                break
+            out.append(self.q.get_nowait())
+        return out
diff --git a/agent_ext/workbench/gitops.py b/agent_ext/workbench/gitops.py
new file mode 100644
index 0000000..e0cb6be
--- /dev/null
+++ b/agent_ext/workbench/gitops.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import subprocess
+from datetime import datetime
+
+
+def run(cmd: list[str]) -> tuple[bool, str]:
+    p = subprocess.run(cmd, capture_output=True, text=True)
+    ok = p.returncode == 0
+    return ok, (p.stdout + "\n" + p.stderr).strip()
+
+
+def ensure_branch(prefix: str = "auto") -> str:
+    slug = datetime.now().strftime("%Y%m%d_%H%M%S")
+    branch = f"{prefix}/{slug}"
+    ok, out = run(["git", "checkout", "-b", branch])
+    if not ok:
+        raise RuntimeError(out)
+    return branch
+
+
+def commit_all(message: str) -> None:
+    ok, out = run(["git", "add", "-A"])
+    if not ok:
+        raise RuntimeError(out)
+    ok, out = run(["git", "commit", "-m", message])
+    if not ok:
+        raise RuntimeError(out)
+
+
+def push(branch: str) -> None:
+    ok, out = run(["git", "push", "-u", "origin", branch])
+    if not ok:
+        raise RuntimeError(out)
diff --git a/agent_ext/workbench/jupyter.py b/agent_ext/workbench/jupyter.py
new file mode 100644
index 0000000..7a64625
--- /dev/null
+++ b/agent_ext/workbench/jupyter.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from .models import build_openai_chat_model, model_from_env
+from .runtime import build_ctx
+
+
+@dataclass
+class Workbench:
+    """
+    Notebook-friendly wrapper.
+
+    Usage in Jupyter:
+      wb = Workbench.from_env()
+      await wb.plan("add bm25 search tool")
+      await wb.exec("general", "find where RunContext is defined")
+    """
+
+    ctx: any
+
+    @classmethod
+    def from_env(cls, *, use_openai_chat_model: bool = True):
+        model = None
+        if use_openai_chat_model:
+            cfg = model_from_env()
+            model = build_openai_chat_model(cfg)
+        ctx = build_ctx(model=model)
+        return cls(ctx=ctx)
+
+    async def refresh_index(self):
+        # incremental rebuild
+        changed, removed = self.ctx.search.rebuild_incremental()
+        return {"changed": changed, "removed": removed}
+
+    async def bm25(self, query: str, k: int = 20):
+        return self.ctx.search.search(query, top_k=k)
+
+    async def mcp_call(self, tool: str, args: dict):
+        return await self.ctx.mcp_client.call(tool, args)
+
+    # If you have the workflow planner/executor wired:
+    async def exec(self, task_type: str, text: str, hints=()):
+        from agent_ext.workflow.types import TaskRequest
+
+        req = TaskRequest(text=text, task_type=task_type, hints=tuple(hints))
+        wf = self.ctx.workflow_planner.choose(self.ctx, req)
+        result = await self.ctx.workflow_executor.execute(self.ctx, wf, req)
+        # simple reward
+        reward = 1.0 if result.ok else 0.0
+        self.ctx.workflow_experience.record(req, result, reward)
+        self.ctx.workflow_planner.observe(req, wf.name, reward)
+        return result
diff --git a/agent_ext/workbench/limits.py b/agent_ext/workbench/limits.py
new file mode 100644
index 0000000..992f64f
--- /dev/null
+++ b/agent_ext/workbench/limits.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import asyncio
+
+
+class ModelLimiter:
+    """
+    Shared limiter for *all* model calls across subagents.
+    """
+
+    def __init__(self, max_concurrency: int = 2):
+        self._sem = asyncio.Semaphore(max_concurrency)
+
+    async def __aenter__(self):
+        await self._sem.acquire()
+
+    async def __aexit__(self, exc_type, exc, tb):
+        self._sem.release()
diff --git a/agent_ext/workbench/locks.py b/agent_ext/workbench/locks.py
new file mode 100644
index 0000000..c8af01b
--- /dev/null
+++ b/agent_ext/workbench/locks.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+LOCKS_DIR = Path(".agent_state/locks")
+
+
+@dataclass(frozen=True)
+class Lease:
+    key: str
+    owner: str
+    expires_at: float
+
+
+class LeaseLockStore:
+    def __init__(self, root: Path = LOCKS_DIR):
+        self.root = root
+        self.root.mkdir(parents=True, exist_ok=True)
+
+    def _path(self, key: str) -> Path:
+        safe = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in key)
+        return self.root / f"{safe}.json"
+
+    def try_acquire(self, *, key: str, owner: str, ttl_s: int = 900) -> Lease | None:
+        """
+        Best-effort lock. If expired, we steal it.
+        """
+        p = self._path(key)
+        now = time.time()
+
+        if p.exists():
+            try:
+                data = json.loads(p.read_text(encoding="utf-8"))
+                if float(data.get("expires_at", 0)) > now:
+                    return None  # still held
+            except Exception:
+                pass  # treat as expired/corrupt
+
+        lease = Lease(key=key, owner=owner, expires_at=now + ttl_s)
+        p.write_text(
+            json.dumps({"key": key, "owner": owner, "expires_at": lease.expires_at}, indent=2), encoding="utf-8"
+        )
+        return lease
+
+    def release(self, lease: Lease) -> None:
+        p = self._path(lease.key)
+        if p.exists():
+            try:
+                data = json.loads(p.read_text(encoding="utf-8"))
+                if data.get("owner") == lease.owner:
+                    p.unlink()
+            except Exception:
+                # if corrupt, just remove
+                p.unlink(missing_ok=True)
diff --git a/agent_ext/workbench/loop.py b/agent_ext/workbench/loop.py
new file mode 100644
index 0000000..ae8791d
--- /dev/null
+++ b/agent_ext/workbench/loop.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+from .subagents import SubagentResult
+
+# Optional: pydantic-ai for design/implement LLM steps
+try:
+    from pydantic_ai import Agent
+except ImportError:
+    Agent = None  # type: ignore[misc, assignment]
+
+import contextlib
+
+from agent_ext.cog.scoring import score_patch, touched_files_from_diff
+from agent_ext.self_improve.gates import run_gates
+from agent_ext.self_improve.models import GatePlan
+from agent_ext.self_improve.patching import apply_unified_diff
+from agent_ext.workbench.adopt import apply_diff_to_repo, commit_and_push
+from agent_ext.workbench.worktrees import cleanup_worktree, create_worktree, worktree_diff
+
+LLM_TRACE_MAX = 30
+# Store enough for debugging; trace is appended after full response (not streamed)
+LLM_TRACE_PROMPT_LEN = 8_000
+LLM_TRACE_RESPONSE_LEN = 12_000
+
+
+def _append_llm_trace(ctx, kind: str, prompt: str, response: str) -> None:
+    traces = getattr(ctx, "llm_traces", None)
+    if traces is None:
+        return
+    if len(traces) >= LLM_TRACE_MAX:
+        traces.pop(0)
+    traces.append(
+        {
+            "kind": kind,
+            "prompt": (prompt or "")[:LLM_TRACE_PROMPT_LEN],
+            "response": (response or "")[:LLM_TRACE_RESPONSE_LEN],
+        }
+    )
+
+
+async def _implement_in_worktree(
+    ctx,
+    goal: str,
+    candidates: list[dict],
+    strategy: str | None = None,
+    task_id: str | None = None,
+) -> str:
+    """
+    Lifecycle: (1) Create a temporary worktree (sandbox). (2) Apply LLM diff there, run gates.
+    (3) Capture diff from worktree and save to .agent_state/patch_<run_id>.diff. (4) Optionally
+    AUTO_ADOPT: apply that file to main repo and commit+push. (5) finally: remove worktree.
+    Uses task_id in run_id when provided so concurrent implement tasks don't collide on the same worktree.
+    """
+    run_id = f"{ctx.session_id}_{task_id}" if task_id else ctx.session_id
+    wt = create_worktree(run_id=run_id, agent_name="writer_llm_patch")
+
+    try:
+        # 1) generate diff (inside the worktree context)
+        patcher = ctx.subagents.get("llm_patch")
+        meta = {"workdir": str(wt.path), "candidates": candidates, "max_files": 6}
+        if strategy:
+            meta["strategy"] = strategy
+        res = await patcher.run(ctx, input=goal, meta=meta)
+        raw_output = (res.output or "").strip()
+        # Even if LLM didn't return something that starts with diff --git/---, try sanitizer (e.g. extract from markdown)
+        ok_apply = False
+        out_apply = ""
+        if raw_output:
+            ok_apply, out_apply = apply_unified_diff(raw_output, repo_root=wt.path)
+        if not ok_apply:
+            snippet = (raw_output[:800] + ("..." if len(raw_output) > 800 else "")) if raw_output else "(empty)"
+            reason = (
+                "model output not a raw diff (no diff --git or --- at start)"
+                if not res.ok and not raw_output
+                else out_apply or "no unified diff found in output"
+            )
+            return f"implement: create patch failed.\nReason: {reason}\nModel output (first 800 chars):\n{snippet}"
+
+        # 3) gates in worktree (compile/import; pytest optional)
+        plan = GatePlan(import_check=True, compile_check=True, pytest_paths=[])
+        gates = run_gates(plan, repo_root=wt.path)
+
+        # 4) produce final diff
+        diff = worktree_diff(wt)
+
+        # ---- NEW: persist diff into agent state ----
+        state_dir = Path(".agent_state")
+        state_dir.mkdir(parents=True, exist_ok=True)
+
+        diff_path = state_dir / f"patch_{run_id}.diff"
+        diff_path.write_text(diff, encoding="utf-8")
+
+        # pointer to latest patch (optional but nice)
+        (state_dir / "last_patch_path.txt").write_text(str(diff_path), encoding="utf-8")
+
+        # ---- append modules history (learning memory) ----
+        hist = state_dir / "modules_history.json"
+
+        data = {"patches": []}
+        if hist.exists():
+            data = json.loads(hist.read_text(encoding="utf-8"))
+
+        data["patches"].append(
+            {
+                "run_id": run_id,
+                "path": str(diff_path),
+                "gates_ok": gates.ok,
+                "diff_chars": len(diff),
+            }
+        )
+
+        hist.write_text(json.dumps(data, indent=2), encoding="utf-8")
+        # --------------------------------------------
+        touched = touched_files_from_diff(diff)
+        sc = score_patch(gates_ok=gates.ok, diff_chars=len(diff), files_touched=len(touched), eval_delta=0.0)
+        # ---- AUTO-ADOPT (optional) ----
+        AUTO_ADOPT = bool(int(os.getenv("AUTO_ADOPT", "0")))  # default off until you're confident
+        AUTO_PUSH_BRANCH = os.getenv("AUTO_PUSH_BRANCH", "dev")
+        threshold = float(os.getenv("AUTO_COMMIT_THRESHOLD", "80"))
+
+        if gates.ok and AUTO_ADOPT and sc.score >= threshold:
+            # 1) apply to main working tree (outside worktree)
+            apply_diff_to_repo(diff, repo_root=Path("."))
+
+            # 2) rerun gates on main tree (highly recommended)
+            main_plan = GatePlan(import_check=True, compile_check=True, pytest_paths=[])
+            main_gates = run_gates(main_plan, repo_root=Path("."))
+
+            if not main_gates.ok:
+                return (
+                    "implement: worktree gates passed, BUT main-tree gates FAILED after auto-adopt.\n"
+                    f"diff_saved={diff_path}\n"
+                    f"main_gates={list(main_gates.details.keys())}\n"
+                    "Patch is applied in your working tree; revert or fix-forward.\n"
+                )
+
+            # 3) commit + push
+            msg = f"auto: {goal[:72]}"
+            commit_and_push(message=msg, branch=AUTO_PUSH_BRANCH, repo_root=Path("."))
+
+            return (
+                f"implement: ok_apply={ok_apply} gates_ok={gates.ok}\n"
+                f"AUTO_ADOPT: patch applied to main repo and pushed to {AUTO_PUSH_BRANCH}\n"
+                f"diff_saved={diff_path}\n"
+            )
+        # ------------------------------
+        else:
+            keep_msg = f"worktree_kept={wt.path}\n" if bool(int(os.getenv("KEEP_WORKTREE", "0"))) else ""
+            return (
+                f"implement: ok_apply={ok_apply} gates_ok={gates.ok}\n"
+                f"diff_saved={diff_path}\n"
+                f"Patch is on disk; worktree will be removed (use KEEP_WORKTREE=1 to keep). Use /adopt to apply to main repo.\n"
+                f"score={sc.score}\n"
+                f"threshold={threshold}\n"
+                f"AUTO_ADOPT={AUTO_ADOPT}\n"
+                f"AUTO_PUSH_BRANCH={AUTO_PUSH_BRANCH}\n"
+                f"{keep_msg}"
+            )
+
+    finally:
+        # Patch already saved to diff_path above; safe to remove worktree (sandbox only—we never commit in it).
+        keep = bool(int(os.getenv("KEEP_WORKTREE", "0")))
+        state_dir = Path(".agent_state")
+        state_dir.mkdir(parents=True, exist_ok=True)
+        (state_dir / "last_worktree_path.txt").write_text(str(wt.path), encoding="utf-8")
+        if not keep:
+            cleanup_worktree(wt, prune_branch=False)
+        else:
+            # Leave worktree in place; user can inspect or remove manually
+            pass
+
+
+async def plan_and_queue(ctx, user_goal: str) -> list[str]:
+    """
+    Uses planner subagent to generate tasks, then enqueues them.
+    """
+    planner = ctx.subagents.get("planner")
+    res = await planner.run(ctx, input=user_goal, meta={})
+    if not res.ok:
+        return [f"planner failed: {res.output}"]
+
+    tasks = res.output
+    lines = []
+    state = getattr(ctx, "workbench_run_state", None)
+    if state is not None:
+        state["goal"] = user_goal
+    for t in tasks:
+        ctx.task_queue.add(t["kind"], t["title"], t["input"])
+        lines.append(f"queued: {t['kind']} - {t['title']}")
+    return lines
+
+
+async def run_next_task(ctx) -> str:
+    """
+    Executes a single pending task. For now, only implements:
+    - search: uses repo_grep
+    - analyze/design: placeholder
+    - implement: placeholder (tomorrow we wire LLM patch gen + self_improve controller)
+    - gates: placeholder
+    """
+    t = await ctx.task_queue.claim_next_pending()
+    if not t:
+        return "no pending tasks"
+
+    try:
+        if t.kind == "search":
+            query = str(t.input).strip()
+            # Notify if BM25 index will be built on first search
+            if not ctx.search._index_ready:
+                ctx.logger.info("Building search index (first search)…")
+            # Run repo_grep (literal substring) and BM25 (index) in parallel
+            calls = [
+                ("repo_grep", query, {"root": ".", "limit": 25, "regex": False}),
+            ]
+            await ctx.orchestrator.run_many(
+                ctx,
+                calls,
+                max_concurrency=ctx.max_parallel_subagents,
+            )
+            bm = await ctx.subagents.get("bm25").run(ctx, input=str(t.input), meta={"k": 20})
+            t.meta["bm25_candidates"] = bm.output
+            state = getattr(ctx, "workbench_run_state", None)
+            if state is not None:
+                state["search_bm25_hits"] = bm.output
+            t.status = "done"
+            t.finished_at = time.time()
+            return f"{t.id} done: search\n- bm25: {len(bm.output)} hits\n  top: " + ", ".join(
+                [x["path"] for x in bm.output[:5]]
+            )
+
+        if t.kind == "analyze":
+            goal = str(t.input).strip()
+            state = getattr(ctx, "workbench_run_state", None) or {}
+            state["goal"] = goal
+            if hasattr(ctx, "workbench_run_state"):
+                ctx.workbench_run_state.update(state)
+            if ctx.model and Agent is not None:
+                analyze_prompt = f"Clarify this goal into a short, concrete one-paragraph spec (what to build, what success looks like). Goal: {goal}"
+                async with ctx.model_limiter:
+                    agent = Agent(model=ctx.model)
+                    result = await agent.run(analyze_prompt)
+                spec = result.output if hasattr(result, "output") else str(result)
+                _append_llm_trace(ctx, "analyze", analyze_prompt, spec)
+                state["analyze_spec"] = spec
+                if hasattr(ctx, "workbench_run_state"):
+                    ctx.workbench_run_state.update(state)
+                t.status = "done"
+                t.finished_at = time.time()
+                return f"{t.id} done: analyze\n{spec[:400]}..."
+            t.status = "done"
+            t.finished_at = time.time()
+            return f"{t.id} done: analyze (no model; goal stored)"
+
+        if t.kind == "design":
+            state = getattr(ctx, "workbench_run_state", None) or {}
+            goal = state.get("goal", str(t.input))
+            spec = state.get("analyze_spec", "")
+            bm25 = state.get("search_bm25_hits", [])[:10]
+            repo_hits = state.get("search_repo_hits", [])
+            files_ctx = ""
+            if bm25:
+                paths = [x["path"] if isinstance(x, dict) else x[0] for x in bm25]
+                files_ctx = "Relevant paths (from search): " + ", ".join(paths)
+            if repo_hits and isinstance(repo_hits, list):
+                flat = []
+                for r in repo_hits:
+                    if isinstance(r, list):
+                        flat.extend([x.get("file", x) if isinstance(x, dict) else str(x) for x in r])
+                    else:
+                        flat.append(str(r))
+                if flat:
+                    files_ctx += "\nRepo grep files: " + ", ".join(flat[:15])
+            if ctx.model and Agent is not None:
+                design_prompt = (
+                    f"Goal: {goal}\n{spec}\n{files_ctx}\n\n"
+                    "Output a short approach (2-3 sentences) then a JSON array of file changes. "
+                    'Format: {"approach": "...", "changes": [{"path": "rel/path", "action": "edit" or "create", "description": "what to do"}]}. '
+                    "Only include the JSON object, no markdown."
+                )
+                async with ctx.model_limiter:
+                    agent = Agent(model=ctx.model)
+                    result = await agent.run(design_prompt)
+                raw = result.output if hasattr(result, "output") else str(result)
+                _append_llm_trace(ctx, "design", design_prompt, raw)
+                raw = raw.strip()
+                if "```" in raw:
+                    raw = raw.split("```")[1].replace("json", "").strip()
+                try:
+                    design = json.loads(raw)
+                    state["design"] = design
+                    if hasattr(ctx, "workbench_run_state"):
+                        ctx.workbench_run_state.update(state)
+                except json.JSONDecodeError:
+                    state["design"] = {"approach": raw[:500], "changes": []}
+                    if hasattr(ctx, "workbench_run_state"):
+                        ctx.workbench_run_state.update(state)
+                approach = design.get("approach", "")[:300]
+                changes = design.get("changes", [])
+                t.status = "done"
+                t.finished_at = time.time()
+                return f"{t.id} done: design\n{approach}\nchanges: {len(changes)} files"
+            t.status = "done"
+            t.finished_at = time.time()
+            return f"{t.id} done: design (no model)"
+
+        if t.kind == "implement":
+            candidates = []
+            for prev in ctx.task_queue.list():
+                if prev.kind == "search" and prev.meta.get("bm25_candidates"):
+                    candidates = prev.meta["bm25_candidates"]
+                    break
+            state = getattr(ctx, "workbench_run_state", None) or {}
+            if not candidates:
+                candidates = state.get("search_bm25_hits") or []
+            if not candidates:
+                bm = await ctx.subagents.get("bm25").run(ctx, input=str(t.input), meta={"k": 20})
+                candidates = bm.output or []
+            design = state.get("design") or {}
+            strategy = (design.get("approach") or "").strip() if isinstance(design, dict) else None
+            out = await _implement_in_worktree(ctx, str(t.input), candidates, strategy=strategy, task_id=t.id)
+            t.status = "done"
+            t.finished_at = time.time()
+            return f"{t.id} done: implement\n{out}"
+
+        if t.kind == "gates":
+            try:
+                from agent_ext.self_improve.gates import run_gates
+                from agent_ext.self_improve.models import GatePlan
+            except ImportError:
+                t.status = "done"
+                t.finished_at = time.time()
+                return f"{t.id} done: gates (self_improve not available)"
+            gates = run_gates(GatePlan(import_check=True, compile_check=True, pytest_paths=[]))
+            t.status = "done"
+            t.finished_at = time.time()
+            return f"{t.id} done: gates\nok={gates.ok}\n{gates.details}"
+
+        # Unknown task kind
+        t.status = "done"
+        t.finished_at = time.time()
+        return f"{t.id} done: {t.kind} (stub)"
+
+    except Exception as e:
+        t.status = "failed"
+        t.finished_at = time.time()
+        return f"{t.id} failed: {e!r}"
+
+
+def _run_worker(
+    ctx,
+    results: list[str],
+    results_lock: asyncio.Lock,
+    progress_callback: Any | None = None,
+) -> None:
+    """Single worker: repeatedly claim and run next task until queue empty (OpenCode-style parallel units of work)."""
+
+    async def _run() -> None:
+        while True:
+            out = await run_next_task(ctx)
+            if out == "no pending tasks":
+                break
+            async with results_lock:
+                results.append(out)
+            if progress_callback is not None and callable(progress_callback):
+                with contextlib.suppress(Exception):
+                    progress_callback(out)
+
+    return _run
+
+
+async def run_n_tasks(
+    ctx,
+    n: int,
+    *,
+    progress_callback: Any | None = None,
+) -> list[str]:
+    """Run with n concurrent workers until queue is empty. Each worker claims and runs tasks atomically (OpenCode-style).
+    If progress_callback is set, it is called with each task output string as tasks complete (live stream, Cursor/Claude Code style)."""
+    workers = max(1, n)
+    results: list[str] = []
+    results_lock = asyncio.Lock()
+    worker_factory = _run_worker(ctx, results, results_lock, progress_callback)
+    await asyncio.gather(*[worker_factory() for _ in range(workers)])
+    return results
diff --git a/agent_ext/workbench/models.py b/agent_ext/workbench/models.py
new file mode 100644
index 0000000..504c21a
--- /dev/null
+++ b/agent_ext/workbench/models.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class ModelConfig:
+    base_url: str
+    api_key: str
+    model: str
+
+
+def model_from_env(prefix: str = "LLM_") -> ModelConfig:
+    """
+    LLM_BASE_URL=http://127.0.0.1:8000/v1
+    LLM_API_KEY=local
+    LLM_MODEL=gpt-oss-120b
+    """
+    return ModelConfig(
+        base_url=os.environ.get(prefix + "BASE_URL", "http://127.0.0.1:8000/v1"),
+        api_key=os.environ.get(prefix + "API_KEY", "local"),
+        model=os.environ.get(prefix + "MODEL", "gpt-oss-120b"),
+    )
+
+
+def build_openai_chat_model(cfg: ModelConfig) -> Any:
+    """
+    Returns OpenAIChatModel(model_name, provider=OpenAIProvider(base_url=..., api_key=...)).
+    Imports pydantic-ai lazily so startup is fast when no model is requested.
+    """
+    try:
+        from pydantic_ai.models.openai import OpenAIChatModel
+        from pydantic_ai.providers.openai import OpenAIProvider
+    except ImportError as e:
+        raise RuntimeError("pydantic-ai not installed. Install agent-patterns[agent] or pydantic-ai.") from e
+    provider = OpenAIProvider(base_url=cfg.base_url, api_key=cfg.api_key)
+    return OpenAIChatModel(cfg.model, provider=provider)
diff --git a/agent_ext/workbench/parallel.py b/agent_ext/workbench/parallel.py
new file mode 100644
index 0000000..313c4d4
--- /dev/null
+++ b/agent_ext/workbench/parallel.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+import asyncio
+from collections.abc import Awaitable, Iterable
+from typing import Any
+
+
+async def gather_bounded(
+    coros: Iterable[Awaitable[Any]],
+    *,
+    max_concurrency: int = 4,
+) -> list[Any]:
+    """
+    Bounded asyncio.gather: prevents saturating your local LLM server.
+    """
+    sem = asyncio.Semaphore(max_concurrency)
+
+    async def _wrap(coro: Awaitable[Any]) -> Any:
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*[_wrap(c) for c in coros])
diff --git a/agent_ext/workbench/patch_models.py b/agent_ext/workbench/patch_models.py
new file mode 100644
index 0000000..1363f55
--- /dev/null
+++ b/agent_ext/workbench/patch_models.py
@@ -0,0 +1,73 @@
+"""
+Structured patch output for the LLM: Pydantic models that the model returns
+so we can convert to a valid unified diff ourselves (no raw diff parsing).
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class LineChange(BaseModel):
+    """One line in a file patch: unchanged context, added, or removed."""
+
+    kind: Literal["context", "add", "remove"] = Field(
+        description="context = unchanged line, add = new line, remove = deleted line"
+    )
+    content: str = Field(description="The line content without any + - or space prefix")
+
+
+class FilePatch(BaseModel):
+    """Edits to a single file: path (relative to repo) and list of line changes."""
+
+    path: str = Field(description="Relative path from repo root, e.g. agent_ext/foo.py")
+    is_new_file: bool = Field(default=False, description="True if this file is being created")
+    lines: list[LineChange] = Field(
+        default_factory=list, description="Ordered list of line changes (context/add/remove)"
+    )
+
+
+class PatchOutput(BaseModel):
+    """Structured patch: list of file edits. Convert to unified diff with structured_to_unified_diff()."""
+
+    files: list[FilePatch] = Field(default_factory=list, description="List of file patches to apply")
+
+
+def structured_to_unified_diff(patch: PatchOutput) -> str:
+    """
+    Convert structured PatchOutput to a valid unified diff string for git apply.
+    We control the format so it is always valid; no LLM diff parsing needed.
+    """
+    out: list[str] = []
+    for fp in patch.files:
+        path = fp.path.replace("\\", "/").lstrip("/")
+        if fp.is_new_file:
+            out.append(f"diff --git a/{path} b/{path}")
+            out.append("new file mode 100644")
+            out.append("--- /dev/null")
+            out.append(f"+++ b/{path}")
+        else:
+            out.append(f"diff --git a/{path} b/{path}")
+            out.append(f"--- a/{path}")
+            out.append(f"+++ b/{path}")
+        # Single hunk: compute line counts
+        old_count = sum(1 for lc in fp.lines if lc.kind in ("context", "remove"))
+        new_count = sum(1 for lc in fp.lines if lc.kind in ("context", "add"))
+        if fp.is_new_file:
+            out.append(f"@@ -0,0 +1,{max(1, new_count)} @@")
+        else:
+            out.append(f"@@ -1,{max(1, old_count)} +1,{max(1, new_count)} @@")
+        for lc in fp.lines:
+            if lc.kind == "context":
+                prefix = " "
+            elif lc.kind == "add":
+                prefix = "+"
+            else:
+                prefix = "-"
+            line = (lc.content or "").rstrip("\n")
+            out.append(prefix + line)
+    if not out:
+        return ""
+    return "\n".join(out) + "\n"
diff --git a/agent_ext/workbench/plan_models.py b/agent_ext/workbench/plan_models.py
new file mode 100644
index 0000000..8a7ae0c
--- /dev/null
+++ b/agent_ext/workbench/plan_models.py
@@ -0,0 +1,39 @@
+"""
+Structured plan output: LLM returns a list of tasks (kind, title, input)
+so planning is dynamic (e.g. skip analyze for small changes, add multiple searches).
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class TaskSpec(BaseModel):
+    """One task in the plan. Kinds are executed by the workbench loop."""
+
+    kind: Literal["analyze", "search", "design", "implement", "gates"] = Field(
+        description="analyze=clarify goal, search=find relevant code, design=approach+file list, implement=create patch, gates=run tests"
+    )
+    title: str = Field(description="Short human-readable title for this step")
+    input: str = Field(
+        default="", description="Input for the task: usually the goal, or a specific search query for search tasks"
+    )
+
+
+class PlanOutput(BaseModel):
+    """Dynamic plan: ordered list of tasks. Convert to queue with plan_and_queue."""
+
+    tasks: list[TaskSpec] = Field(default_factory=list, description="Ordered list of tasks to run")
+
+
+def plan_output_to_tasks(plan: PlanOutput) -> list[dict]:
+    """Convert PlanOutput to the list of dicts expected by plan_and_queue (kind, title, input)."""
+    out = []
+    for t in plan.tasks:
+        inp: str | dict = t.input
+        if t.kind == "gates":
+            inp = {"pytest": []}
+        out.append({"kind": t.kind, "title": t.title, "input": inp})
+    return out
diff --git a/agent_ext/workbench/planner.py b/agent_ext/workbench/planner.py
new file mode 100644
index 0000000..54d2d4f
--- /dev/null
+++ b/agent_ext/workbench/planner.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import asyncio
+import builtins
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class Task:
+    id: str
+    kind: str  # analyze/search/design/implement/gates/improve
+    title: str
+    input: Any
+    status: str = "pending"  # pending/in_progress/done/failed/cancelled
+    meta: dict[str, Any] = field(default_factory=dict)
+    created_at: float = field(default_factory=time.time)
+    started_at: float | None = None
+    finished_at: float | None = None
+
+    @property
+    def elapsed_s(self) -> float | None:
+        """Seconds between start and finish (or now if in progress)."""
+        if self.started_at is None:
+            return None
+        end = self.finished_at if self.finished_at is not None else time.time()
+        return end - self.started_at
+
+
+class TaskQueue:
+    """Queue of tasks; safe for many concurrent run loops (claim_next_pending is atomic)."""
+
+    def __init__(self) -> None:
+        self._tasks: list[Task] = []
+        self._seq = 0
+        self._lock = asyncio.Lock()
+
+    def add(self, kind: str, title: str, input: Any, meta: dict[str, Any] | None = None) -> Task:
+        self._seq += 1
+        t = Task(id=f"t{self._seq:04d}", kind=kind, title=title, input=input, meta=meta or {})
+        self._tasks.append(t)
+        return t
+
+    def list(self) -> builtins.list[Task]:
+        return list(self._tasks)
+
+    def next_pending(self) -> Task | None:
+        """First pending task (read-only; for display). Use claim_next_pending in run loops."""
+        for t in self._tasks:
+            if t.status == "pending":
+                return t
+        return None
+
+    async def claim_next_pending(self) -> Task | None:
+        """Atomically take the first pending task and mark in_progress. Safe for many concurrent run loops."""
+        async with self._lock:
+            for t in self._tasks:
+                if t.status == "pending":
+                    t.status = "in_progress"
+                    t.started_at = time.time()
+                    return t
+            return None
+
+    def normalize_id(self, task_id: str) -> str:
+        """Allow 't0004' or '0004' -> 't0004'."""
+        s = (task_id or "").strip()
+        if s.isdigit():
+            return f"t{s}"
+        return s if s.startswith("t") else f"t{s}"
+
+    def get_by_id(self, task_id: str) -> Task | None:
+        """Return task by id (accepts t0004 or 0004), or None."""
+        tid = self.normalize_id(task_id)
+        return next((t for t in self._tasks if t.id == tid), None)
+
+    async def cancel_by_id(self, task_id: str) -> bool | None:
+        """Cancel a task by id. Returns True if it was pending and is now cancelled, False if found but not pending, None if not found. Thread-safe."""
+        tid = self.normalize_id(task_id)
+        async with self._lock:
+            for t in self._tasks:
+                if t.id == tid:
+                    if t.status == "pending":
+                        t.status = "cancelled"
+                        t.finished_at = time.time()
+                        return True
+                    return False
+            return None
+
+    async def retry_by_id(self, task_id: str) -> bool | None:
+        """Reset a failed/cancelled task to pending. Returns True if reset, False if not in retryable state, None if not found."""
+        tid = self.normalize_id(task_id)
+        async with self._lock:
+            for t in self._tasks:
+                if t.id == tid:
+                    if t.status in ("failed", "cancelled"):
+                        t.status = "pending"
+                        t.started_at = None
+                        t.finished_at = None
+                        return True
+                    return False
+            return None
+
+    async def retry_all_failed(self) -> int:
+        """Reset all failed tasks to pending. Returns count."""
+        count = 0
+        async with self._lock:
+            for t in self._tasks:
+                if t.status == "failed":
+                    t.status = "pending"
+                    t.started_at = None
+                    t.finished_at = None
+                    count += 1
+        return count
diff --git a/agent_ext/workbench/runtime.py b/agent_ext/workbench/runtime.py
new file mode 100644
index 0000000..383478a
--- /dev/null
+++ b/agent_ext/workbench/runtime.py
@@ -0,0 +1,172 @@
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import os
+from pathlib import Path
+from typing import Any
+
+from agent_ext.cog.state import CogState, RegressionMemory
+from agent_ext.hooks.builtins import AuditHook, PolicyHook
+from agent_ext.hooks.chain import MiddlewareChain
+from agent_ext.hooks.context import MiddlewareContext
+from agent_ext.mcp import LocalTransport, MCPClient, MCPServer, MCPToolRegistry, ToolSpec
+from agent_ext.modules.registry import ModuleRegistry
+from agent_ext.run_context import Policy, RunContext
+from agent_ext.search import BM25Config, BM25Index, RepoIndexerConfig, TokenizerConfig
+from agent_ext.subagents.message_bus import InMemoryMessageBus, TaskManager
+from agent_ext.workbench.subagents_bm25 import BM25SearchSubagent
+from agent_ext.workflow.builtins import register_builtins as register_workflow_builtins
+from agent_ext.workflow.executor import WorkflowExecutor
+from agent_ext.workflow.experience import ExperienceStore
+from agent_ext.workflow.planner import WorkflowPlanner
+from agent_ext.workflow.registry import Registry as WorkflowRegistry
+
+from .limits import ModelLimiter
+from .planner import TaskQueue
+from .subagents import PlannerSubagent, RepoGrepSubagent, SubagentOrchestrator, SubagentRegistry
+from .subagents_patch import LLMPatchSubagent
+
+try:
+    from agent_ext.self_improve.controller import SelfImproveController
+except Exception:
+    SelfImproveController = None  # type: ignore[misc, assignment]
+
+use_tiktoken = bool(int(os.getenv("USE_TIKTOKEN", "0")))
+tok_enc = os.getenv("TIKTOKEN_ENCODING", "o200k_base")
+
+
+class _Logger:
+    def info(self, msg: str, **kw):
+        print(f"[info] {msg} {kw}")
+
+    def warning(self, msg: str, **kw):
+        print(f"[warn] {msg} {kw}")
+
+    def error(self, msg: str, **kw):
+        print(f"[error] {msg} {kw}")
+
+
+class _Cache(dict):
+    def get(self, k, default=None):
+        return super().get(k, default)
+
+    def set(self, k, v):
+        super().__setitem__(k, v)
+
+
+class _Artifacts:
+    root = Path(".agent_state/runs")
+
+    def put_json(self, key: str, obj):
+        self.root.mkdir(parents=True, exist_ok=True)
+        import json
+
+        p = self.root / f"{key}.json"
+        p.write_text(json.dumps(obj, indent=2), encoding="utf-8")
+        return str(p)
+
+
+def build_ctx(
+    *,
+    case_id: str = "case-1",
+    session_id: str = "sess-1",
+    user_id: str = "user-1",
+    model: Any | None = None,
+    max_parallel_subagents: int = 4,
+    max_parallel_model_calls: int = 2,
+) -> RunContext:
+    cog_state = CogState()
+    cog_state.load()
+    regression_memory = RegressionMemory()
+    regression_memory.load()
+
+    ctx = RunContext(
+        case_id=case_id,
+        session_id=session_id,
+        user_id=user_id,
+        policy=Policy(allow_tools=True, allow_exec=False, allow_fs_write=True),
+        cache=_Cache(),
+        logger=_Logger(),
+        artifacts=_Artifacts(),
+        trace_id=None,
+        cog_state=cog_state,
+        regression_memory=regression_memory,
+    )
+
+    # Workbench attachments
+    ctx.model = model
+    ctx.model_limiter = ModelLimiter(max_concurrency=max_parallel_model_calls)
+    ctx.max_parallel_subagents = max_parallel_subagents
+
+    ctx.task_queue = TaskQueue()
+
+    # Subagents
+    reg = SubagentRegistry()
+    reg.register(PlannerSubagent())
+    reg.register(RepoGrepSubagent())
+    reg.register(BM25SearchSubagent())
+    reg.register(LLMPatchSubagent())
+    ctx.subagents = reg
+    ctx.orchestrator = SubagentOrchestrator(reg)
+    # Middleware chain (async hooks)
+    ctx.middleware_chain = MiddlewareChain([AuditHook(), PolicyHook()])
+    ctx.middleware_context = MiddlewareContext(
+        config={
+            "case_id": case_id,
+            "session_id": session_id,
+            "max_parallel_subagents": max_parallel_subagents,
+        }
+    )
+    # Message bus for inter-agent communication
+    ctx.message_bus = InMemoryMessageBus()
+    ctx.task_manager = TaskManager(message_bus=ctx.message_bus)
+    # Module registry (load builtins)
+    ctx.module_registry = ModuleRegistry()
+    with contextlib.suppress(Exception):
+        ctx.module_registry.load_all_builtins(ctx)  # non-fatal if modules fail to load
+    # Commands map (TUI)
+    ctx.commands = {}
+    # Run state for plan → design → implement (search results, design output, etc.)
+    ctx.workbench_run_state = {}
+    # Recent LLM call traces for TUI (prompt/response previews; capped at 30)
+    ctx.llm_traces = []
+    # Background runs: list of asyncio.Task (many parallel runs; queue claims atomically; /stop or /stop all)
+    ctx.background_run_tasks: list = []
+    # Recent task outputs for /watch (progress_callback appends; capped in TUI)
+    ctx.watch_outputs: list = []
+    # Self-improve: apply patches and run gates (optional)
+    ctx.self_improve = SelfImproveController() if SelfImproveController else None
+    # Workflow synthesis + learning
+    ctx.workflow_registry = WorkflowRegistry()
+    register_workflow_builtins(ctx.workflow_registry)
+
+    ctx.workflow_experience = ExperienceStore()
+    ctx.workflow_planner = WorkflowPlanner(ctx.workflow_experience)
+    ctx.workflow_executor = WorkflowExecutor()
+
+    ctx.search = BM25Index(
+        bm25_cfg=BM25Config(top_k=int(os.getenv("BM25_TOP_K", "20"))),
+        tok_cfg=TokenizerConfig(use_tiktoken=use_tiktoken, tiktoken_encoding=tok_enc),
+        indexer_cfg=RepoIndexerConfig(),
+    )
+    # Index built on first search (keeps startup fast)
+
+    ctx.mcp_registry = MCPToolRegistry()
+    ctx.mcp_transport = LocalTransport(server_in=asyncio.Queue(), server_out=asyncio.Queue())
+    ctx.mcp_server = MCPServer(ctx.mcp_registry, ctx.mcp_transport)
+    ctx.mcp_client = MCPClient(ctx.mcp_transport)
+    # MCP server started in run_tui() when event loop is running
+
+    # example MCP tool: bm25_search
+    ctx.mcp_registry.register(
+        ToolSpec(
+            name="bm25_search",
+            description="Search repo via BM25 index",
+            input_schema={"type": "object", "properties": {"query": {"type": "string"}, "k": {"type": "integer"}}},
+            output_schema={"type": "array", "items": {"type": "object"}},
+        ),
+        lambda a: ctx.search.search(a.get("query", ""), top_k=int(a.get("k", 20))),
+    )
+
+    return ctx
diff --git a/agent_ext/workbench/streaming.py b/agent_ext/workbench/streaming.py
new file mode 100644
index 0000000..5888052
--- /dev/null
+++ b/agent_ext/workbench/streaming.py
@@ -0,0 +1,94 @@
+"""
+Streaming and agent DAG hooks for the workbench.
+
+Uses pydantic-ai's run_stream() for token-level streaming and agent.iter() for
+node-by-node access to the execution graph (UserPromptNode, ModelRequestNode,
+CallToolsNode, End). Lets you tap into the agent DAG for observability, custom
+logging, or self-replicating agent frameworks.
+
+See: https://ai.pydantic.dev (run_stream, stream_text, agent.iter, graph nodes)
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from typing import Any
+
+# Node type names matching pydantic-ai graph (agent.iter())
+NODE_USER_PROMPT = "user_prompt"
+NODE_MODEL_REQUEST = "model_request"
+NODE_CALL_TOOLS = "call_tools"
+NODE_END = "end"
+
+
+async def run_agent_streaming(
+    ctx: Any,
+    agent: Any,
+    prompt: str,
+    kind: str = "llm",
+    *,
+    prompt_max_len: int = 8000,
+    response_max_len: int = 12000,
+) -> str:
+    """
+    Run a pydantic-ai agent with streaming; append a trace to ctx.llm_traces
+    and update trace["response"] as tokens arrive. Returns the full text output.
+
+    Use this when you want a single LLM call to stream into the workbench trace
+    (e.g. for custom subagents or tools that use Agent).
+    """
+    traces = getattr(ctx, "llm_traces", None)
+    trace_entry: dict[str, Any] | None = None
+    if traces is not None:
+        max_traces = 30
+        if len(traces) >= max_traces:
+            traces.pop(0)
+        trace_entry = {
+            "kind": kind,
+            "prompt": (prompt or "")[:prompt_max_len],
+            "response": "",
+        }
+        traces.append(trace_entry)
+
+    text_out = ""
+    async with agent.run_stream(prompt) as result:
+        async for text in result.stream_text():
+            text_out = text
+            if trace_entry is not None:
+                trace_entry["response"] = (text or "")[:response_max_len]
+    if trace_entry is not None and not trace_entry["response"] and text_out:
+        trace_entry["response"] = (text_out or "")[:response_max_len]
+    return text_out
+
+
+async def iter_agent_dag(
+    agent: Any,
+    prompt: str,
+    **kwargs: Any,
+) -> AsyncIterator[tuple[str, Any]]:
+    """
+    Iterate over the agent execution graph (DAG) node by node. Yields
+    (node_type, node) for each step: user_prompt, model_request, call_tools, end.
+
+    Lets you tap into the agent DAG for observability, metrics, or custom
+    handling (e.g. streaming from ModelRequestNode via node.stream(run.ctx)).
+
+    Example:
+        async for node_type, node in iter_agent_dag(agent, "What is 2+2?"):
+            if node_type == NODE_MODEL_REQUEST:
+                async with node.stream(run.ctx) as stream:
+                    async for event in stream:
+                        ...
+    """
+    from pydantic_ai import Agent
+
+    async with agent.iter(prompt, **kwargs) as run:
+        async for node in run:
+            if getattr(Agent, "is_user_prompt_node", lambda n: False)(node):
+                yield (NODE_USER_PROMPT, node)
+            elif getattr(Agent, "is_model_request_node", lambda n: False)(node):
+                yield (NODE_MODEL_REQUEST, node)
+            elif getattr(Agent, "is_call_tools_node", lambda n: False)(node):
+                yield (NODE_CALL_TOOLS, node)
+            elif getattr(Agent, "is_end_node", lambda n: False)(node) or type(node).__name__ == "End":
+                yield (NODE_END, node)
diff --git a/agent_ext/workbench/subagents.py b/agent_ext/workbench/subagents.py
new file mode 100644
index 0000000..54da173
--- /dev/null
+++ b/agent_ext/workbench/subagents.py
@@ -0,0 +1,176 @@
+from __future__ import annotations
+
+import asyncio
+import builtins
+import re
+from collections.abc import Awaitable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Protocol
+
+from .parallel import gather_bounded
+
+
+@dataclass
+class SubagentResult:
+    ok: bool
+    name: str
+    output: Any
+    meta: dict[str, Any]
+
+
+class Subagent(Protocol):
+    name: str
+
+    async def run(self, ctx, *, input: Any, meta: dict[str, Any]) -> SubagentResult: ...
+
+
+class SubagentRegistry:
+    def __init__(self):
+        self._agents: dict[str, Subagent] = {}
+
+    def register(self, agent: Subagent) -> None:
+        self._agents[agent.name] = agent
+
+    def get(self, name: str) -> Subagent:
+        if name not in self._agents:
+            raise KeyError(f"Unknown subagent: {name}")
+        return self._agents[name]
+
+    def list(self) -> builtins.list[str]:
+        return sorted(self._agents.keys())
+
+
+class SubagentOrchestrator:
+    def __init__(self, registry: SubagentRegistry):
+        self.registry = registry
+
+    async def run_many(
+        self,
+        ctx,
+        calls: list[tuple[str, Any, dict[str, Any]]],
+        *,
+        max_concurrency: int = 4,
+    ) -> list[SubagentResult]:
+        coros: list[Awaitable[SubagentResult]] = []
+        for name, inp, meta in calls:
+            agent = self.registry.get(name)
+            coros.append(agent.run(ctx, input=inp, meta=meta))
+        return await gather_bounded(coros, max_concurrency=max_concurrency)
+
+
+# ----------------------------
+# Built-in starter subagents
+# ----------------------------
+
+
+class RepoGrepSubagent:
+    """
+    Deterministic: searches repo for keywords/regex. Cheap. Great parallel companion.
+    """
+
+    name = "repo_grep"
+
+    async def run(self, ctx, *, input: Any, meta: dict[str, Any]) -> SubagentResult:
+        query = str(input).strip()
+        root = Path(meta.get("root", "."))
+        pattern = meta.get("regex", False)
+
+        hits: list[dict] = []
+        rx = re.compile(query) if pattern else None
+
+        # small async yield to keep loop responsive
+        await asyncio.sleep(0)
+
+        for path in root.rglob("*.py"):
+            try:
+                text = path.read_text(encoding="utf-8", errors="ignore")
+            except Exception:
+                continue
+
+            if rx:
+                if rx.search(text):
+                    hits.append({"file": str(path)})
+            else:
+                if query in text:
+                    hits.append({"file": str(path)})
+
+            if len(hits) >= int(meta.get("limit", 25)):
+                break
+
+        return SubagentResult(ok=True, name=self.name, output=hits, meta={"query": query, "count": len(hits)})
+
+
+def _default_plan(goal: str) -> list[dict[str, Any]]:
+    """Fallback when no model or LLM plan fails: fixed sequence."""
+    return [
+        {"kind": "analyze", "title": "Clarify goal", "input": goal},
+        {"kind": "search", "title": "Search repo for relevant modules", "input": goal},
+        {"kind": "design", "title": "Propose approach + file changes", "input": goal},
+        {"kind": "implement", "title": "Create patch", "input": goal},
+        {"kind": "gates", "title": "Run gates/tests", "input": {"pytest": []}},
+    ]
+
+
+class PlannerSubagent:
+    """
+    Dynamic planner: when ctx.model is set, uses LLM with structured output to
+    choose task sequence (e.g. skip analyze for small edits, add multiple searches).
+    Falls back to a fixed plan when no model or validation fails.
+    """
+
+    name = "planner"
+
+    async def run(self, ctx, *, input: Any, meta: dict[str, Any]) -> SubagentResult:
+        goal = str(input).strip()
+        if not goal:
+            return SubagentResult(ok=True, name=self.name, output=[], meta={"goal": goal, "count": 0})
+
+        if getattr(ctx, "model", None) is not None:
+            try:
+                from pydantic_ai import Agent
+
+                from .plan_models import PlanOutput, plan_output_to_tasks
+
+                prompt = f"""Given this development goal, output a minimal ordered plan of tasks.
+
+GOAL: {goal}
+
+Available task kinds (use only these):
+- analyze: clarify the goal into a short spec (use when the goal is vague or large).
+- search: find relevant code; input = search query (can be more specific than the goal).
+- design: propose approach and which files to change (use when multiple files or non-obvious approach).
+- implement: create the code patch (always include if the goal involves code changes).
+- gates: run compile/import checks and optional tests (include at the end when code was changed).
+
+Rules:
+- Prefer fewer tasks when the goal is small (e.g. "fix typo in README" → search + implement + gates).
+- For large or vague goals start with analyze, then search, then design, then implement, then gates.
+- You may use multiple search tasks with different queries if needed.
+- Every plan that changes code should end with implement and then gates.
+- Return JSON only: {{"tasks": [{{"kind": "...", "title": "...", "input": "..."}}, ...]}}."""
+
+                async with ctx.model_limiter:
+                    agent = Agent(model=ctx.model, output_type=PlanOutput)
+                    result = await agent.run(prompt)
+                plan: PlanOutput = result.output
+                tasks = plan_output_to_tasks(plan)
+                if not tasks:
+                    tasks = _default_plan(goal)
+                return SubagentResult(
+                    ok=True,
+                    name=self.name,
+                    output=tasks,
+                    meta={"goal": goal, "count": len(tasks), "dynamic": True},
+                )
+            except Exception:
+                tasks = _default_plan(goal)
+                return SubagentResult(
+                    ok=True,
+                    name=self.name,
+                    output=tasks,
+                    meta={"goal": goal, "count": len(tasks), "dynamic": False, "fallback": True},
+                )
+
+        tasks = _default_plan(goal)
+        return SubagentResult(ok=True, name=self.name, output=tasks, meta={"goal": goal, "count": len(tasks)})
diff --git a/agent_ext/workbench/subagents_bm25.py b/agent_ext/workbench/subagents_bm25.py
new file mode 100644
index 0000000..177a2d7
--- /dev/null
+++ b/agent_ext/workbench/subagents_bm25.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from typing import Any
+
+from .subagents import SubagentResult
+
+
+class BM25SearchSubagent:
+    name = "bm25"
+
+    async def run(self, ctx, *, input: Any, meta: dict[str, Any]) -> SubagentResult:
+        query = str(input).strip()
+        k = int(meta.get("k", 20))
+        hits = ctx.search.search(query, top_k=k)  # [(path, score)]
+        # return top paths only (keep it small)
+        out = [{"path": p, "score": float(s)} for p, s in hits]
+        return SubagentResult(ok=True, name=self.name, output=out, meta={"query": query, "k": k})
diff --git a/agent_ext/workbench/subagents_patch.py b/agent_ext/workbench/subagents_patch.py
new file mode 100644
index 0000000..6eea1cc
--- /dev/null
+++ b/agent_ext/workbench/subagents_patch.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from .loop import LLM_TRACE_MAX, LLM_TRACE_PROMPT_LEN, LLM_TRACE_RESPONSE_LEN
+from .patch_models import PatchOutput, structured_to_unified_diff
+from .subagents import SubagentResult
+
+
+def _read_snippet(root: Path, rel_path: str, max_chars: int = 6000) -> str:
+    p = root / rel_path
+    try:
+        txt = p.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        return ""
+    if len(txt) > max_chars:
+        return txt[:max_chars] + "\n...\n"
+    return txt
+
+
+class LLMPatchSubagent:
+    """
+    Produces a unified diff via structured output: LLM returns PatchOutput (list of
+    file edits with context/add/remove lines), we convert to valid unified diff.
+    Avoids raw diff parsing and format failures.
+    """
+
+    name = "llm_patch"
+
+    async def run(self, ctx, *, input: Any, meta: dict[str, Any]) -> SubagentResult:
+        if ctx.model is None:
+            return SubagentResult(ok=False, name=self.name, output="", meta={"error": "ctx.model is None"})
+
+        workdir = Path(meta.get("workdir", "."))
+        goal = str(input)
+
+        candidates: list[dict[str, Any]] = meta.get("candidates", [])[: int(meta.get("max_files", 6))]
+
+        snippets = []
+        for c in candidates:
+            rp = c["path"]
+            s = _read_snippet(workdir, rp)
+            if s:
+                snippets.append(f"FILE: {rp}\n---\n{s}\n")
+
+        strategy = meta.get("strategy")
+        strategy_block = f"\nSTRATEGY (follow this approach):\n{strategy}\n" if strategy else ""
+
+        prompt = f"""You are editing a git repository. Return a structured patch (JSON) describing the minimal code changes.
+{strategy_block}
+GOAL:
+{goal}
+
+RULES:
+- Only change files needed for the goal. Prefer editing existing files over creating new ones.
+- For each file: path (relative, e.g. agent_ext/foo.py), is_new_file (true only for new files), and lines: list of {{"kind": "context"|"add"|"remove", "content": "line text"}}.
+- context = unchanged line, add = new line, remove = deleted line. Order matters; keep context lines around edits so the patch is readable.
+- Keep changes minimal. Ensure code would still compile.
+
+CONTEXT SNIPPETS:
+{chr(10).join(snippets) if snippets else "(no snippets)"}
+
+Return only the structured patch: {{"files": [{{"path": "...", "is_new_file": false, "lines": [{{"kind": "context", "content": "..."}}, ...]}}, ...]}}."""
+
+        traces = getattr(ctx, "llm_traces", None)
+        trace_entry: dict[str, Any] | None = None
+        if traces is not None:
+            if len(traces) >= LLM_TRACE_MAX:
+                traces.pop(0)
+            trace_entry = {
+                "kind": "llm_patch",
+                "prompt": (prompt or "")[:LLM_TRACE_PROMPT_LEN],
+                "response": "",
+            }
+            traces.append(trace_entry)
+
+        try:
+            async with ctx.model_limiter:
+                from pydantic_ai import Agent
+
+                agent = Agent(model=ctx.model, output_type=PatchOutput)
+                result = await agent.run(prompt)
+                structured: PatchOutput = result.output
+
+            diff = structured_to_unified_diff(structured)
+            if trace_entry is not None:
+                trace_entry["response"] = (diff or "")[:LLM_TRACE_RESPONSE_LEN]
+
+            ok = bool(diff.strip()) and ("--- " in diff or "diff --git" in diff)
+            return SubagentResult(
+                ok=ok,
+                name=self.name,
+                output=diff,
+                meta={"files_considered": [c["path"] for c in candidates], "structured": True},
+            )
+        except Exception as e:
+            if trace_entry is not None:
+                trace_entry["response"] = f"Structured output failed: {e!s}"
+            return SubagentResult(
+                ok=False,
+                name=self.name,
+                output="",
+                meta={"error": str(e), "files_considered": [c["path"] for c in candidates]},
+            )
diff --git a/agent_ext/workbench/tui.py b/agent_ext/workbench/tui.py
new file mode 100644
index 0000000..2296a1f
--- /dev/null
+++ b/agent_ext/workbench/tui.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.panel import Panel
+
+console = Console()
+
+
+def run_tui(
+    *,
+    on_user_message: Callable[[str], str],
+    on_command: Callable[[str], str | None],
+) -> None:
+    console.print(Panel.fit("agent_patterns workbench (type /help for commands, /quit to exit)"))
+
+    while True:
+        try:
+            msg = console.input("[bold cyan]you> [/bold cyan]").strip()
+        except (EOFError, KeyboardInterrupt):
+            console.print("\nbye")
+            return
+
+        if not msg:
+            continue
+
+        if msg.startswith("/"):
+            if msg in ("/quit", "/exit"):
+                console.print("bye")
+                return
+            out = on_command(msg)
+            if out:
+                console.print(Panel(out, title=msg))
+            else:
+                console.print(Panel("unknown command", title=msg))
+            continue
+
+        out = on_user_message(msg)
+        # render markdown if it looks like markdown
+        console.print(Markdown(out))
diff --git a/agent_ext/workbench/tui_async.py b/agent_ext/workbench/tui_async.py
new file mode 100644
index 0000000..c6eafc5
--- /dev/null
+++ b/agent_ext/workbench/tui_async.py
@@ -0,0 +1,889 @@
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import re
+import time
+
+from rich.console import Console, Group
+from rich.live import Live
+from rich.panel import Panel
+from rich.rule import Rule
+from rich.spinner import Spinner
+from rich.table import Table
+from rich.text import Text
+from rich.theme import Theme
+
+from .loop import plan_and_queue, run_n_tasks, run_next_task
+
+# Slightly custom theme: keep default but ensure status colors pop
+console = Console(theme=Theme({"info": "cyan", "success": "green", "warn": "yellow", "error": "red", "dim": "dim"}))
+
+BANNER = """[bold cyan]  ╭─────────────────────────────────────╮
+  │  [bold white]agent_patterns[/bold white] [dim]workbench[/dim]        │
+  ╰─────────────────────────────────────╯[/bold cyan]
+  [dim]Quick start:[/]
+    [cyan]•[/] Type a goal or [bold]/plan <goal>[/] to plan
+    [cyan]•[/] [bold]/run[/] to execute  ·  [bold]/watch[/] for live view
+    [cyan]•[/] [bold]/tasks[/] to see queue  ·  [bold]/help[/] for all commands
+"""
+# Animated-looking rule (static; use Live elsewhere for motion)
+BANNER_RULE_STYLE = "cyan dim"
+
+# Kind-specific icons for task display
+KIND_ICON = {
+    "analyze": "🧠",
+    "search": "🔍",
+    "design": "📐",
+    "implement": "🔨",
+    "gates": "🧪",
+    "improve": "✨",
+}
+
+KIND_STYLE = {
+    "analyze": "bright_blue",
+    "search": "yellow",
+    "design": "magenta",
+    "implement": "green",
+    "gates": "cyan",
+    "improve": "bright_magenta",
+}
+
+# Spinner names: dots, dots12, line, aesthetic, runner, arc, etc. Run: python -m rich.spinner
+RUN_SPINNER = "dots12"
+LIVE_REFRESH_PER_SECOND = 10
+
+
+class _LiveSpinner:
+    """Renderable that shows an animated spinner; message_ref is a list so the caller can update the text. Implements __rich_console__ for Rich."""
+
+    def __init__(
+        self,
+        message_ref: list[str],
+        spinner_name: str = "dots12",
+        style: str = "bold cyan",
+        use_markup: bool = True,
+    ):
+        self._message_ref = message_ref
+        self._spinner = Spinner(spinner_name, style=style)
+        self._use_markup = use_markup
+
+    def __rich_console__(self, console: Console, options):
+        t = time.time()
+        msg = self._message_ref[0] if self._message_ref else "Running..."
+        text = Text.from_markup(" " + msg) if self._use_markup else Text(" " + msg)
+        yield Group(self._spinner.render(t), text)
+
+
+class _LiveTraceView:
+    """Shows the most recent LLM trace. Implement (llm_patch) streams into the trace; analyze/design append after the full response."""
+
+    def __init__(self, ctx, max_lines: int = 22):
+        self._ctx = ctx
+        self._max_lines = max_lines
+
+    def __rich_console__(self, console: Console, options):
+        traces = getattr(self._ctx, "llm_traces", [])
+        if not traces:
+            yield Panel("[dim]Waiting for LLM…[/]", title="[bold]LLM trace[/]", border_style="dim", padding=(0, 1))
+            return
+        entry = traces[-1]
+        kind = entry.get("kind", "?")
+        prompt = (entry.get("prompt") or "").strip()
+        response = (entry.get("response") or "").strip()
+        half = self._max_lines // 2
+        prompt_lines = prompt.splitlines()[:half]
+        response_lines = response.splitlines()[:half]
+        prompt_preview = "\n".join(prompt_lines) + ("\n…" if len(prompt.splitlines()) > half else "")
+        response_preview = "\n".join(response_lines) + ("\n…" if len(response.splitlines()) > half else "")
+        body = f"[bold magenta]{kind}[/]\n[dim]in:[/] {prompt_preview}\n[dim]out:[/] {response_preview}"
+        yield Panel(body, title="[bold]LLM trace[/]", border_style="magenta", padding=(0, 1))
+
+
+# Which subagents run for each task kind (for display)
+TASK_SUBAGENTS = {
+    "analyze": "LLM (clarify goal)",
+    "search": "repo_grep, bm25",
+    "design": "LLM (approach + file list)",
+    "implement": "llm_patch (worktree)",
+    "gates": "import_check, compile_check",
+}
+
+
+def _status_style(status: str) -> str:
+    if status == "done":
+        return "green"
+    if status == "in_progress":
+        return "yellow"
+    if status == "failed":
+        return "red"
+    if status == "cancelled":
+        return "dim strike"
+    return "dim"
+
+
+def _watch_renderable(ctx, task_id: str | None = None) -> Group:
+    """Build the live-updating view for /watch: recent task outputs + last LLM trace."""
+    watch_out = getattr(ctx, "watch_outputs", [])
+    lines = []
+    for out in watch_out[-25:]:
+        first = (out.split("\n")[0] or "").strip()
+        if "done" in out and "failed" not in out:
+            lines.append(f"  [green]✓[/] {first}")
+        elif "failed" in out:
+            lines.append(f"  [red]✗[/] {first}")
+        else:
+            lines.append(f"  [dim]{first}[/]")
+    task_block = "\n".join(lines) if lines else "[dim]No task output yet. Run /run to see completions here.[/]"
+    task_panel = Panel(
+        task_block,
+        title="[bold]Recent task output[/] [dim](streams as run progresses)[/dim]",
+        border_style="cyan",
+        padding=(0, 1),
+    )
+    trace_panel = _LiveTraceView(ctx, max_lines=14)
+    if task_id:
+        # Highlight that we're watching for this task
+        watching = f"[yellow]Watching for {task_id}[/] — output will appear above when it completes."
+        header = Panel(watching, title="[bold]watch[/]", border_style="yellow", padding=(0, 1))
+        return Group(header, task_panel, trace_panel)
+    return Group(task_panel, trace_panel)
+
+
+def _format_elapsed(elapsed_s: float | None) -> str:
+    """Format elapsed seconds into human-readable string."""
+    if elapsed_s is None:
+        return "[dim]—[/]"
+    if elapsed_s < 1.0:
+        return f"[dim]{elapsed_s * 1000:.0f}ms[/]"
+    if elapsed_s < 60:
+        return f"[dim]{elapsed_s:.1f}s[/]"
+    mins = int(elapsed_s // 60)
+    secs = int(elapsed_s % 60)
+    return f"[dim]{mins}m{secs:02d}s[/]"
+
+
+def _tasks_table(ctx) -> Table:
+    t = Table(title="[bold]Task Queue[/bold]", title_style="bold white", border_style="dim")
+    t.add_column("id", style="bold cyan", no_wrap=True)
+    t.add_column("kind", style=None, no_wrap=True)
+    t.add_column("status", style=None, no_wrap=True)
+    t.add_column("time", style=None, no_wrap=True, justify="right")
+    t.add_column("title", style="white")
+    for task in ctx.task_queue.list():
+        icon = KIND_ICON.get(task.kind, "·")
+        kstyle = KIND_STYLE.get(task.kind, "magenta")
+        t.add_row(
+            task.id,
+            f"{icon} [{kstyle}]{task.kind}[/]",
+            f"[{_status_style(task.status)}]{task.status}[/]",
+            _format_elapsed(task.elapsed_s),
+            task.title,
+        )
+    return t
+
+
+# Max chars per trace when listing (avoid hanging on /traces with huge prompts/responses)
+TRACE_PREVIEW_PROMPT = 800
+TRACE_PREVIEW_RESPONSE = 1200
+
+
+def _format_llm_trace(entry: dict, truncate: bool = True) -> str:
+    kind = entry.get("kind", "?")
+    prompt = entry.get("prompt", "") or ""
+    response = entry.get("response", "") or ""
+    if truncate:
+        if len(prompt) > TRACE_PREVIEW_PROMPT:
+            prompt = prompt[:TRACE_PREVIEW_PROMPT] + "\n… [truncated]"
+        if len(response) > TRACE_PREVIEW_RESPONSE:
+            response = response[:TRACE_PREVIEW_RESPONSE] + "\n… [truncated]"
+    return f"[bold magenta]{kind}[/]\n[dim]prompt:[/]\n{prompt}\n[dim]response:[/]\n{response}"
+
+
+async def _ainput(prompt: str) -> str:
+    # Rich input is blocking; run it in a thread so we can keep an async loop.
+    return await asyncio.to_thread(console.input, prompt)
+
+
+async def run_tui(ctx) -> None:
+    # Start MCP server now that the event loop is running (cannot start in build_ctx)
+    ctx.mcp_server.start()
+    console.print(BANNER)
+    console.print(Rule(style=BANNER_RULE_STYLE))
+    console.print(
+        Panel.fit(
+            "[bold]/help[/] commands  [bold]/plan <goal>[/]  [bold]/run[/]  [bold]/quit[/]",
+            border_style="cyan",
+            padding=(0, 1),
+        )
+    )
+
+    while True:
+        msg = (await _ainput("[bold cyan]you> [/bold cyan]")).strip()
+        if not msg:
+            continue
+
+        if msg in ("/quit", "/exit"):
+            console.print("bye")
+            return
+
+        if msg == "/help":
+            console.print(
+                Panel(
+                    "\n".join(
+                        [
+                            "[bold white]Planning & Execution[/]",
+                            "  [cyan]/plan <goal>[/]    queue plan (background)",
+                            "  [cyan]/run[/] [dim]or[/] [cyan]/run N[/]    run in background  [cyan]/run N fg[/]  wait & watch",
+                            "  [cyan]/stop[/] [dim]or[/] [cyan]/stop all[/]  cancel run(s)",
+                            "",
+                            "[bold white]Inspection[/]",
+                            "  [cyan]/tasks[/]          task queue with timing",
+                            "  [cyan]/status[/]         case/session + run info",
+                            "  [cyan]/agents[/]         list subagents",
+                            "  [cyan]/watch[/] [dim][id][/]      live view of run + trace",
+                            "  [cyan]/diff[/]           show last generated patch",
+                            "  [cyan]/trace[/]          last LLM trace (full)",
+                            "  [cyan]/traces[/] [dim][N][/]      last N traces (preview)",
+                            "",
+                            "[bold white]Actions[/]",
+                            "  [cyan]/adopt[/]          apply last patch to repo",
+                            "  [cyan]/retry[/] [dim][id][/]      retry failed task(s)",
+                            "  [cyan]/cancel <id>[/]    cancel pending task",
+                            "  [cyan]/ask <q>[/]        one-off LLM question",
+                            "",
+                            "[bold white]Config & Misc[/]",
+                            "  [cyan]/parallel <n>[/]   max subagents",
+                            "  [cyan]/model[/]          model info",
+                            "  [cyan]/workflows[/]      list workflows",
+                            "  [cyan]/clear[/]          clear screen",
+                            "  [cyan]/quit[/]           exit",
+                        ]
+                    ),
+                    title="[bold]commands[/bold]",
+                    border_style="cyan",
+                )
+            )
+            continue
+
+        if msg == "/status":
+            tasks_list = getattr(ctx, "background_run_tasks", [])
+            active = [t for t in tasks_list if not t.done()]
+            done_q = sum(1 for t in ctx.task_queue.list() if t.status == "done")
+            pending_q = sum(1 for t in ctx.task_queue.list() if t.status == "pending")
+            bg_line = ""
+            if active:
+                bg_line = f"\n[yellow]{len(active)} run(s) in progress[/] [dim](queue: {done_q} done, {pending_q} pending — /stop or /stop all)[/]"
+            else:
+                bg_line = f"\n[dim]queue: {done_q} done, {pending_q} pending[/]"
+            console.print(
+                Panel(
+                    f"[bold]case[/]={ctx.case_id}  [bold]session[/]={ctx.session_id}  [bold]user[/]={ctx.user_id}{bg_line}",
+                    title="[bold]status[/bold]",
+                    border_style="dim",
+                )
+            )
+            continue
+
+        if msg.startswith("/stop"):
+            tasks_list = getattr(ctx, "background_run_tasks", [])
+            active = [t for t in tasks_list if not t.done()]
+            if not active:
+                console.print(Panel("[dim]No background runs in progress.[/]", title="stop", border_style="dim"))
+                continue
+            stop_all = msg.strip().lower().endswith("all")
+            if stop_all:
+                for t in active:
+                    t.cancel()
+                for t in active:
+                    with contextlib.suppress(asyncio.CancelledError):
+                        await t
+                ctx.background_run_tasks.clear()
+                console.print(Panel(f"[yellow]Stopped {len(active)} run(s).[/]", title="stop", border_style="yellow"))
+            else:
+                t = active[-1]
+                t.cancel()
+                with contextlib.suppress(asyncio.CancelledError):
+                    await t
+                with contextlib.suppress(ValueError):
+                    ctx.background_run_tasks.remove(t)
+                console.print(Panel("[yellow]Most recent run stopped.[/]", title="stop", border_style="yellow"))
+            continue
+
+        if msg == "/agents":
+            agents = ctx.subagents.list()
+            console.print(
+                Panel(
+                    "\n".join(f"  [cyan]•[/] {a}" for a in agents),
+                    title="[bold]subagents[/bold]",
+                    border_style="cyan",
+                )
+            )
+            continue
+
+        if msg == "/adopt":
+            from pathlib import Path
+
+            from agent_ext.self_improve.patching import apply_unified_diff
+
+            state_dir = Path(".agent_state")
+            path_file = state_dir / "last_patch_path.txt"
+            if path_file.exists():
+                diff_path = Path(path_file.read_text(encoding="utf-8").strip())
+            else:
+                diff_path = state_dir / "last_patch.diff"
+            if not diff_path.exists():
+                console.print(
+                    Panel(
+                        "No saved patch. Run /run and complete an implement step first (patch is saved to .agent_state/patch_<run_id>.diff).",
+                        title="adopt",
+                        border_style="yellow",
+                    )
+                )
+                continue
+            diff = diff_path.read_text(encoding="utf-8")
+            ok, out = apply_unified_diff(diff, repo_root=Path("."))
+            if not ok:
+                console.print(Panel(f"adopt failed: {out}", title="adopt"))
+                continue
+            console.print(Panel(f"adopted patch from {diff_path}", title="adopt"))
+            continue
+        if msg == "/tasks":
+            console.print(_tasks_table(ctx))
+            console.print("[dim]  ───[/dim]")
+            continue
+
+        if msg.startswith("/cancel "):
+            task_id = msg.split(maxsplit=1)[1].strip() if len(msg.split()) > 1 else ""
+            if not task_id:
+                console.print(
+                    Panel(
+                        "[dim]Usage: /cancel <task_id>  e.g. /cancel t0004 or /cancel 0004[/]",
+                        title="cancel",
+                        border_style="yellow",
+                    )
+                )
+                continue
+            result = await ctx.task_queue.cancel_by_id(task_id)
+            if result is True:
+                console.print(
+                    Panel(
+                        "[green]Task cancelled.[/] Use [cyan]/tasks[/] to see queue.",
+                        title="cancel",
+                        border_style="green",
+                    )
+                )
+            elif result is False:
+                t = ctx.task_queue.get_by_id(task_id)
+                status = t.status if t else "?"
+                console.print(
+                    Panel(
+                        f"[yellow]Task not pending (status: {status}).[/] Only pending tasks can be cancelled.",
+                        title="cancel",
+                        border_style="yellow",
+                    )
+                )
+            else:
+                console.print(
+                    Panel(
+                        f"[red]No task with id '{task_id}'.[/] Use [cyan]/tasks[/] for ids.",
+                        title="cancel",
+                        border_style="red",
+                    )
+                )
+            continue
+
+        if msg == "/clear":
+            console.clear()
+            console.print(BANNER)
+            console.print(Rule(style=BANNER_RULE_STYLE))
+            continue
+
+        if msg == "/diff":
+            from pathlib import Path as _DiffPath
+
+            from rich.syntax import Syntax
+
+            state_dir = _DiffPath(".agent_state")
+            path_file = state_dir / "last_patch_path.txt"
+            diff_path = None
+            if path_file.exists():
+                diff_path = _DiffPath(path_file.read_text(encoding="utf-8").strip())
+            if diff_path is None or not diff_path.exists():
+                console.print(
+                    Panel(
+                        "[dim]No saved patch yet. Run [cyan]/run[/] to generate one.[/]",
+                        title="diff",
+                        border_style="dim",
+                    )
+                )
+                continue
+            diff_content = diff_path.read_text(encoding="utf-8")
+            if not diff_content.strip():
+                console.print(Panel("[dim]Patch file is empty.[/]", title="diff", border_style="dim"))
+                continue
+            syntax = Syntax(diff_content, "diff", theme="monokai", line_numbers=True, word_wrap=True)
+            console.print(
+                Panel(syntax, title=f"[bold]Last patch[/] [dim]({diff_path})[/]", border_style="green", padding=(0, 1))
+            )
+            continue
+
+        if msg == "/retry" or msg.startswith("/retry "):
+            parts = msg.split(maxsplit=1)
+            if len(parts) >= 2 and parts[1].strip():
+                task_id = parts[1].strip()
+                result = await ctx.task_queue.retry_by_id(task_id)
+                if result is True:
+                    console.print(
+                        Panel(
+                            "[green]Task reset to pending.[/] Use [cyan]/run[/] to execute.",
+                            title="retry",
+                            border_style="green",
+                        )
+                    )
+                elif result is False:
+                    t = ctx.task_queue.get_by_id(task_id)
+                    status = t.status if t else "?"
+                    console.print(
+                        Panel(
+                            f"[yellow]Task not retryable (status: {status}).[/] Only failed/cancelled tasks can be retried.",
+                            title="retry",
+                            border_style="yellow",
+                        )
+                    )
+                else:
+                    console.print(Panel(f"[red]No task with id '{task_id}'.[/]", title="retry", border_style="red"))
+            else:
+                count = await ctx.task_queue.retry_all_failed()
+                if count > 0:
+                    console.print(
+                        Panel(
+                            f"[green]Reset {count} failed task(s) to pending.[/] Use [cyan]/run[/] to execute.",
+                            title="retry",
+                            border_style="green",
+                        )
+                    )
+                else:
+                    console.print(Panel("[dim]No failed tasks to retry.[/]", title="retry", border_style="dim"))
+            continue
+
+        if msg == "/watch" or msg.startswith("/watch "):
+            task_id = msg.split(maxsplit=1)[1].strip() if msg.startswith("/watch ") else None
+            if task_id and not task_id.startswith("t"):
+                task_id = f"t{task_id}" if task_id.isdigit() else task_id
+            initial = _watch_renderable(ctx, task_id)
+            with Live(initial, refresh_per_second=4, console=console) as live:
+
+                async def _watch_update_loop(_task_id: str | None = task_id) -> None:
+                    while True:
+                        live.update(_watch_renderable(ctx, _task_id))
+                        await asyncio.sleep(0.25)
+
+                update_task = asyncio.create_task(_watch_update_loop())
+                try:
+                    await _ainput("\n[dim]Press Enter to close watch...[/] ")
+                finally:
+                    update_task.cancel()
+                    with contextlib.suppress(asyncio.CancelledError):
+                        await update_task
+            continue
+
+        if msg.strip() == "/trace":
+            traces = getattr(ctx, "llm_traces", [])
+            if not traces:
+                console.print(
+                    Panel(
+                        "[dim]No LLM traces yet. Run /run to generate.[/]",
+                        title="[bold]trace[/bold]",
+                        border_style="dim",
+                    )
+                )
+            else:
+                entry = traces[-1]
+                kind = entry.get("kind", "?")
+                body = _format_llm_trace(entry, truncate=False)
+                console.print(
+                    Panel(
+                        body,
+                        title=f"[bold]last trace (full)[/] [magenta]{kind}[/]",
+                        border_style="magenta",
+                        padding=(0, 1),
+                    )
+                )
+                from pathlib import Path
+
+                trace_file = Path(".agent_state/last_trace.txt")
+                trace_file.parent.mkdir(parents=True, exist_ok=True)
+                trace_file.write_text(
+                    f"kind: {kind}\n\n--- prompt ---\n{entry.get('prompt', '')}\n\n--- response ---\n{entry.get('response', '')}",
+                    encoding="utf-8",
+                )
+                console.print(f"[dim]Also saved to {trace_file} (open in editor to search)[/]")
+            continue
+
+        if msg == "/traces" or msg.startswith("/traces "):
+            traces = getattr(ctx, "llm_traces", [])
+            n = 5
+            parts = msg.split(maxsplit=1)
+            if len(parts) >= 2 and parts[1].strip().isdigit():
+                n = max(1, min(30, int(parts[1].strip())))
+            show = traces[-n:] if traces else []
+            if not show:
+                console.print(
+                    Panel(
+                        "[dim]No LLM traces yet. Run /run (analyze, design, or implement) to generate.[/]",
+                        title="[bold]traces[/bold]",
+                        border_style="dim",
+                    )
+                )
+            else:
+                for i, entry in enumerate(reversed(show)):
+                    body = _format_llm_trace(entry, truncate=True)
+                    console.print(
+                        Panel(
+                            body,
+                            title=f"[bold]trace[/] {len(show) - i} ([magenta]{entry.get('kind', '?')}[/]) [dim]preview[/dim]",
+                            border_style="dim",
+                            padding=(0, 1),
+                        )
+                    )
+                console.print(
+                    "[dim]Use [cyan]/trace[/] for the last trace in full (and saved to .agent_state/last_trace.txt)[/]"
+                )
+            continue
+
+        if msg.startswith("/parallel "):
+            try:
+                n = int(msg.split(" ", 1)[1].strip())
+                ctx.max_parallel_subagents = max(1, n)
+                console.print(Panel(f"max_parallel_subagents={ctx.max_parallel_subagents}"))
+            except Exception:
+                console.print(Panel("usage: /parallel 4"))
+            continue
+
+        if msg == "/model":
+            model_status = "[green]set[/]" if ctx.model else "[red]none[/]"
+            limiter = ctx.model_limiter._sem._value if hasattr(ctx.model_limiter, "_sem") else "n/a"
+            console.print(
+                Panel(
+                    f"model={model_status}  [dim]parallel slots=[/]{limiter}",
+                    title="[bold]model[/bold]",
+                    border_style="dim",
+                )
+            )
+            continue
+
+        if msg.startswith("/plan "):
+            goal = msg.split(" ", 1)[1].strip()
+            if not goal:
+                console.print(Panel("usage: /plan <goal>", title="plan", border_style="dim"))
+                continue
+
+            def _on_plan_done_slash(t: asyncio.Task) -> None:
+                try:
+                    if t.cancelled():
+                        return
+                    exc = t.exception()
+                    if exc is not None:
+                        console.print(Panel(f"[red]Plan failed: {exc}[/]", title="plan", border_style="red"))
+                        return
+                    lines = t.result()
+                    if lines and "planner failed" not in (lines[0] or ""):
+                        console.print(_tasks_table(ctx))
+                        console.print(
+                            Panel(
+                                "[green]Plan ready.[/] Use [cyan]/run[/] to start.", title="plan", border_style="green"
+                            )
+                        )
+                    else:
+                        console.print(
+                            Panel(
+                                "\n".join(lines) if lines else "[dim]No tasks.[/]", title="plan", border_style="yellow"
+                            )
+                        )
+                except Exception as e:
+                    console.print(Panel(f"[red]{e}[/]", title="plan", border_style="red"))
+
+            asyncio.create_task(plan_and_queue(ctx, goal)).add_done_callback(_on_plan_done_slash)
+            console.print(
+                Panel(
+                    f"[dim]Planning in background:[/] {goal[:80]}{'…' if len(goal) > 80 else ''}\n[dim]/tasks when ready, then /run.[/]",
+                    title="plan",
+                    border_style="green",
+                )
+            )
+            continue
+
+        if msg.startswith("/ask "):
+            question = msg.split(" ", 1)[1].strip()
+            if not question:
+                console.print(Panel("usage: /ask <question>", title="ask", border_style="dim"))
+                continue
+            if not ctx.model:
+                console.print(
+                    Panel("[red]No model set. Use --use-openai-chat-model.[/]", title="ask", border_style="red")
+                )
+                continue
+
+            async def _ask_background(q: str) -> None:
+                try:
+                    from pydantic_ai import Agent
+
+                    async with ctx.model_limiter:
+                        agent = Agent(model=ctx.model)
+                        result = await agent.run(q)
+                    out = getattr(result, "output", None) or str(result)
+                    console.print(
+                        Panel(
+                            f"[dim]Q:[/] {q[:120]}{'…' if len(q) > 120 else ''}\n\n[green]{out}[/]",
+                            title="ask",
+                            border_style="cyan",
+                        )
+                    )
+                except asyncio.CancelledError:
+                    console.print(Panel("[yellow]Ask cancelled.[/]", title="ask", border_style="yellow"))
+                except Exception as e:
+                    console.print(Panel(f"[red]{e}[/]", title="ask", border_style="red"))
+
+            asyncio.create_task(_ask_background(question))
+            console.print(
+                Panel(
+                    "[dim]Asking in background.[/] Answer will appear when ready. Keep typing — /tasks, /run, etc.[/]",
+                    title="ask",
+                    border_style="cyan",
+                )
+            )
+            continue
+
+        if msg.startswith("/run"):
+            parts = msg.split()
+            count = 1
+            # Default: run in background so TUI stays responsive; use fg/wait to block and watch
+            background = True
+            if len(parts) >= 2:
+                if parts[-1] in ("fg", "wait", "foreground"):
+                    background = False
+                    parts = parts[:-1]
+                elif parts[-1] in ("&", "bg"):
+                    parts = parts[:-1]
+                if parts and len(parts) >= 2:
+                    try:
+                        count = max(1, int(parts[1]))
+                    except Exception:
+                        count = 1
+
+            if background:
+                # Non-blocking: run in background; stream task completions live (Cursor/Claude Code style)
+                tasks_list = getattr(ctx, "background_run_tasks", [])
+
+                def _on_task_complete(out: str) -> None:
+                    first = (out.split("\n")[0] or "").strip()
+                    if "done" in out and "failed" not in out:
+                        console.print(f"  [green]✓[/] [dim]{first}[/]")
+                    elif "failed" in out:
+                        console.print(f"  [red]✗[/] [dim]{first}[/]")
+                    watch_out = getattr(ctx, "watch_outputs", None)
+                    if watch_out is not None:
+                        watch_out.append(out)
+                        if len(watch_out) > 100:
+                            watch_out.pop(0)
+
+                def _on_background_done(t: asyncio.Task) -> None:
+                    with contextlib.suppress(ValueError):
+                        ctx.background_run_tasks.remove(t)
+                    if t.cancelled():
+                        console.print(Panel("[yellow]Background run stopped.[/]", title="run", border_style="yellow"))
+                        return
+                    exc = t.exception()
+                    if exc is not None:
+                        console.print(Panel(f"[red]Background run error: {exc}[/]", title="run", border_style="red"))
+                        return
+                    outs = t.result()
+                    if not outs:
+                        console.print(Panel("[dim]No tasks run (queue empty).[/]", title="run", border_style="dim"))
+                        return
+                    n_done = sum(1 for o in outs if "done" in o and "failed" not in o)
+                    failed_outs = [o for o in outs if "failed" in o]
+                    n_fail = len(failed_outs)
+                    body = f"[green]Background run finished.[/] {n_done} done, {n_fail} failed."
+                    if failed_outs:
+                        body += "\n\n[red]Failed task(s):[/]"
+                        for o in failed_outs:
+                            lines = o.strip().split("\n")
+                            # For implement/patch failures show more context (reason + model snippet)
+                            if "create patch failed" in o or "implement:" in o and "failed" in o:
+                                excerpt = "\n    ".join(lines[:8]) if len(lines) > 1 else lines[0]
+                                if len(excerpt) > 700:
+                                    excerpt = excerpt[:697] + "..."
+                                body += f"\n  [red]•[/] {excerpt}"
+                            else:
+                                first_line = (lines[0] or "").strip()
+                                if len(first_line) > 120:
+                                    first_line = first_line[:117] + "..."
+                                body += f"\n  [red]•[/] {first_line}"
+                        body += "\n\n[dim]Use /tasks and /traces for full details.[/]"
+                    else:
+                        body += "\n[dim]Use /tasks and /traces to inspect.[/]"
+                    if n_fail == 0 and outs:
+                        for o in outs:
+                            if "diff_saved=" in o:
+                                m = re.search(r"diff_saved=(\S+)", o)
+                                if m:
+                                    body += f"\n[cyan]Patch:[/] [dim]{m.group(1)}[/] [dim](/adopt to apply)[/]"
+                                    break
+                    console.print(Panel(body, title="run", border_style="green" if n_fail == 0 else "red"))
+
+                task = asyncio.create_task(run_n_tasks(ctx, count, progress_callback=_on_task_complete))
+                task.add_done_callback(_on_background_done)
+                ctx.background_run_tasks.append(task)
+                len([x for x in ctx.background_run_tasks if not x.done()])
+                console.print(
+                    Panel(
+                        f"[green]Run started[/] ({count} worker(s)) — task completions will stream below. [cyan]/status[/] [cyan]/traces[/] [cyan]/stop[/]",
+                        title="run",
+                        border_style="green",
+                    )
+                )
+                continue
+
+            # Foreground (blocking): show Live spinner + trace
+            outs: list[str] = []
+            run_message: list[str] = ["Starting…"]
+            run_spinner_panel = Panel(
+                _LiveSpinner(run_message, spinner_name=RUN_SPINNER),
+                title="[bold yellow] run [/]",
+                border_style="yellow",
+                padding=(0, 1),
+            )
+            live_renderable = Group(run_spinner_panel, _LiveTraceView(ctx))
+
+            with Live(live_renderable, refresh_per_second=LIVE_REFRESH_PER_SECOND, console=console):
+                for _i in range(max(1, count)):
+                    next_t = ctx.task_queue.next_pending()
+                    if next_t:
+                        subagents_desc = TASK_SUBAGENTS.get(next_t.kind, "—")
+                        run_message[0] = f"[bold]{next_t.id}[/] [cyan]({next_t.kind})[/] [dim]→ {subagents_desc}[/]"
+                    else:
+                        run_message[0] = "[dim]No pending tasks[/]"
+                    out = await run_next_task(ctx)
+                    outs.append(out)
+                    if next_t and out.startswith(f"{next_t.id} done"):
+                        run_message[0] = f"[green]✓ {next_t.id} done[/] — next…"
+                    elif next_t and out.startswith(f"{next_t.id} failed"):
+                        run_message[0] = f"[red]✗ {next_t.id} failed[/] — next…"
+
+            # After Live stops, show completion and result panel
+            if outs:
+                for o in outs:
+                    if "done:" in o:
+                        console.print(f"  [green]✓[/] [dim]{o.split(chr(10))[0]}[/]")
+                    elif "failed" in o:
+                        console.print(f"  [red]✗[/] [dim]{o.split(chr(10))[0]}[/]")
+            # Show last LLM trace briefly so user can see what the model saw/returned
+            traces = getattr(ctx, "llm_traces", [])
+            if traces:
+                last = traces[-1]
+                console.print(
+                    Panel(
+                        _format_llm_trace(last),
+                        title="[bold]last LLM trace[/] [dim](/traces for more)[/]",
+                        border_style="magenta",
+                        padding=(0, 1),
+                    )
+                )
+            console.print()
+            console.print(Panel("\n\n".join(outs), title="[bold]run[/bold]", border_style="yellow"))
+            continue
+
+        if msg == "/workflows":
+            names = ctx.workflow_registry.list_workflows()
+            body = "\n".join(f"  [cyan]•[/] {n}" for n in names) if names else "[dim]none[/]"
+            console.print(Panel(body, title="[bold]workflows[/bold]", border_style="dim"))
+            continue
+
+        if msg.startswith("/assemble "):
+            # /assemble ocr extract text from this pdf
+            rest = msg.split(" ", 2)
+            if len(rest) < 3:
+                console.print(Panel("usage: /assemble <task_type> <text>"))
+                continue
+            task_type, text = rest[1], rest[2]
+            req = __import__("agent_ext.workflow.types", fromlist=["TaskRequest"]).TaskRequest(
+                text=text,
+                task_type=task_type,
+                hints=("needs_planning", "needs_memory") if task_type == "ocr" else ("needs_planning",),
+            )
+            wf = ctx.workflow_planner.choose(ctx, req)
+            console.print(Panel(f"chosen workflow: {wf.name}\nsteps: {[s.component_name for s in wf.steps]}"))
+            continue
+
+        if msg.startswith("/exec "):
+            rest = msg.split(" ", 2)
+            if len(rest) < 3:
+                console.print(Panel("usage: /exec <task_type> <text>"))
+                continue
+            task_type, text = rest[1], rest[2]
+            from agent_ext.workflow.types import TaskRequest
+
+            req = TaskRequest(
+                text=text,
+                task_type=task_type,
+                hints=("needs_planning", "needs_memory") if task_type == "ocr" else ("needs_planning",),
+            )
+            wf = ctx.workflow_planner.choose(ctx, req)
+            result = await ctx.workflow_executor.execute(ctx, wf, req)
+
+            # reward: success and speed (simple starter)
+            # reward in [0,1]: ok=1 else 0; subtract small penalty for slow
+            reward = (1.0 if result.ok else 0.0) - min(0.5, result.metrics.get("dt_ms", 0) / 60_000.0)
+            ctx.workflow_experience.record(req, result, reward)
+            ctx.workflow_planner.observe(req, wf.name, reward)
+
+            console.print(
+                Panel(
+                    f"ok={result.ok}\nworkflow={result.workflow_name}\nreward={reward:.3f}\noutputs={result.outputs}\ntrace={result.trace}",
+                    title="execution",
+                )
+            )
+            continue
+
+        # Plain chat message = queue plan in background (never block TUI; OpenCode/Claude Code style)
+        goal = msg.strip()[:200]
+        if not goal:
+            continue
+
+        def _on_plan_done(t: asyncio.Task) -> None:
+            try:
+                if t.cancelled():
+                    return
+                exc = t.exception()
+                if exc is not None:
+                    console.print(Panel(f"[red]Plan failed: {exc}[/]", title="plan", border_style="red"))
+                    return
+                lines = t.result()
+                if lines and "planner failed" not in (lines[0] or ""):
+                    console.print(_tasks_table(ctx))
+                    console.print(
+                        Panel(
+                            "[green]Plan ready.[/] Use [cyan]/run[/] to start, or [cyan]/plan <goal>[/] for another.",
+                            title="plan",
+                            border_style="green",
+                        )
+                    )
+                else:
+                    console.print(
+                        Panel("\n".join(lines) if lines else "[dim]No tasks.[/]", title="plan", border_style="yellow")
+                    )
+            except Exception as e:
+                console.print(Panel(f"[red]{e}[/]", title="plan", border_style="red"))
+
+        plan_task = asyncio.create_task(plan_and_queue(ctx, goal))
+        plan_task.add_done_callback(_on_plan_done)
+        console.print(
+            Panel(
+                f"[dim]Planning in background:[/] {goal}\n[dim]Keep typing — [/][cyan]/tasks[/] [dim]when ready, then [/][cyan]/run[/] [dim]. Run continues in parallel.[/]",
+                title="plan",
+                border_style="cyan",
+            )
+        )
+        continue
diff --git a/agent_ext/workbench/worktrees.py b/agent_ext/workbench/worktrees.py
new file mode 100644
index 0000000..df42feb
--- /dev/null
+++ b/agent_ext/workbench/worktrees.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+WORKTREES_ROOT = Path(".agent_state/worktrees")
+
+
+@dataclass(frozen=True)
+class WorktreeHandle:
+    run_id: str
+    agent_name: str
+    branch: str
+    path: Path
+
+
+def _run(cmd: list[str], *, cwd: Path | None = None) -> tuple[bool, str]:
+    env = os.environ.copy()  # includes HTTP_PROXY/HTTPS_PROXY/etc.
+    p = subprocess.run(cmd, cwd=str(cwd) if cwd else None, env=env, capture_output=True, text=True)
+    ok = p.returncode == 0
+    out = (p.stdout or "") + ("\n" if p.stdout and p.stderr else "") + (p.stderr or "")
+    return ok, out.strip()
+
+
+def ensure_git_repo() -> None:
+    ok, _ = _run(["git", "rev-parse", "--is-inside-work-tree"])
+    if not ok:
+        raise RuntimeError("Not inside a git repo. Worktrees require git.")
+
+
+def create_worktree(
+    *,
+    run_id: str,
+    agent_name: str,
+    base_ref: str = "HEAD",
+    branch_prefix: str = "auto",
+) -> WorktreeHandle:
+    """
+    Creates a new branch + worktree at:
+      .agent_state/worktrees/<run_id>/<agent_name>/
+    """
+    ensure_git_repo()
+
+    wt_path = WORKTREES_ROOT / run_id / agent_name
+    wt_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Unique branch name
+    safe_agent = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in agent_name)
+    branch = f"{branch_prefix}/{run_id}/{safe_agent}"
+
+    # Create branch (fails if exists; that’s fine—use a unique run_id)
+    ok, out = _run(["git", "branch", branch, base_ref])
+    if not ok and "already exists" not in out.lower():
+        raise RuntimeError(f"git branch failed: {out}")
+
+    # Add worktree
+    # NOTE: --force is dangerous; avoid. If directory exists, wipe it and re-add.
+    if wt_path.exists():
+        shutil.rmtree(wt_path)
+
+    ok, out = _run(["git", "worktree", "add", str(wt_path), branch])
+    if not ok:
+        raise RuntimeError(f"git worktree add failed: {out}")
+
+    return WorktreeHandle(run_id=run_id, agent_name=agent_name, branch=branch, path=wt_path)
+
+
+def worktree_diff(wt: WorktreeHandle) -> str:
+    """
+    Unified diff of ALL changes in the worktree (edits + new files).
+    Stages everything first so new (untracked) files are captured.
+    """
+    # Stage all changes (including new/untracked files)
+    ok, out = _run(["git", "add", "-A"], cwd=wt.path)
+    if not ok:
+        raise RuntimeError(f"git add -A failed: {out}")
+    # Diff staged changes against HEAD to capture everything
+    ok, out = _run(["git", "diff", "--cached", "HEAD"], cwd=wt.path)
+    if not ok:
+        raise RuntimeError(out)
+    return out
+
+
+def worktree_status(wt: WorktreeHandle) -> str:
+    ok, out = _run(["git", "status", "--porcelain"], cwd=wt.path)
+    if not ok:
+        raise RuntimeError(out)
+    return out
+
+
+def cleanup_worktree(wt: WorktreeHandle, *, prune_branch: bool = False) -> None:
+    """
+    Removes worktree directory and optionally deletes its branch.
+    """
+    ensure_git_repo()
+    # Remove worktree
+    ok, out = _run(["git", "worktree", "remove", "--force", str(wt.path)])
+    if not ok:
+        raise RuntimeError(f"git worktree remove failed: {out}")
+
+    # Optionally delete branch
+    if prune_branch:
+        ok, out = _run(["git", "branch", "-D", wt.branch])
+        if not ok:
+            raise RuntimeError(f"git branch -D failed: {out}")
diff --git a/agent_ext/workbench/writer_runner.py b/agent_ext/workbench/writer_runner.py
new file mode 100644
index 0000000..4470879
--- /dev/null
+++ b/agent_ext/workbench/writer_runner.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from .locks import LeaseLockStore
+from .worktrees import WorktreeHandle, cleanup_worktree, create_worktree, worktree_diff
+
+
+@dataclass(frozen=True)
+class WriterResult:
+    ok: bool
+    diff: str
+    meta: dict[str, Any]
+
+
+class WriterCoordinator:
+    def __init__(self):
+        self.locks = LeaseLockStore()
+
+    async def run_writer(
+        self,
+        ctx,
+        *,
+        run_id: str,
+        agent_name: str,
+        write_key: str,
+        subagent,  # must have async run(ctx, input, meta)
+        input: Any,
+        meta: dict[str, Any],
+        ttl_s: int = 900,
+        prune_branch: bool = False,
+    ) -> WriterResult:
+        owner = f"{run_id}:{agent_name}"
+        lease = self.locks.try_acquire(key=write_key, owner=owner, ttl_s=ttl_s)
+        if not lease:
+            return WriterResult(ok=False, diff="", meta={"error": f"lock busy: {write_key}"})
+
+        wt: WorktreeHandle | None = None
+        try:
+            wt = create_worktree(run_id=run_id, agent_name=agent_name)
+            # Tell the subagent to operate inside the worktree path
+            meta2 = dict(meta)
+            meta2["workdir"] = str(wt.path)
+
+            # Important: subagent should ONLY write inside meta["workdir"]
+            res = await subagent.run(ctx, input=input, meta=meta2)
+
+            diff = worktree_diff(wt)
+            return WriterResult(ok=res.ok, diff=diff, meta={"subagent_meta": res.meta, "worktree": str(wt.path)})
+
+        finally:
+            # Release lock and cleanup worktree
+            self.locks.release(lease)
+            if wt is not None:
+                cleanup_worktree(wt, prune_branch=prune_branch)
diff --git a/agent_ext/workflow/__init__.py b/agent_ext/workflow/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent_ext/workflow/bandit.py b/agent_ext/workflow/bandit.py
new file mode 100644
index 0000000..cacc079
--- /dev/null
+++ b/agent_ext/workflow/bandit.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import math
+from collections import defaultdict
+
+
+class UCB1Bandit:
+    """
+    Simple, deterministic-ish, works well.
+    Chooses workflow with best upper confidence bound.
+    """
+
+    def __init__(self):
+        self.counts: dict[str, int] = defaultdict(int)
+        self.values: dict[str, float] = defaultdict(float)
+        self.total: int = 0
+
+    def observe(self, arm: str, reward: float) -> None:
+        self.total += 1
+        self.counts[arm] += 1
+        n = self.counts[arm]
+        # incremental mean
+        self.values[arm] += (reward - self.values[arm]) / float(n)
+
+    def choose(self, arms: list[str]) -> str:
+        # cold-start: pick untried first
+        for a in arms:
+            if self.counts[a] == 0:
+                return a
+        # UCB1
+        best_arm = arms[0]
+        best_score = -1e9
+        for a in arms:
+            avg = self.values[a]
+            bonus = math.sqrt(2.0 * math.log(max(1, self.total)) / float(self.counts[a]))
+            score = avg + bonus
+            if score > best_score:
+                best_score = score
+                best_arm = a
+        return best_arm
diff --git a/agent_ext/workflow/builtins.py b/agent_ext/workflow/builtins.py
new file mode 100644
index 0000000..107507a
--- /dev/null
+++ b/agent_ext/workflow/builtins.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+from typing import Any
+
+from .types import Capability, StepSpec, WorkflowSpec
+
+
+@dataclass
+class PlannerComponent:
+    capability = Capability(
+        name="planner_component", tags=("plan",), cost_hint=1, quality_hint=0.5, requires_model=False
+    )
+
+    async def run(self, ctx, state: dict[str, Any]) -> dict[str, Any]:
+        # quick heuristic “plan” into scratch
+        state["task"]["text"]
+        state["scratch"]["plan"] = [
+            "analyze intent",
+            "search repo",
+            "execute relevant workflow steps",
+            "summarize output",
+        ]
+        await asyncio.sleep(0)
+        return state
+
+
+@dataclass
+class MemoryComponent:
+    capability = Capability(
+        name="memory_component", tags=("memory",), cost_hint=1, quality_hint=0.5, requires_model=False
+    )
+
+    async def run(self, ctx, state: dict[str, Any]) -> dict[str, Any]:
+        # placeholder: later wire SummarizingMemory / SlidingWindowMemory
+        state["scratch"]["memory_used"] = True
+        await asyncio.sleep(0)
+        return state
+
+
+@dataclass
+class OcrComponent:
+    capability = Capability(name="ocr_component", tags=("ocr",), cost_hint=3, quality_hint=0.5, requires_model=True)
+
+    async def run(self, ctx, state: dict[str, Any]) -> dict[str, Any]:
+        # stub: later wire actual ingest/vision OCR pipeline
+        async with ctx.model_limiter:
+            # don’t call model yet; just show wiring
+            await asyncio.sleep(0)
+        state.setdefault("outputs", {})
+        state["outputs"]["ocr_text"] = "(stub ocr output)"
+        return state
+
+
+@dataclass
+class RepoSearchComponent:
+    capability = Capability(
+        name="repo_search_component", tags=("search_repo",), cost_hint=1, quality_hint=0.5, requires_model=False
+    )
+
+    async def run(self, ctx, state: dict[str, Any]) -> dict[str, Any]:
+        query = state["task"]["text"]
+        res = await ctx.subagents.get("repo_grep").run(ctx, input=query, meta={"root": ".", "limit": 10})
+        state.setdefault("outputs", {})
+        state["outputs"]["repo_hits"] = res.output
+        return state
+
+
+@dataclass
+class SummarizeComponent:
+    capability = Capability(
+        name="summarize_component", tags=("summarize",), cost_hint=1, quality_hint=0.5, requires_model=False
+    )
+
+    async def run(self, ctx, state: dict[str, Any]) -> dict[str, Any]:
+        # cheap summary; later can be LLM
+        outs = state.get("outputs", {})
+        state["outputs"]["summary"] = f"done. outputs keys={list(outs.keys())}"
+        await asyncio.sleep(0)
+        return state
+
+
+def register_builtins(registry) -> None:
+    registry.register_component("plan", PlannerComponent())
+    registry.register_component("memory", MemoryComponent())
+    registry.register_component("ocr", OcrComponent())
+    registry.register_component("repo_search", RepoSearchComponent())
+    registry.register_component("summarize", SummarizeComponent())
+
+    # Workflow A: generic plan + search + summarize
+    registry.register_workflow(
+        WorkflowSpec(
+            name="wf_general",
+            steps=(
+                StepSpec("plan"),
+                StepSpec("repo_search"),
+                StepSpec("summarize"),
+            ),
+            meta={"task_type": "general"},
+        )
+    )
+
+    # Workflow B: OCR-ish (includes memory)
+    registry.register_workflow(
+        WorkflowSpec(
+            name="wf_ocr_with_memory",
+            steps=(
+                StepSpec("plan"),
+                StepSpec("memory"),
+                StepSpec("ocr"),
+                StepSpec("summarize"),
+            ),
+            meta={"task_type": "ocr"},
+        )
+    )
diff --git a/agent_ext/workflow/executor.py b/agent_ext/workflow/executor.py
new file mode 100644
index 0000000..df16b3d
--- /dev/null
+++ b/agent_ext/workflow/executor.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import time
+from typing import Any
+
+from .types import ExecutionResult, TaskRequest, WorkflowSpec
+
+
+class WorkflowExecutor:
+    async def execute(self, ctx, wf: WorkflowSpec, req: TaskRequest) -> ExecutionResult:
+        state: dict[str, Any] = {
+            "task": {
+                "text": req.text,
+                "task_type": req.task_type,
+                "hints": list(req.hints),
+                "constraints": req.constraints,
+            },
+            "scratch": {},
+        }
+        trace: list[dict[str, Any]] = []
+        t0 = time.time()
+
+        ok = True
+        for step in wf.steps:
+            comp = ctx.workflow_registry.get_component(step.component_name)
+            step_t0 = time.time()
+            try:
+                state = await comp.run(ctx, state)
+                trace.append(
+                    {
+                        "step": step.component_name,
+                        "ok": True,
+                        "dt_ms": int((time.time() - step_t0) * 1000),
+                    }
+                )
+            except Exception as e:
+                ok = False
+                trace.append(
+                    {
+                        "step": step.component_name,
+                        "ok": False,
+                        "error": repr(e),
+                        "dt_ms": int((time.time() - step_t0) * 1000),
+                    }
+                )
+                break
+
+        dt_ms = int((time.time() - t0) * 1000)
+        metrics = {"dt_ms": dt_ms}
+        outputs = state.get("outputs", {})
+        return ExecutionResult(ok=ok, workflow_name=wf.name, outputs=outputs, metrics=metrics, trace=trace)
diff --git a/agent_ext/workflow/experience.py b/agent_ext/workflow/experience.py
new file mode 100644
index 0000000..8cc58a3
--- /dev/null
+++ b/agent_ext/workflow/experience.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from .types import ExecutionResult, TaskRequest
+
+EXP_FILE = Path(".agent_state/workflow_experience.json")
+
+
+def _bucket(req: TaskRequest) -> str:
+    hints = ",".join(sorted(req.hints)) if req.hints else ""
+    return f"{req.task_type}|{hints}"
+
+
+@dataclass
+class ExperienceStore:
+    path: Path = EXP_FILE
+
+    def _read_data(self) -> dict:
+        if not self.path.exists():
+            return {"buckets": {}}
+        try:
+            raw = self.path.read_text(encoding="utf-8").strip()
+            if not raw:
+                return {"buckets": {}}
+            return json.loads(raw)
+        except (json.JSONDecodeError, OSError):
+            return {"buckets": {}}
+
+    def __post_init__(self):
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        if not self.path.exists():
+            self.path.write_text(json.dumps({"buckets": {}}, indent=2), encoding="utf-8")
+
+    def record(self, req: TaskRequest, result: ExecutionResult, reward: float) -> None:
+        data = self._read_data()
+        b = _bucket(req)
+        data["buckets"].setdefault(b, [])
+        data["buckets"][b].append(
+            {
+                "workflow": result.workflow_name,
+                "ok": result.ok,
+                "reward": reward,
+                "dt_ms": result.metrics.get("dt_ms"),
+            }
+        )
+        self.path.write_text(json.dumps(data, indent=2), encoding="utf-8")
+
+    def get_bucket_stats(self, req: TaskRequest) -> list[dict]:
+        data = self._read_data()
+        return data.get("buckets", {}).get(_bucket(req), [])
diff --git a/agent_ext/workflow/planner.py b/agent_ext/workflow/planner.py
new file mode 100644
index 0000000..4bfb0df
--- /dev/null
+++ b/agent_ext/workflow/planner.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from .bandit import UCB1Bandit
+from .experience import ExperienceStore
+from .types import TaskRequest, WorkflowSpec
+
+
+class WorkflowPlanner:
+    def __init__(self, exp: ExperienceStore):
+        self.exp = exp
+        self.bandits = {}  # bucket -> bandit
+
+    def _bucket(self, req: TaskRequest) -> str:
+        hints = ",".join(sorted(req.hints)) if req.hints else ""
+        return f"{req.task_type}|{hints}"
+
+    def candidates(self, ctx, req: TaskRequest) -> list[str]:
+        # Very simple matching rules to start; extend as needed
+        names = ctx.workflow_registry.list_workflows()
+        out = []
+        for n in names:
+            wf = ctx.workflow_registry.workflows[n]
+            tags = ctx.workflow_registry.workflow_capability_signature(wf)
+
+            if req.task_type == "ocr" and "ocr" not in tags:
+                continue
+            if "needs_memory" in req.hints and "memory" not in tags:
+                continue
+            if "needs_planning" in req.hints and "plan" not in tags:
+                continue
+
+            out.append(n)
+
+        return out or names  # fallback
+
+    def choose(self, ctx, req: TaskRequest) -> WorkflowSpec:
+        cands = self.candidates(ctx, req)
+        bucket = self._bucket(req)
+
+        bandit = self.bandits.get(bucket)
+        if bandit is None:
+            bandit = UCB1Bandit()
+            # warm-start from experience
+            for row in self.exp.get_bucket_stats(req):
+                bandit.observe(row["workflow"], float(row["reward"]))
+            self.bandits[bucket] = bandit
+
+        chosen = bandit.choose(cands)
+        return ctx.workflow_registry.workflows[chosen]
+
+    def observe(self, req: TaskRequest, workflow_name: str, reward: float) -> None:
+        bucket = self._bucket(req)
+        bandit = self.bandits.get(bucket)
+        if bandit is None:
+            bandit = UCB1Bandit()
+            self.bandits[bucket] = bandit
+        bandit.observe(workflow_name, reward)
diff --git a/agent_ext/workflow/registry.py b/agent_ext/workflow/registry.py
new file mode 100644
index 0000000..1fb6d20
--- /dev/null
+++ b/agent_ext/workflow/registry.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from .types import Component, WorkflowSpec
+
+
+@dataclass
+class Registry:
+    components: dict[str, Component]
+    workflows: dict[str, WorkflowSpec]
+
+    def __init__(self):
+        self.components = {}
+        self.workflows = {}
+
+    def register_component(self, name: str, component: Component) -> None:
+        self.components[name] = component
+
+    def register_workflow(self, wf: WorkflowSpec) -> None:
+        self.workflows[wf.name] = wf
+
+    def list_components(self) -> list[str]:
+        return sorted(self.components.keys())
+
+    def list_workflows(self) -> list[str]:
+        return sorted(self.workflows.keys())
+
+    def get_component(self, name: str) -> Component:
+        return self.components[name]
+
+    def find_components_by_tag(self, tag: str) -> list[str]:
+        out = []
+        for name, c in self.components.items():
+            if tag in c.capability.tags:
+                out.append(name)
+        return sorted(out)
+
+    def workflow_capability_signature(self, wf: WorkflowSpec) -> tuple[str, ...]:
+        tags = []
+        for step in wf.steps:
+            comp = self.components.get(step.component_name)
+            if comp:
+                tags.extend(list(comp.capability.tags))
+        return tuple(sorted(set(tags)))
diff --git a/agent_ext/workflow/types.py b/agent_ext/workflow/types.py
new file mode 100644
index 0000000..b9be0a3
--- /dev/null
+++ b/agent_ext/workflow/types.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+Json = dict[str, Any]
+
+
+@dataclass(frozen=True)
+class Capability:
+    """
+    Declares what a component can do.
+    Example tags: "plan", "ocr", "memory", "search_repo", "patch", "gates", "summarize"
+    """
+
+    name: str
+    tags: tuple[str, ...]
+    cost_hint: int = 1  # rough relative cost
+    quality_hint: float = 0.5  # rough prior
+    requires_model: bool = False
+
+
+class Component(Protocol):
+    """
+    A runnable building block: takes ctx + state; returns updated state.
+    """
+
+    capability: Capability
+
+    async def run(self, ctx, state: Json) -> Json: ...
+
+
+@dataclass(frozen=True)
+class StepSpec:
+    component_name: str
+    input_keys: tuple[str, ...] = ()
+    output_key: str | None = None
+    meta: Json = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class WorkflowSpec:
+    """
+    A workflow is a DAG in disguise (for now sequence; can grow to DAG later).
+    """
+
+    name: str
+    steps: tuple[StepSpec, ...]
+    meta: Json = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class TaskRequest:
+    text: str
+    task_type: str = "general"  # e.g. "ocr", "code_change", "research"
+    hints: tuple[str, ...] = ()  # e.g. ("needs_ocr", "needs_memory")
+    constraints: Json = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ExecutionResult:
+    ok: bool
+    workflow_name: str
+    outputs: Json
+    metrics: Json
+    trace: list[Json]  # per-step trace for learning/debug
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..a76ae42
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,25 @@
+services:
+  workbench:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+    working_dir: /repo
+    volumes:
+      - ./:/repo
+      - ./.agent_state:/repo/.agent_state
+    environment:
+
+      - HTTP_PROXY=${HTTP_PROXY}
+      - http_proxy=${http_proxy}
+      - HTTPS_PROXY=${HTTPS_PROXY}
+      - https_proxy=${https_proxy}
+      - NO_PROXY=${NO_PROXY}
+      - no_proxy=${no_proxy}
+      - REQUESTS_CA_BUNDLE=${REQUESTS_CA_BUNDLE}
+      - SSL_CERT_FILE=${SSL_CERT_FILE}
+
+      # point to your local OpenAI-compatible server
+      - LLM_BASE_URL=${LLM_BASE_URL:-http://host.docker.internal:8000/v1}
+      - LLM_API_KEY=${LLM_API_KEY:-local}
+      - LLM_MODEL=${LLM_MODEL:-gpt-oss-120b}
+    command: ["python", "-m", "agent_ext.workbench", "--use-openai-chat-model"]
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..0969302
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.11-slim
+
+WORKDIR /repo
+
+# optional but helpful
+RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
+
+# If you use uv in this repo already, swap to uv here.
+# Keeping pip-simple so it works everywhere tomorrow.
+COPY pyproject.toml /repo/pyproject.toml
+COPY README.md /repo/README.md
+
+# Install runtime deps only (you can refine tomorrow)
+RUN pip install --no-cache-dir rich
+
+# If you want pydantic-ai here, add:
+# RUN pip install --no-cache-dir "agent-patterns[agent]"
+# (or pip install pydantic-ai)
+
+COPY . /repo
+
+CMD ["python", "-m", "agent_ext.workbench"]
diff --git a/docs/AUTO_AGENT.md b/docs/AUTO_AGENT.md
new file mode 100644
index 0000000..32e2e89
--- /dev/null
+++ b/docs/AUTO_AGENT.md
@@ -0,0 +1,179 @@
+# Fully automated self-improving code agent
+
+Goal: run with **no intervention** — test, push to a branch, store state (optionally on GitHub), and **deconflict** when multiple people or runners submit PRs.
+
+---
+
+## What you already have
+
+- **Auto-adopt**: Workbench loop and cog `loop_v2` apply patch → run gates → commit & push when `AUTO_ADOPT=1` and score ≥ threshold.
+- **Worktrees**: Isolated branches per run (`auto/<run_id>/<agent>`); patches applied in worktree, then diff applied to main tree.
+- **Gates**: Import check, compile check, optional pytest (set `pytest_paths` in GatePlan).
+- **Push**: `adopt.commit_and_push(message, branch=AUTO_PUSH_BRANCH)` pushes to `origin/<branch>`.
+- **Cog state**: `CogState` + `RegressionMemory` in `.agent_state/` (local); anti-thrash and fail_streak.
+- **Locks**: `LeaseLockStore` in `.agent_state/locks/` (local, single-machine).
+
+---
+
+## What’s needed (checklist)
+
+### 1. Headless daemon entry point (no TUI)
+
+- **Run the cog loop** (`loop_v2.run_cognitive_cycle`) in a loop with `run_forever` in `agent_ext.cog.daemon`.
+- **Same ctx as workbench**: Use `build_ctx()` so the daemon has model, subagents, search, `cog_state`, `regression_memory`.
+- **Load .env** at daemon startup (e.g. in a `__main__` for `agent_ext.cog.daemon` or a single `agent_ext.run` entry).
+- **Env vars**: `AGENT_LOOP_SLEEP`, `AGENT_DAEMON_GOAL`, `COG_*`, `AUTO_ADOPT`, `AUTO_PUSH_BRANCH`, `AUTO_COMMIT_THRESHOLD`, `LLM_*`.
+
+### 2. Test before push (already there, tighten)
+
+- In `loop_v2` and workbench implement path, gates already run (import + compile + optional pytest).
+- Ensure **pytest** runs when you have `tests/`: set `GatePlan(pytest_paths=["tests"])` when appropriate (e.g. from env `RUN_PYTEST=1` or if `tests/` exists).
+
+### 3. Push to “its” branch
+
+- **Single shared branch**: `AUTO_PUSH_BRANCH=dev` — all runners push to `dev`; need pull-before-push and conflict handling.
+- **Per-runner branch** (recommended for multi-actor): e.g. `AUTO_PUSH_BRANCH=auto/$(hostname)` or `auto/$RUNNER_ID`. Each runner pushes to its own branch; humans or a bot open PRs from these branches. No direct overwrites; conflicts only at PR merge.
+
+### 4. Pull before push (deconflict with others)
+
+- Before `commit_and_push`: **fetch** and **merge** (or **rebase**) `origin/<branch>` into current branch so you push on top of latest.
+- If merge/rebase has **conflicts**: abort commit, leave working tree clean (e.g. `git merge --abort` / `git rebase --abort`), optionally back off and retry later.
+- If **push** fails (e.g. non–fast-forward): pull again (merge/rebase), retry push once or twice; then back off and retry next cycle.
+
+### 5. Store state on GitHub
+
+- **Option A – Same repo, state branch**: Push `.agent_state` (or a subset: `cog_state.json`, `regression_memory.json`, `patches/`) to a branch like `agent-state/main` or `agent-state/<runner_id>`. Other runners pull this branch to sync state (with the same conflict rules as code).
+- **Option B – State in working branch**: Commit `.agent_state` on the same branch you push code to (e.g. `dev`). Simpler; state and code evolve together; merge conflicts can include state files.
+- **Option C – External store**: GitHub API, separate repo, or key-value store. More work; use if you need stronger consistency or multi-repo.
+
+### 6. Multi-actor deconflict (multiple people + PRs)
+
+- **Pull before push** (above) so each runner integrates others’ commits before pushing.
+- **Per-runner branches** so each agent has its own ref; no direct conflict on the same branch.
+- **Distributed lock** (optional): To allow only one runner to commit at a time, use a lock that all runners see (e.g. a file in the repo or a branch like `lock/agent` that you create/delete via Git, or an external lock service). Current `LeaseLockStore` is local only.
+- **PR workflow**: Humans (or a bot) merge PRs from `auto/<runner>` into `main`/`dev`. Branch protection and CI on the target branch enforce tests and review; the agent only pushes to its own branch.
+
+### 7. No intervention
+
+- Daemon runs **forever** (`run_forever`), loads **.env** once at startup, uses **same build_ctx** (model, search, subagents, gates).
+- On errors: log and **back off** (e.g. longer sleep), don’t crash; next cycle will fetch latest and try again.
+
+---
+
+## Env vars (summary)
+
+| Var | Purpose |
+|-----|--------|
+| `LLM_BASE_URL`, `LLM_API_KEY`, `LLM_MODEL` | Model for llm_patch / analyze / design |
+| `AUTO_ADOPT` | 1 = auto commit & push after gates pass |
+| `AUTO_PUSH_BRANCH` | Branch to push to (e.g. `dev` or `auto/$(hostname)`) |
+| `AUTO_COMMIT_THRESHOLD` | Min score (0–100) to auto-adopt |
+| `AGENT_LOOP_SLEEP` | Seconds between daemon cycles |
+| `AGENT_DAEMON_GOAL` | Default goal for cog loop |
+| `COG_MAX_STEPS`, `COG_MAX_MODEL_CALLS`, `COG_MAX_PARALLEL_WRITERS` | Cog budget |
+| `MAX_DIFF_CHARS` | Max diff size to consider |
+| `RUNNER_ID` or hostname | Optional: for per-runner branch names |
+
+---
+
+## Suggested order of implementation
+
+1. **Daemon entry point**: `python -m agent_ext.cog.daemon` (or similar) that loads `.env`, builds ctx with model, runs `run_forever(ctx)`.
+2. **adopt.py**: Add `fetch_and_merge_origin(branch)` (or rebase) before commit; on merge conflict, abort and return error; in `commit_and_push` catch push failure, pull again, retry.
+3. **Per-runner branch**: Set `AUTO_PUSH_BRANCH=auto/$(RUNNER_ID)` or `auto/$(hostname)` when running multiple agents.
+4. **State on GitHub**: Option B (commit `.agent_state` on same branch) or Option A (dedicated `agent-state` branch) and pull/push it in the daemon.
+
+After that, add optional distributed lock and/or PR-bot integration if you need stricter single-writer or automated PR creation.
+
+---
+
+## Recommendations: multiple people running at once
+
+To **ensure correct merging of state, code, and branches** when many people (or machines) run the agent concurrently:
+
+### 1. **Per-runner branches (strongly recommended)**
+
+- **Code:** Each runner uses a **unique branch** for its commits, e.g. `auto/$RUNNER_ID` or `auto/$(hostname)`.
+- **Why:** No two runners push to the same ref → no direct push conflicts. Conflicts only happen when a human/bot merges a PR from `auto/alice` into `main`/`dev`, and that’s a single place to resolve.
+- **How:** Set `RUNNER_ID` (or derive from hostname) and `AUTO_PUSH_BRANCH=auto/$RUNNER_ID` in each runner’s env. Document in `.env.example`.
+
+### 2. **State: keep it per-runner (simplest and safe)**
+
+- **Option A – Per-runner state branch:** e.g. `agent-state/$RUNNER_ID`. Each runner pushes/pulls only its own state. No state merge conflicts between people.
+- **Option B – State on same branch as code:** Commit `.agent_state` on `auto/$RUNNER_ID` along with code. Again, no cross-runner state conflict.
+- **Recommendation:** Start with **per-runner state** (A or B). No need to merge state files between runners; each agent has its own cog_state and regression_memory.
+
+### 3. **If you want shared state (one learning corpus)**
+
+- Use a **single state branch** (e.g. `agent-state/main`) that all runners push to and pull from.
+- **Merge strategy:**
+  - **Pull-before-push:** Before writing state, fetch and merge `origin/agent-state/main` into your local state branch (same as code in `adopt.py`). Then commit and push.
+  - **Conflict resolution:** State files (JSON) can conflict. Options:
+    - **Last-write-wins:** On pull, if remote state is newer (by commit timestamp or a field in the file), overwrite local and push. Simple but can drop one runner’s updates.
+    - **Key-wise merge:** Implement a merge for `regression_memory.json` (e.g. merge by file path / commit) and for `cog_state.json` (e.g. take max fail_streak, merge timestamps). More work, preserves more information.
+    - **Short-lived lock:** A file or branch that acts as a lock (e.g. `lock/agent` or a blob on GitHub); only one runner writes state at a time. Easiest to reason about; serializes state updates.
+
+### 4. **Code merge (already in place)**
+
+- **Pull before push** in `adopt.py` (fetch + merge/rebase) ensures each runner’s branch is based on latest `origin/auto/$RUNNER_ID`. For per-runner branches, the only new commits on that branch are from that runner, so merge/rebase is usually trivial.
+- When **merging a PR** from `auto/alice` into `dev`, the merge is done by a human or bot; conflict resolution is once per PR.
+
+### 5. **Concrete steps to support multiple people**
+
+| Step | Action |
+|------|--------|
+| 1 | **Enforce RUNNER_ID:** In daemon/adopt, set `AUTO_PUSH_BRANCH=auto/${RUNNER_ID:-$(hostname)}` so every runner has a unique branch. |
+| 2 | **Ensure branch exists:** Before first push, create `auto/$RUNNER_ID` from current `main`/`dev` if it doesn’t exist (e.g. `git push -u origin HEAD:auto/$RUNNER_ID`). |
+| 3 | **State (per-runner):** Optional env `AGENT_STATE_BRANCH=agent-state/$RUNNER_ID`. At daemon start: pull that branch (or create it), restore `.agent_state` from it if present. After successful adopt: commit `.agent_state` and push to that branch. |
+| 4 | **State (shared):** If you later add `agent-state/main`, add a small “state sync” that pull/merges state (last-write-wins or key-merge), then push; optionally use a lock to avoid conflicting state pushes. |
+
+### 6. **Single shared learning state, but don’t merge state with code**
+
+You want one shared learning state, and when a PR from `auto/alice` is merged into `main` or `dev`, you **do not** want Alice’s local `.agent_state` to be merged in — only her code.
+
+**Recommended: state never lives on the code branch**
+
+- **Code branch** (`auto/$RUNNER_ID`): Runners commit and push **only code**. Do **not** commit `.agent_state` on this branch.
+- **State branch** (`agent-state/main`): All runners pull/push **only state** (e.g. `cog_state.json`, `regression_memory.json`, and any other shared state files) to this branch.
+- When you merge `auto/alice` → `main`/`dev`, the PR has no state files, so nothing to exclude. Main/dev never get runner-specific state.
+
+**How to enforce “no state on code branch”**
+
+- In the agent: when adopting, **commit only code** to the code branch (no `git add .agent_state` there). Separately, sync state to `agent-state/main` (e.g. clone/fetch that branch, copy `.agent_state` into it, commit, push). So code and state are always pushed to different branches.
+- Optionally keep **`.agent_state/` in `.gitignore`** on the code branch so a mistaken `git add -A` doesn’t add state. (Your repo already ignores most of `.agent_state`; you can keep or tighten that so no state is tracked on code branches.)
+
+**If state ever gets committed on a runner’s branch: merge rule so main/dev don’t take it**
+
+If a runner has already committed `.agent_state` on `auto/alice` and you merge that PR into `main`/`dev`, Git would normally merge those files. To **discard** incoming state and keep main’s version (or leave main without those files), use a **custom merge driver** on `main`/`dev`:
+
+1. **Define a “keep ours” merge driver** (run once per repo, or document for maintainers):
+
+   ```bash
+   git config merge.keep-ours.name "keep our version (ignore incoming)"
+   git config merge.keep-ours.driver "true"
+   ```
+
+   (`true` exits 0 and keeps the current branch’s version; the incoming changes are not applied.)
+
+2. **In the repo root, `.gitattributes`** (already in repo) contains:
+
+   ```
+   .agent_state/** merge=keep-ours
+   ```
+   Commit this on `main`/`dev` so all merges into those branches use it.
+
+3. When you merge `auto/alice` into `main`, Git will use `keep-ours` for any path under `.agent_state/`: it keeps main’s version and ignores Alice’s. So her local state is not merged in.
+
+**Summary**
+
+| Goal | Approach |
+|------|----------|
+| Shared learning state | All runners push/pull state to `agent-state/main` only. |
+| Code PRs don’t merge state | Don’t commit `.agent_state` on code branches; state lives only on `agent-state/main`. |
+| Safety net if state was committed on a branch | On main/dev, set `merge=keep-ours` for `.agent_state/**` in `.gitattributes` and define the `keep-ours` merge driver so merges into main/dev never take state from the PR. |
+
+### 7. **Summary**
+
+- **Multiple people, no shared state:** Per-runner code branch + per-runner state (branch or on same branch). No state merge logic; no code conflict between runners.
+- **Multiple people, shared state:** Per-runner code branch + shared state branch with pull-before-push and a clear merge/lock strategy (last-write-wins, key-merge, or lock). **Do not commit state on the code branch** so merging PRs into main/dev never merges in local state.
+- **Merge rule:** Use `.gitattributes` with `merge=keep-ours` for `.agent_state/**` on main/dev so that if state was ever committed on a runner branch, it is not merged into main/dev.
diff --git a/evals/__init__.py b/evals/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/cases/search.jsonl b/evals/cases/search.jsonl
new file mode 100644
index 0000000..af11643
--- /dev/null
+++ b/evals/cases/search.jsonl
@@ -0,0 +1,3 @@
+{"query":"RunContext", "expect_any":["run_context.py","context"]}
+{"query":"worktree", "expect_any":["worktrees.py"]}
+{"query":"bm25", "expect_any":["bm25.py","search"]}
diff --git a/evals/search_evals.py b/evals/search_evals.py
new file mode 100644
index 0000000..133bb44
--- /dev/null
+++ b/evals/search_evals.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+def run_search_smoke(ctx) -> dict:
+    """
+    Deterministic smoke test: ensures BM25 index returns something for common terms.
+    Expand later using pydantic-evals.
+    """
+    q = "RunContext"
+    hits = ctx.search.search(q, top_k=10)
+    return {"query": q, "num_hits": len(hits), "top": hits[:3]}
diff --git a/evals/workbench_evals.py b/evals/workbench_evals.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/workflow_evals.py b/evals/workflow_evals.py
new file mode 100644
index 0000000..e69de29
diff --git a/main.py b/main.py
deleted file mode 100644
index da316c1..0000000
--- a/main.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def main():
-    print("Hello from agent-patterns!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pyproject.toml b/pyproject.toml
index 04f8170..b181d78 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,26 +1,76 @@
 [project]
 name = "agent-patterns"
-version = "0.1.0"
-description = "Add your description here"
+dynamic = ["version"]
+description = "Modular, pluggable subsystems for building AI agents with pydantic-ai"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "asyncpg>=0.31.0",
+    "python-dotenv>=1.0.0",
     "httpx>=0.28.1",
     "pdf2img>=0.1.2",
     "pydantic>=2.12.5",
-]
-[project.optional-dependencies]
-# Install with: pip install agent-patterns[agent] or uv add "agent-patterns[agent]"
-# Use this extra when you want PydanticAIAgentBase and LLMVisionOCREngine.
-# If your app already has pydantic-ai (any version), you can omit [agent] and
-# the package will use your installed pydantic-ai when you use the agent/vision OCR.
-agent = [
+    "python-dotenv>=1.2.1",
+    "python-pptx>=1.0.2",
+    "rich>=14.3.2",
+    "python-docx>=1.2.0",
+    "reportlab>=4.4.10",
     "pydantic-ai>=1.60.0",
 ]
 
-docs = [
-    "python-docx>=1.2.0",
-    "python-pptx>=1.0.2",
-    "reportlab>=4.4.10",
-]
\ No newline at end of file
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.version]
+path = "agent_ext/__init__.py"
+
+[tool.hatch.build.targets.wheel]
+only-include = ["agent_ext"]
+
+[tool.hatch.build.targets.wheel.force-include]
+"__init__.py" = "agent_patterns/__init__.py"
+"run_context.py" = "agent_patterns/run_context.py"
+"config.py" = "agent_patterns/config.py"
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
+    "ruff>=0.11.0",
+]
+
+[project.scripts]
+workbench = "agent_ext.workbench.__main__:main"
+agent-cog-daemon = "agent_ext.cog.__main__:main"
+
+# --- Ruff (linter + formatter) ---
+
+[tool.ruff]
+target-version = "py312"
+line-length = 120
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "UP", "B", "SIM"]
+ignore = [
+    "E501",   # line too long (handled by formatter)
+    "E402",   # module-level import not at top (intentional: _ensure_root_importable)
+    "E731",   # lambda assignment (used in tests and callbacks)
+    "E741",   # ambiguous variable name (l, O, I — common in math/loop code)
+    "B008",   # function call in default arg (pydantic pattern)
+    "B905",   # zip strict (py310+)
+    "UP007",  # X | Y union syntax (some places use Optional)
+    "SIM108", # ternary instead of if-else (sometimes less readable)
+    "F401",   # unused import (many re-exports in __init__.py files)
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["agent_ext", "agent_patterns"]
+
+[tool.ruff.format]
+quote-style = "double"
+
+# --- Pytest ---
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
diff --git a/run_context.py b/run_context.py
index 3d71ea8..a2d8afe 100644
--- a/run_context.py
+++ b/run_context.py
@@ -58,6 +58,9 @@ class RunContext:
     memory: Any = None
     rlm: Any = None
     todo: Any = None  # TodoToolset for task CRUD
+    # cog daemon / self-improve (workbench build_ctx sets these)
+    cog_state: Any = None
+    regression_memory: Any = None
 
 
 @dataclass(frozen=True)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_audit_gaps.py b/tests/test_audit_gaps.py
new file mode 100644
index 0000000..1917e0a
--- /dev/null
+++ b/tests/test_audit_gaps.py
@@ -0,0 +1,185 @@
+"""Tests for gap-fill code: strategies, async guardrail, decorators, prompts, wiring."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.hooks import (
+    AgentMiddleware,
+    AggregationStrategy,
+    AsyncGuardrailMiddleware,
+    GuardrailTiming,
+    InputBlocked,
+    middleware_from_functions,
+)
+from agent_ext.hooks.strategies import GuardrailTiming as GT
+from agent_ext.run_context import Policy, RunContext
+from agent_ext.subagents.prompts import (
+    SUBAGENT_SYSTEM_PROMPT,
+    get_subagent_system_prompt,
+    get_task_instructions_prompt,
+)
+from agent_ext.subagents.types import SubAgentConfig
+
+
+def _make_ctx():
+    class _C(dict):
+        def get(self, k, d=None):
+            return super().get(k, d)
+
+        def set(self, k, v):
+            super().__setitem__(k, v)
+
+    class _L:
+        def info(self, msg, **k):
+            pass
+
+        def warning(self, msg, **k):
+            pass
+
+        def error(self, msg, **k):
+            pass
+
+    class _A:
+        def put_json(self, k, o):
+            return k
+
+    return RunContext(
+        case_id="c1",
+        session_id="s1",
+        user_id="u1",
+        policy=Policy(allow_tools=True),
+        cache=_C(),
+        logger=_L(),
+        artifacts=_A(),
+    )
+
+
+class TestGuardrailTiming:
+    def test_enum_values(self):
+        assert GT.BLOCKING.value == "blocking"
+        assert GT.CONCURRENT.value == "concurrent"
+        assert GT.ASYNC_POST.value == "async_post"
+
+
+class TestAggregationStrategy:
+    def test_new_values_exist(self):
+        assert AggregationStrategy.ALL_MUST_PASS
+        assert AggregationStrategy.FIRST_SUCCESS
+        assert AggregationStrategy.RACE
+        assert AggregationStrategy.COLLECT_ALL
+
+
+class TestAsyncGuardrail:
+    @pytest.mark.asyncio
+    async def test_blocking_mode_passes(self):
+        class PassGuardrail(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                return prompt
+
+        grd = AsyncGuardrailMiddleware(PassGuardrail(), timing=GuardrailTiming.BLOCKING)
+        result = await grd.before_run(_make_ctx(), "hello")
+        assert result == "hello"
+
+    @pytest.mark.asyncio
+    async def test_blocking_mode_blocks(self):
+        class BlockGuardrail(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                raise InputBlocked("blocked!")
+
+        grd = AsyncGuardrailMiddleware(BlockGuardrail(), timing=GuardrailTiming.BLOCKING)
+        with pytest.raises(InputBlocked):
+            await grd.before_run(_make_ctx(), "hello")
+
+    @pytest.mark.asyncio
+    async def test_async_post_logs_but_passes(self):
+        class BlockGuardrail(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                raise InputBlocked("blocked!")
+
+        grd = AsyncGuardrailMiddleware(BlockGuardrail(), timing=GuardrailTiming.ASYNC_POST)
+        # before_run should pass (ASYNC_POST doesn't check before)
+        result = await grd.before_run(_make_ctx(), "hello")
+        assert result == "hello"
+        # after_run should log but not raise
+        output = await grd.after_run(_make_ctx(), "hello", "output")
+        assert output == "output"
+
+
+class TestDecoratorMiddleware:
+    @pytest.mark.asyncio
+    async def test_before_run_decorator(self):
+        async def log_prompt(ctx, prompt):
+            return f"[logged] {prompt}"
+
+        mw = middleware_from_functions(before_run=log_prompt)
+        result = await mw.before_run(_make_ctx(), "hello")
+        assert result == "[logged] hello"
+
+    @pytest.mark.asyncio
+    async def test_multiple_hooks(self):
+        calls = []
+
+        async def br(ctx, prompt):
+            calls.append("before_run")
+            return prompt
+
+        async def ar(ctx, prompt, output):
+            calls.append("after_run")
+            return output
+
+        mw = middleware_from_functions(before_run=br, after_run=ar)
+        await mw.before_run(_make_ctx(), "p")
+        await mw.after_run(_make_ctx(), "p", "o")
+        assert calls == ["before_run", "after_run"]
+
+    @pytest.mark.asyncio
+    async def test_noop_when_no_functions(self):
+        mw = middleware_from_functions()
+        result = await mw.before_run(_make_ctx(), "hello")
+        assert result == "hello"
+
+
+class TestSubagentPrompts:
+    def test_system_prompt_not_empty(self):
+        assert len(SUBAGENT_SYSTEM_PROMPT) > 50
+
+    def test_get_subagent_system_prompt(self):
+        configs = [
+            SubAgentConfig(name="researcher", description="Researches topics", instructions="..."),
+            SubAgentConfig(name="writer", description="Writes content", instructions="...", can_ask_questions=False),
+        ]
+        prompt = get_subagent_system_prompt(configs)
+        assert "researcher" in prompt
+        assert "writer" in prompt
+        assert "cannot ask" in prompt
+
+    def test_task_instructions(self):
+        instructions = get_task_instructions_prompt("Fix the bug", can_ask_questions=True, max_questions=3)
+        assert "Fix the bug" in instructions
+        assert "3 questions" in instructions
+
+    def test_task_instructions_no_questions(self):
+        instructions = get_task_instructions_prompt("Do it", can_ask_questions=False)
+        assert "best judgment" in instructions
+
+
+class TestRuntimeWiring:
+    def test_build_ctx_has_middleware(self):
+        from agent_ext.workbench.runtime import build_ctx
+
+        ctx = build_ctx()
+        assert hasattr(ctx, "middleware_chain")
+        assert len(ctx.middleware_chain) == 2
+        assert hasattr(ctx, "middleware_context")
+        assert hasattr(ctx, "message_bus")
+        assert hasattr(ctx, "task_manager")
+        assert hasattr(ctx, "module_registry")
+
+    def test_modules_loaded(self):
+        from agent_ext.workbench.runtime import build_ctx
+
+        ctx = build_ctx()
+        module_names = list(ctx.module_registry.modules.keys())
+        assert "core" in module_names
+        assert "self_improve" in module_names
diff --git a/tests/test_backends_new.py b/tests/test_backends_new.py
new file mode 100644
index 0000000..0b32cd9
--- /dev/null
+++ b/tests/test_backends_new.py
@@ -0,0 +1,155 @@
+"""Tests for the overhauled backends system."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.backends import (
+    DEFAULT_RULESET,
+    PERMISSIVE_RULESET,
+    READONLY_RULESET,
+    PermissionChecker,
+    StateBackend,
+    apply_hashline_edit,
+    create_ruleset,
+    format_hashline_output,
+    line_hash,
+)
+
+
+class TestStateBackend:
+    def test_write_and_read(self):
+        sb = StateBackend()
+        sb.write_text("src/app.py", "print('hello')")
+        assert sb.read_text("src/app.py") == "print('hello')"
+
+    def test_read_missing_raises(self):
+        sb = StateBackend()
+        with pytest.raises(FileNotFoundError):
+            sb.read_text("nonexistent.py")
+
+    def test_list_files(self):
+        sb = StateBackend()
+        sb.write_text("src/a.py", "a")
+        sb.write_text("src/b.py", "b")
+        assert sb.list("/src") == ["a.py", "b.py"]
+
+    def test_glob(self):
+        sb = StateBackend()
+        sb.write_text("src/a.py", "a")
+        sb.write_text("src/b.txt", "b")
+        result = sb.glob("**/*.py")
+        assert any("a.py" in r for r in result)
+
+    def test_edit(self):
+        sb = StateBackend()
+        sb.write_text("f.py", "x = 1\ny = 2")
+        result = sb.edit("/f.py", "x = 1", "x = 99")
+        assert result.error is None
+        assert "x = 99" in sb.read_text("f.py")
+
+    def test_edit_not_found(self):
+        sb = StateBackend()
+        sb.write_text("f.py", "hello")
+        result = sb.edit("/f.py", "nonexistent", "replacement")
+        assert result.error is not None
+
+    def test_grep(self):
+        sb = StateBackend()
+        sb.write_text("a.py", "def hello():\n    pass")
+        sb.write_text("b.py", "x = 1")
+        matches = sb.grep_raw("hello")
+        assert isinstance(matches, list)
+        assert len(matches) == 1
+        assert matches[0].path == "/a.py"
+
+    def test_read_numbered(self):
+        sb = StateBackend()
+        sb.write_text("f.py", "line1\nline2\nline3")
+        output = sb.read_numbered("/f.py")
+        assert "1\tline1" in output
+        assert "3\tline3" in output
+
+    def test_path_traversal_blocked(self):
+        sb = StateBackend()
+        with pytest.raises(PermissionError):
+            sb.write_text("../../etc/passwd", "hacked")
+
+
+class TestPermissions:
+    def test_readonly_allows_read(self):
+        checker = PermissionChecker(READONLY_RULESET)
+        assert checker.is_allowed("read", "/src/app.py")
+
+    def test_readonly_blocks_write(self):
+        checker = PermissionChecker(READONLY_RULESET)
+        assert not checker.is_allowed("write", "/src/app.py")
+
+    def test_permissive_allows_write(self):
+        checker = PermissionChecker(PERMISSIVE_RULESET)
+        assert checker.is_allowed("write", "/src/app.py")
+
+    def test_secrets_always_denied(self):
+        for ruleset in [READONLY_RULESET, PERMISSIVE_RULESET, DEFAULT_RULESET]:
+            checker = PermissionChecker(ruleset)
+            assert not checker.is_allowed("read", "**/.env")
+
+    def test_custom_ruleset(self):
+        ruleset = create_ruleset(allow_read=True, allow_write=True, allow_execute=False)
+        checker = PermissionChecker(ruleset)
+        assert checker.is_allowed("read", "/f.py")
+        assert checker.is_allowed("write", "/f.py")
+        assert not checker.is_allowed("execute", "ls")
+
+    def test_require_raises_on_deny(self):
+        checker = PermissionChecker(READONLY_RULESET)
+        with pytest.raises(PermissionError):
+            checker.require("write", "/f.py")
+
+
+class TestHashline:
+    def test_line_hash_deterministic(self):
+        h1 = line_hash("hello world")
+        h2 = line_hash("hello world")
+        assert h1 == h2
+        assert len(h1) == 2
+
+    def test_different_content_different_hash(self):
+        h1 = line_hash("hello")
+        h2 = line_hash("world")
+        assert h1 != h2
+
+    def test_format_output(self):
+        content = "line one\nline two\nline three\n"
+        output = format_hashline_output(content)
+        assert "1:" in output
+        assert "2:" in output
+        assert "3:" in output
+        assert "|line one" in output
+
+    def test_apply_edit_success(self):
+        content = "first\nsecond\nthird\n"
+        h = line_hash("second")
+        new_content, error = apply_hashline_edit(content, start_line=2, start_hash=h, new_content="replaced")
+        assert error is None
+        assert "replaced" in new_content
+        assert "second" not in new_content
+
+    def test_apply_edit_hash_mismatch(self):
+        content = "first\nsecond\nthird\n"
+        new_content, error = apply_hashline_edit(content, start_line=2, start_hash="xx", new_content="replaced")
+        assert error is not None
+        assert "mismatch" in error.lower()
+        assert "second" in new_content  # unchanged
+
+    def test_apply_edit_insert_after(self):
+        content = "first\nsecond\nthird\n"
+        h = line_hash("first")
+        new_content, error = apply_hashline_edit(
+            content, start_line=1, start_hash=h, new_content="inserted", insert_after=True
+        )
+        assert error is None
+        lines = new_content.strip().split("\n")
+        assert lines[0] == "first"
+        assert lines[1] == "inserted"
+        assert lines[2] == "second"
diff --git a/tests/test_database.py b/tests/test_database.py
new file mode 100644
index 0000000..d23cf21
--- /dev/null
+++ b/tests/test_database.py
@@ -0,0 +1,112 @@
+"""Tests for the new database system."""
+
+from __future__ import annotations
+
+import sqlite3
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from agent_ext.database import DatabaseConfig, SQLiteDatabase
+
+
+@pytest.fixture
+def test_db_path():
+    """Create a temp SQLite database with test data."""
+    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+        path = f.name
+    conn = sqlite3.connect(path)
+    conn.execute("CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)")
+    conn.execute("INSERT INTO users VALUES (1, 'Alice', 30)")
+    conn.execute("INSERT INTO users VALUES (2, 'Bob', 25)")
+    conn.execute("INSERT INTO users VALUES (3, 'Charlie', 35)")
+    conn.execute("CREATE TABLE orders (id INTEGER PRIMARY KEY, user_id INTEGER, total REAL)")
+    conn.execute("INSERT INTO orders VALUES (1, 1, 99.99)")
+    conn.execute("INSERT INTO orders VALUES (2, 2, 49.50)")
+    conn.commit()
+    conn.close()
+    yield path
+    Path(path).unlink(missing_ok=True)
+
+
+class TestSQLiteDatabase:
+    @pytest.mark.asyncio
+    async def test_connect_disconnect(self, test_db_path):
+        db = SQLiteDatabase(test_db_path)
+        await db.connect()
+        await db.disconnect()
+
+    @pytest.mark.asyncio
+    async def test_list_tables(self, test_db_path):
+        async with SQLiteDatabase(test_db_path) as db:
+            tables = await db.list_tables()
+            names = [t.name for t in tables]
+            assert "users" in names
+            assert "orders" in names
+
+    @pytest.mark.asyncio
+    async def test_describe_table(self, test_db_path):
+        async with SQLiteDatabase(test_db_path) as db:
+            info = await db.describe_table("users")
+            assert info.name == "users"
+            col_names = [c["name"] for c in info.columns]
+            assert "id" in col_names
+            assert "name" in col_names
+            assert "age" in col_names
+            assert info.row_count == 3
+
+    @pytest.mark.asyncio
+    async def test_get_schema(self, test_db_path):
+        async with SQLiteDatabase(test_db_path) as db:
+            schema = await db.get_schema()
+            assert schema.database_type == "sqlite"
+            assert len(schema.tables) == 2
+
+    @pytest.mark.asyncio
+    async def test_execute_query(self, test_db_path):
+        async with SQLiteDatabase(test_db_path) as db:
+            result = await db.execute_query("SELECT * FROM users WHERE age > 25")
+            assert result.error is None
+            assert result.row_count == 2
+            assert result.columns == ["id", "name", "age"]
+
+    @pytest.mark.asyncio
+    async def test_read_only_blocks_writes(self, test_db_path):
+        async with SQLiteDatabase(test_db_path, config=DatabaseConfig(read_only=True)) as db:
+            result = await db.execute_query("DELETE FROM users WHERE id = 1")
+            assert result.error is not None
+            assert "not allowed" in result.error.lower()
+
+    @pytest.mark.asyncio
+    async def test_read_only_blocks_insert(self, test_db_path):
+        async with SQLiteDatabase(test_db_path, config=DatabaseConfig(read_only=True)) as db:
+            result = await db.execute_query("INSERT INTO users VALUES (4, 'Dave', 40)")
+            assert result.error is not None
+
+    @pytest.mark.asyncio
+    async def test_row_limit(self, test_db_path):
+        async with SQLiteDatabase(test_db_path, config=DatabaseConfig(max_rows=1)) as db:
+            result = await db.execute_query("SELECT * FROM users")
+            assert result.row_count == 1
+            assert result.truncated is True
+
+    @pytest.mark.asyncio
+    async def test_sample_table(self, test_db_path):
+        async with SQLiteDatabase(test_db_path) as db:
+            result = await db.sample_table("users", limit=2)
+            assert result.error is None
+            assert result.row_count == 2
+
+    @pytest.mark.asyncio
+    async def test_query_length_limit(self, test_db_path):
+        async with SQLiteDatabase(test_db_path, config=DatabaseConfig(max_query_length=5)) as db:
+            result = await db.execute_query("SELECT * FROM users")
+            assert result.error is not None
+            assert "too long" in result.error.lower()
+
+    @pytest.mark.asyncio
+    async def test_invalid_query(self, test_db_path):
+        async with SQLiteDatabase(test_db_path) as db:
+            result = await db.execute_query("SELECT * FROM nonexistent_table")
+            assert result.error is not None
diff --git a/tests/test_hooks.py b/tests/test_hooks.py
new file mode 100644
index 0000000..e50fb73
--- /dev/null
+++ b/tests/test_hooks.py
@@ -0,0 +1,290 @@
+"""Tests for the overhauled hooks/middleware system."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.hooks import (
+    AgentMiddleware,
+    AggregationStrategy,
+    AuditHook,
+    BlockedPrompt,
+    BlockedToolCall,
+    BudgetExceededError,
+    ConditionalMiddleware,
+    ContentFilterHook,
+    ContextAccessError,
+    CostTrackingMiddleware,
+    HookType,
+    InputBlocked,
+    MiddlewareChain,
+    MiddlewareContext,
+    ParallelMiddleware,
+    PolicyHook,
+    ToolBlocked,
+    make_blocklist_filter,
+)
+from agent_ext.run_context import Policy, RunContext
+
+
+def _make_ctx(**kw):
+    class _C(dict):
+        def get(self, k, d=None):
+            return super().get(k, d)
+
+        def set(self, k, v):
+            super().__setitem__(k, v)
+
+    class _L:
+        def info(self, msg, **k):
+            pass
+
+        def warning(self, msg, **k):
+            pass
+
+        def error(self, msg, **k):
+            pass
+
+    class _A:
+        def put_json(self, k, o):
+            return k
+
+    return RunContext(
+        case_id="c1",
+        session_id="s1",
+        user_id="u1",
+        policy=kw.get("policy", Policy(allow_tools=True)),
+        cache=_C(),
+        logger=_L(),
+        artifacts=_A(),
+    )
+
+
+class TestMiddlewareContext:
+    def test_scoped_write_read(self):
+        ctx = MiddlewareContext(config={"key": "value"})
+        scoped = ctx.for_hook(HookType.BEFORE_RUN)
+        scoped.set("user_intent", "question")
+        assert scoped.get("user_intent") == "question"
+
+    def test_later_hook_can_read_earlier(self):
+        ctx = MiddlewareContext()
+        before = ctx.for_hook(HookType.BEFORE_RUN)
+        before.set("x", 42)
+        after = ctx.for_hook(HookType.AFTER_RUN)
+        assert after.get_from(HookType.BEFORE_RUN, "x") == 42
+
+    def test_earlier_hook_cannot_read_later(self):
+        ctx = MiddlewareContext()
+        before = ctx.for_hook(HookType.BEFORE_RUN)
+        with pytest.raises(ContextAccessError):
+            before.get_from(HookType.AFTER_RUN, "x")
+
+    def test_on_error_can_read_all(self):
+        ctx = MiddlewareContext()
+        before = ctx.for_hook(HookType.BEFORE_RUN)
+        before.set("data", "hello")
+        error_scope = ctx.for_hook(HookType.ON_ERROR)
+        assert error_scope.get_from(HookType.BEFORE_RUN, "data") == "hello"
+
+    def test_clone_and_merge(self):
+        ctx = MiddlewareContext(config={"a": 1})
+        before = ctx.for_hook(HookType.BEFORE_RUN)
+        before.set("x", 1)
+        clone = ctx.clone()
+        clone_before = clone.for_hook(HookType.BEFORE_RUN)
+        clone_before.set("y", 2)
+        ctx.merge_from(clone, HookType.BEFORE_RUN)
+        assert ctx.for_hook(HookType.BEFORE_RUN).get("y") == 2
+
+    def test_reset(self):
+        ctx = MiddlewareContext(config={"a": 1})
+        ctx.set_metadata("k", "v")
+        ctx.for_hook(HookType.BEFORE_RUN).set("x", 1)
+        ctx.reset()
+        assert ctx.for_hook(HookType.BEFORE_RUN).get("x") is None
+        assert ctx.config.get("a") == 1  # config preserved
+
+
+class TestMiddlewareChain:
+    @pytest.mark.asyncio
+    async def test_before_run_order(self):
+        order = []
+
+        class M1(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                order.append("M1")
+                return prompt
+
+        class M2(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                order.append("M2")
+                return prompt
+
+        chain = MiddlewareChain([M1(), M2()])
+        await chain.before_run(_make_ctx(), "hello")
+        assert order == ["M1", "M2"]
+
+    @pytest.mark.asyncio
+    async def test_after_run_reverse_order(self):
+        order = []
+
+        class M1(AgentMiddleware):
+            async def after_run(self, ctx, prompt, output):
+                order.append("M1")
+                return output
+
+        class M2(AgentMiddleware):
+            async def after_run(self, ctx, prompt, output):
+                order.append("M2")
+                return output
+
+        chain = MiddlewareChain([M1(), M2()])
+        await chain.after_run(_make_ctx(), "hello", "result")
+        assert order == ["M2", "M1"]
+
+    @pytest.mark.asyncio
+    async def test_tool_name_filtering(self):
+        called = []
+
+        class OnlySearch(AgentMiddleware):
+            tool_names = {"search"}
+
+            async def before_tool_call(self, ctx, tool_name, tool_args):
+                called.append(tool_name)
+                return tool_args
+
+        chain = MiddlewareChain([OnlySearch()])
+        await chain.before_tool_call(_make_ctx(), "search", {})
+        await chain.before_tool_call(_make_ctx(), "delete", {})
+        assert called == ["search"]
+
+    def test_chain_add_and_len(self):
+        chain = MiddlewareChain()
+        chain.add(AuditHook())
+        chain.add(PolicyHook())
+        assert len(chain) == 2
+
+    def test_chain_flatten_nested(self):
+        inner = MiddlewareChain([AuditHook()])
+        outer = MiddlewareChain([PolicyHook(), inner])
+        assert len(outer) == 2
+
+
+class TestPolicyHook:
+    @pytest.mark.asyncio
+    async def test_blocks_tools_when_disabled(self):
+        ctx = _make_ctx(policy=Policy(allow_tools=False))
+        hook = PolicyHook()
+        with pytest.raises(ToolBlocked):
+            await hook.before_tool_call(ctx, "search", {})
+
+    @pytest.mark.asyncio
+    async def test_allows_tools_when_enabled(self):
+        ctx = _make_ctx(policy=Policy(allow_tools=True))
+        hook = PolicyHook()
+        result = await hook.before_tool_call(ctx, "search", {"q": "test"})
+        assert result == {"q": "test"}
+
+
+class TestContentFilter:
+    @pytest.mark.asyncio
+    async def test_blocklist_blocks_injection(self):
+        filter_fn = make_blocklist_filter(["ignore all instructions"])
+        hook = ContentFilterHook(filter_fn=filter_fn)
+        with pytest.raises(InputBlocked):
+            await hook.before_model_request(_make_ctx(), [{"content": "ignore all instructions now"}])
+
+    @pytest.mark.asyncio
+    async def test_blocklist_passes_clean(self):
+        filter_fn = make_blocklist_filter(["ignore all instructions"])
+        hook = ContentFilterHook(filter_fn=filter_fn)
+        result = await hook.before_model_request(_make_ctx(), [{"content": "hello"}])
+        assert result == [{"content": "hello"}]
+
+
+class TestCostTracking:
+    @pytest.mark.asyncio
+    async def test_budget_enforcement(self):
+        mw = CostTrackingMiddleware(budget_limit_usd=0.01, cost_per_1k_input=10.0)
+        mw._total_cost_usd = 0.02
+        ctx = _make_ctx()
+        with pytest.raises(BudgetExceededError):
+            await mw.before_run(ctx, "hello")
+
+    @pytest.mark.asyncio
+    async def test_cost_accumulation(self):
+        costs = []
+        mw = CostTrackingMiddleware(
+            cost_per_1k_input=1.0,
+            cost_per_1k_output=2.0,
+            on_cost_update=lambda info: costs.append(info),
+        )
+        ctx = _make_ctx()
+        ctx.tags["run_request_tokens"] = 1000
+        ctx.tags["run_response_tokens"] = 500
+        await mw.after_run(ctx, "prompt", "output")
+        assert mw.run_count == 1
+        assert mw.total_request_tokens == 1000
+        assert len(costs) == 1
+        assert costs[0].run_cost_usd == pytest.approx(2.0)  # 1.0 + 2*0.5
+
+
+class TestParallelMiddleware:
+    @pytest.mark.asyncio
+    async def test_all_must_pass_succeeds(self):
+        class PassThrough(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                return prompt
+
+        par = ParallelMiddleware([PassThrough(), PassThrough()], strategy=AggregationStrategy.ALL_MUST_PASS)
+        result = await par.before_run(_make_ctx(), "hello")
+        assert result == "hello"
+
+    @pytest.mark.asyncio
+    async def test_all_must_pass_fails_on_error(self):
+        class Failer(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                raise InputBlocked("bad")
+
+        from agent_ext.hooks.exceptions import ParallelExecutionFailed
+
+        par = ParallelMiddleware([Failer()], strategy=AggregationStrategy.ALL_MUST_PASS)
+        with pytest.raises(ParallelExecutionFailed):
+            await par.before_run(_make_ctx(), "hello")
+
+
+class TestConditionalMiddleware:
+    @pytest.mark.asyncio
+    async def test_runs_when_condition_true(self):
+        called = []
+
+        class Logger(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                called.append(True)
+                return prompt
+
+        cond = ConditionalMiddleware(Logger(), condition=lambda ctx: True)
+        await cond.before_run(_make_ctx(), "hello")
+        assert called == [True]
+
+    @pytest.mark.asyncio
+    async def test_skips_when_condition_false(self):
+        called = []
+
+        class Logger(AgentMiddleware):
+            async def before_run(self, ctx, prompt):
+                called.append(True)
+                return prompt
+
+        cond = ConditionalMiddleware(Logger(), condition=lambda ctx: False)
+        await cond.before_run(_make_ctx(), "hello")
+        assert called == []
+
+
+class TestBackwardCompat:
+    def test_blocked_tool_call_alias(self):
+        assert BlockedToolCall is ToolBlocked
+
+    def test_blocked_prompt_alias(self):
+        assert BlockedPrompt is InputBlocked
diff --git a/tests/test_memory_new.py b/tests/test_memory_new.py
new file mode 100644
index 0000000..1921d53
--- /dev/null
+++ b/tests/test_memory_new.py
@@ -0,0 +1,85 @@
+"""Tests for the overhauled memory system."""
+
+from __future__ import annotations
+
+from agent_ext.memory import (
+    SlidingWindowMemory,
+    approximate_token_count,
+    find_safe_cutoff,
+    find_token_based_cutoff,
+    is_safe_cutoff_point,
+)
+
+
+class TestSlidingWindowMemory:
+    def test_message_count_mode(self):
+        mem = SlidingWindowMemory(max_messages=3)
+        result = mem.shape_messages(["a", "b", "c", "d", "e"])
+        assert len(result) == 3
+        assert result == ["c", "d", "e"]
+
+    def test_under_limit_unchanged(self):
+        mem = SlidingWindowMemory(max_messages=10)
+        msgs = ["a", "b", "c"]
+        result = mem.shape_messages(msgs)
+        assert result == msgs
+
+    def test_token_mode(self):
+        counter = lambda msgs: sum(len(str(m)) for m in msgs)
+        mem = SlidingWindowMemory(max_tokens=5, token_counter=counter)
+        result = mem.shape_messages(["aaa", "bb", "c"])  # 3+2+1 = 6 > 5
+        assert len(result) < 3
+
+    def test_trigger_messages(self):
+        mem = SlidingWindowMemory(max_messages=3, trigger_messages=5)
+        # Under trigger: no trim
+        result = mem.shape_messages(["a", "b", "c", "d"])
+        assert len(result) == 4
+        # At trigger: trim
+        result2 = mem.shape_messages(["a", "b", "c", "d", "e"])
+        assert len(result2) == 3
+
+    def test_checkpoint_is_noop(self):
+        mem = SlidingWindowMemory(max_messages=10)
+        mem.checkpoint(["a"], outcome="done")  # should not raise
+
+
+class TestSafeCutoff:
+    def test_preserves_messages_when_under(self):
+        assert find_safe_cutoff(["a", "b", "c"], messages_to_keep=5) == 0
+
+    def test_trims_when_over(self):
+        msgs = list(range(10))
+        cutoff = find_safe_cutoff(msgs, messages_to_keep=3)
+        assert cutoff >= 7
+
+    def test_keep_zero_returns_full_length(self):
+        assert find_safe_cutoff(["a", "b", "c"], messages_to_keep=0) == 3
+
+    def test_safe_cutoff_point_at_start(self):
+        assert is_safe_cutoff_point(["a", "b"], 0) is True
+
+    def test_safe_cutoff_point_at_end(self):
+        assert is_safe_cutoff_point(["a", "b"], 5) is True
+
+
+class TestTokenBasedCutoff:
+    def test_under_budget_returns_zero(self):
+        counter = lambda msgs: len(msgs)
+        assert find_token_based_cutoff(["a", "b", "c"], target_tokens=5, token_counter=counter) == 0
+
+    def test_over_budget_trims(self):
+        counter = lambda msgs: len(msgs) * 100
+        cutoff = find_token_based_cutoff(list(range(10)), target_tokens=300, token_counter=counter)
+        assert cutoff >= 7
+
+    def test_empty_messages(self):
+        counter = lambda msgs: 0
+        assert find_token_based_cutoff([], target_tokens=100, token_counter=counter) == 0
+
+
+class TestApproximateTokenCount:
+    def test_basic(self):
+        count = approximate_token_count(["hello world", "test"])
+        assert count > 0
+        assert isinstance(count, int)
diff --git a/tests/test_patching.py b/tests/test_patching.py
new file mode 100644
index 0000000..7987fa0
--- /dev/null
+++ b/tests/test_patching.py
@@ -0,0 +1,317 @@
+"""Tests for agent_ext.self_improve.patching — diff sanitization, hunk repair, and apply."""
+
+from __future__ import annotations
+
+import subprocess
+import tempfile
+from pathlib import Path
+
+from agent_ext.self_improve.patching import (
+    _HUNK_HEADER_RE,
+    _repair_hunk_headers,
+    apply_unified_diff,
+    sanitize_diff_for_apply,
+)
+from agent_ext.workbench.patch_models import (
+    FilePatch,
+    LineChange,
+    PatchOutput,
+    structured_to_unified_diff,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_git_repo(tmp: Path, files: dict[str, str] | None = None) -> Path:
+    """Create a temp git repo with initial files and return its path."""
+    subprocess.run(["git", "init", str(tmp)], capture_output=True, check=True)
+    subprocess.run(["git", "-C", str(tmp), "config", "user.email", "t@t"], capture_output=True, check=True)
+    subprocess.run(["git", "-C", str(tmp), "config", "user.name", "t"], capture_output=True, check=True)
+    if files:
+        for rel_path, content in files.items():
+            p = tmp / rel_path
+            p.parent.mkdir(parents=True, exist_ok=True)
+            p.write_text(content, encoding="utf-8")
+    subprocess.run(["git", "-C", str(tmp), "add", "-A"], capture_output=True, check=True)
+    subprocess.run(["git", "-C", str(tmp), "commit", "-m", "init"], capture_output=True, check=True)
+    return tmp
+
+
+# ---------------------------------------------------------------------------
+# 1. Hunk header regex
+# ---------------------------------------------------------------------------
+
+
+class TestHunkHeaderRegex:
+    def test_matches_standard_headers(self):
+        """Regex must match standard @@ -L,N +L,N @@ headers."""
+        assert _HUNK_HEADER_RE.match("@@ -1,3 +1,3 @@")
+        assert _HUNK_HEADER_RE.match("@@ -0,0 +1,5 @@")
+        assert _HUNK_HEADER_RE.match("@@ -10,20 +15,25 @@")
+
+    def test_matches_with_trailing_text(self):
+        """Git often appends function context after @@."""
+        assert _HUNK_HEADER_RE.match("@@ -1,3 +1,3 @@ def hello")
+
+    def test_rejects_bare_hunk(self):
+        """Bare @@ with no line numbers must not match."""
+        assert not _HUNK_HEADER_RE.match("@@")
+        assert not _HUNK_HEADER_RE.match("@@ malformed @@")
+
+
+# ---------------------------------------------------------------------------
+# 2. _repair_hunk_headers
+# ---------------------------------------------------------------------------
+
+
+class TestRepairHunkHeaders:
+    def test_valid_headers_pass_through(self):
+        """Already-valid hunk headers should not be altered."""
+        diff = "--- a/foo.py\n+++ b/foo.py\n@@ -1,3 +1,3 @@\n line1\n-old\n+new\n"
+        repaired = _repair_hunk_headers(diff)
+        assert "@@ -1,3 +1,3 @@" in repaired
+
+    def test_bare_hunk_repaired(self):
+        """Bare @@ gets rewritten with correct counts."""
+        diff = "--- a/foo.py\n+++ b/foo.py\n@@\n context\n-old\n+new\n"
+        repaired = _repair_hunk_headers(diff)
+        assert "@@ -1,2 +1,2 @@" in repaired
+
+    def test_new_file_bare_hunk_repaired(self):
+        """Bare @@ after --- /dev/null gets @@ -0,0 +1,N @@."""
+        diff = "--- /dev/null\n+++ b/new.py\n@@\n+line1\n+line2\n"
+        repaired = _repair_hunk_headers(diff)
+        assert "@@ -0,0 +1,2 @@" in repaired
+
+
+# ---------------------------------------------------------------------------
+# 3. sanitize_diff_for_apply
+# ---------------------------------------------------------------------------
+
+
+class TestSanitizeDiff:
+    def test_well_formed_diff_unchanged(self):
+        diff = '--- a/foo.py\n+++ b/foo.py\n@@ -1,3 +1,3 @@\n def hello():\n-    return "old"\n+    return "new"\n'
+        sanitized = sanitize_diff_for_apply(diff)
+        assert "--- a/foo.py" in sanitized
+        assert "+++ b/foo.py" in sanitized
+        assert "@@ -1,3 +1,3 @@" in sanitized
+
+    def test_markdown_wrapped_diff(self):
+        md = (
+            "Here is the change:\n"
+            "```diff\n"
+            "--- a/foo.py\n"
+            "+++ b/foo.py\n"
+            "@@ -1,2 +1,2 @@\n"
+            " def f():\n"
+            "-    pass\n"
+            "+    return 1\n"
+            "```\n"
+            "That's it!\n"
+        )
+        sanitized = sanitize_diff_for_apply(md)
+        assert "--- a/foo.py" in sanitized
+        assert "+    return 1" in sanitized
+
+    def test_new_file_diff_preserved(self):
+        diff = (
+            "diff --git a/new.py b/new.py\n"
+            "new file mode 100644\n"
+            "--- /dev/null\n"
+            "+++ b/new.py\n"
+            "@@ -0,0 +1,3 @@\n"
+            "+def new_function():\n"
+            "+    return True\n"
+            "+\n"
+        )
+        sanitized = sanitize_diff_for_apply(diff)
+        assert "--- /dev/null" in sanitized
+        assert "@@ -0,0 +1,3 @@" in sanitized
+
+    def test_empty_input_returns_empty(self):
+        assert sanitize_diff_for_apply("") == ""
+        assert sanitize_diff_for_apply("  \n  ") == ""
+
+    def test_empty_lines_dont_extend_diff(self):
+        """Blank lines after the diff block should not be captured as diff content."""
+        text = (
+            "--- a/foo.py\n"
+            "+++ b/foo.py\n"
+            "@@ -1,2 +1,2 @@\n"
+            " def f():\n"
+            "-    pass\n"
+            "+    return 1\n"
+            "\n"
+            "This is NOT part of the diff.\n"
+            "Neither is this.\n"
+        )
+        sanitized = sanitize_diff_for_apply(text)
+        assert "NOT part of the diff" not in sanitized
+        assert "Neither is this" not in sanitized
+
+
+# ---------------------------------------------------------------------------
+# 4. structured_to_unified_diff
+# ---------------------------------------------------------------------------
+
+
+class TestStructuredToUnifiedDiff:
+    def test_basic_edit(self):
+        patch = PatchOutput(
+            files=[
+                FilePatch(
+                    path="src/foo.py",
+                    is_new_file=False,
+                    lines=[
+                        LineChange(kind="context", content="def hello():"),
+                        LineChange(kind="remove", content='    return "old"'),
+                        LineChange(kind="add", content='    return "new"'),
+                    ],
+                )
+            ]
+        )
+        diff = structured_to_unified_diff(patch)
+        assert "diff --git a/src/foo.py b/src/foo.py" in diff
+        assert "--- a/src/foo.py" in diff
+        assert "+++ b/src/foo.py" in diff
+        assert "@@ -1,2 +1,2 @@" in diff
+        assert '-    return "old"' in diff
+        assert '+    return "new"' in diff
+
+    def test_new_file(self):
+        patch = PatchOutput(
+            files=[
+                FilePatch(
+                    path="src/new.py",
+                    is_new_file=True,
+                    lines=[
+                        LineChange(kind="add", content="# new module"),
+                        LineChange(kind="add", content="def fn(): pass"),
+                    ],
+                )
+            ]
+        )
+        diff = structured_to_unified_diff(patch)
+        assert "diff --git a/src/new.py b/src/new.py" in diff
+        assert "new file mode 100644" in diff
+        assert "--- /dev/null" in diff
+        assert "+++ b/src/new.py" in diff
+        assert "@@ -0,0 +1,2 @@" in diff
+
+    def test_multi_file(self):
+        patch = PatchOutput(
+            files=[
+                FilePatch(
+                    path="a.py",
+                    is_new_file=False,
+                    lines=[
+                        LineChange(kind="context", content="x = 1"),
+                        LineChange(kind="remove", content="y = 2"),
+                        LineChange(kind="add", content="y = 3"),
+                    ],
+                ),
+                FilePatch(
+                    path="b.py",
+                    is_new_file=True,
+                    lines=[
+                        LineChange(kind="add", content="z = 42"),
+                    ],
+                ),
+            ]
+        )
+        diff = structured_to_unified_diff(patch)
+        assert "diff --git a/a.py b/a.py" in diff
+        assert "diff --git a/b.py b/b.py" in diff
+        assert diff.count("diff --git") == 2
+
+    def test_empty_patch(self):
+        patch = PatchOutput(files=[])
+        assert structured_to_unified_diff(patch) == ""
+
+
+# ---------------------------------------------------------------------------
+# 5. End-to-end: structured → unified → git apply
+# ---------------------------------------------------------------------------
+
+
+class TestApplyStructuredDiff:
+    def test_edit_existing_file(self):
+        with tempfile.TemporaryDirectory() as td:
+            repo = _make_git_repo(Path(td), {"src/foo.py": 'def hello():\n    return "old"\n'})
+            patch = PatchOutput(
+                files=[
+                    FilePatch(
+                        path="src/foo.py",
+                        is_new_file=False,
+                        lines=[
+                            LineChange(kind="context", content="def hello():"),
+                            LineChange(kind="remove", content='    return "old"'),
+                            LineChange(kind="add", content='    return "new"'),
+                        ],
+                    ),
+                ]
+            )
+            diff = structured_to_unified_diff(patch)
+            ok, err = apply_unified_diff(diff, repo_root=repo)
+            assert ok, f"git apply failed: {err}"
+            content = (repo / "src" / "foo.py").read_text()
+            assert 'return "new"' in content
+
+    def test_create_new_file(self):
+        with tempfile.TemporaryDirectory() as td:
+            repo = _make_git_repo(Path(td), {"src/existing.py": "x = 1\n"})
+            patch = PatchOutput(
+                files=[
+                    FilePatch(
+                        path="src/brand_new.py",
+                        is_new_file=True,
+                        lines=[
+                            LineChange(kind="add", content="def new_fn():"),
+                            LineChange(kind="add", content="    return 42"),
+                        ],
+                    ),
+                ]
+            )
+            diff = structured_to_unified_diff(patch)
+            ok, err = apply_unified_diff(diff, repo_root=repo)
+            assert ok, f"git apply failed: {err}"
+            content = (repo / "src" / "brand_new.py").read_text()
+            assert "def new_fn():" in content
+            assert "return 42" in content
+
+    def test_multi_file_edit_and_create(self):
+        with tempfile.TemporaryDirectory() as td:
+            repo = _make_git_repo(
+                Path(td),
+                {
+                    "src/a.py": "x = 1\ny = 2\n",
+                },
+            )
+            patch = PatchOutput(
+                files=[
+                    FilePatch(
+                        path="src/a.py",
+                        is_new_file=False,
+                        lines=[
+                            LineChange(kind="context", content="x = 1"),
+                            LineChange(kind="remove", content="y = 2"),
+                            LineChange(kind="add", content="y = 3"),
+                        ],
+                    ),
+                    FilePatch(
+                        path="src/b.py",
+                        is_new_file=True,
+                        lines=[
+                            LineChange(kind="add", content="z = 42"),
+                        ],
+                    ),
+                ]
+            )
+            diff = structured_to_unified_diff(patch)
+            ok, err = apply_unified_diff(diff, repo_root=repo)
+            assert ok, f"git apply failed: {err}"
+            assert "y = 3" in (repo / "src" / "a.py").read_text()
+            assert "z = 42" in (repo / "src" / "b.py").read_text()
diff --git a/tests/test_planner.py b/tests/test_planner.py
new file mode 100644
index 0000000..c41136a
--- /dev/null
+++ b/tests/test_planner.py
@@ -0,0 +1,125 @@
+"""Tests for agent_ext.workbench.planner — TaskQueue."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.workbench.planner import TaskQueue
+
+
+class TestTaskQueue:
+    def test_add_and_list(self):
+        q = TaskQueue()
+        q.add("search", "Find modules", "query")
+        q.add("implement", "Create patch", "goal")
+        tasks = q.list()
+        assert len(tasks) == 2
+        assert tasks[0].kind == "search"
+        assert tasks[1].kind == "implement"
+
+    def test_ids_are_sequential(self):
+        q = TaskQueue()
+        t1 = q.add("search", "A", "q1")
+        t2 = q.add("search", "B", "q2")
+        assert t1.id == "t0001"
+        assert t2.id == "t0002"
+
+    def test_next_pending(self):
+        q = TaskQueue()
+        q.add("search", "A", "q1")
+        t = q.next_pending()
+        assert t is not None
+        assert t.status == "pending"
+
+    def test_next_pending_empty(self):
+        q = TaskQueue()
+        assert q.next_pending() is None
+
+    @pytest.mark.asyncio
+    async def test_claim_next_pending(self):
+        q = TaskQueue()
+        q.add("search", "A", "q1")
+        t = await q.claim_next_pending()
+        assert t is not None
+        assert t.status == "in_progress"
+        # Second claim returns None (no more pending)
+        t2 = await q.claim_next_pending()
+        assert t2 is None
+
+    @pytest.mark.asyncio
+    async def test_cancel_by_id(self):
+        q = TaskQueue()
+        t = q.add("search", "A", "q1")
+        result = await q.cancel_by_id(t.id)
+        assert result is True
+        assert t.status == "cancelled"
+
+    @pytest.mark.asyncio
+    async def test_cancel_non_pending(self):
+        q = TaskQueue()
+        t = q.add("search", "A", "q1")
+        await q.claim_next_pending()  # now in_progress
+        result = await q.cancel_by_id(t.id)
+        assert result is False  # can't cancel in_progress
+
+    @pytest.mark.asyncio
+    async def test_cancel_unknown_id(self):
+        q = TaskQueue()
+        result = await q.cancel_by_id("t9999")
+        assert result is None
+
+    def test_get_by_id(self):
+        q = TaskQueue()
+        t = q.add("search", "A", "q1")
+        found = q.get_by_id("t0001")
+        assert found is t
+        assert q.get_by_id("0001") is t  # numeric shorthand
+        assert q.get_by_id("t9999") is None
+
+    def test_normalize_id(self):
+        q = TaskQueue()
+        assert q.normalize_id("0001") == "t0001"
+        assert q.normalize_id("t0001") == "t0001"
+
+    @pytest.mark.asyncio
+    async def test_retry_failed_task(self):
+        q = TaskQueue()
+        t = q.add("implement", "Create patch", "goal")
+        await q.claim_next_pending()  # in_progress
+        t.status = "failed"
+        result = await q.retry_by_id(t.id)
+        assert result is True
+        assert t.status == "pending"
+        assert t.started_at is None
+        assert t.finished_at is None
+
+    @pytest.mark.asyncio
+    async def test_retry_non_failed_task(self):
+        q = TaskQueue()
+        t = q.add("search", "A", "q1")
+        result = await q.retry_by_id(t.id)
+        assert result is False  # pending, not retryable
+
+    @pytest.mark.asyncio
+    async def test_retry_all_failed(self):
+        q = TaskQueue()
+        t1 = q.add("search", "A", "q1")
+        t2 = q.add("implement", "B", "q2")
+        t1.status = "failed"
+        t2.status = "failed"
+        count = await q.retry_all_failed()
+        assert count == 2
+        assert t1.status == "pending"
+        assert t2.status == "pending"
+
+    def test_elapsed_time(self):
+        import time
+
+        q = TaskQueue()
+        t = q.add("search", "A", "q1")
+        assert t.elapsed_s is None  # not started
+        t.started_at = time.time() - 5.0
+        assert t.elapsed_s is not None
+        assert t.elapsed_s >= 4.9  # at least ~5s
+        t.finished_at = t.started_at + 5.0
+        assert abs(t.elapsed_s - 5.0) < 0.1
diff --git a/tests/test_rlm.py b/tests/test_rlm.py
new file mode 100644
index 0000000..4b7c50f
--- /dev/null
+++ b/tests/test_rlm.py
@@ -0,0 +1,112 @@
+"""Tests for the overhauled RLM system."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.rlm import (
+    GroundedResponse,
+    REPLEnvironment,
+    REPLResult,
+    RLMConfig,
+    RLMPolicy,
+    RLMRunError,
+    format_repl_result,
+    run_restricted_python,
+)
+
+
+class TestREPLEnvironment:
+    def test_basic_execution(self):
+        repl = REPLEnvironment(context="Hello World", config=RLMConfig())
+        result = repl.execute("print(len(context))")
+        assert result.success
+        assert "11" in result.stdout
+        repl.cleanup()
+
+    def test_persistent_state(self):
+        repl = REPLEnvironment(context="test data", config=RLMConfig())
+        repl.execute("x = len(context)")
+        result = repl.execute("print(x)")
+        assert result.success
+        assert "9" in result.stdout
+        repl.cleanup()
+
+    def test_dict_context(self):
+        repl = REPLEnvironment(context={"key": "value", "num": 42})
+        result = repl.execute("print(context['key'])")
+        assert result.success
+        assert "value" in result.stdout
+        repl.cleanup()
+
+    def test_list_context(self):
+        repl = REPLEnvironment(context=[1, 2, 3])
+        result = repl.execute("print(sum(context))")
+        assert result.success
+        assert "6" in result.stdout
+        repl.cleanup()
+
+    def test_allowed_import(self):
+        repl = REPLEnvironment(context="test", config=RLMConfig(allow_imports=["math"]))
+        result = repl.execute("import math\nprint(math.pi)")
+        assert result.success
+        assert "3.14" in result.stdout
+        repl.cleanup()
+
+    def test_blocked_import(self):
+        repl = REPLEnvironment(context="test", config=RLMConfig(allow_imports=[]))
+        result = repl.execute("import os")
+        assert not result.success
+        assert "not allowed" in result.stderr.lower() or "import" in result.stderr.lower()
+        repl.cleanup()
+
+    def test_error_handling(self):
+        repl = REPLEnvironment(context="test")
+        result = repl.execute("1/0")
+        assert not result.success
+        assert "ZeroDivision" in result.stderr or "division" in result.stderr
+        repl.cleanup()
+
+    def test_output_truncation(self):
+        repl = REPLEnvironment(context="test", config=RLMConfig(truncate_output_chars=50))
+        result = repl.execute("print('a' * 200)")
+        assert len(result.stdout) <= 70  # 50 chars + truncation marker
+        assert "truncated" in result.stdout
+        repl.cleanup()
+
+
+class TestFormatREPLResult:
+    def test_formats_stdout(self):
+        result = REPLResult(stdout="Hello", stderr="", locals={}, execution_time=0.1, success=True)
+        formatted = format_repl_result(result)
+        assert "Hello" in formatted
+        assert "0.100s" in formatted
+
+    def test_formats_variables(self):
+        result = REPLResult(stdout="", stderr="", locals={"x": 42, "context": "skip"}, execution_time=0.01)
+        formatted = format_repl_result(result)
+        assert "x = 42" in formatted
+        assert "context" not in formatted  # filtered out
+
+
+class TestGroundedResponse:
+    def test_construction(self):
+        gr = GroundedResponse(info="Revenue grew [1]", grounding={"1": "by 45%"})
+        assert "[1]" in gr.info
+        assert gr.grounding["1"] == "by 45%"
+
+    def test_serialization(self):
+        gr = GroundedResponse(info="test [1]", grounding={"1": "quote"})
+        data = gr.model_dump()
+        gr2 = GroundedResponse.model_validate(data)
+        assert gr2.info == gr.info
+
+
+class TestLegacyRLM:
+    def test_run_restricted_python(self):
+        result = run_restricted_python("x = 1 + 1", policy=RLMPolicy())
+        assert result["globals"]["x"] == 2
+
+    def test_disallowed_import(self):
+        with pytest.raises(RLMRunError):
+            run_restricted_python("import subprocess", policy=RLMPolicy())
diff --git a/tests/test_scoring.py b/tests/test_scoring.py
new file mode 100644
index 0000000..6651a5b
--- /dev/null
+++ b/tests/test_scoring.py
@@ -0,0 +1,67 @@
+"""Tests for agent_ext.cog.scoring — Score properties and score_patch."""
+
+from __future__ import annotations
+
+from agent_ext.cog.scoring import Score, score_patch, touched_files_from_diff
+
+
+class TestScore:
+    def test_score_property_alias(self):
+        sc = Score(total=42.0, reasons={"gates": 100.0})
+        assert sc.score == 42.0
+        assert sc.score == sc.total
+
+    def test_ok_when_gates_pass(self):
+        sc = Score(total=90.0, reasons={"gates": 100.0})
+        assert sc.ok is True
+
+    def test_not_ok_when_gates_fail(self):
+        sc = Score(total=-50.0, reasons={"gates": -50.0})
+        assert sc.ok is False
+
+    def test_not_ok_when_gates_zero(self):
+        sc = Score(total=0.0, reasons={"gates": 0.0})
+        assert sc.ok is False
+
+
+class TestScorePatch:
+    def test_gates_pass_positive_score(self):
+        sc = score_patch(gates_ok=True, diff_chars=100, files_touched=1, eval_delta=0.0)
+        assert sc.score > 0
+        assert sc.ok is True
+
+    def test_gates_fail_negative_score(self):
+        sc = score_patch(gates_ok=False, diff_chars=100, files_touched=1, eval_delta=0.0)
+        assert sc.score < 0
+        assert sc.ok is False
+
+    def test_large_diff_penalized(self):
+        small = score_patch(gates_ok=True, diff_chars=100, files_touched=1)
+        large = score_patch(gates_ok=True, diff_chars=60000, files_touched=1)
+        assert small.score > large.score
+
+    def test_many_files_penalized(self):
+        few = score_patch(gates_ok=True, diff_chars=100, files_touched=1)
+        many = score_patch(gates_ok=True, diff_chars=100, files_touched=10)
+        assert few.score > many.score
+
+
+class TestTouchedFiles:
+    def test_extracts_paths_from_diff_git(self):
+        diff = (
+            "diff --git a/foo.py b/foo.py\n"
+            "--- a/foo.py\n"
+            "+++ b/foo.py\n"
+            "@@ -1,1 +1,1 @@\n"
+            "+x\n"
+            "diff --git a/bar.py b/bar.py\n"
+            "--- a/bar.py\n"
+            "+++ b/bar.py\n"
+            "@@ -1,1 +1,1 @@\n"
+            "+y\n"
+        )
+        files = touched_files_from_diff(diff)
+        assert files == ["bar.py", "foo.py"]
+
+    def test_empty_diff(self):
+        assert touched_files_from_diff("") == []
diff --git a/tests/test_skills_new.py b/tests/test_skills_new.py
new file mode 100644
index 0000000..aea9dfb
--- /dev/null
+++ b/tests/test_skills_new.py
@@ -0,0 +1,108 @@
+"""Tests for the overhauled skills system."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.skills import (
+    CombinedRegistry,
+    FilteredRegistry,
+    PrefixedRegistry,
+    SkillNotFoundError,
+    SkillRegistry,
+    SkillSpec,
+    create_skill,
+)
+
+
+class TestCreateSkill:
+    def test_basic_creation(self):
+        skill = create_skill(
+            id="test",
+            name="Test",
+            description="A test",
+            body="# Test\n\nBody text",
+        )
+        assert skill.spec.id == "test"
+        assert skill.spec.name == "Test"
+        assert "# Test" in skill.body_markdown
+        assert len(skill.body_hash) == 64  # sha256 hex
+
+    def test_with_tags(self):
+        skill = create_skill(
+            id="t",
+            name="T",
+            description="d",
+            body="body",
+            tags=["python", "code"],
+        )
+        assert skill.spec.tags == ["python", "code"]
+
+
+class TestCombinedRegistry:
+    def _make_reg(self, skills: list[SkillSpec]) -> SkillRegistry:
+        """Helper to make a registry with pre-loaded skills."""
+        reg = SkillRegistry(roots=[])
+        for s in skills:
+            reg._skills[s.id] = s
+        return reg
+
+    def test_merge_two(self):
+        r1 = self._make_reg([SkillSpec(id="a", name="A", description="d")])
+        r2 = self._make_reg([SkillSpec(id="b", name="B", description="d")])
+        combined = CombinedRegistry([r1, r2])
+        ids = [s.id for s in combined.list()]
+        assert "a" in ids
+        assert "b" in ids
+
+    def test_first_wins_on_conflict(self):
+        r1 = self._make_reg([SkillSpec(id="x", name="First", description="d")])
+        r2 = self._make_reg([SkillSpec(id="x", name="Second", description="d")])
+        combined = CombinedRegistry([r1, r2])
+        assert combined.get("x").name == "First"
+
+    def test_get_missing_raises(self):
+        combined = CombinedRegistry([])
+        with pytest.raises(SkillNotFoundError):
+            combined.get("nonexistent")
+
+
+class TestFilteredRegistry:
+    def test_filters_by_tag(self):
+        reg = SkillRegistry(roots=[])
+        reg._skills["a"] = SkillSpec(id="a", name="A", description="d", tags=["python"])
+        reg._skills["b"] = SkillSpec(id="b", name="B", description="d", tags=["rust"])
+        filtered = FilteredRegistry(reg, predicate=lambda s: "python" in s.tags)
+        ids = [s.id for s in filtered.list()]
+        assert ids == ["a"]
+
+    def test_get_filtered_out_raises(self):
+        reg = SkillRegistry(roots=[])
+        reg._skills["a"] = SkillSpec(id="a", name="A", description="d", tags=["rust"])
+        filtered = FilteredRegistry(reg, predicate=lambda s: "python" in s.tags)
+        with pytest.raises(SkillNotFoundError):
+            filtered.get("a")
+
+
+class TestPrefixedRegistry:
+    def test_prefix_applied(self):
+        reg = SkillRegistry(roots=[])
+        reg._skills["search"] = SkillSpec(id="search", name="Search", description="d")
+        prefixed = PrefixedRegistry(reg, prefix="vendor_")
+        ids = [s.id for s in prefixed.list()]
+        assert ids == ["vendor_search"]
+
+    def test_get_with_prefix(self):
+        reg = SkillRegistry(roots=[])
+        reg._skills["search"] = SkillSpec(id="search", name="Search", description="d")
+        prefixed = PrefixedRegistry(reg, prefix="vendor_")
+        spec = prefixed.get("vendor_search")
+        assert spec.id == "vendor_search"
+        assert spec.name == "Search"
+
+    def test_get_without_prefix_raises(self):
+        reg = SkillRegistry(roots=[])
+        reg._skills["s"] = SkillSpec(id="s", name="S", description="d")
+        prefixed = PrefixedRegistry(reg, prefix="v_")
+        with pytest.raises(SkillNotFoundError):
+            prefixed.get("s")
diff --git a/tests/test_subagents.py b/tests/test_subagents.py
new file mode 100644
index 0000000..e82cd6f
--- /dev/null
+++ b/tests/test_subagents.py
@@ -0,0 +1,163 @@
+"""Tests for the overhauled subagents system."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.subagents import (
+    AgentMessage,
+    DynamicAgentRegistry,
+    InMemoryMessageBus,
+    MessageType,
+    SubAgentConfig,
+    SubagentRegistry,
+    TaskCharacteristics,
+    decide_execution_mode,
+)
+
+
+class TestSubagentRegistry:
+    def test_register_and_get(self):
+        class FakeAgent:
+            name = "test"
+
+        reg = SubagentRegistry()
+        reg.register(FakeAgent())
+        assert reg.get("test").name == "test"
+
+    def test_get_unknown_raises(self):
+        reg = SubagentRegistry()
+        with pytest.raises(KeyError):
+            reg.get("nonexistent")
+
+    def test_list_and_count(self):
+        class A:
+            name = "a"
+
+        class B:
+            name = "b"
+
+        reg = SubagentRegistry()
+        reg.register(A())
+        reg.register(B())
+        assert reg.list() == ["a", "b"]
+        assert reg.count() == 2
+
+
+class TestDynamicRegistry:
+    def test_register_and_get(self):
+        dyn = DynamicAgentRegistry()
+        config = SubAgentConfig(name="worker", description="does work", instructions="work hard")
+        dyn.register(config, "agent_obj")
+        assert dyn.get("worker") == "agent_obj"
+        assert dyn.exists("worker")
+        assert dyn.count() == 1
+
+    def test_max_agents_limit(self):
+        dyn = DynamicAgentRegistry(max_agents=1)
+        config1 = SubAgentConfig(name="a", description="a", instructions="a")
+        dyn.register(config1, "obj1")
+        config2 = SubAgentConfig(name="b", description="b", instructions="b")
+        with pytest.raises(ValueError, match="Maximum"):
+            dyn.register(config2, "obj2")
+
+    def test_duplicate_name_raises(self):
+        dyn = DynamicAgentRegistry()
+        config = SubAgentConfig(name="x", description="x", instructions="x")
+        dyn.register(config, "obj")
+        with pytest.raises(ValueError, match="already exists"):
+            dyn.register(config, "obj2")
+
+    def test_remove(self):
+        dyn = DynamicAgentRegistry()
+        config = SubAgentConfig(name="x", description="x", instructions="x")
+        dyn.register(config, "obj")
+        assert dyn.remove("x") is True
+        assert dyn.remove("x") is False
+        assert dyn.count() == 0
+
+    def test_clear(self):
+        dyn = DynamicAgentRegistry()
+        for i in range(3):
+            config = SubAgentConfig(name=f"a{i}", description="d", instructions="i")
+            dyn.register(config, f"obj{i}")
+        dyn.clear()
+        assert dyn.count() == 0
+
+    def test_get_summary(self):
+        dyn = DynamicAgentRegistry()
+        assert "No dynamically" in dyn.get_summary()
+        config = SubAgentConfig(name="w", description="worker", instructions="work")
+        dyn.register(config, "obj")
+        summary = dyn.get_summary()
+        assert "w" in summary
+        assert "worker" in summary
+
+
+class TestMessageBus:
+    @pytest.mark.asyncio
+    async def test_send_and_receive(self):
+        bus = InMemoryMessageBus()
+        bus.register_agent("worker")
+        msg = AgentMessage(
+            type=MessageType.TASK_ASSIGNED,
+            sender="parent",
+            receiver="worker",
+            payload="do something",
+            task_id="t1",
+        )
+        await bus.send(msg)
+        messages = await bus.get_messages("worker")
+        assert len(messages) == 1
+        assert messages[0].payload == "do something"
+
+    @pytest.mark.asyncio
+    async def test_send_to_unregistered_raises(self):
+        bus = InMemoryMessageBus()
+        msg = AgentMessage(
+            type=MessageType.TASK_ASSIGNED,
+            sender="parent",
+            receiver="nobody",
+            payload="x",
+            task_id="t1",
+        )
+        with pytest.raises(KeyError):
+            await bus.send(msg)
+
+    @pytest.mark.asyncio
+    async def test_register_duplicate_raises(self):
+        bus = InMemoryMessageBus()
+        bus.register_agent("a")
+        with pytest.raises(ValueError):
+            bus.register_agent("a")
+
+    def test_registered_agents(self):
+        bus = InMemoryMessageBus()
+        bus.register_agent("a")
+        bus.register_agent("b")
+        assert sorted(bus.registered_agents()) == ["a", "b"]
+        assert bus.is_registered("a")
+        assert not bus.is_registered("c")
+
+
+class TestDecideExecutionMode:
+    def test_force_mode(self):
+        chars = TaskCharacteristics()
+        config = SubAgentConfig(name="x", description="x", instructions="x")
+        assert decide_execution_mode(chars, config, force_mode="sync") == "sync"
+        assert decide_execution_mode(chars, config, force_mode="async") == "async"
+
+    def test_complex_independent_is_async(self):
+        chars = TaskCharacteristics(estimated_complexity="complex", can_run_independently=True)
+        config = SubAgentConfig(name="x", description="x", instructions="x")
+        assert decide_execution_mode(chars, config) == "async"
+
+    def test_simple_is_sync(self):
+        chars = TaskCharacteristics(estimated_complexity="simple")
+        config = SubAgentConfig(name="x", description="x", instructions="x")
+        assert decide_execution_mode(chars, config) == "sync"
+
+    def test_needs_context_is_sync(self):
+        chars = TaskCharacteristics(requires_user_context=True, estimated_complexity="complex")
+        config = SubAgentConfig(name="x", description="x", instructions="x")
+        assert decide_execution_mode(chars, config) == "sync"
diff --git a/tests/test_toolsets.py b/tests/test_toolsets.py
new file mode 100644
index 0000000..e580d4e
--- /dev/null
+++ b/tests/test_toolsets.py
@@ -0,0 +1,93 @@
+"""Tests for all FunctionToolset factories."""
+
+from __future__ import annotations
+
+from pydantic_ai.toolsets import FunctionToolset
+
+from agent_ext.backends.console import ConsoleDeps, create_console_toolset
+from agent_ext.database.toolset import SQLDatabaseDeps, create_database_toolset
+from agent_ext.rlm.toolset import cleanup_repl_environments, create_rlm_toolset
+from agent_ext.subagents.toolset import SubAgentDeps, create_subagent_toolset
+from agent_ext.todo.pai_toolset import TodoDeps, create_todo_toolset
+
+
+class TestRLMToolset:
+    def test_creates_function_toolset(self):
+        ts = create_rlm_toolset()
+        assert isinstance(ts, FunctionToolset)
+
+    def test_custom_timeout(self):
+        ts = create_rlm_toolset(code_timeout=120.0)
+        assert isinstance(ts, FunctionToolset)
+
+    def test_with_sub_model(self):
+        ts = create_rlm_toolset(sub_model="openai:gpt-4o-mini")
+        assert isinstance(ts, FunctionToolset)
+
+    def test_cleanup(self):
+        cleanup_repl_environments()  # Should not raise
+
+
+class TestDatabaseToolset:
+    def test_creates_function_toolset(self):
+        ts = create_database_toolset()
+        assert isinstance(ts, FunctionToolset)
+
+    def test_with_id(self):
+        ts = create_database_toolset(toolset_id="my_db")
+        assert isinstance(ts, FunctionToolset)
+
+    def test_deps_model(self):
+        deps = SQLDatabaseDeps(database=None, read_only=True, max_rows=50)
+        assert deps.read_only is True
+        assert deps.max_rows == 50
+
+
+class TestConsoleToolset:
+    def test_creates_function_toolset(self):
+        ts = create_console_toolset()
+        assert isinstance(ts, FunctionToolset)
+
+    def test_deps_model(self):
+        from agent_ext.backends import StateBackend
+
+        backend = StateBackend()
+        deps = ConsoleDeps(backend=backend, exec_enabled=False)
+        assert deps.exec_enabled is False
+
+
+class TestSubagentToolset:
+    def test_creates_function_toolset(self):
+        ts = create_subagent_toolset()
+        assert isinstance(ts, FunctionToolset)
+
+    def test_with_configs(self):
+        """Config compilation requires valid model — test with env var or skip."""
+        import os
+
+        from agent_ext.subagents import SubAgentConfig
+
+        configs = [
+            SubAgentConfig(name="helper", description="Helps", instructions="Be helpful"),
+        ]
+        if not os.environ.get("OPENAI_API_KEY"):
+            # Can't compile agents without API key; test factory without configs
+            ts = create_subagent_toolset(configs=[])
+            assert isinstance(ts, FunctionToolset)
+        else:
+            ts = create_subagent_toolset(configs=configs)
+            assert isinstance(ts, FunctionToolset)
+
+    def test_deps_model(self):
+        deps = SubAgentDeps(default_model="openai:gpt-4o-mini")
+        assert deps.default_model == "openai:gpt-4o-mini"
+
+
+class TestTodoToolset:
+    def test_creates_function_toolset(self):
+        ts = create_todo_toolset()
+        assert isinstance(ts, FunctionToolset)
+
+    def test_deps_model(self):
+        deps = TodoDeps(store=None, case_id="case-1")
+        assert deps.case_id == "case-1"
diff --git a/tests/test_worktrees.py b/tests/test_worktrees.py
new file mode 100644
index 0000000..d360bf8
--- /dev/null
+++ b/tests/test_worktrees.py
@@ -0,0 +1,116 @@
+"""Tests for agent_ext.workbench.worktrees — worktree create, diff, cleanup."""
+
+from __future__ import annotations
+
+import pytest
+
+from agent_ext.workbench.worktrees import (
+    _run,
+    cleanup_worktree,
+    create_worktree,
+    worktree_diff,
+)
+
+
+def _ensure_main_repo_clean():
+    """Check we're in a git repo (the workspace itself)."""
+    ok, _ = _run(["git", "rev-parse", "--is-inside-work-tree"])
+    return ok
+
+
+@pytest.fixture(autouse=True)
+def _skip_if_not_git():
+    """Skip worktree tests if we're not inside a git repo."""
+    if not _ensure_main_repo_clean():
+        pytest.skip("Not inside a git repo — worktree tests require git")
+
+
+class TestWorktreeCreateAndCleanup:
+    def test_create_worktree(self):
+        """Creating a worktree should produce a valid WorktreeHandle with an existing directory."""
+        wt = create_worktree(run_id="test_create_001", agent_name="test_agent")
+        try:
+            assert wt.path.exists()
+            assert wt.path.is_dir()
+            assert wt.run_id == "test_create_001"
+            assert wt.agent_name == "test_agent"
+            # Verify it's actually a git worktree
+            ok, out = _run(["git", "rev-parse", "--is-inside-work-tree"], cwd=wt.path)
+            assert ok
+        finally:
+            cleanup_worktree(wt, prune_branch=True)
+
+    def test_cleanup_removes_directory(self):
+        """Cleanup should remove the worktree directory."""
+        wt = create_worktree(run_id="test_cleanup_001", agent_name="test_agent")
+        path = wt.path
+        assert path.exists()
+        cleanup_worktree(wt, prune_branch=True)
+        assert not path.exists()
+
+
+class TestWorktreeDiff:
+    def test_diff_captures_edits(self):
+        """Editing an existing file in the worktree should appear in the diff."""
+        wt = create_worktree(run_id="test_diff_edit_001", agent_name="test_agent")
+        try:
+            # Find an existing python file to modify
+            py_files = list(wt.path.rglob("*.py"))
+            if not py_files:
+                pytest.skip("No .py files in worktree")
+            target = py_files[0]
+            original = target.read_text(encoding="utf-8")
+            target.write_text(original + "\n# test edit marker\n", encoding="utf-8")
+
+            diff = worktree_diff(wt)
+            assert "# test edit marker" in diff
+            assert "+" in diff  # Should have added lines
+        finally:
+            cleanup_worktree(wt, prune_branch=True)
+
+    def test_diff_captures_new_files(self):
+        """New files created in the worktree should appear in the diff."""
+        wt = create_worktree(run_id="test_diff_new_001", agent_name="test_agent")
+        try:
+            new_file = wt.path / "test_brand_new_file.py"
+            new_file.write_text("# Brand new file\ndef hello():\n    return 42\n", encoding="utf-8")
+
+            diff = worktree_diff(wt)
+            assert "test_brand_new_file.py" in diff
+            assert "+# Brand new file" in diff
+            assert "+def hello():" in diff
+            assert "new file" in diff  # git diff should mark it as new
+        finally:
+            cleanup_worktree(wt, prune_branch=True)
+
+    def test_diff_empty_when_no_changes(self):
+        """No changes should produce an empty diff."""
+        wt = create_worktree(run_id="test_diff_empty_001", agent_name="test_agent")
+        try:
+            diff = worktree_diff(wt)
+            assert diff.strip() == ""
+        finally:
+            cleanup_worktree(wt, prune_branch=True)
+
+    def test_diff_captures_both_edits_and_new_files(self):
+        """Mixed changes (edits + new files) should all appear in the diff."""
+        wt = create_worktree(run_id="test_diff_mixed_001", agent_name="test_agent")
+        try:
+            # Edit existing
+            py_files = list(wt.path.rglob("*.py"))
+            if not py_files:
+                pytest.skip("No .py files in worktree")
+            target = py_files[0]
+            original = target.read_text(encoding="utf-8")
+            target.write_text(original + "\n# mixed edit marker\n", encoding="utf-8")
+
+            # Create new
+            new_file = wt.path / "test_mixed_new.py"
+            new_file.write_text("# mixed new file\n", encoding="utf-8")
+
+            diff = worktree_diff(wt)
+            assert "# mixed edit marker" in diff
+            assert "test_mixed_new.py" in diff
+            assert "+# mixed new file" in diff
+        finally:
+            cleanup_worktree(wt, prune_branch=True)
diff --git a/uv.lock b/uv.lock
index ef0cdb4..8419add 100644
--- a/uv.lock
+++ b/uv.lock
@@ -16,21 +16,25 @@ wheels = [
 
 [[package]]
 name = "agent-patterns"
-version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "asyncpg" },
     { name = "httpx" },
     { name = "pdf2img" },
     { name = "pydantic" },
+    { name = "pydantic-ai" },
     { name = "python-docx" },
+    { name = "python-dotenv" },
     { name = "python-pptx" },
     { name = "reportlab" },
+    { name = "rich" },
 ]
 
-[package.optional-dependencies]
-agent = [
-    { name = "pydantic-ai" },
+[package.dev-dependencies]
+dev = [
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "ruff" },
 ]
 
 [package.metadata]
@@ -39,12 +43,21 @@ requires-dist = [
     { name = "httpx", specifier = ">=0.28.1" },
     { name = "pdf2img", specifier = ">=0.1.2" },
     { name = "pydantic", specifier = ">=2.12.5" },
-    { name = "pydantic-ai", marker = "extra == 'agent'", specifier = ">=1.60.0" },
+    { name = "pydantic-ai", specifier = ">=1.60.0" },
     { name = "python-docx", specifier = ">=1.2.0" },
+    { name = "python-dotenv", specifier = ">=1.0.0" },
+    { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "python-pptx", specifier = ">=1.0.2" },
     { name = "reportlab", specifier = ">=4.4.10" },
+    { name = "rich", specifier = ">=14.3.2" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "pytest", specifier = ">=9.0.2" },
+    { name = "pytest-asyncio", specifier = ">=1.3.0" },
+    { name = "ruff", specifier = ">=0.11.0" },
 ]
-provides-extras = ["agent"]
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1124,6 +1137,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
 ]
 
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
 [[package]]
 name = "invoke"
 version = "2.2.1"
@@ -1938,6 +1960,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/48/31/05e764397056194206169869b50cf2fee4dbbbc71b344705b9c0d878d4d8/platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd", size = 21168, upload-time = "2026-02-16T03:56:08.891Z" },
 ]
 
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.24.1"
@@ -2415,6 +2446,35 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/df/80/fc9d01d5ed37ba4c42ca2b55b4339ae6e200b456be3a1aaddf4a9fa99b8c/pyperclip-1.11.0-py3-none-any.whl", hash = "sha256:299403e9ff44581cb9ba2ffeed69c7aa96a008622ad0c46cb575ca75b5b84273", size = 11063, upload-time = "2025-09-26T14:40:36.069Z" },
 ]
 
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
+[[package]]
+name = "pytest-asyncio"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -2820,6 +2880,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
 ]
 
+[[package]]
+name = "ruff"
+version = "0.15.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/da/31/d6e536cdebb6568ae75a7f00e4b4819ae0ad2640c3604c305a0428680b0c/ruff-0.15.4.tar.gz", hash = "sha256:3412195319e42d634470cc97aa9803d07e9d5c9223b99bcb1518f0c725f26ae1", size = 4569550, upload-time = "2026-02-26T20:04:14.959Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/82/c11a03cfec3a4d26a0ea1e571f0f44be5993b923f905eeddfc397c13d360/ruff-0.15.4-py3-none-linux_armv6l.whl", hash = "sha256:a1810931c41606c686bae8b5b9a8072adac2f611bb433c0ba476acba17a332e0", size = 10453333, upload-time = "2026-02-26T20:04:20.093Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/5d/6a1f271f6e31dffb31855996493641edc3eef8077b883eaf007a2f1c2976/ruff-0.15.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:5a1632c66672b8b4d3e1d1782859e98d6e0b4e70829530666644286600a33992", size = 10853356, upload-time = "2026-02-26T20:04:05.808Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/d8/0fab9f8842b83b1a9c2bf81b85063f65e93fb512e60effa95b0be49bfc54/ruff-0.15.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a4386ba2cd6c0f4ff75252845906acc7c7c8e1ac567b7bc3d373686ac8c222ba", size = 10187434, upload-time = "2026-02-26T20:03:54.656Z" },
+    { url = "https://files.pythonhosted.org/packages/85/cc/cc220fd9394eff5db8d94dec199eec56dd6c9f3651d8869d024867a91030/ruff-0.15.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2496488bdfd3732747558b6f95ae427ff066d1fcd054daf75f5a50674411e75", size = 10535456, upload-time = "2026-02-26T20:03:52.738Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/0f/bced38fa5cf24373ec767713c8e4cadc90247f3863605fb030e597878661/ruff-0.15.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f1c4893841ff2d54cbda1b2860fa3260173df5ddd7b95d370186f8a5e66a4ac", size = 10287772, upload-time = "2026-02-26T20:04:08.138Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/90/58a1802d84fed15f8f281925b21ab3cecd813bde52a8ca033a4de8ab0e7a/ruff-0.15.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:820b8766bd65503b6c30aaa6331e8ef3a6e564f7999c844e9a547c40179e440a", size = 11049051, upload-time = "2026-02-26T20:04:03.53Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/ac/b7ad36703c35f3866584564dc15f12f91cb1a26a897dc2fd13d7cb3ae1af/ruff-0.15.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9fb74bab47139c1751f900f857fa503987253c3ef89129b24ed375e72873e85", size = 11890494, upload-time = "2026-02-26T20:04:10.497Z" },
+    { url = "https://files.pythonhosted.org/packages/93/3d/3eb2f47a39a8b0da99faf9c54d3eb24720add1e886a5309d4d1be73a6380/ruff-0.15.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f80c98765949c518142b3a50a5db89343aa90f2c2bf7799de9986498ae6176db", size = 11326221, upload-time = "2026-02-26T20:04:12.84Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/90/bf134f4c1e5243e62690e09d63c55df948a74084c8ac3e48a88468314da6/ruff-0.15.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:451a2e224151729b3b6c9ffb36aed9091b2996fe4bdbd11f47e27d8f2e8888ec", size = 11168459, upload-time = "2026-02-26T20:04:00.969Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/e5/a64d27688789b06b5d55162aafc32059bb8c989c61a5139a36e1368285eb/ruff-0.15.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a8f157f2e583c513c4f5f896163a93198297371f34c04220daf40d133fdd4f7f", size = 11104366, upload-time = "2026-02-26T20:03:48.099Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/f6/32d1dcb66a2559763fc3027bdd65836cad9eb09d90f2ed6a63d8e9252b02/ruff-0.15.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:917cc68503357021f541e69b35361c99387cdbbf99bd0ea4aa6f28ca99ff5338", size = 10510887, upload-time = "2026-02-26T20:03:45.771Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/92/22d1ced50971c5b6433aed166fcef8c9343f567a94cf2b9d9089f6aa80fe/ruff-0.15.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e9737c8161da79fd7cfec19f1e35620375bd8b2a50c3e77fa3d2c16f574105cc", size = 10285939, upload-time = "2026-02-26T20:04:22.42Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/f4/7c20aec3143837641a02509a4668fb146a642fd1211846634edc17eb5563/ruff-0.15.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:291258c917539e18f6ba40482fe31d6f5ac023994ee11d7bdafd716f2aab8a68", size = 10765471, upload-time = "2026-02-26T20:03:58.924Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/09/6d2f7586f09a16120aebdff8f64d962d7c4348313c77ebb29c566cefc357/ruff-0.15.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3f83c45911da6f2cd5936c436cf86b9f09f09165f033a99dcf7477e34041cbc3", size = 11263382, upload-time = "2026-02-26T20:04:24.424Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/fa/2ef715a1cd329ef47c1a050e10dee91a9054b7ce2fcfdd6a06d139afb7ec/ruff-0.15.4-py3-none-win32.whl", hash = "sha256:65594a2d557d4ee9f02834fcdf0a28daa8b3b9f6cb2cb93846025a36db47ef22", size = 10506664, upload-time = "2026-02-26T20:03:50.56Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/a8/c688ef7e29983976820d18710f955751d9f4d4eb69df658af3d006e2ba3e/ruff-0.15.4-py3-none-win_amd64.whl", hash = "sha256:04196ad44f0df220c2ece5b0e959c2f37c777375ec744397d21d15b50a75264f", size = 11651048, upload-time = "2026-02-26T20:04:17.191Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/0a/9e1be9035b37448ce2e68c978f0591da94389ade5a5abafa4cf99985d1b2/ruff-0.15.4-py3-none-win_arm64.whl", hash = "sha256:60d5177e8cfc70e51b9c5fad936c634872a74209f934c1e79107d11787ad5453", size = 10966776, upload-time = "2026-02-26T20:03:56.908Z" },
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.16.0"