llamastack · stainless-app · Feb 5, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.5.0-alpha.2"
+  ".": "0.5.0-alpha.3"
 }
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 108
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-958e990011d6b4c27513743a151ec4c80c3103650a80027380d15f1d6b108e32.yml
-openapi_spec_hash: 5b49d825dbc2a26726ca752914a65114
-config_hash: 19b84a0a93d566334ae134dafc71991f
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-41c9203afe93dd495edeaad0f9494065f2ae7de5d7148207531eeddf9ed4f11e.yml
+openapi_spec_hash: c47e69115bbf13bdde86a076088fdad9
+config_hash: 6aa61d4143c3e3df785972c0287d1370
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,42 @@
 # Changelog
 
+## 0.5.0-alpha.3 (2026-02-24)
+
+Full Changelog: [v0.5.0-alpha.2...v0.5.0-alpha.3](https://github.com/llamastack/llama-stack-client-python/compare/v0.5.0-alpha.2...v0.5.0-alpha.3)
+
+### ⚠ BREAKING CHANGES
+
+* improve consistency of post-training API endpoints
+
+### Features
+
+* Add prompt_cache_key parameter support ([6b45699](https://github.com/llamastack/llama-stack-client-python/commit/6b45699185d934a5f8395c5cc3046f6c5aceb770))
+* add support for /responses background parameter ([4f8bf45](https://github.com/llamastack/llama-stack-client-python/commit/4f8bf4526e529a74b9c53cac6df8e4beb2808d60))
+* add top_p parameter support to responses API ([23e3b9f](https://github.com/llamastack/llama-stack-client-python/commit/23e3b9fcf7a23378c200604d0f57dc5a9e6a8527))
+* Add truncation parameter support ([7501365](https://github.com/llamastack/llama-stack-client-python/commit/7501365fe89795e87accfb6b1f2329da25d0efeb))
+* improve consistency of post-training API endpoints ([99057fd](https://github.com/llamastack/llama-stack-client-python/commit/99057fdc74bafdf54479674ba75b447cd4681cb6))
+* **vector_io:** Implement Contextual Retrieval for improved RAG search quality ([89ec5a7](https://github.com/llamastack/llama-stack-client-python/commit/89ec5a7bf405e688bd404877e49ab1ee9b49bf7e))
+
+
+### Bug Fixes
+
+* align chat completion usage schema with OpenAI spec ([3974d5d](https://github.com/llamastack/llama-stack-client-python/commit/3974d5db8270e2548d0cdd54204c1603ca7a84a8))
+* Enabled models list works ([#314](https://github.com/llamastack/llama-stack-client-python/issues/314)) ([acd5e64](https://github.com/llamastack/llama-stack-client-python/commit/acd5e64a9e82083192a31f85f9c810291cabcadb))
+* **inference:** use flat response message model for chat/completions ([e58e2e4](https://github.com/llamastack/llama-stack-client-python/commit/e58e2e4dee9c9bbb72e4903e30f169991d10e545))
+* **vector_io:** align Protocol signatures with request models ([ea58fd8](https://github.com/llamastack/llama-stack-client-python/commit/ea58fd88201ef59e580443688100cafe45f305c0))
+
+
+### Chores
+
+* **api:** minor updates ([17a2705](https://github.com/llamastack/llama-stack-client-python/commit/17a270528b503591de15f9e9fcbc378007b75eda))
+* format all `api.md` files ([0e3e262](https://github.com/llamastack/llama-stack-client-python/commit/0e3e2626081ca9268297742990368c7ed6493b40))
+* **internal:** add request options to SSE classes ([2ecc682](https://github.com/llamastack/llama-stack-client-python/commit/2ecc682c1fccc86c643ad3da40e5134352745525))
+* **internal:** bump dependencies ([612291e](https://github.com/llamastack/llama-stack-client-python/commit/612291e2142b710cdd643af16bbe83e514f7a44e))
+* **internal:** fix lint error on Python 3.14 ([a0f6975](https://github.com/llamastack/llama-stack-client-python/commit/a0f69750827b016bb27a52bdd77fcbbacd311020))
+* **internal:** make `test_proxy_environment_variables` more resilient ([6bc2bb4](https://github.com/llamastack/llama-stack-client-python/commit/6bc2bb4e81b16d23e20090f45dbd8a53a63c158d))
+* **internal:** make `test_proxy_environment_variables` more resilient to env ([44bbae1](https://github.com/llamastack/llama-stack-client-python/commit/44bbae12bb8b4f72d1fb50db29bedd69f30340b7))
+* update mock server docs ([92cb087](https://github.com/llamastack/llama-stack-client-python/commit/92cb087355ffa1fd50e3a35b8e888853784c9fe9))
+
 ## 0.5.0-alpha.2 (2026-02-05)
 
 Full Changelog: [v0.5.0-alpha.1...v0.5.0-alpha.2](https://github.com/llamastack/llama-stack-client-python/compare/v0.5.0-alpha.1...v0.5.0-alpha.2)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -88,8 +88,7 @@ $ pip install ./path-to-wheel-file.whl
 Most tests require you to [set up a mock server](https://github.com/stoplightio/prism) against the OpenAPI spec to run the tests.
 
 ```sh
-# you will need npm installed
-$ npx prism mock path/to/your/openapi.yml
+$ ./scripts/mock
 ```
 
 ```sh

diff --git a/api.md b/api.md
@@ -474,9 +474,9 @@ from llama_stack_client.types.alpha.post_training import (
 Methods:
 
 - <code title="get /v1alpha/post-training/jobs">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">list</a>() -> <a href="./src/llama_stack_client/types/alpha/post_training/job_list_response.py">JobListResponse</a></code>
-- <code title="get /v1alpha/post-training/job/artifacts">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">artifacts</a>() -> <a href="./src/llama_stack_client/types/alpha/post_training/job_artifacts_response.py">JobArtifactsResponse</a></code>
-- <code title="post /v1alpha/post-training/job/cancel">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">cancel</a>() -> None</code>
-- <code title="get /v1alpha/post-training/job/status">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">status</a>() -> <a href="./src/llama_stack_client/types/alpha/post_training/job_status_response.py">JobStatusResponse</a></code>
+- <code title="get /v1alpha/post-training/jobs/{job_uuid}/artifacts">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">artifacts</a>(job_uuid) -> <a href="./src/llama_stack_client/types/alpha/post_training/job_artifacts_response.py">JobArtifactsResponse</a></code>
+- <code title="post /v1alpha/post-training/jobs/{job_uuid}/cancel">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">cancel</a>(job_uuid) -> None</code>
+- <code title="get /v1alpha/post-training/jobs/{job_uuid}/status">client.alpha.post_training.job.<a href="./src/llama_stack_client/resources/alpha/post_training/job.py">status</a>(job_uuid) -> <a href="./src/llama_stack_client/types/alpha/post_training/job_status_response.py">JobStatusResponse</a></code>
 
 ## Benchmarks
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "llama_stack_client"
-version = "0.5.0-alpha.2"
+version = "0.5.0-alpha.3"
 description = "The official Python library for the llama-stack-client API"
 dynamic = ["readme"]
 license = "MIT"

diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -3,12 +3,12 @@
 -e .
 annotated-types==0.7.0
     # via pydantic
-anyio==4.12.0
+anyio==4.12.1
     # via
     #   httpx
     #   llama-stack-client
 black==26.1.0
-certifi==2025.11.12
+certifi==2026.1.4
     # via
     #   httpcore
     #   httpx
@@ -52,7 +52,7 @@ idna==3.11
     #   anyio
     #   httpx
     #   requests
-importlib-metadata==8.7.0
+importlib-metadata==8.7.1
 iniconfig==2.3.0
     # via pytest
 markdown-it-py==4.0.0
@@ -64,11 +64,11 @@ mypy-extensions==1.1.0
     # via
     #   black
     #   mypy
-nodeenv==1.9.1
+nodeenv==1.10.0
     # via
     #   pre-commit
     #   pyright
-numpy==2.4.1
+numpy==2.4.2
     # via pandas
 packaging==25.0
     # via
@@ -89,7 +89,7 @@ pluggy==1.6.0
 pre-commit==4.5.1
 prompt-toolkit==3.0.52
     # via llama-stack-client
-pyaml==25.7.0
+pyaml==26.2.1
     # via llama-stack-client
 pydantic==2.12.5
     # via llama-stack-client
@@ -100,15 +100,15 @@ pygments==2.19.2
     #   pytest
     #   rich
 pyright==1.1.399
-pytest==9.0.1
+pytest==9.0.2
     # via
     #   pytest-asyncio
     #   pytest-xdist
 pytest-asyncio==1.3.0
 pytest-xdist==3.8.0
 python-dateutil==2.9.0.post0
     # via pandas
-pytokens==0.4.0
+pytokens==0.4.1
     # via black
 pyyaml==6.0.3
     # via
@@ -119,7 +119,7 @@ requests==2.32.5
 respx==0.22.0
 rich==14.2.0
     # via llama-stack-client
-ruff==0.14.7
+ruff==0.14.13
 six==1.17.0
     # via python-dateutil
 sniffio==1.3.1
@@ -128,8 +128,8 @@ termcolor==3.3.0
     # via
     #   fire
     #   llama-stack-client
-time-machine==3.1.0
-tqdm==4.67.1
+time-machine==3.2.0
+tqdm==4.67.3
     # via llama-stack-client
 typing-extensions==4.15.0
     # via
@@ -149,7 +149,7 @@ urllib3==2.6.3
     # via requests
 virtualenv==20.36.1
     # via pre-commit
-wcwidth==0.3.1
+wcwidth==0.6.0
     # via prompt-toolkit
 zipp==3.23.0
     # via importlib-metadata
diff --git a/scripts/format b/scripts/format
@@ -11,4 +11,4 @@ uv run ruff check --fix .
 uv run ruff format
 
 echo "==> Formatting docs"
-uv run python scripts/utils/ruffen-docs.py README.md api.md
+uv run python scripts/utils/ruffen-docs.py README.md $(find . -type f -name api.md)
diff --git a/src/llama_stack_client/_response.py b/src/llama_stack_client/_response.py
@@ -158,6 +158,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
                         ),
                         response=self.http_response,
                         client=cast(Any, self._client),
+                        options=self._options,
                     ),
                 )
 
@@ -168,6 +169,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
                         cast_to=extract_stream_chunk_type(self._stream_cls),
                         response=self.http_response,
                         client=cast(Any, self._client),
+                        options=self._options,
                     ),
                 )
 
@@ -181,6 +183,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
                     cast_to=cast_to,
                     response=self.http_response,
                     client=cast(Any, self._client),
+                    options=self._options,
                 ),
             )
 

diff --git a/src/llama_stack_client/_streaming.py b/src/llama_stack_client/_streaming.py
@@ -10,7 +10,7 @@
 import json
 import inspect
 from types import TracebackType
-from typing import TYPE_CHECKING, Any, Generic, TypeVar, Iterator, AsyncIterator, cast
+from typing import TYPE_CHECKING, Any, Generic, TypeVar, Iterator, Optional, AsyncIterator, cast
 from typing_extensions import Self, Protocol, TypeGuard, override, get_origin, runtime_checkable
 
 import httpx
@@ -19,6 +19,7 @@
 
 if TYPE_CHECKING:
     from ._client import LlamaStackClient, AsyncLlamaStackClient
+    from ._models import FinalRequestOptions
 
 
 _T = TypeVar("_T")
@@ -28,7 +29,7 @@ class Stream(Generic[_T]):
     """Provides the core interface to iterate over a synchronous stream response."""
 
     response: httpx.Response
-
+    _options: Optional[FinalRequestOptions] = None
     _decoder: SSEBytesDecoder
 
     def __init__(
@@ -37,10 +38,12 @@ def __init__(
         cast_to: type[_T],
         response: httpx.Response,
         client: LlamaStackClient,
+        options: Optional[FinalRequestOptions] = None,
     ) -> None:
         self.response = response
         self._cast_to = cast_to
         self._client = client
+        self._options = options
         self._decoder = client._make_sse_decoder()
         self._iterator = self.__stream__()
 
@@ -91,7 +94,7 @@ class AsyncStream(Generic[_T]):
     """Provides the core interface to iterate over an asynchronous stream response."""
 
     response: httpx.Response
-
+    _options: Optional[FinalRequestOptions] = None
     _decoder: SSEDecoder | SSEBytesDecoder
 
     def __init__(
@@ -100,10 +103,12 @@ def __init__(
         cast_to: type[_T],
         response: httpx.Response,
         client: AsyncLlamaStackClient,
+        options: Optional[FinalRequestOptions] = None,
     ) -> None:
         self.response = response
         self._cast_to = cast_to
         self._client = client
+        self._options = options
         self._decoder = client._make_sse_decoder()
         self._iterator = self.__stream__()
 

diff --git a/src/llama_stack_client/_utils/_compat.py b/src/llama_stack_client/_utils/_compat.py
@@ -32,7 +32,7 @@ def is_union(tp: Optional[Type[Any]]) -> bool:
     else:
         import types
 
-        return tp is Union or tp is types.UnionType
+        return tp is Union or tp is types.UnionType  # type: ignore[comparison-overlap]
 
 
 def is_typeddict(tp: Type[Any]) -> bool:

diff --git a/src/llama_stack_client/_version.py b/src/llama_stack_client/_version.py
@@ -7,4 +7,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "llama_stack_client"
-__version__ = "0.5.0-alpha.2"  # x-release-please-version
+__version__ = "0.5.0-alpha.3"  # x-release-please-version