diff --git a/docs.json b/docs.json index 27930c2..a224d34 100644 --- a/docs.json +++ b/docs.json @@ -433,6 +433,17 @@ "pages": [ "/phala-cloud/confidential-ai/confidential-model/confidential-ai-api", "/phala-cloud/confidential-ai/confidential-gpu/model-template", + { + "group": "API Reference", + "pages": [ + "/phala-cloud/confidential-ai/confidential-model/api-reference/chat-completions", + "/phala-cloud/confidential-ai/confidential-model/api-reference/models", + "/phala-cloud/confidential-ai/confidential-model/api-reference/attestation", + "/phala-cloud/confidential-ai/confidential-model/api-reference/signature", + "/phala-cloud/confidential-ai/confidential-model/api-reference/embeddings", + "/phala-cloud/confidential-ai/confidential-model/api-reference/embedding-models" + ] + }, "/phala-cloud/confidential-ai/confidential-model/tool-calling", "/phala-cloud/confidential-ai/confidential-model/images-and-vision", "/phala-cloud/confidential-ai/confidential-model/structured-output", @@ -2122,4 +2133,4 @@ "thumbnails": { "background": "/images/phala-docs-og.png" } -} \ No newline at end of file +} diff --git a/phala-cloud/confidential-ai/confidential-model/api-reference/attestation.mdx b/phala-cloud/confidential-ai/confidential-model/api-reference/attestation.mdx new file mode 100644 index 0000000..618dfde --- /dev/null +++ b/phala-cloud/confidential-ai/confidential-model/api-reference/attestation.mdx @@ -0,0 +1,165 @@ +--- +title: Attestation Report +description: Fetch TEE attestation evidence for a Confidential AI model. +--- + +## Endpoint + +```bash +GET https://api.redpill.ai/v1/attestation/report?model={model_id}&nonce={nonce}&signing_address={address} +``` + +The attestation report proves a model endpoint is backed by TEE hardware and provides the evidence needed for hardware, software, and signer binding checks. + + +Always include a fresh random `nonce` when fetching attestations for security-sensitive verification. A nonce prevents replay of an older valid attestation. + + +## Parameters + + + Model ID to attest. + + Examples: `phala/qwen3.5-27b`, `phala/qwen-2.5-7b-instruct`, `openai/gpt-oss-120b`, `z-ai/glm-5`. + + + + Fresh 32-byte random value encoded as 64 hex characters. The nonce is embedded in the TEE report data. + + + + Ethereum address or public key used to filter attestations in multi-server deployments. Use this when binding a response signature to a specific TEE signer. + + +## Examples + + +```bash cURL +NONCE=$(openssl rand -hex 32) + +curl "https://api.redpill.ai/v1/attestation/report?model=phala/qwen3.5-27b&nonce=$NONCE" \ + -H "Authorization: Bearer " +``` + +```python Python +import secrets +import requests + +nonce = secrets.token_hex(32) + +response = requests.get( + "https://api.redpill.ai/v1/attestation/report", + params={ + "model": "phala/qwen3.5-27b", + "nonce": nonce, + }, + headers={"Authorization": "Bearer "}, +) + +attestation = response.json() +``` + + +## Response Formats + +The response format depends on the provider behind the model. + +### Phala / NearAI Two-Layer Format + +Models may return separate gateway and model attestations: + +```json +{ + "gateway_attestation": { + "signing_address": "0x...", + "signing_algo": "ecdsa", + "intel_quote": "hex-encoded-tdx-quote", + "event_log": [], + "report_data": "...", + "request_nonce": "...", + "info": { + "vm_config": "..." + } + }, + "model_attestations": [ + { + "model_name": "phala/qwen3.5-27b", + "signing_address": "0x...", + "signing_algo": "ecdsa", + "intel_quote": "hex-encoded-tdx-quote", + "nvidia_payload": "{...json gpu attestation...}", + "event_log": [], + "info": { + "tcb_info": "{...app_compose...}", + "vm_config": "..." + } + } + ] +} +``` + +### Chutes Format + +Some models return Chutes-style instance attestations: + +```json +{ + "attestation_type": "chutes", + "nonce": "...", + "all_attestations": [ + { + "instance_id": "uuid", + "nonce": "...", + "intel_quote": "base64-encoded-tdx-quote", + "gpu_evidence": [ + { "certificate": "...", "evidence": "...", "arch": "HOPPER" } + ], + "e2e_pubkey": "..." + } + ] +} +``` + +### Flat Format + +Older Phala-native responses may expose fields at the top level: + +```json +{ + "signing_address": "0x...", + "signing_algo": "ecdsa", + "request_nonce": "...", + "intel_quote": "hex-encoded-tdx-quote", + "nvidia_payload": "{...}", + "info": { + "tcb_info": "{\"app_compose\":\"...\"}" + } +} +``` + +## Important Fields + +| Field | Description | +|-------|-------------| +| `signing_address` | Address or key used by the TEE to sign responses | +| `signing_algo` | Signature algorithm, commonly `ecdsa` | +| `request_nonce` / `nonce` | Nonce included in the attestation | +| `intel_quote` | Intel TDX quote for CPU TEE verification | +| `nvidia_payload` | NVIDIA GPU attestation payload | +| `event_log` | Boot event log for software stack verification | +| `info.vm_config` | VM configuration evidence | +| `info.tcb_info.app_compose` | Docker Compose application evidence | +| `gateway_attestation` | Gateway TEE attestation | +| `model_attestations` | One or more model runtime attestations | +| `all_attestations` | Provider-specific list of model instance attestations | + +## Verification Flow + +1. Generate a fresh nonce. +2. Fetch an attestation report for the exact model. +3. Verify the Intel TDX quote. +4. Verify GPU evidence when `nvidia_payload` or `gpu_evidence` is present. +5. Confirm the report data binds the nonce and expected signing address. +6. Verify application measurements such as compose hash and image provenance when available. + +For a walkthrough, see [Verify Attestation](/phala-cloud/confidential-ai/verify/verify-attestation). diff --git a/phala-cloud/confidential-ai/confidential-model/api-reference/chat-completions.mdx b/phala-cloud/confidential-ai/confidential-model/api-reference/chat-completions.mdx new file mode 100644 index 0000000..c910d00 --- /dev/null +++ b/phala-cloud/confidential-ai/confidential-model/api-reference/chat-completions.mdx @@ -0,0 +1,157 @@ +--- +title: Chat Completions +description: Create OpenAI-compatible chat completion responses with Confidential AI models. +--- + +## Endpoint + +```bash +POST https://api.redpill.ai/v1/chat/completions +``` + +Creates a response for a chat conversation. Use the same OpenAI-compatible request shape you already use with the OpenAI SDK, then set the base URL to `https://api.redpill.ai/v1`. + +## Request Body + + + Model ID to use for completion. + + Examples: `phala/qwen3.5-27b`, `phala/gemma-3-27b-it`, `z-ai/glm-5`, `openai/gpt-oss-120b`. + + + + Conversation messages. Each message includes `role` and `content`. + + ```json + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain GPU TEE in one paragraph."} + ] + ``` + + + + Sampling temperature. Typical range is `0` to `2`. + + + + Maximum number of output tokens for most open models and GPU TEE models. + + + + Maximum output tokens for newer OpenAI reasoning models that do not accept `max_tokens`. + + + + Set to `true` to receive server-sent event chunks. + + + + Function/tool definitions that supported models can call. + + + + Controls whether the model may call tools. Common values are `auto`, `none`, or a specific tool selection object. + + + + Requests structured output from supported models, including JSON schema mode. + + +## Examples + + +```bash cURL +curl https://api.redpill.ai/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "phala/qwen3.5-27b", + "messages": [ + {"role": "user", "content": "What privacy guarantees does GPU TEE provide?"} + ] + }' +``` + +```python Python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="https://api.redpill.ai/v1", +) + +response = client.chat.completions.create( + model="phala/qwen3.5-27b", + messages=[ + {"role": "user", "content": "What privacy guarantees does GPU TEE provide?"} + ], +) + +print(response.choices[0].message.content) +``` + +```typescript TypeScript +import OpenAI from "openai"; + +const client = new OpenAI({ + apiKey: "", + baseURL: "https://api.redpill.ai/v1", +}); + +const response = await client.chat.completions.create({ + model: "phala/qwen3.5-27b", + messages: [ + { role: "user", content: "What privacy guarantees does GPU TEE provide?" }, + ], +}); + +console.log(response.choices[0].message.content); +``` + + +## Response + +```json +{ + "id": "chatcmpl-123", + "object": "chat.completion", + "created": 1677652288, + "model": "phala/qwen3.5-27b", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "GPU TEE protects inference by..." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 16, + "completion_tokens": 48, + "total_tokens": 64 + } +} +``` + +The `id` field is the request ID. Use it with [Request Signature](/phala-cloud/confidential-ai/confidential-model/api-reference/signature) when you need cryptographic proof for this specific response. + +## Feature Notes + +- Streaming uses the same `stream: true` option as the OpenAI API. +- Vision models accept multimodal `content` arrays with `image_url` entries. +- Tool calling uses OpenAI-compatible `tools`, `tool_choice`, assistant `tool_calls`, and tool response messages. +- Structured output uses `response_format` on supported models. + +## Next Steps + + + + Discover available Confidential AI models and capabilities + + + Fetch the signature for a chat completion response + + diff --git a/phala-cloud/confidential-ai/confidential-model/api-reference/embedding-models.mdx b/phala-cloud/confidential-ai/confidential-model/api-reference/embedding-models.mdx new file mode 100644 index 0000000..e6bc521 --- /dev/null +++ b/phala-cloud/confidential-ai/confidential-model/api-reference/embedding-models.mdx @@ -0,0 +1,83 @@ +--- +title: List Embedding Models +description: List embedding-capable models available through the Confidential AI API. +--- + +## Endpoint + +```bash +GET https://api.redpill.ai/v1/embeddings/models +``` + +Returns models designed for vector embeddings. Use this endpoint when selecting embedding models for retrieval, RAG, clustering, or similarity search. + +## Examples + + +```bash cURL +curl https://api.redpill.ai/v1/embeddings/models \ + -H "Authorization: Bearer " +``` + +```python Python +import requests + +response = requests.get( + "https://api.redpill.ai/v1/embeddings/models", + headers={"Authorization": "Bearer "}, +) + +for model in response.json()["data"]: + print(model["id"]) +``` + +```typescript TypeScript +const response = await fetch("https://api.redpill.ai/v1/embeddings/models", { + headers: { + Authorization: "Bearer ", + }, +}); + +const models = await response.json(); +for (const model of models.data) { + console.log(model.id); +} +``` + + +## Response + +```json +{ + "object": "list", + "data": [ + { + "id": "qwen/qwen3-embedding-8b", + "name": "Qwen3 Embedding 8B", + "created": 1704067200, + "input_modalities": ["text"], + "output_modalities": ["embeddings"], + "context_length": 32768, + "max_output_length": 4096, + "pricing": { + "prompt": "0.00000001", + "completion": "0" + }, + "description": "Embedding model for semantic search and retrieval" + } + ] +} +``` + +## Response Fields + +| Field | Description | +|-------|-------------| +| `id` | Model identifier for `POST /v1/embeddings` | +| `name` | Human-readable model name | +| `input_modalities` | Input types accepted by the model | +| `output_modalities` | Output types produced by the model | +| `context_length` | Maximum input context length | +| `max_output_length` | Embedding dimensions or max output size | +| `pricing.prompt` | Input price per token | +| `description` | Model description and use case | diff --git a/phala-cloud/confidential-ai/confidential-model/api-reference/embeddings.mdx b/phala-cloud/confidential-ai/confidential-model/api-reference/embeddings.mdx new file mode 100644 index 0000000..9ae06fe --- /dev/null +++ b/phala-cloud/confidential-ai/confidential-model/api-reference/embeddings.mdx @@ -0,0 +1,108 @@ +--- +title: Embeddings +description: Create vector embeddings with OpenAI-compatible embedding models. +--- + +## Endpoint + +```bash +POST https://api.redpill.ai/v1/embeddings +``` + +Generate vector embeddings for retrieval, semantic search, clustering, and similarity workloads. + +## Request Body + + + Embedding model ID. + + Examples: `qwen/qwen3-embedding-8b`, `sentence-transformers/all-minilm-l6-v2`. + + + + Input text or list of inputs to embed. + + + + Embedding encoding format. Common values are `float` and `base64`. + + + + Requested output dimensions, when supported by the selected model. + + +## Examples + + +```bash cURL +curl https://api.redpill.ai/v1/embeddings \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen/qwen3-embedding-8b", + "input": "Confidential AI keeps inference data private." + }' +``` + +```python Python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="https://api.redpill.ai/v1", +) + +response = client.embeddings.create( + model="qwen/qwen3-embedding-8b", + input="Confidential AI keeps inference data private.", +) + +vector = response.data[0].embedding +print(len(vector)) +``` + +```typescript TypeScript +import OpenAI from "openai"; + +const client = new OpenAI({ + apiKey: "", + baseURL: "https://api.redpill.ai/v1", +}); + +const response = await client.embeddings.create({ + model: "qwen/qwen3-embedding-8b", + input: "Confidential AI keeps inference data private.", +}); + +console.log(response.data[0].embedding.length); +``` + + +## Response + +```json +{ + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [0.0023, -0.0015, 0.0042] + } + ], + "model": "qwen/qwen3-embedding-8b", + "usage": { + "prompt_tokens": 8, + "total_tokens": 8 + } +} +``` + +## Common Models + +| Model | Dimensions | Context | Notes | +|-------|------------|---------|-------| +| `qwen/qwen3-embedding-8b` | 4096 | 32K | Large confidential embedding model | +| `sentence-transformers/all-minilm-l6-v2` | 384 | 512 | Low-cost compact embedding model | + +Use [List Embedding Models](/phala-cloud/confidential-ai/confidential-model/api-reference/embedding-models) for the live embedding catalog. diff --git a/phala-cloud/confidential-ai/confidential-model/api-reference/models.mdx b/phala-cloud/confidential-ai/confidential-model/api-reference/models.mdx new file mode 100644 index 0000000..a5b75f6 --- /dev/null +++ b/phala-cloud/confidential-ai/confidential-model/api-reference/models.mdx @@ -0,0 +1,98 @@ +--- +title: List Models +description: List available Confidential AI models, providers, modalities, context windows, and pricing metadata. +--- + +## Endpoints + +```bash +GET https://api.redpill.ai/v1/models +GET https://api.redpill.ai/v1/models/phala +``` + +Use the live model catalog before hardcoding model IDs. The catalog returns model IDs, context windows, pricing, providers, modalities, and TEE metadata when available. + +## Examples + + +```bash All Models +curl https://api.redpill.ai/v1/models \ + -H "Authorization: Bearer " +``` + +```bash Phala Models +curl https://api.redpill.ai/v1/models/phala \ + -H "Authorization: Bearer " +``` + +```python Python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="https://api.redpill.ai/v1", +) + +models = client.models.list() +for model in models.data: + print(model.id) +``` + + +## Response + +```json +{ + "data": [ + { + "id": "phala/qwen3.5-27b", + "name": "Qwen3.5 27B", + "created": 1677652288, + "description": "Qwen model running through Phala GPU TEE infrastructure", + "context_length": 262144, + "pricing": { + "prompt": "0.00000030", + "completion": "0.00000240" + }, + "providers": ["phala"], + "metadata": { + "tee": true, + "appid": "..." + }, + "architecture": { + "modality": "text->text", + "input_modalities": ["text"], + "output_modalities": ["text"] + } + } + ] +} +``` + +## Model Object Fields + +| Field | Description | +|-------|-------------| +| `id` | Model identifier for API calls | +| `name` | Human-readable model name | +| `description` | Model or provider description | +| `context_length` | Maximum context window | +| `pricing.prompt` | Input token price per token; multiply by 1,000,000 for per-million-token pricing | +| `pricing.completion` | Output token price per token; multiply by 1,000,000 for per-million-token pricing | +| `providers` | Infrastructure providers such as `phala`, `near-ai`, `tinfoil`, or `chutes` | +| `metadata.tee` | Whether the model is marked as a TEE model | +| `metadata.appid` | Present when the model supports the attestation flow | +| `architecture.input_modalities` | Supported input types, such as `text` or `image` | +| `architecture.output_modalities` | Supported output types, such as `text` or `embeddings` | + +## Find Verifiable TEE Models + +Filter for models that expose TEE provider metadata: + +```bash +curl https://api.redpill.ai/v1/models \ + -H "Authorization: Bearer " | \ + jq '.data[] | select(.metadata.tee == true or any(.providers[]?; test("phala|near-ai|tinfoil|chutes"))) | {id, providers, appid: .metadata.appid}' +``` + +For production verification, test [Attestation Report](/phala-cloud/confidential-ai/confidential-model/api-reference/attestation) with the exact model ID before relying on it in your application. diff --git a/phala-cloud/confidential-ai/confidential-model/api-reference/signature.mdx b/phala-cloud/confidential-ai/confidential-model/api-reference/signature.mdx new file mode 100644 index 0000000..c02bbb0 --- /dev/null +++ b/phala-cloud/confidential-ai/confidential-model/api-reference/signature.mdx @@ -0,0 +1,109 @@ +--- +title: Request Signature +description: Fetch a cryptographic signature for a Confidential AI response. +--- + +## Endpoint + +```bash +GET https://api.redpill.ai/v1/signature/{request_id}?model={model}&signing_algo={algo} +``` + +Use this endpoint after a chat completion request. The signature proves a specific response was signed by a TEE key. Bind that key to fresh attestation evidence before treating the response as fully verified. + +## Parameters + + + The `id` returned by `POST /v1/chat/completions`. + + + + The model ID used for the original request. + + + + Signature algorithm. Common values include `ecdsa`, `ecdsa-p256`, and `rsa`; use the algorithm supported by the model response. + + +## Examples + + +```bash cURL +RESPONSE=$(curl -s https://api.redpill.ai/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{"model":"phala/qwen3.5-27b","messages":[{"role":"user","content":"hello"}]}') + +REQUEST_ID=$(echo "$RESPONSE" | jq -r '.id') + +curl "https://api.redpill.ai/v1/signature/$REQUEST_ID?model=phala/qwen3.5-27b" \ + -H "Authorization: Bearer " +``` + +```python Python +import requests + +chat_response = requests.post( + "https://api.redpill.ai/v1/chat/completions", + headers={ + "Authorization": "Bearer ", + "Content-Type": "application/json", + }, + json={ + "model": "phala/qwen3.5-27b", + "messages": [{"role": "user", "content": "hello"}], + }, +) + +request_id = chat_response.json()["id"] + +signature_response = requests.get( + f"https://api.redpill.ai/v1/signature/{request_id}", + params={"model": "phala/qwen3.5-27b"}, + headers={"Authorization": "Bearer "}, +) + +signature_data = signature_response.json() +``` + + +## Response + +```json +{ + "text": "phala/qwen3.5-27b:116478638341bd2b...:3d0b2a2df73dc93a...", + "signature": "0xee817b30e13ec3c320997ec37076a600e194dc64...", + "signing_address": "0x56d070df1c6be444b007839ef9cf67cec7c12b8b", + "signing_algo": "ecdsa" +} +``` + +## Response Fields + +| Field | Description | +|-------|-------------| +| `text` | Signed text. Format is either `request_hash:response_hash` or `model:request_hash:response_hash` | +| `signature` | Signature over `text` | +| `signing_address` | TEE signing address or public key | +| `signing_algo` | Signature algorithm used | + + +When `text` has three colon-separated parts, the first part is the model name used inside the signing path. It may differ from the alias you sent if the gateway rewrote the model ID internally. + + +## Bind to Attestation + +For production verification, use the returned `signing_address` to fetch fresh attestation evidence: + +```bash +NONCE=$(openssl rand -hex 32) + +curl "https://api.redpill.ai/v1/attestation/report?model=phala/qwen3.5-27b&nonce=$NONCE&signing_address=$SIGNING_ADDRESS" \ + -H "Authorization: Bearer " +``` + +The response is verified only when: + +1. The request and response hashes in `text` match the bytes you sent and received. +2. The signature is valid for `text`. +3. The attestation report binds the same `signing_address` to genuine TEE evidence and your fresh nonce. diff --git a/phala-cloud/confidential-ai/confidential-model/confidential-ai-api.mdx b/phala-cloud/confidential-ai/confidential-model/confidential-ai-api.mdx index aaf6f38..c1c0459 100644 --- a/phala-cloud/confidential-ai/confidential-model/confidential-ai-api.mdx +++ b/phala-cloud/confidential-ai/confidential-model/confidential-ai-api.mdx @@ -24,24 +24,27 @@ Once you get the API Key, you can start making requests to the Confidential AI A ## Make Your Secure Request -Replace `` with your actual API key in the examples below. We use DeepSeek V3 0324 model as an example, but you can choose any other available models. +Replace `` with your actual API key. The examples below use `phala/qwen3.5-27b`; use [List Models](/phala-cloud/confidential-ai/confidential-model/api-reference/models) to choose a model for your workload. -```bash Python +```python Python # Install OpenAI SDK: `pip3 install openai` from openai import OpenAI -client = OpenAI(api_key="", base_url="https://api.redpill.ai/v1") +client = OpenAI( + api_key="", + base_url="https://api.redpill.ai/v1", +) response = client.chat.completions.create( - model="phala/deepseek-chat-v3-0324", + model="phala/qwen3.5-27b", messages=[ {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "What is your model name?"}, ], - stream=True ) + print(response.choices[0].message.content) ``` @@ -49,14 +52,13 @@ print(response.choices[0].message.content) import OpenAI from 'openai'; const client = new OpenAI({ - baseURL: 'https://api.redpill.ai/v1', - apiKey: '', - }, + baseURL: 'https://api.redpill.ai/v1', + apiKey: '', }); async function main() { const completion = await client.chat.completions.create({ - model: 'phala/deepseek-chat-v3-0324', + model: 'phala/qwen3.5-27b', messages: [ { role: 'user', @@ -87,59 +89,97 @@ curl -X 'POST' \ "role": "user" } ], - "stream": true, - "model": "phala/deepseek-chat-v3-0324" + "model": "phala/qwen3.5-27b" }' ``` ### Available Models -We support [14+ models](https://redpill.ai/models) running in GPU TEE from multiple providers. Click the **GPU TEE** checkbox to see all options. +Confidential AI models are available through several GPU TEE providers. The live catalog is authoritative; query it before hardcoding model IDs: + +```bash +curl https://api.redpill.ai/v1/models \ + -H "Authorization: Bearer " +``` + +To list Phala-backed models only: + +```bash +curl https://api.redpill.ai/v1/models/phala \ + -H "Authorization: Bearer " +``` + +The following table reflects the current model families added in the RedPill model catalog update. Pricing and availability can change; use the API response for production routing. #### Phala Provider -| Model | Model ID | Context | Pricing (per 1M tokens) | -|-------|----------|---------|-------------------------| -| DeepSeek V3 0324 | `deepseek/deepseek-chat-v3-0324` | 163K | $0.28 / $1.14 | -| Qwen2.5 VL 72B Instruct | `qwen/qwen2.5-vl-72b-instruct` | 65K | $0.59 / $0.59 | -| Google Gemma 3 27B | `google/gemma-3-27b-it` | 53K | $0.11 / $0.40 | -| OpenAI GPT OSS 120B | `openai/gpt-oss-120b` | 131K | $0.10 / $0.49 | -| OpenAI GPT OSS 20B | `openai/gpt-oss-20b` | 131K | $0.04 / $0.15 | -| Qwen2.5 7B Instruct | `qwen/qwen-2.5-7b-instruct` | 32K | $0.04 / $0.10 | -| Sentence Transformers all-MiniLM-L6-v2 | `sentence-transformers/all-minilm-l6-v2` | 512 | $0.000005 | +| Model ID | Context | Modality | Pricing (input/output per 1M tokens) | +|----------|---------|----------|--------------------------------------| +| `phala/qwen3.5-27b` | 262K | Text | $0.30 / $2.40 | +| `phala/qwen3-vl-30b-a3b-instruct` | 128K | Vision + Text | $0.20 / $0.70 | +| `qwen/qwen3-embedding-8b` | 32K | Embeddings | $0.01 / $0 | +| `phala/gemma-3-27b-it` | 53K | Vision + Text | $0.11 / $0.40 | +| `phala/glm-4.7-flash` | 202K | Text | $0.10 / $0.43 | +| `phala/gpt-oss-20b` | 131K | Text | $0.04 / $0.15 | +| `phala/qwen-2.5-7b-instruct` | 32K | Text | $0.04 / $0.10 | +| `phala/qwen2.5-vl-72b-instruct` | 128K | Vision + Text | $0.40 / $1.20 | +| `phala/uncensored-24b` | 32K | Text | $0.20 / $0.90 | +| `sentence-transformers/all-minilm-l6-v2` | 512 | Embeddings | $0.005 / $0 | + + +`phala/qwen2.5-vl-72b-instruct` is a legacy alias that may route to `phala/qwen3-vl-30b-a3b-instruct`. Prefer the canonical ID returned by `/v1/models`. + #### NearAI Provider -| Model | Model ID | Context | Pricing (per 1M tokens) | -|-------|----------|---------|-------------------------| -| DeepSeek V3.1 | `deepseek/deepseek-chat-v3.1` | 163K | $1.00 / $2.50 | -| Qwen3 30B A3B Instruct | `qwen/qwen3-30b-a3b-instruct-2507` | 262K | $0.15 / $0.45 | -| Z.AI GLM 4.6 | `z-ai/glm-4.6` | 202K | $0.75 / $2.00 | +| Model ID | Context | Modality | Pricing (input/output per 1M tokens) | +|----------|---------|----------|--------------------------------------| +| `z-ai/glm-5` | 203K | Text | $1.20 / $3.50 | +| `deepseek/deepseek-chat-v3.1` | 164K | Text | $1.05 / $3.10 | +| `openai/gpt-oss-120b` | 131K | Text | $0.10 / $0.49 | +| `qwen/qwen3-30b-a3b-instruct-2507` | 262K | Text | $0.15 / $0.55 | +| `z-ai/glm-4.7` | 131K | Text | $0.85 / $3.30 | + +#### Chutes Provider + +| Model ID | Context | Modality | Pricing (input/output per 1M tokens) | +|----------|---------|----------|--------------------------------------| +| `z-ai/glm-5.1` | 203K | Text | $1.21 / $4.20 | +| `moonshotai/kimi-k2.6` | 262K | Text + Image | $1.09 / $4.60 | +| `qwen/qwen3.5-397b-a17b` | 262K | Text | $0.55 / $3.50 | +| `qwen/qwen3-coder-next` | 262K | Text | $0.18 / $1.20 | +| `minimax/minimax-m2.5` | 197K | Text | $0.20 / $1.38 | +| `xiaomi/mimo-v2-flash` | 262K | Text | $0.10 / $0.30 | +| `deepseek/deepseek-v3.2` | 164K | Text | $0.32 / $0.48 | +| `moonshotai/kimi-k2.5` | 262K | Text + Image | $0.60 / $3.00 | #### Tinfoil Provider -| Model | Model ID | Context | Pricing (per 1M tokens) | -|-------|----------|---------|-------------------------| -| DeepSeek R1 0528 | `deepseek/deepseek-r1-0528` | 163K | $2.00 / $2.00 | -| Qwen3 Coder 480B A35B | `qwen/qwen3-coder-480b-a35b-instruct` | 262K | $2.00 / $2.00 | -| Qwen3 VL 30B A3B | `qwen/qwen3-vl-30b-a3b-instruct` | 262K | $2.00 / $2.00 | -| Meta Llama 3.3 70B Instruct | `meta-llama/llama-3.3-70b-instruct` | 131K | $2.00 / $2.00 | +| Model ID | Context | Modality | Pricing (input/output per 1M tokens) | +|----------|---------|----------|--------------------------------------| +| `qwen/qwen3-coder-480b-a35b-instruct` | 262K | Text | $2.00 / $2.00 | +| `moonshotai/kimi-k2-thinking` | 262K | Text | $2.00 / $2.00 | +| `deepseek/deepseek-r1-0528` | 163K | Text | $2.00 / $2.00 | +| `meta-llama/llama-3.3-70b-instruct` | 131K | Text | $2.00 / $2.00 | -All models run in GPU TEEs with hardware attestation. Pricing shows input/output token costs. Browse the full list at [redpill.ai/models](https://redpill.ai/models). +TEE provider presence and attestation support are not identical for every provider and model. For production verification, test [Attestation Report](/phala-cloud/confidential-ai/confidential-model/api-reference/attestation) with the exact model ID you plan to use. ## Verify Your AI is Running Securely -Once you finished your secure request, every response comes with cryptographic proof that it ran in a secure TEE. This proof is generated by the TEE. ensures the response is secure and trustworthy. Click [Verify](/phala-cloud/confidential-ai/verify/overview) to learn how to verify your AI is running securely. +After you make a request, use [Request Signature](/phala-cloud/confidential-ai/confidential-model/api-reference/signature) to fetch the signature for that response. Then fetch a fresh [Attestation Report](/phala-cloud/confidential-ai/confidential-model/api-reference/attestation) with the returned `signing_address` to bind the response to TEE evidence. ## Next Steps -There are some advanced features you could use with Confidential AI API. +Use the API reference and feature guides for the next step: -- [Tool Calling](/phala-cloud/confidential-ai/confidential-model/tool-calling) help you call tools from your AI models. -- [Images and Vision](/phala-cloud/confidential-ai/confidential-model/images-and-vision) help you use images and vision models in Confidential AI. -- [Structured Output](/phala-cloud/confidential-ai/confidential-model/structured-output) help you get structured output from your AI models. -- [Streaming](/phala-cloud/confidential-ai/confidential-model/streaming) help you get streaming response from your AI models. -- [Playground](/phala-cloud/confidential-ai/confidential-model/playground) help you play with Confidential AI models in a private environment. +- [Chat Completions](/phala-cloud/confidential-ai/confidential-model/api-reference/chat-completions) documents the core request and response shape. +- [List Models](/phala-cloud/confidential-ai/confidential-model/api-reference/models) shows how to discover models programmatically. +- [Embeddings](/phala-cloud/confidential-ai/confidential-model/api-reference/embeddings) covers embedding model calls. +- [Tool Calling](/phala-cloud/confidential-ai/confidential-model/tool-calling) helps you call tools from your AI models. +- [Images and Vision](/phala-cloud/confidential-ai/confidential-model/images-and-vision) helps you use image-capable models. +- [Structured Output](/phala-cloud/confidential-ai/confidential-model/structured-output) helps you get JSON responses. +- [Streaming](/phala-cloud/confidential-ai/confidential-model/streaming) helps you consume streaming responses. +- [Playground](/phala-cloud/confidential-ai/confidential-model/playground) helps you test models in a private environment. diff --git a/phala-cloud/confidential-ai/confidential-model/images-and-vision.mdx b/phala-cloud/confidential-ai/confidential-model/images-and-vision.mdx index 26ec1cf..116c8ab 100644 --- a/phala-cloud/confidential-ai/confidential-model/images-and-vision.mdx +++ b/phala-cloud/confidential-ai/confidential-model/images-and-vision.mdx @@ -24,7 +24,7 @@ client = OpenAI( api_key="", ) response = client.chat.completions.create( - model="phala/gemma-3-27b-it", + model="phala/qwen3-vl-30b-a3b-instruct", messages=[{ "role": "user", "content": [ @@ -60,4 +60,5 @@ The overall impression is of a cute and peaceful scene with baby pandas enjoying ### Supported Models for Image Analysis - `phala/gemma-3-27b-it` -- `phala/qwen2.5-vl-72b-instruct` +- `phala/qwen3-vl-30b-a3b-instruct` +- `phala/qwen2.5-vl-72b-instruct` (legacy alias) diff --git a/phala-cloud/confidential-ai/confidential-model/streaming.mdx b/phala-cloud/confidential-ai/confidential-model/streaming.mdx index 4e8a9a7..49a93e0 100644 --- a/phala-cloud/confidential-ai/confidential-model/streaming.mdx +++ b/phala-cloud/confidential-ai/confidential-model/streaming.mdx @@ -15,16 +15,17 @@ Confidential AI API supports streaming, enabling you to receive responses in a s Replace `` with your actual API key in the examples below. -```python -import OpenAI from 'openai'; -const client = new OpenAI({ - baseURL: 'https://api.redpill.ai/api/v1', - apiKey: '', - }, -}); + +```python Python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="https://api.redpill.ai/v1", +) stream = client.chat.completions.create( - model="phala/deepseek-chat-v3-0324", + model="phala/qwen3.5-27b", messages=[ { "role": "user", @@ -34,21 +35,36 @@ stream = client.chat.completions.create( stream=True, ) for chunk in stream: - if chunk.choices: - print(chunk.choices[0].delta.content) - print("---") + content = chunk.choices[0].delta.content + if content: + print(content, end="") ``` +```typescript TypeScript +import OpenAI from "openai"; - -```json ---- -Hello ---- -Hello ---- +const client = new OpenAI({ + apiKey: "", + baseURL: "https://api.redpill.ai/v1", +}); ---- +const stream = await client.chat.completions.create({ + model: "phala/qwen3.5-27b", + messages: [ + { role: "user", content: "say `Hello` 2 times fast, no other output" }, + ], + stream: true, +}); + +for await (const chunk of stream) { + process.stdout.write(chunk.choices[0]?.delta?.content || ""); +} +``` + + + +``` +HelloHello ``` diff --git a/phala-cloud/confidential-ai/confidential-model/structured-output.mdx b/phala-cloud/confidential-ai/confidential-model/structured-output.mdx index c2fa2b2..3535365 100644 --- a/phala-cloud/confidential-ai/confidential-model/structured-output.mdx +++ b/phala-cloud/confidential-ai/confidential-model/structured-output.mdx @@ -25,7 +25,7 @@ response = requests.post( "Content-Type": "application/json", }, json={ - "model": "phala/deepseek-chat-v3-0324", + "model": "phala/gpt-oss-20b", "messages": [ {"role": "user", "content": "What is the weather like in Los Angeles?"}, ], @@ -79,8 +79,8 @@ print(info) Confidential AI supports structured output for the following models: -- `phala/deepseek-chat-v3-0324` - `phala/gemma-3-27b-it` - `phala/gpt-oss-20b` - `phala/gpt-oss-120b` -- `phala/qwen2.5-vl-72b-instruct` +- `phala/qwen3.5-27b` +- `phala/qwen3-vl-30b-a3b-instruct` diff --git a/phala-cloud/confidential-ai/confidential-model/tool-calling.mdx b/phala-cloud/confidential-ai/confidential-model/tool-calling.mdx index 2ee98a5..1cc971f 100644 --- a/phala-cloud/confidential-ai/confidential-model/tool-calling.mdx +++ b/phala-cloud/confidential-ai/confidential-model/tool-calling.mdx @@ -47,7 +47,7 @@ curl -s -X POST 'https://api.redpill.ai/v1/chat/completions' \ ], "tool_choice": "auto", "stream": false, - "model": "phala/qwen3-coder" + "model": "phala/gpt-oss-20b" }' ``` @@ -57,7 +57,7 @@ curl -s -X POST 'https://api.redpill.ai/v1/chat/completions' \ "id": "chatcmpl-28f745c2b7ee44f2ba36a8b4b409c74a", "object": "chat.completion", "created": 1754381277, - "model": "qwen/qwen3-coder", + "model": "phala/gpt-oss-20b", "choices": [ { "index": 0, @@ -152,7 +152,7 @@ curl -s -X POST 'https://api.redpill.ai/v1/chat/completions' \ ], "tool_choice": "auto", "stream": false, - "model": "phala/qwen3-coder" + "model": "phala/gpt-oss-20b" }' ``` @@ -163,7 +163,7 @@ curl -s -X POST 'https://api.redpill.ai/v1/chat/completions' \ "id": "chatcmpl-a46eff3d335c42c39bbe4ea69fc97462", "object": "chat.completion", "created": 1754381325, - "model": "qwen/qwen3-coder", + "model": "phala/gpt-oss-20b", "choices": [ { "index": 0, @@ -198,6 +198,7 @@ curl -s -X POST 'https://api.redpill.ai/v1/chat/completions' \ ## Supported Models -- phala/deepseek-chat-v3-0324 -- phala/qwen3-coder -- phala/llama-3.3-70b-instruct +- `phala/gpt-oss-20b` +- `phala/qwen3.5-27b` +- `qwen/qwen3-coder-next` +- `qwen/qwen3-coder-480b-a35b-instruct` diff --git a/phala-cloud/confidential-ai/verify/verify-attestation.mdx b/phala-cloud/confidential-ai/verify/verify-attestation.mdx index ecd5037..7a5fb5f 100644 --- a/phala-cloud/confidential-ai/verify/verify-attestation.mdx +++ b/phala-cloud/confidential-ai/verify/verify-attestation.mdx @@ -42,41 +42,73 @@ response = requests.get( ) report = response.json() -# You get key pieces: -# - nvidia_payload: GPU verification data -# - intel_quote: CPU verification data -# - signing_address: For signature verification -# - signing_algo: "ecdsa" or "ed25519" +# Response shape depends on the provider backing the model. ``` -The report gives you NVIDIA's hardware verification data for each GPU, Intel's TEE verification data for the CPU, a signing address you'll use later to verify signatures, and the signing algorithm used by this TEE instance. +The report gives you Intel TDX evidence, optional NVIDIA GPU evidence, signing key information, and software measurement data. The exact shape depends on the provider: + +- **Phala / NearAI two-layer format**: `gateway_attestation` plus `model_attestations`. +- **Chutes format**: `attestation_type: "chutes"` plus `all_attestations`. +- **Flat format**: older Phala-native responses expose fields such as `intel_quote`, `nvidia_payload`, and `signing_address` at the top level. + +Use [Attestation Report](/phala-cloud/confidential-ai/confidential-model/api-reference/attestation) for the endpoint schema. + +### Select the attestation to verify + +For Phala and NearAI two-layer responses, verify both the gateway attestation and the model attestation when both are present. The gateway protects routing and request handling; the model attestation protects the inference runtime. + +```python +def get_attestations(report): + attestations = [] + + if "gateway_attestation" in report: + attestations.append(("gateway", report["gateway_attestation"])) + + for item in report.get("model_attestations", []): + attestations.append(("model", item)) + + for item in report.get("all_attestations", []): + attestations.append(("model", item)) + + if "intel_quote" in report: + attestations.append(("model", report)) + + return attestations + +attestations = get_attestations(report) +assert attestations, "No attestation evidence found" +``` ### Verify NVIDIA GPU attestation -Now let's verify your NVIDIA GPUs are genuine. You'll send the `nvidia_payload` from your report to NVIDIA's own attestation service. Why NVIDIA's service? Because only NVIDIA can confirm their hardware is authentic - they built secret keys into each chip during manufacturing. +Now verify NVIDIA GPUs when GPU evidence is present. Phala-style responses expose `nvidia_payload`; Chutes-style responses expose `gpu_evidence`. ```python import json import base64 -# Parse and verify GPU payload nonce -gpu_payload = json.loads(report["nvidia_payload"]) -assert gpu_payload["nonce"].lower() == request_nonce.lower() +for name, attestation in attestations: + if "nvidia_payload" not in attestation: + continue -# Send to NVIDIA's Remote Attestation Service -response = requests.post( - "https://nras.attestation.nvidia.com/v3/attest/gpu", - json=gpu_payload -) -result = response.json() + # Parse and verify GPU payload nonce + gpu_payload = json.loads(attestation["nvidia_payload"]) + assert gpu_payload["nonce"].lower() == request_nonce.lower() -# Decode the JWT verdict -jwt_token = result[0][1] -payload_b64 = jwt_token.split(".")[1] -padded = payload_b64 + "=" * ((4 - len(payload_b64) % 4) % 4) -verdict_data = json.loads(base64.urlsafe_b64decode(padded)) + # Send to NVIDIA's Remote Attestation Service + response = requests.post( + "https://nras.attestation.nvidia.com/v3/attest/gpu", + json=gpu_payload + ) + result = response.json() -assert verdict_data["x-nvidia-overall-att-result"] == True + # Decode the JWT verdict + jwt_token = result[0][1] + payload_b64 = jwt_token.split(".")[1] + padded = payload_b64 + "=" * ((4 - len(payload_b64) % 4) % 4) + verdict_data = json.loads(base64.urlsafe_b64decode(padded)) + + assert verdict_data["x-nvidia-overall-att-result"] == True ``` The GPU payload must use the same nonce you generated. NVIDIA returns a JWT with `x-nvidia-overall-att-result: True` for verified authentic hardware. @@ -86,14 +118,26 @@ The GPU payload must use the same nonce you generated. NVIDIA returns a JWT with For Intel CPUs, you'll verify the TDX quote using Phala's verification service. This service decodes and validates Intel's cryptographic proof. ```python -# Verify Intel TDX quote -response = requests.post( - "https://cloud-api.phala.com/api/v1/attestations/verify", - json={"hex": report["intel_quote"]} -) -intel_result = response.json() +import base64 +import re + +def quote_to_hex(quote): + value = quote.removeprefix("0x") + if re.fullmatch(r"[0-9a-fA-F]+", value): + return value + return base64.b64decode(value).hex() + +for name, attestation in attestations: + if "intel_quote" not in attestation: + continue + + response = requests.post( + "https://cloud-api.phala.com/api/v1/attestations/verify", + json={"hex": quote_to_hex(attestation["intel_quote"])} + ) + intel_result = response.json() -assert intel_result["quote"]["verified"] == True + assert intel_result["quote"]["verified"] == True ``` This confirms the CPU is genuine Intel hardware running in TDX mode. The `intel_result` contains the decoded quote data we'll use next, including `reportdata` and `mrconfig` fields. @@ -112,8 +156,8 @@ report_data_hex = intel_result["quote"]["body"]["reportdata"] report_data = bytes.fromhex(report_data_hex.removeprefix("0x")) # Parse signing address based on algorithm -signing_address = report["signing_address"] -signing_algo = report.get("signing_algo", "ecdsa") +signing_address = attestation["signing_address"] +signing_algo = attestation.get("signing_algo", "ecdsa") if signing_algo == "ecdsa": # ECDSA: 20-byte Ethereum address @@ -155,7 +199,7 @@ Next, verify your application code hasn't been modified. The TEE measures the en from hashlib import sha256 # Extract compose manifest from attestation -tcb_info = report["info"]["tcb_info"] +tcb_info = attestation["info"]["tcb_info"] if isinstance(tcb_info, str): tcb_info = json.loads(tcb_info) diff --git a/phala-cloud/confidential-ai/verify/verify-signature.mdx b/phala-cloud/confidential-ai/verify/verify-signature.mdx index d64ced5..8f52326 100644 --- a/phala-cloud/confidential-ai/verify/verify-signature.mdx +++ b/phala-cloud/confidential-ai/verify/verify-signature.mdx @@ -18,7 +18,7 @@ import requests # After getting AI response with chat_id chat_id = ai_response["id"] -model = "phala/deepseek-chat-v3-0324" # or your model +model = "phala/qwen3.5-27b" # or your model # Fetch the signature sig_response = requests.get( @@ -28,16 +28,17 @@ sig_response = requests.get( signature_data = sig_response.json() # signature_data contains: -# - text: "request_hash:response_hash" +# - text: "request_hash:response_hash" or "model:request_hash:response_hash" # - signature: The ECDSA or Ed25519 signature # - signing_address: The address that signed this response +# - signing_algo: The signature algorithm ``` -The response gives you everything needed for verification. The `text` field contains hashes of your request and the AI's response, separated by a colon. The `signature` is the cryptographic proof from the TEE. The `signing_address` identifies which TEE instance signed this response. +The response gives you everything needed for verification. The `text` field contains hashes of your request and the AI's response. Some responses include the model name first, so the format is either `request_hash:response_hash` or `model:request_hash:response_hash`. The `signature` is the cryptographic proof from the TEE. The `signing_address` identifies which TEE instance signed this response. ## Verify request and response hashes -Confirm the hashes in the `text` field match your actual request and response. The `text` field format is `request_hash:response_hash`. +Confirm the hashes in the `text` field match your actual request and response. Hashes are byte-sensitive, so production verifiers should hash the exact serialized request body and response body sent over the wire. ```python from hashlib import sha256 @@ -53,9 +54,13 @@ response_body = '{"id": "...", "choices": [...], ...}' # Full response JSON request_hash = sha256_text(request_body_json) response_hash = sha256_text(response_body) -# Parse the signed hashes -hashed_text = signature_data["text"] -request_hash_server, response_hash_server = hashed_text.split(":") +# Parse the signed hashes. +# Format can be either request_hash:response_hash or model:request_hash:response_hash. +parts = signature_data["text"].split(":") +if len(parts) == 3: + signed_model, request_hash_server, response_hash_server = parts +else: + request_hash_server, response_hash_server = parts # Verify they match assert request_hash == request_hash_server @@ -109,19 +114,27 @@ attestation_response = requests.get( ) attestation_report = attestation_response.json() -# If using multi-server deployment, filter for matching signing address -if "all_attestations" in attestation_report: - attestation = next( - item for item in attestation_report["all_attestations"] - if item["signing_address"].lower() == signing_address.lower() - ) -else: - attestation = attestation_report +def find_attestation_for_signer(report, signing_address): + candidates = [] + if "gateway_attestation" in report: + candidates.append(report["gateway_attestation"]) + candidates.extend(report.get("model_attestations", [])) + candidates.extend(report.get("all_attestations", [])) + if "signing_address" in report: + candidates.append(report) + + for item in candidates: + if item.get("signing_address", "").lower() == signing_address.lower(): + return item + + raise ValueError("No attestation found for signing address") + +attestation = find_attestation_for_signer(attestation_report, signing_address) print(f"Found attestation for: {attestation['signing_address']}") ``` -In multi-server deployments, the response may include `all_attestations` array containing attestations from multiple backend servers. You filter by `signing_address` to find the one matching your signature. +In multi-server or two-layer deployments, the response may include several attestations. Filter by `signing_address` to find the one matching your response signature. ### Verify the attestation @@ -142,7 +155,7 @@ This gives you an independent third-party verification that the signature is val ## Complete example -For a full implementation that verifies both attestation and signatures, see the [signature verifier example](https://github.com/Phala-Network/private-ml-sdk/blob/main/vllm-proxy/verifiers/signature_verifier.py). +For a full raw Python implementation that verifies both attestation and signatures, see the [signature verifier example](https://github.com/Phala-Network/private-ml-sdk/blob/main/vllm-proxy/verifiers/signature_verifier.py). This script demonstrates the complete flow: 1. Send chat completion request (streaming or non-streaming)