Create chat completion

OpenAI compatible parameters

To use Respan parameters, you can pass them in the extra_body parameter if you’re using the OpenAI SDK.

Environment Switching: Respan doesn’t support an env parameter in API calls. To switch between environments (test/production), use different API keys - one for your test environment and another for production. You can manage these keys in your API Keys settings.

Python

1 def demo_call(input, 
2               model="gpt-4o-mini",
3               token="YOUR_RESPAN_API_KEY"
4               ):
5     headers = {
6         'Content-Type': 'application/json',
7         'Authorization': f'Bearer {token}',
8     }
9 
10     data = {
11         'model': model,
12         'messages': [{'role': 'user', 'content': input}],
13     }
14 
15     response = requests.post('https://api.respan.ai/api/chat/completions', headers=headers, json=data)
16     return response
17 
18 messages = "Say 'Hello World'"
19 print(demo_call(messages).json())

TypeScript

1 fetch('https://api.respan.ai/api/chat/completions', {
2   method: 'POST',
3   headers: {
4     'Content-Type': 'application/json',
5     'Authorization': 'Bearer YOUR_RESPAN_API_KEY'
6   },
7     body: JSON.stringify({
8         model: 'gpt-4o-mini',
9         messages: [{role: 'user', content: "Say 'Hello World'"}]
10     })
11 })
12 .then(response => response.json())
13 .then(data => console.log(data));

Bash

$ curl -X POST "https://api.respan.ai/chat/completions" 
$ -H "Content-Type: application/json" 
$ -H "Authorization: Bearer Your_Respan_API_Key" 
$ -d "{
>   "model": "gpt-4o-mini",
>   "messages": [{"role": "user", "content": "Hello"}],
> }"

PHP

1 <?php
2   $ch = curl_init();
3     
4   curl_setopt($ch, CURLOPT_URL, "https://api.respan.ai/chat/completions");
5   curl_setopt($ch, CURLOPT_POST, 1);
6   curl_setopt($ch, CURLOPT_HTTPHEADER, array(
7     "Content-Type: application/json",
8     "Authorization: Bearer Your_Respan_API_Key",
9   ));
10   curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(array(
11     "model" => "gpt-4o-mini",
12     "messages" => array(["role" => "user", "content" => "Hello"]),
13   )));
14     
15   $response = curl_exec($ch);
16   curl_close($ch);
17 ?>

1 package main
2 
3   "bytes"
4   "net/http"
5 )
6     
7 func main() {
8   url := "https://api.respan.ai/chat/completions"
9   method := "POST"
10     
11   payload := []byte(`{
12     "model" : "gpt-4o-mini",
13     "messages": [{"role": "user", "content": "Hello"}],
14   }`)
15     
16   client := &http.Client{}
17   req, err := http.NewRequest(method, url, bytes.NewBuffer(payload))
18     
19   if err != nil {
20     panic(err)
21   }
22     
23   req.Header.Add("Content-Type", "application/json")
24   req.Header.Add("Authorization", "Bearer Your_Respan_API_Key")
25     
26   res, err := client.Do(req)
27   defer res.Body.Close()
28 }

messages array required: List of messages to send to the endpoint in the OpenAI style, each of them following this format:

1 messages=[
2   {"role": "system", // Available choices are user, system or assistant
3    "content": "You are a helpful assistant."
4   },
5   {"role": "user", "content": "Hello!"}
6 ]

Properties

type string: The type of response format. Options: json_object, json_schema or text

Image processing: If you want to use the image processing feature, you need to use the following format to upload the image.

Example

1 {
2   "role": "user",
3   "content": [
4     {
5         "type": "text",
6         "text": "What's in this image?"
7     },
8     {
9         "type": "image_url",
10         "image_url": {
11         "url": "https://as1.ftcdn.net/v2/jpg/01/34/53/74/1000_F_134537443_VendrqyXIWyHrZgxdIsfyKUost734JDP.jpg"
12         }
13     }
14   ]
15 }

model string required: Specify which model to use. See the list of model here

This parameter will be overridden by the loadbalance_models parameter.

stream boolean: Whether to stream back partial progress token by token
tools array[dict]: A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide an array of functions the model may generate JSON inputs for.

Example

1   {
2       "type": "function",
3       "function": {
4         "name": "get_current_weather",
5         "description": "Get the current weather in a given location",
6         "parameters": {
7           "type": "object",
8           "properties": {
9             "location": {
10               "type": "string",
11               "description": "The city and state, e.g. San Francisco, CA"
12             },
13             "unit": {
14               "type": "string",
15               "enum": ["celsius", "fahrenheit"]
16             }
17           },
18           "required": ["location"]
19         }
20       }
21     }

tool_choice dict: Controls which (if any) tool is called by the model. none means the model will not call any tool and instead generates a message. auto means the model can pick between generating a message or calling one or more tools. required means the model must call one or more tools.

none is the default when no tools are present. auto is the default if tools are present.

Specifying a particular tool via the code below forces the model to call that tool.

1 {
2   "type": "function",
3   "function": {"name": "name_of_the_function"},
4 }

frequency_penalty number: Specify how much to penalize new tokens based on their existing frequency in the text so far. Decreases the model’s likelihood of repeating the same line verbatim
max_tokens number: Maximum number of tokens to generate in the response
temperature number: Controls randomness in the output in the range of 0-2, higher temperature will a more random response.
n number: How many chat completion choices are generated for each input message.

Caveat! While this can help improve generation quality by picking the optimal choice, this could also lead to more token usage.

logprobs boolean: Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.
echo boolean: Echo back the prompt in addition to the completion

stop array[string]: Stop sequence

presence_penalty number: Specify how much to penalize new tokens based on whether they appear in the text so far. Increases the model’s likelihood of talking about new topics

logit_bias dict: Used to modify the probability of tokens appearing in the response
response_format object: An object specifying the format that the model must output. Compatible with GPT-4 Turbo and all GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106.

Setting to { "type": "json_object" } enables JSON mode, which guarantees the message the model generates is valid JSON.

If you want to specify your own output structure, use { "type": "json_schema", "json_schema": {...your shema}}. For more reference, please check OpenAI’s guide on structured output

You must have a “json” as a keyword in the prompt to use this feature.

Properties

type string required: The type of response format. options: json_object, json_schema and text

Vertex AI example (special case)

If you are using Vertex AI and want to use JSON mode, you should specify a response_schema in the response_format parameter. Check the details of response schema here.

Example.py

1 response_schema = {
2     "type": "array", # or "string", "number", "object", "boolean"....
3     "items": { # items only for array type
4         "type": "object", 
5         "properties": {  # properties only for object type
6           "number": { "type": "number" },
7           "street_name": { "type": "string" },
8           "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
9         } 
10     },
11 }
12 
13 response_format={
14         "type": "json_object",
15         "response_schema": response_schema,
16     },

parallel_tool_calls boolean: Whether to enable parallel function calling during tool use.

Respan parameters

See how to make a standard Respan API call in the Gateway Quickstart guide.

Generation parameters

load_balance_group object: Balance the load of your requests between different models. See the details of load balancing here.
The proxy will pick one model from the group and override the model parameter

Example

1 {
2 // you don't need to specify the model parameter, otherwise, the model parameter will overwrite the load balance group
3     "messages": [
4         {
5             "role": "user",
6             "content": "Hi, how are you?"
7         }
8     ],
9     "load_balance_group": {
10         "group_id":"THE_GROUP_ID" // from Load balancing page
11     }
12 }

Example code with adding credentials

The models field will overwrite the load_balance_group you specified in the UI.

1 {
2   "load_balance_group": {
3       "group_id":"THE_GROUP_ID", // from Load balancing page
4       "models": [
5         {
6           "model": "azure/gpt-35-turbo",
7           "weight": 1
8         },
9         {
10           "model": "azure/gpt-4",
11           "credentials": { // add your own credentials if you want to use your own Azure credentials or custom model name
12               "api_base": "Your own Azure api_base",
13               "api_version": "Your own Azure api_version",
14               "api_key": "Your own Azure api_key"
15           },
16           "weight": 1
17         } 
18       ]
19   }
20 }

fallback_models array: Specify the list of backup models (ranked by priority) to respond in case of a failure in the primary model. See the details of fallback models here.

Example

1 {
2   // ...other parameters...
3     "fallback_models": [
4       "gemini/gemini-pro",
5       "mistral/mistral-small",
6       "gpt-4o"
7     ]
8 }

customer_credentials object: You can pass in your customer’s credentials for supported providers and use their credits when our proxy is calling models from those providers.
See details here

Example

1 "customer_credentials": {
2 
3   "openai": {
4     "api_key": "YOUR_OPENAI_API_KEY",
5   }
6 }

credential_override object: One-off credential overrides. Instead of using what is uploaded for each provider, this targets credentials for individual models.

Go to provider page to see how to add your own credentials and override them for a specific model.

Example

1 "credential_override": {
2     "azure/gpt-4o":{ // override for a specific model.
3       "api_key": "your-api-key",
4       "api_base": "your-api-base-url",
5       "api_version": "your-api-version",
6     }
7   }

cache_enabled boolean: Enable or disable caches. Check the details of caches here.

Example

1 {
2     "cache_enabled": true
3 }

cache_ttl number: This parameter specifies the time-to-live (TTL) for the cache in seconds.

It’s optional and the default value is 30 days now.

Example

1 {
2     "cache_ttl": 3600 // in seconds
3 }

cache_options boolean: This parameter specifies the cache options. Currently we support cache_by_customer option, you can set it to true or false. If cache_by_customer is set to true, the cache will be stored by the customer identifier.

It’s an optional parameter

1 {
2     "cache_options": { // optional
3         "cache_by_customer": true // or false
4     }
5 }

OpenAI-style responses surface cached prompt tokens in usage.prompt_tokens_details.cached_tokens. When cache entries are created, usage.prompt_tokens_details.cache_creation_tokens may be present.
Anthropic-style responses also include usage.cache_read_input_tokens (tokens read from cache) and usage.cache_creation_input_tokens (tokens added to cache). Depending on the model/provider, you may see both the prompt_tokens_details and cache_* fields.
If caching didn’t occur for a call, these values are 0 or may be omitted by the provider.

prompt object: The prompt template to use for the completion. You can build and deploy prompts in the Prompt Management.

Properties

prompt_id string required: The ID of the prompt to use. You can find this on the Prompts page.

variables object: The variables to replace in the prompt template. Values can be plain strings or typed prompt objects for prompt composition.

To reference another prompt as a variable, pass a typed object instead of a string:

1 {
2   "variables": {
3     "regular_var": "plain string value",
4     "composed_var": {
5       "_type": "prompt",
6       "prompt_id": "CHILD_PROMPT_ID",
7       "version": 1,
8       "variables": {
9         "child_var": "value"
10       }
11     }
12   }
13 }

Typed prompt variable fields:

_type — must be "prompt"
prompt_id — the child prompt ID
version — integer or "latest" (optional, defaults to deployed version)
variables — nested variables for the child prompt (optional)

The child prompt is rendered first, flattened to plain text, and injected into the parent. Max depth is 2; circular references return HTTP 400. Resolved values appear in logs as _rendered_result.

version number or string: The prompt version to use.
- Omit to use the deployed live version.
- Set to a specific number (e.g., 3) to pin that version.
- Use the reserved keyword “latest” to use the most recent draft version (not deployed). Useful for testing.
echo boolean: With echo on, the response body will have an extra field. This is an optional parameter.

1   "prompt_message" [an array of messages]

override boolean: Turn on override to use params in override_params instead of the params in the prompt.

1 {
2   "override": true,
3 }

override_params boolean: You can put any OpenAI chat/completions parameters here to override the prompt’s parameters. This will only work if override is set to true.

1 {
2   "override_params": {
3     "temperature": 0.5,
4     "max_tokens": 100
5   }
6 }

override_config object: This parameter allows you to control how you can override the parameters in the prompt.

Properties

messages_override_mode string: append: append the new messages to the existing messages

override: override the existing messages

Example

1     request_body = {
2         "prompt": {
3             "prompt_id": "xxxxxx",
4             "override_config": {"messages_override_mode": "append"}, # append or override
5             "override_params": {"messages": [{"role": "user", "content": "5"}]},
6         }
7     }

schema_version integer: Controls the prompt merge strategy.
- 1 (default, legacy): uses override flag logic — when override=true, prompt config wins; when override=false, request body wins for conflicts.
- 2 (recommended): prompt config always wins. Uses prepend/instructions-style merging. Supports the patch field. Requires raw HTTP requests — OpenAI SDKs strip v2 fields. See Prompt schema.
patch object: Additional parameter overrides applied in v2 mode (schema_version=2). Must not contain messages or input. Useful for overriding fields like temperature or max_tokens while letting the prompt config control messages and model.

1 {
2   "patch": {
3     "temperature": 0.9,
4     "max_tokens": 500
5   }
6 }

Example

1 {
2 "prompt": {
3       "prompt_id": "prompt_id", //paste this from the prompt management page
4       "variables": {
5         "variable_name": "variable_value"
6       },
7       // "echo": true //optional parameter
8     }
9 }

Prompt composition example

1 {
2   "prompt": {
3     "prompt_id": "PARENT_PROMPT_ID",
4     "override": true,
5     "variables": {
6       "request": "dispute a charge from last month",
7       "conversation": {
8         "_type": "prompt",
9         "prompt_id": "CHILD_PROMPT_ID",
10         "version": 2,
11         "variables": {
12           "customer_name": "Sarah",
13           "department": "billing"
14         }
15       }
16     }
17   }
18 }

v2 merge mode example

1 {
2   "prompt": {
3     "prompt_id": "YOUR_PROMPT_ID",
4     "schema_version": 2,
5     "variables": {
6       "task_description": "Square a number"
7     },
8     "patch": {
9       "temperature": 0.9,
10       "max_tokens": 500
11     }
12   }
13 }

With schema_version=2, the prompt config always wins for conflicting fields. The patch object lets you override non-message parameters like temperature and max_tokens. The patch must not contain messages or input.

retry_params object: Enable or disable retries and set the number of retries and the time to wait before retrying. Check the details of retries here.

Properties

retry_enabled boolean required: Enable or disable retries.
num_retries number: The number of retries to attempt.
retry_after number: The time to wait before retrying in seconds.
disable_log boolean: When set to true, only the request and performance metrics will be recorded, input and output messages will be omitted from the log.
model_name_map object:
This parameter is for Azure deployment only!!
We understand that you may have a custom name for your Azure deployment. Respan is using the model’s origin name which may not be able to match your deployment. You can use this parameter to map the default name to your custom name.

Example

1 {
2     "model": "azure/gpt-4o",
3     "model_name_map": {
4       "original_model_name": "azure/your_custom_model_name"
5       // e.g, "azure/gpt-4o": "azure/{your gpt-4o's deployment name}"
6     }
7 }

models array: Specify the list of models for the Respan LLM router to choose between. If not specified, all models will be used. See the list of models here

If only one model is specified, it will be treated as if the model parameter is used and the router will not trigger.

When the model parameter is used, the router will not trigger, and this parameter behaves as fallback_models.

exclude_providers array: The list of providers to exclude from the LLM router’s selection. All models under the provider will be excluded. See the list of providers here

This only excludes providers in the LLM router, if model parameter takes precedence over this parameter, andfallback_models and safety net will still use the excluded models to catch failures.

exclude_models array: The list of models to exclude from the LLM router’s selection. See the list of models here

This only excludes models in the LLM router, if model parameter takes precedence over this parameter, andfallback_models and safety net will still use the excluded models to catch failures.

Observability parameters

metadata dict: You can add any key-value pair to this metadata field for your reference. Check the details of metadata here.

Contact team@respan.ai if you need extra parameter support for your use case.

Example

1 {
2   "metadata": {
3     "my_key": "my_value"
4     // Add any key-value pair here
5   }
6 }

custom_identifier string: You can use this parameter to send an extra custom tag with your request. This will help you to identify LLM spans faster than metadata parameter, because it’s indexed. You can see it in Spans with name Custom ID field.

Example

1 {
2   "custom_identifier": "my_value"
3 }

customer_identifier string: Use this as a tag to identify the user associated with the API call. See the details of customer identifier here.

Example

1 {
2     //...other_params,
3     "customer_identifier": "user_123"
4 }

customer_params object: Pass the customer’s parameters in the API call to monitor the user’s data in the Respan platform. See how to get insights into your users’ data here

Properties

customer_identifier string required: The unique identifier for the customer. It can be any string.
group_identifier string: Group identifier. Use group identifier to group spans together.
name string: The name of the customer. It can be any string.
email string: The email of the customer. It shoud be a valid email.
period_start string: The start date of the period. It should be in the format YYYY-MM-DD.
period_end string: The start date of the period. It should be in the format YYYY-MM-DD.
budget_duration string: Choices are yearly, monthly, weekly, and daily
period_budget float: The budget for the period. It should be a float.
markup_percentage float: The markup percentage for the period. Usage report of your customers through this key will be increased by this percentge.
total_budget float: The total budget for a user.
request_breakdown boolean: Adding this returns the summarization of the response in the response body. If streaming is on, the metrics will be streamed as the last chunk.

Properties

Regular Response

1 {
2 "id": "chatcmpl-7476cf3f-fcc9-4902-a548-a12489856d8a",
3 //... main part of the response body ...
4 "request_breakdown": {
5 "prompt_tokens": 6,
6 "completion_tokens": 9,
7 "cost": 4.8e-5,
8 "prompt_messages": [
9   {
10     "role": "user",
11     "content": "How are you doing today?"
12   }
13 ],
14 "completion_message": {
15   "content": " I'm doing well, thanks for asking!",
16   "role": "assistant"
17 },
18 "model": "claude-2",
19 "cached": false,
20 "timestamp": "2024-02-20T01:23:39.329729Z",
21 "status_code": 200,
22 "stream": false,
23 "latency": 1.8415491580963135,
24 "scores": {},
25 "category": "Questions",
26 "metadata": {},
27 "routing_time": 0.18612787732854486,
28 "full_request": {
29   "messages": [
30     {
31       "role": "user",
32       "content": "How are you doing today?"
33     }
34   ],
35   "model": "claude-2",
36   "logprobs": true
37 },
38 "sentiment_score": 0
39 }
40 }

Streaming Response

1 //... other chunks ...
2 // The following is the last chunk
3 {
4   "id": "request_breakdown",
5   "choices": [
6     {
7       "delta": { "content": null, "role": "assistant" },
8       "finish_reason": "stop",
9       "request_breakdown": {
10         "prompt_tokens": 6,
11         "completion_tokens": 9,
12         "cost": 4.8e-5, //  In usd
13         "prompt_messages": [
14           {
15             "role": "user",
16             "content": "How are you doing today?"
17           }
18         ],
19         "completion_message": {
20           "content": " I'm doing well, thanks for asking!",
21           "role": "assistant"
22         },
23         "model": "claude-2",
24         "cached": false,
25         "timestamp": "2024-02-20T01:23:39.329729Z",
26         "status_code": 200,
27         "stream": false,
28         "latency": 1.8415491580963135, // in seconds
29         "scores": {},
30         "category": "Questions",
31         "metadata": {},
32         "routing_time": 0.18612787732854486, // in seconds
33         "full_request": {
34           "messages": [
35             {
36               "role": "user",
37               "content": "How are you doing today?"
38             }
39           ],
40           "model": "claude-2",
41           "logprobs": true
42         },
43         "sentiment_score": 0
44       },
45       "index": 0,
46       "message": { "content": null, "role": "assistant" }
47     }
48   ],
49   "created": 1706100589,
50   "model": "extra_parameter",
51   "object": "chat.completion.chunk",
52   "system_fingerprint": null,
53   "usage": {}
54 }

Evals parameters

positive_feedback boolean: Whether the user liked the output. True means the user liked the output.

Deprecated parameters

customer_api_keys object: You can pass in a dictionary of your customer’s API keys for specific models. If the router selects a model that is in the dictionary, it will attempt to use the customer’s API key for calling the model before using your integration API key or Respan’s default API key.

1 {
2   "gpt-3.5-turbo": "your_customer_api_key",
3   "gpt-4": "your_customer_api_key"
4 }

loadbalance_models array: Balance the load of your requests between different models. See the details of load balancing here.
This parameter will override the model parameter.

Example

1   {
2     // ...other parameters...
3     "loadbalance_models": [
4         {
5             "model": "claude-3-5-sonnet-20240620",
6             "weight": 34,
7             "credentials": { // Your own Anthropic API key, optional for team plan and above
8                 "api_key": "Your own Anthropic API key"
9             }
10         },
11         {
12             "model": "azure/gpt-35-turbo",
13             "weight": 34,
14             "credentials": { // Your own Azure credentials, optional for team plan and above
15                 "api_base": "Your own Azure api_base",
16                 "api_version": "Your own Azure api_version",
17                 "api_key": "Your own Azure api_key"
18             }
19         }
20     ]
21 }

Response

Below is an example response payload with the usage object. This helps you reconcile token accounting across providers and caching scenarios.

1 {
2   "id": "chatcmpl-e1b9665b-c354-41c5-bbe5-178bd0b69773",
3   "object": "chat.completion",
4   "created": 1761546960,
5   "model": "claude-sonnet-4-5-20250929",
6   "choices": [
7     {
8       "index": 0,
9       "finish_reason": "stop",
10       "message": {
11         "role": "assistant",
12         "content": "I'm doing well, thank you for asking! How can I help you today?"
13       }
14     }
15   ],
16   "usage": {
17     "completion_tokens": 20,
18     "prompt_tokens": 2619,
19     "total_tokens": 2639,
20 
21     // Details mirrors OpenAI-style fields
22     "completion_tokens_details": {
23       "accepted_prediction_tokens": 0,
24       "audio_tokens": 0,
25       "reasoning_tokens": 0,
26       "rejected_prediction_tokens": 0
27     },
28 
29     // Where cached prompt tokens are reported
30     "prompt_tokens_details": {
31       "audio_tokens": 0,
32       "cached_tokens": 2601,
33       "cache_creation_tokens": 0
34     },
35 
36     // Anthropic-style cache counters (present when applicable)
37     "cache_creation_input_tokens": 0,
38     "cache_read_input_tokens": 2601
39   }
40 }

## OpenAI compatible parameters To use [Respan parameters](/api-reference/develop/gateway/create-chat-completion#respan-parameters), you can pass them in the `extra_body` parameter if you're using the OpenAI SDK. <Note> **Environment Switching**: Respan doesn't support an `env` parameter in API calls. To switch between environments (test/production), use different API keys - one for your test environment and another for production. You can manage these keys in your [API Keys settings](https://platform.respan.ai/platform/api/api-keys). </Note> {/* ```Python OpenAI Python from openai import OpenAI client = OpenAI( base_url="https://api.respan.ai/api/", api_key="YOUR_RESPAN_API_KEY", ) response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role":"user", "content":"Tell me a long story"}], stream=True, extra_body={"metadata": {"session_id": "session_11"}} ) ``` ```TypeScript OpenAI TypeScript const client = new OpenAI({ baseURL: "https://api.respan.ai/api", apiKey: process.env.RESPAN_API_KEY, }); const response = await client.chat.completions .create({ messages: [{ role: "user", content: "Say this is a test" }], model: "gpt-3.5-turbo", // @ts-expect-error metadata: {"session_id": "session_11"} }) .asResponse(); console.log(await response.json()); ``` */} ```python Python def demo_call(input, model="gpt-4o-mini", token="YOUR_RESPAN_API_KEY" ): headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {token}', } data = { 'model': model, 'messages': [{'role': 'user', 'content': input}], } response = requests.post('https://api.respan.ai/api/chat/completions', headers=headers, json=data) return response messages = "Say 'Hello World'" print(demo_call(messages).json()) ``` ```TypeScript TypeScript fetch('https://api.respan.ai/api/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer YOUR_RESPAN_API_KEY' }, body: JSON.stringify({ model: 'gpt-4o-mini', messages: [{role: 'user', content: "Say 'Hello World'"}] }) }) .then(response => response.json()) .then(data => console.log(data)); ``` ```bash Bash curl -X POST "https://api.respan.ai/chat/completions" -H "Content-Type: application/json" -H "Authorization: Bearer Your_Respan_API_Key" -d "{ "model": "gpt-4o-mini", "messages": [{"role": "user", "content": "Hello"}], }" ``` ```PHP PHP <?php $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, "https://api.respan.ai/chat/completions"); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_HTTPHEADER, array( "Content-Type: application/json", "Authorization: Bearer Your_Respan_API_Key", )); curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(array( "model" => "gpt-4o-mini", "messages" => array(["role" => "user", "content" => "Hello"]), ))); $response = curl_exec($ch); curl_close($ch); ?> ``` ```Go Go package main "bytes" "net/http" ) func main() { url := "https://api.respan.ai/chat/completions" method := "POST" payload := []byte(`{ "model" : "gpt-4o-mini", "messages": [{"role": "user", "content": "Hello"}], }`) client := &http.Client{} req, err := http.NewRequest(method, url, bytes.NewBuffer(payload)) if err != nil { panic(err) } req.Header.Add("Content-Type", "application/json") req.Header.Add("Authorization", "Bearer Your_Respan_API_Key") res, err := client.Do(req) defer res.Body.Close() } ``` {/* ```python Python response = respan_generate(**params) def respan_generate( messages, api_key=os.getenv("RESPAN_API_KEY"), url="https://api.respan.ai/api/chat/completions", **kwargs ): print("Calling: ",url) headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}', } data = { "messages": messages, **kwargs } response = requests.post(url=url, headers=headers, json=data) return response params ={"messages": [{"role": "user", "content": "Hello"}], "model": "gpt-3.5-turbo", "stream": False, "max_tokens": 100 } ``` ```TypeScript TypeScript // Define the function with TypeScript fetch('https://api.respan.ai/api/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': 'Bearer {YOUR_RESPAN_API_KEY}' } body: JSON.stringify({ model: 'gpt-3.5-turbo', messages: [{role: 'user', content: "Say 'Hello World'"}] }) }) .then(response => response.json()) .then(data => console.log(data)); ``` ```bash cURL curl -X POST "https://api.respan.ai/api/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer {YOUR_RESPAN_API_KEY}" \ -d '{ "messages": [ { "role": "user", "content": "Hello" } ], "model": "gpt-3.5-turbo", "stream": false, "max_tokens": 100 }' ``` */} - `messages` *array* **required**: List of messages to send to the endpoint in the [OpenAI style](https://platform.openai.com/docs/api-reference/chat/create#chat-create-messages), each of them following this format: ```json messages=[ {"role": "system", // Available choices are user, system or assistant "content": "You are a helpful assistant." }, {"role": "user", "content": "Hello!"} ] ``` **Properties** - `type` *string*: The type of response format. Options: `json_object`, `json_schema` or `text` Image processing: If you want to use the image processing feature, you need to use the following format to upload the image. **Example** ```json { "role": "user", "content": [ { "type": "text", "text": "What's in this image?" }, { "type": "image_url", "image_url": { "url": "https://as1.ftcdn.net/v2/jpg/01/34/53/74/1000_F_134537443_VendrqyXIWyHrZgxdIsfyKUost734JDP.jpg" } } ] } ``` - `model` *string* **required**: Specify which model to use. See the list of model [here](/integrations/overview/overview) <Note> This parameter will be overridden by the `loadbalance_models` parameter.</Note> - `stream` *boolean*: Whether to stream back partial progress token by token - `tools` *array[dict]*: A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide an array of functions the model may generate JSON inputs for. **Example** ```json { "type": "function", "function": { "name": "get_current_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ``` - `tool_choice` *dict*: Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. required means the model must call one or more tools. `none` is the default when no tools are present. `auto` is the default if tools are present. Specifying a particular tool via the code below forces the model to call that tool. ```json { "type": "function", "function": {"name": "name_of_the_function"}, } ``` - `frequency_penalty` *number*: Specify how much to penalize new tokens based on their existing frequency in the text so far. Decreases the model's likelihood of repeating the same line verbatim - `max_tokens` *number*: Maximum number of tokens to generate in the response - `temperature` *number*: Controls randomness in the output in the range of 0-2, higher temperature will a more random response. - `n` *number*: How many chat completion choices are generated for each input message. Caveat! While this can help improve generation quality by picking the optimal choice, this could also lead to more token usage. {/* Add this to the concept page */} - `logprobs` *boolean*: Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the `content` of `message`. - `echo` *boolean*: Echo back the prompt in addition to the completion {/* Add this to the concept page */} - `stop` *array[string]*: Stop sequence {/* Add this to the concept page */} - `presence_penalty` *number*: Specify how much to penalize new tokens based on whether they appear in the text so far. Increases the model's likelihood of talking about new topics {/* Add this to the concept page */} {/* Add this to the concept page */} - `logit_bias` *dict*: Used to modify the probability of tokens appearing in the response - `response_format` *object*: An object specifying the format that the model must output. Compatible with GPT-4 Turbo and all GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106. Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the message the model generates is valid JSON. If you want to specify your own output structure, use `{ "type": "json_schema", "json_schema": {...your shema}}`. For more reference, please check OpenAI's guide on [structured output](https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses) You must have a "json" as a keyword in the prompt to use this feature. **Properties** - `type` *string* **required**: The type of response format. options: `json_object`, `json_schema` and `text` **Vertex AI example (special case)** If you are using Vertex AI and want to use JSON mode, you should specify a `response_schema` in the `response_format` parameter. Check the [details of response schema here.](https://json-schema.org/understanding-json-schema/reference/string) ```python Example.py response_schema = { "type": "array", # or "string", "number", "object", "boolean".... "items": { # items only for array type "type": "object", "properties": { # properties only for object type "number": { "type": "number" }, "street_name": { "type": "string" }, "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } } }, } response_format={ "type": "json_object", "response_schema": response_schema, }, ``` - `parallel_tool_calls` *boolean*: Whether to enable parallel function calling during tool use. ## Respan parameters See how to make a standard Respan API call in the [Gateway Quickstart](/documentation/features/gateway/gateway-quickstart) guide. ### Generation parameters - `load_balance_group` *object*: Balance the load of your requests between different models. See the [details of load balancing here.](/documentation/features/gateway/advanced-configuration#load-balancing) <Note> The proxy will pick one model from the group and override the `model` parameter </Note> **Example** ```json { // you don't need to specify the model parameter, otherwise, the model parameter will overwrite the load balance group "messages": [ { "role": "user", "content": "Hi, how are you?" } ], "load_balance_group": { "group_id":"THE_GROUP_ID" // from Load balancing page } } ``` **Example code with adding credentials** The `models` field will overwrite the `load_balance_group` you specified in the UI. ```json { "load_balance_group": { "group_id":"THE_GROUP_ID", // from Load balancing page "models": [ { "model": "azure/gpt-35-turbo", "weight": 1 }, { "model": "azure/gpt-4", "credentials": { // add your own credentials if you want to use your own Azure credentials or custom model name "api_base": "Your own Azure api_base", "api_version": "Your own Azure api_version", "api_key": "Your own Azure api_key" }, "weight": 1 } ] } } ``` - `fallback_models` *array*: Specify the list of backup models (ranked by priority) to respond in case of a failure in the primary model. See the [details of fallback models here.](/documentation/features/gateway/advanced-configuration#fallback-models) **Example** ```json { // ...other parameters... "fallback_models": [ "gemini/gemini-pro", "mistral/mistral-small", "gpt-4o" ] } ``` - `customer_credentials` *object*: You can pass in your customer's credentials for [supported providers](/documentation/admin/llm-provider-keys) and use their credits when our proxy is calling models from those providers. See details [here](/documentation/admin/llm-provider-keys) **Example** ```json "customer_credentials": { "openai": { "api_key": "YOUR_OPENAI_API_KEY", } } ``` - `credential_override` *object*: One-off credential overrides. Instead of using what is uploaded for each provider, this targets credentials for individual models. Go to [provider page](/integrations/overview/overview) to see how to add your own credentials and override them for a specific model. **Example** ```json "credential_override": { "azure/gpt-4o":{ // override for a specific model. "api_key": "your-api-key", "api_base": "your-api-base-url", "api_version": "your-api-version", } } ``` - `cache_enabled` *boolean*: Enable or disable caches. Check the [details of caches here.](/documentation/features/gateway/advanced-configuration#caches) **Example** ```json { "cache_enabled": true } ``` - `cache_ttl` *number*: This parameter specifies the time-to-live (TTL) for the cache in seconds. <Note>It's optional and the default value is 30 **days** now.</Note> **Example** ```json { "cache_ttl": 3600 // in seconds } ``` - `cache_options` *boolean*: This parameter specifies the cache options. Currently we support `cache_by_customer` option, you can set it to `true` or `false`. If `cache_by_customer` is set to `true`, the cache will be stored by the customer identifier. <Note>It's an optional parameter</Note> ```json { "cache_options": { // optional "cache_by_customer": true // or false } } ``` <Note> - OpenAI-style responses surface cached prompt tokens in `usage.prompt_tokens_details.cached_tokens`. When cache entries are created, `usage.prompt_tokens_details.cache_creation_tokens` may be present. - Anthropic-style responses also include `usage.cache_read_input_tokens` (tokens read from cache) and `usage.cache_creation_input_tokens` (tokens added to cache). Depending on the model/provider, you may see both the `prompt_tokens_details` and `cache_*` fields. - If caching didn’t occur for a call, these values are `0` or may be omitted by the provider. </Note> - `prompt` *object*: The prompt template to use for the completion. You can build and deploy prompts in the [Prompt Management](/documentation/features/prompt-management/manage-prompts). **Properties** - `prompt_id` *string* **required**: The ID of the prompt to use. You can find this on the Prompts page. - `variables` *object*: The variables to replace in the prompt template. Values can be plain strings or **typed prompt objects** for [prompt composition](/documentation/features/prompt-management/manage-prompts#prompt-composition). To reference another prompt as a variable, pass a typed object instead of a string: ```json { "variables": { "regular_var": "plain string value", "composed_var": { "_type": "prompt", "prompt_id": "CHILD_PROMPT_ID", "version": 1, "variables": { "child_var": "value" } } } } ``` Typed prompt variable fields: - `_type` — must be `"prompt"` - `prompt_id` — the child prompt ID - `version` — integer or `"latest"` (optional, defaults to deployed version) - `variables` — nested variables for the child prompt (optional) The child prompt is rendered first, flattened to plain text, and injected into the parent. Max depth is **2**; circular references return HTTP 400. Resolved values appear in logs as `_rendered_result`. - `version` *number or string*: The prompt version to use. - Omit to use the deployed live version. - Set to a specific number (e.g., 3) to pin that version. - Use the reserved keyword "latest" to use the most recent draft version (not deployed). Useful for testing. - `echo` *boolean*: With echo on, the response body will have an extra field. This is an optional parameter. ```json "prompt_message" [an array of messages] ``` - `override` *boolean*: Turn on override to use params in `override_params` instead of the params in the prompt. ```json { "override": true, } ``` - `override_params` *boolean*: You can put any OpenAI chat/completions parameters here to override the prompt's parameters. This will only work if `override` is set to `true`. ```json { "override_params": { "temperature": 0.5, "max_tokens": 100 } } ``` - `override_config` *object*: This parameter allows you to control how you can override the parameters in the prompt. **Properties** - `messages_override_mode` *string*: `append`: append the new messages to the existing messages `override`: override the existing messages **Example** ```python {4-5} request_body = { "prompt": { "prompt_id": "xxxxxx", "override_config": {"messages_override_mode": "append"}, # append or override "override_params": {"messages": [{"role": "user", "content": "5"}]}, } } ``` - `schema_version` *integer*: Controls the prompt merge strategy. - `1` (default, legacy): uses `override` flag logic — when `override=true`, prompt config wins; when `override=false`, request body wins for conflicts. - `2` (recommended): prompt config always wins. Uses prepend/instructions-style merging. Supports the `patch` field. Requires raw HTTP requests — OpenAI SDKs strip v2 fields. See [Prompt schema](/documentation/features/prompt-management/manage-prompts#prompt-schema). - `patch` *object*: Additional parameter overrides applied in v2 mode (`schema_version=2`). Must **not** contain `messages` or `input`. Useful for overriding fields like `temperature` or `max_tokens` while letting the prompt config control messages and model. ```json { "patch": { "temperature": 0.9, "max_tokens": 500 } } ``` **Example** ```json { "prompt": { "prompt_id": "prompt_id", //paste this from the prompt management page "variables": { "variable_name": "variable_value" }, // "echo": true //optional parameter } } ``` **Prompt composition example** ```json { "prompt": { "prompt_id": "PARENT_PROMPT_ID", "override": true, "variables": { "request": "dispute a charge from last month", "conversation": { "_type": "prompt", "prompt_id": "CHILD_PROMPT_ID", "version": 2, "variables": { "customer_name": "Sarah", "department": "billing" } } } } } ``` **v2 merge mode example** ```json { "prompt": { "prompt_id": "YOUR_PROMPT_ID", "schema_version": 2, "variables": { "task_description": "Square a number" }, "patch": { "temperature": 0.9, "max_tokens": 500 } } } ``` With `schema_version=2`, the prompt config always wins for conflicting fields. The `patch` object lets you override non-message parameters like `temperature` and `max_tokens`. The `patch` must **not** contain `messages` or `input`. - `retry_params` *object*: Enable or disable retries and set the number of retries and the time to wait before retrying. Check the [details of retries here.](/documentation/features/gateway/advanced-configuration#retries) **Properties** - `retry_enabled` *boolean* **required**: Enable or disable retries. - `num_retries` *number*: The number of retries to attempt. - `retry_after` *number*: The time to wait before retrying in seconds. - `disable_log` *boolean*: When set to true, only the request and [performance metrics](/documentation/features/monitoring/metrics) will be recorded, input and output messages will be omitted from the log. - `model_name_map` *object*: <Note>This parameter is for Azure deployment only!! </Note> We understand that you may have a custom name for your Azure deployment. Respan is using the model's origin name which may not be able to match your deployment. You can use this parameter to map the default name to your custom name. **Example** ```json { "model": "azure/gpt-4o", "model_name_map": { "original_model_name": "azure/your_custom_model_name" // e.g, "azure/gpt-4o": "azure/{your gpt-4o's deployment name}" } } ``` - `models` *array*: Specify the list of models for the Respan LLM router to choose between. If not specified,{" "} all models will be used. See the list of models [here](/integrations/overview/overview) If only one model is specified, it will be treated as if the `model` parameter is used and the router will not trigger. When the `model` parameter is used, the router will not trigger, and this parameter behaves as `fallback_models`. - `exclude_providers` *array*: The list of providers to exclude from the LLM router's selection. All models under the provider will be excluded. See the list of providers [here](/integrations/overview/overview) This only excludes providers in the LLM router, if `model` parameter takes precedence over this parameter, and`fallback_models` and [safety net](/documentation/features/monitoring/notifications/subscribe-alerts) will still use the excluded models to catch failures. - `exclude_models` *array*: The list of models to exclude from the LLM router's selection. See the list of models [here](/integrations/overview/overview) This only excludes models in the LLM router, if `model` parameter takes precedence over this parameter, and`fallback_models` and [safety net](/documentation/features/monitoring/notifications/subscribe-alerts) will still use the excluded models to catch failures. ### Observability parameters - `metadata` *dict*: You can add any key-value pair to this metadata field for your reference. Check the [details of metadata here.](/documentation/features/tracing/spans/span-fields-parameters#custom-properties) Contact team@respan.ai if you need extra parameter support for your use case. **Example** ```json { "metadata": { "my_key": "my_value" // Add any key-value pair here } } ``` - `custom_identifier` *string*: You can use this parameter to send an extra custom tag with your request. This will help you to identify LLM spans faster than `metadata` parameter, because it's indexed. You can see it in Spans with name `Custom ID` field. **Example** ```json { "custom_identifier": "my_value" } ``` - `customer_identifier` *string*: Use this as a tag to identify the user associated with the API call. See the [details of customer identifier here.](/documentation/features/user-analytics/customer-identifier) **Example** ```json { //...other_params, "customer_identifier": "user_123" } ``` - `customer_params` *object*: Pass the customer's parameters in the API call to monitor the user's data in the Respan platform. See how to get insights into your users' data [here](/api-reference/observe/users/update-user) **Properties** - `customer_identifier` *string* **required**: The unique identifier for the customer. It can be any string. - `group_identifier` *string*: Group identifier. Use group identifier to group spans together. - `name` *string*: The name of the customer. It can be any string. - `email` *string*: The email of the customer. It shoud be a valid email. - `period_start` *string*: The start date of the period. It should be in the format `YYYY-MM-DD`. - `period_end` *string*: The start date of the period. It should be in the format `YYYY-MM-DD`. - `budget_duration` *string*: Choices are `yearly`, `monthly`, `weekly`, and `daily` - `period_budget` *float*: The budget for the period. It should be a float. - `markup_percentage` *float*: The markup percentage for the period. Usage report of your customers through this key will be increased by this percentge. - `total_budget` *float*: The total budget for a user. - `request_breakdown` *boolean*: Adding this returns the summarization of the response in the response body. If streaming is on, the metrics will be streamed as the last chunk. **Properties** ```json Regular Response { "id": "chatcmpl-7476cf3f-fcc9-4902-a548-a12489856d8a", //... main part of the response body ... "request_breakdown": { "prompt_tokens": 6, "completion_tokens": 9, "cost": 4.8e-5, "prompt_messages": [ { "role": "user", "content": "How are you doing today?" } ], "completion_message": { "content": " I'm doing well, thanks for asking!", "role": "assistant" }, "model": "claude-2", "cached": false, "timestamp": "2024-02-20T01:23:39.329729Z", "status_code": 200, "stream": false, "latency": 1.8415491580963135, "scores": {}, "category": "Questions", "metadata": {}, "routing_time": 0.18612787732854486, "full_request": { "messages": [ { "role": "user", "content": "How are you doing today?" } ], "model": "claude-2", "logprobs": true }, "sentiment_score": 0 } } ``` ```json Streaming Response //... other chunks ... // The following is the last chunk { "id": "request_breakdown", "choices": [ { "delta": { "content": null, "role": "assistant" }, "finish_reason": "stop", "request_breakdown": { "prompt_tokens": 6, "completion_tokens": 9, "cost": 4.8e-5, // In usd "prompt_messages": [ { "role": "user", "content": "How are you doing today?" } ], "completion_message": { "content": " I'm doing well, thanks for asking!", "role": "assistant" }, "model": "claude-2", "cached": false, "timestamp": "2024-02-20T01:23:39.329729Z", "status_code": 200, "stream": false, "latency": 1.8415491580963135, // in seconds "scores": {}, "category": "Questions", "metadata": {}, "routing_time": 0.18612787732854486, // in seconds "full_request": { "messages": [ { "role": "user", "content": "How are you doing today?" } ], "model": "claude-2", "logprobs": true }, "sentiment_score": 0 }, "index": 0, "message": { "content": null, "role": "assistant" } } ], "created": 1706100589, "model": "extra_parameter", "object": "chat.completion.chunk", "system_fingerprint": null, "usage": {} } ``` ## Evals parameters - `positive_feedback` *boolean*: Whether the user liked the output. `True` means the user liked the output. ## Deprecated parameters - `customer_api_keys` *object*: You can pass in a dictionary of your customer's API keys for specific models. If the router selects a model that is in the dictionary, it will attempt to use the customer's API key for calling the model before using your [integration API key](/documentation/admin/llm-provider-keys) or Respan's default API key. ```json { "gpt-3.5-turbo": "your_customer_api_key", "gpt-4": "your_customer_api_key" } ``` - `loadbalance_models` *array*: Balance the load of your requests between different models. See the [details of load balancing here.](/documentation/features/gateway/advanced-configuration#load-balancing) <Note> This parameter will override the `model` parameter. </Note> **Example** ```json { // ...other parameters... "loadbalance_models": [ { "model": "claude-3-5-sonnet-20240620", "weight": 34, "credentials": { // Your own Anthropic API key, optional for team plan and above "api_key": "Your own Anthropic API key" } }, { "model": "azure/gpt-35-turbo", "weight": 34, "credentials": { // Your own Azure credentials, optional for team plan and above "api_base": "Your own Azure api_base", "api_version": "Your own Azure api_version", "api_key": "Your own Azure api_key" } } ] } ``` ## Response Below is an example response payload with the `usage` object. This helps you reconcile token accounting across providers and caching scenarios. ```json { "id": "chatcmpl-e1b9665b-c354-41c5-bbe5-178bd0b69773", "object": "chat.completion", "created": 1761546960, "model": "claude-sonnet-4-5-20250929", "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": "I'm doing well, thank you for asking! How can I help you today?" } } ], "usage": { "completion_tokens": 20, "prompt_tokens": 2619, "total_tokens": 2639, // Details mirrors OpenAI-style fields "completion_tokens_details": { "accepted_prediction_tokens": 0, "audio_tokens": 0, "reasoning_tokens": 0, "rejected_prediction_tokens": 0 }, // Where cached prompt tokens are reported "prompt_tokens_details": { "audio_tokens": 0, "cached_tokens": 2601, "cache_creation_tokens": 0 }, // Anthropic-style cache counters (present when applicable) "cache_creation_input_tokens": 0, "cache_read_input_tokens": 2601 } } ```

Authentication

AuthorizationBearer

API key authentication. Get your API key from https://platform.respan.ai/platform/api-keys

Request

This endpoint expects an object.

messageslist of stringsRequired

List of messages to send to the endpoint in the OpenAI style, each of them following this format: Image processing: If you want to use the image processing feature, you need to use the following format to upload the image.

modelstringRequired

Specify which model to use. See the list of model here

streambooleanOptional

Whether to stream back partial progress token by token

toolslist of objectsOptional

A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide an array of functions the model may generate JSON inputs for.

tool_choiceobjectOptional

Controls which (if any) tool is called by the model. none means the model will not call any tool and instead generates a message. auto means the model can pick between generating a message or calling one or more tools. required means the model must call one or more tools. none is the default when no tools are present. auto is the default if tools are present. Specifying a particular tool via the code below forces the model to call that tool.

frequency_penaltydoubleOptional

Specify how much to penalize new tokens based on their existing frequency in the text so far. Decreases the model's likelihood of repeating the same line verbatim

max_tokensdoubleOptional

Maximum number of tokens to generate in the response

temperaturedoubleOptionalDefaults to 1

Controls randomness in the output in the range of 0-2, higher temperature will a more random response.

ndoubleOptionalDefaults to 1

How many chat completion choices are generated for each input message. Caveat! While this can help improve generation quality by picking the optimal choice, this could also lead to more token usage.

logprobsbooleanOptional

Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.

echobooleanOptional

Echo back the prompt in addition to the completion

stoplist of stringsOptional

Stop sequence

presence_penaltydoubleOptional

Specify how much to penalize new tokens based on whether they appear in the text so far. Increases the model's likelihood of talking about new topics

logit_biasobjectOptional

Used to modify the probability of tokens appearing in the response

response_formatobjectOptional

An object specifying the format that the model must output. Compatible with GPT-4 Turbo and all GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106. Setting to { “type”: “json_object” } enables JSON mode, which guarantees the message the model generates is valid JSON. If you want to specify your own output structure, use { “type”: “json_schema”, “json_schema”: {…your shema}}. For more reference, please check OpenAI’s guide on structured output You must have a “json” as a keyword in the prompt to use this feature.

An object specifying the format that the model must output. Compatible with GPT-4 Turbo and all GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106. Setting to { "type": "json_object" } enables JSON mode, which guarantees the message the model generates is valid JSON. If you want to specify your own output structure, use { "type": "json_schema", "json_schema": {...your shema}}. For more reference, please check OpenAI's guide on structured output You must have a "json" as a keyword in the prompt to use this feature.

parallel_tool_callsbooleanOptional

Whether to enable parallel function calling during tool use.

load_balance_groupobjectOptional

Balance the load of your requests between different models. See the details of load balancing here.

fallback_modelslist of stringsOptional

Specify the list of backup models (ranked by priority) to respond in case of a failure in the primary model. See the details of fallback models here.

customer_credentialsobjectOptional

You can pass in your customer's credentials for supported providers and use their credits when our proxy is calling models from those providers. See details here

credential_overrideobjectOptional

One-off credential overrides. Instead of using what is uploaded for each provider, this targets credentials for individual models. Go to provider page to see how to add your own credentials and override them for a specific model.

cache_enabledbooleanOptional

Enable or disable caches. Check the details of caches here.

cache_ttldoubleOptional

This parameter specifies the time-to-live (TTL) for the cache in seconds.

cache_optionsbooleanOptional

This parameter specifies the cache options. Currently we support cache_by_customer option, you can set it to true or false. If cache_by_customer is set to true, the cache will be stored by the customer identifier.

promptobjectOptional

The prompt template to use for the completion. You can build and deploy prompts in the Prompt Management.

retry_paramsobjectOptional

Enable or disable retries and set the number of retries and the time to wait before retrying. Check the details of retries here.

disable_logbooleanOptional

When set to true, only the request and performance metrics will be recorded, input and output messages will be omitted from the log.

model_name_mapobjectOptional

We understand that you may have a custom name for your Azure deployment. Respan is using the model's origin name which may not be able to match your deployment. You can use this parameter to map the default name to your custom name.

modelslist of stringsOptional

Specify the list of models for the Respan LLM router to choose between. If not specified, all models will be used. See the list of models here If only one model is specified, it will be treated as if the model parameter is used and the router will not trigger. When the model parameter is used, the router will not trigger, and this parameter behaves as fallback_models.

exclude_providerslist of stringsOptional

The list of providers to exclude from the LLM router’s selection. All models under the provider will be excluded. See the list of providers here This only excludes providers in the LLM router, if model parameter takes precedence over this parameter, andfallback_models and safety net will still use the excluded models to catch failures.

exclude_modelslist of stringsOptional

The list of models to exclude from the LLM router’s selection. See the list of models here This only excludes models in the LLM router, if model parameter takes precedence over this parameter, andfallback_models and safety net will still use the excluded models to catch failures.

metadataobjectOptional

You can add any key-value pair to this metadata field for your reference. Check the details of metadata here. Contact team@respan.ai if you need extra parameter support for your use case.

custom_identifierstringOptional

You can use this parameter to send an extra custom tag with your request. This will help you to identify LLM spans faster than metadata parameter, because it's indexed. You can see it in Spans with name Custom ID field.

customer_identifierstringOptional<=254 characters

Use this as a tag to identify the user associated with the API call. See the details of customer identifier here. Max 254 characters; auto-truncated if exceeded.

customer_paramsobjectOptional

Pass the customer's parameters in the API call to monitor the user's data in the Respan platform. See how to get insights into your users' data here

request_breakdownbooleanOptional

Adding this returns the summarization of the response in the response body. If streaming is on, the metrics will be streamed as the last chunk.

positive_feedbackbooleanOptional

Whether the user liked the output. True means the user liked the output.

customer_api_keysobjectOptional

You can pass in a dictionary of your customer's API keys for specific models. If the router selects a model that is in the dictionary, it will attempt to use the customer's API key for calling the model before using your integration API key or Respan's default API key.

loadbalance_modelslist of stringsOptional

Balance the load of your requests between different models. See the details of load balancing here.

Response

Successful response for Create chat completion

rolestring

contentlist of objects

Errors

401

Unauthorized Error

OpenAI compatible parameters

To use Respan parameters, you can pass them in the extra_body parameter if you’re using the OpenAI SDK.

Python

1 def demo_call(input, 
2               model="gpt-4o-mini",
3               token="YOUR_RESPAN_API_KEY"
4               ):
5     headers = {
6         'Content-Type': 'application/json',
7         'Authorization': f'Bearer {token}',
8     }
9 
10     data = {
11         'model': model,
12         'messages': [{'role': 'user', 'content': input}],
13     }
14 
15     response = requests.post('https://api.respan.ai/api/chat/completions', headers=headers, json=data)
16     return response
17 
18 messages = "Say 'Hello World'"
19 print(demo_call(messages).json())

TypeScript

1 fetch('https://api.respan.ai/api/chat/completions', {
2   method: 'POST',
3   headers: {
4     'Content-Type': 'application/json',
5     'Authorization': 'Bearer YOUR_RESPAN_API_KEY'
6   },
7     body: JSON.stringify({
8         model: 'gpt-4o-mini',
9         messages: [{role: 'user', content: "Say 'Hello World'"}]
10     })
11 })
12 .then(response => response.json())
13 .then(data => console.log(data));

Bash

$ curl -X POST "https://api.respan.ai/chat/completions" 
$ -H "Content-Type: application/json" 
$ -H "Authorization: Bearer Your_Respan_API_Key" 
$ -d "{
>   "model": "gpt-4o-mini",
>   "messages": [{"role": "user", "content": "Hello"}],
> }"

PHP

1 <?php
2   $ch = curl_init();
3     
4   curl_setopt($ch, CURLOPT_URL, "https://api.respan.ai/chat/completions");
5   curl_setopt($ch, CURLOPT_POST, 1);
6   curl_setopt($ch, CURLOPT_HTTPHEADER, array(
7     "Content-Type: application/json",
8     "Authorization: Bearer Your_Respan_API_Key",
9   ));
10   curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(array(
11     "model" => "gpt-4o-mini",
12     "messages" => array(["role" => "user", "content" => "Hello"]),
13   )));
14     
15   $response = curl_exec($ch);
16   curl_close($ch);
17 ?>

1 package main
2 
3   "bytes"
4   "net/http"
5 )
6     
7 func main() {
8   url := "https://api.respan.ai/chat/completions"
9   method := "POST"
10     
11   payload := []byte(`{
12     "model" : "gpt-4o-mini",
13     "messages": [{"role": "user", "content": "Hello"}],
14   }`)
15     
16   client := &http.Client{}
17   req, err := http.NewRequest(method, url, bytes.NewBuffer(payload))
18     
19   if err != nil {
20     panic(err)
21   }
22     
23   req.Header.Add("Content-Type", "application/json")
24   req.Header.Add("Authorization", "Bearer Your_Respan_API_Key")
25     
26   res, err := client.Do(req)
27   defer res.Body.Close()
28 }

messages array required: List of messages to send to the endpoint in the OpenAI style, each of them following this format:

1 messages=[
2   {"role": "system", // Available choices are user, system or assistant
3    "content": "You are a helpful assistant."
4   },
5   {"role": "user", "content": "Hello!"}
6 ]

Properties

type string: The type of response format. Options: json_object, json_schema or text

Image processing: If you want to use the image processing feature, you need to use the following format to upload the image.

Example

1 {
2   "role": "user",
3   "content": [
4     {
5         "type": "text",
6         "text": "What's in this image?"
7     },
8     {
9         "type": "image_url",
10         "image_url": {
11         "url": "https://as1.ftcdn.net/v2/jpg/01/34/53/74/1000_F_134537443_VendrqyXIWyHrZgxdIsfyKUost734JDP.jpg"
12         }
13     }
14   ]
15 }

model string required: Specify which model to use. See the list of model here

This parameter will be overridden by the loadbalance_models parameter.

stream boolean: Whether to stream back partial progress token by token
tools array[dict]: A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide an array of functions the model may generate JSON inputs for.

Example

1   {
2       "type": "function",
3       "function": {
4         "name": "get_current_weather",
5         "description": "Get the current weather in a given location",
6         "parameters": {
7           "type": "object",
8           "properties": {
9             "location": {
10               "type": "string",
11               "description": "The city and state, e.g. San Francisco, CA"
12             },
13             "unit": {
14               "type": "string",
15               "enum": ["celsius", "fahrenheit"]
16             }
17           },
18           "required": ["location"]
19         }
20       }
21     }

tool_choice dict: Controls which (if any) tool is called by the model. none means the model will not call any tool and instead generates a message. auto means the model can pick between generating a message or calling one or more tools. required means the model must call one or more tools.

none is the default when no tools are present. auto is the default if tools are present.

Specifying a particular tool via the code below forces the model to call that tool.

1 {
2   "type": "function",
3   "function": {"name": "name_of_the_function"},
4 }

frequency_penalty number: Specify how much to penalize new tokens based on their existing frequency in the text so far. Decreases the model’s likelihood of repeating the same line verbatim
max_tokens number: Maximum number of tokens to generate in the response
temperature number: Controls randomness in the output in the range of 0-2, higher temperature will a more random response.
n number: How many chat completion choices are generated for each input message.

Caveat! While this can help improve generation quality by picking the optimal choice, this could also lead to more token usage.

logprobs boolean: Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.
echo boolean: Echo back the prompt in addition to the completion

stop array[string]: Stop sequence

presence_penalty number: Specify how much to penalize new tokens based on whether they appear in the text so far. Increases the model’s likelihood of talking about new topics

logit_bias dict: Used to modify the probability of tokens appearing in the response
response_format object: An object specifying the format that the model must output. Compatible with GPT-4 Turbo and all GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106.

Setting to { "type": "json_object" } enables JSON mode, which guarantees the message the model generates is valid JSON.

If you want to specify your own output structure, use { "type": "json_schema", "json_schema": {...your shema}}. For more reference, please check OpenAI’s guide on structured output

You must have a “json” as a keyword in the prompt to use this feature.

Properties

type string required: The type of response format. options: json_object, json_schema and text

Vertex AI example (special case)

If you are using Vertex AI and want to use JSON mode, you should specify a response_schema in the response_format parameter. Check the details of response schema here.

Example.py

1 response_schema = {
2     "type": "array", # or "string", "number", "object", "boolean"....
3     "items": { # items only for array type
4         "type": "object", 
5         "properties": {  # properties only for object type
6           "number": { "type": "number" },
7           "street_name": { "type": "string" },
8           "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
9         } 
10     },
11 }
12 
13 response_format={
14         "type": "json_object",
15         "response_schema": response_schema,
16     },

parallel_tool_calls boolean: Whether to enable parallel function calling during tool use.

Respan parameters

See how to make a standard Respan API call in the Gateway Quickstart guide.

Generation parameters

load_balance_group object: Balance the load of your requests between different models. See the details of load balancing here.
The proxy will pick one model from the group and override the model parameter

Example

1 {
2 // you don't need to specify the model parameter, otherwise, the model parameter will overwrite the load balance group
3     "messages": [
4         {
5             "role": "user",
6             "content": "Hi, how are you?"
7         }
8     ],
9     "load_balance_group": {
10         "group_id":"THE_GROUP_ID" // from Load balancing page
11     }
12 }

Example code with adding credentials

The models field will overwrite the load_balance_group you specified in the UI.

1 {
2   "load_balance_group": {
3       "group_id":"THE_GROUP_ID", // from Load balancing page
4       "models": [
5         {
6           "model": "azure/gpt-35-turbo",
7           "weight": 1
8         },
9         {
10           "model": "azure/gpt-4",
11           "credentials": { // add your own credentials if you want to use your own Azure credentials or custom model name
12               "api_base": "Your own Azure api_base",
13               "api_version": "Your own Azure api_version",
14               "api_key": "Your own Azure api_key"
15           },
16           "weight": 1
17         } 
18       ]
19   }
20 }

fallback_models array: Specify the list of backup models (ranked by priority) to respond in case of a failure in the primary model. See the details of fallback models here.

Example

1 {
2   // ...other parameters...
3     "fallback_models": [
4       "gemini/gemini-pro",
5       "mistral/mistral-small",
6       "gpt-4o"
7     ]
8 }

customer_credentials object: You can pass in your customer’s credentials for supported providers and use their credits when our proxy is calling models from those providers.
See details here

Example

1 "customer_credentials": {
2 
3   "openai": {
4     "api_key": "YOUR_OPENAI_API_KEY",
5   }
6 }

credential_override object: One-off credential overrides. Instead of using what is uploaded for each provider, this targets credentials for individual models.

Go to provider page to see how to add your own credentials and override them for a specific model.

Example

1 "credential_override": {
2     "azure/gpt-4o":{ // override for a specific model.
3       "api_key": "your-api-key",
4       "api_base": "your-api-base-url",
5       "api_version": "your-api-version",
6     }
7   }

cache_enabled boolean: Enable or disable caches. Check the details of caches here.

Example

1 {
2     "cache_enabled": true
3 }

cache_ttl number: This parameter specifies the time-to-live (TTL) for the cache in seconds.

It’s optional and the default value is 30 days now.

Example

1 {
2     "cache_ttl": 3600 // in seconds
3 }

cache_options boolean: This parameter specifies the cache options. Currently we support cache_by_customer option, you can set it to true or false. If cache_by_customer is set to true, the cache will be stored by the customer identifier.

It’s an optional parameter

1 {
2     "cache_options": { // optional
3         "cache_by_customer": true // or false
4     }
5 }

OpenAI-style responses surface cached prompt tokens in usage.prompt_tokens_details.cached_tokens. When cache entries are created, usage.prompt_tokens_details.cache_creation_tokens may be present.
Anthropic-style responses also include usage.cache_read_input_tokens (tokens read from cache) and usage.cache_creation_input_tokens (tokens added to cache). Depending on the model/provider, you may see both the prompt_tokens_details and cache_* fields.
If caching didn’t occur for a call, these values are 0 or may be omitted by the provider.

prompt object: The prompt template to use for the completion. You can build and deploy prompts in the Prompt Management.

Properties

prompt_id string required: The ID of the prompt to use. You can find this on the Prompts page.

variables object: The variables to replace in the prompt template. Values can be plain strings or typed prompt objects for prompt composition.

To reference another prompt as a variable, pass a typed object instead of a string:

1 {
2   "variables": {
3     "regular_var": "plain string value",
4     "composed_var": {
5       "_type": "prompt",
6       "prompt_id": "CHILD_PROMPT_ID",
7       "version": 1,
8       "variables": {
9         "child_var": "value"
10       }
11     }
12   }
13 }

Typed prompt variable fields:

_type — must be "prompt"
prompt_id — the child prompt ID
version — integer or "latest" (optional, defaults to deployed version)
variables — nested variables for the child prompt (optional)

The child prompt is rendered first, flattened to plain text, and injected into the parent. Max depth is 2; circular references return HTTP 400. Resolved values appear in logs as _rendered_result.

version number or string: The prompt version to use.
- Omit to use the deployed live version.
- Set to a specific number (e.g., 3) to pin that version.
- Use the reserved keyword “latest” to use the most recent draft version (not deployed). Useful for testing.
echo boolean: With echo on, the response body will have an extra field. This is an optional parameter.

1   "prompt_message" [an array of messages]

override boolean: Turn on override to use params in override_params instead of the params in the prompt.

1 {
2   "override": true,
3 }

override_params boolean: You can put any OpenAI chat/completions parameters here to override the prompt’s parameters. This will only work if override is set to true.

1 {
2   "override_params": {
3     "temperature": 0.5,
4     "max_tokens": 100
5   }
6 }

override_config object: This parameter allows you to control how you can override the parameters in the prompt.

Properties

messages_override_mode string: append: append the new messages to the existing messages

override: override the existing messages

Example

1     request_body = {
2         "prompt": {
3             "prompt_id": "xxxxxx",
4             "override_config": {"messages_override_mode": "append"}, # append or override
5             "override_params": {"messages": [{"role": "user", "content": "5"}]},
6         }
7     }

schema_version integer: Controls the prompt merge strategy.
- 1 (default, legacy): uses override flag logic — when override=true, prompt config wins; when override=false, request body wins for conflicts.
- 2 (recommended): prompt config always wins. Uses prepend/instructions-style merging. Supports the patch field. Requires raw HTTP requests — OpenAI SDKs strip v2 fields. See Prompt schema.
patch object: Additional parameter overrides applied in v2 mode (schema_version=2). Must not contain messages or input. Useful for overriding fields like temperature or max_tokens while letting the prompt config control messages and model.

1 {
2   "patch": {
3     "temperature": 0.9,
4     "max_tokens": 500
5   }
6 }

Example

1 {
2 "prompt": {
3       "prompt_id": "prompt_id", //paste this from the prompt management page
4       "variables": {
5         "variable_name": "variable_value"
6       },
7       // "echo": true //optional parameter
8     }
9 }

Prompt composition example

1 {
2   "prompt": {
3     "prompt_id": "PARENT_PROMPT_ID",
4     "override": true,
5     "variables": {
6       "request": "dispute a charge from last month",
7       "conversation": {
8         "_type": "prompt",
9         "prompt_id": "CHILD_PROMPT_ID",
10         "version": 2,
11         "variables": {
12           "customer_name": "Sarah",
13           "department": "billing"
14         }
15       }
16     }
17   }
18 }

v2 merge mode example

1 {
2   "prompt": {
3     "prompt_id": "YOUR_PROMPT_ID",
4     "schema_version": 2,
5     "variables": {
6       "task_description": "Square a number"
7     },
8     "patch": {
9       "temperature": 0.9,
10       "max_tokens": 500
11     }
12   }
13 }

retry_params object: Enable or disable retries and set the number of retries and the time to wait before retrying. Check the details of retries here.

Properties

retry_enabled boolean required: Enable or disable retries.
num_retries number: The number of retries to attempt.
retry_after number: The time to wait before retrying in seconds.
disable_log boolean: When set to true, only the request and performance metrics will be recorded, input and output messages will be omitted from the log.
model_name_map object:
This parameter is for Azure deployment only!!
We understand that you may have a custom name for your Azure deployment. Respan is using the model’s origin name which may not be able to match your deployment. You can use this parameter to map the default name to your custom name.

Example

1 {
2     "model": "azure/gpt-4o",
3     "model_name_map": {
4       "original_model_name": "azure/your_custom_model_name"
5       // e.g, "azure/gpt-4o": "azure/{your gpt-4o's deployment name}"
6     }
7 }

models array: Specify the list of models for the Respan LLM router to choose between. If not specified, all models will be used. See the list of models here

If only one model is specified, it will be treated as if the model parameter is used and the router will not trigger.

When the model parameter is used, the router will not trigger, and this parameter behaves as fallback_models.

exclude_providers array: The list of providers to exclude from the LLM router’s selection. All models under the provider will be excluded. See the list of providers here

This only excludes providers in the LLM router, if model parameter takes precedence over this parameter, andfallback_models and safety net will still use the excluded models to catch failures.

exclude_models array: The list of models to exclude from the LLM router’s selection. See the list of models here

This only excludes models in the LLM router, if model parameter takes precedence over this parameter, andfallback_models and safety net will still use the excluded models to catch failures.

Observability parameters

metadata dict: You can add any key-value pair to this metadata field for your reference. Check the details of metadata here.

Contact team@respan.ai if you need extra parameter support for your use case.

Example

1 {
2   "metadata": {
3     "my_key": "my_value"
4     // Add any key-value pair here
5   }
6 }

custom_identifier string: You can use this parameter to send an extra custom tag with your request. This will help you to identify LLM spans faster than metadata parameter, because it’s indexed. You can see it in Spans with name Custom ID field.

Example

1 {
2   "custom_identifier": "my_value"
3 }

customer_identifier string: Use this as a tag to identify the user associated with the API call. See the details of customer identifier here.

Example

1 {
2     //...other_params,
3     "customer_identifier": "user_123"
4 }

customer_params object: Pass the customer’s parameters in the API call to monitor the user’s data in the Respan platform. See how to get insights into your users’ data here

Properties

customer_identifier string required: The unique identifier for the customer. It can be any string.
group_identifier string: Group identifier. Use group identifier to group spans together.
name string: The name of the customer. It can be any string.
email string: The email of the customer. It shoud be a valid email.
period_start string: The start date of the period. It should be in the format YYYY-MM-DD.
period_end string: The start date of the period. It should be in the format YYYY-MM-DD.
budget_duration string: Choices are yearly, monthly, weekly, and daily
period_budget float: The budget for the period. It should be a float.
markup_percentage float: The markup percentage for the period. Usage report of your customers through this key will be increased by this percentge.
total_budget float: The total budget for a user.
request_breakdown boolean: Adding this returns the summarization of the response in the response body. If streaming is on, the metrics will be streamed as the last chunk.

Properties

Regular Response

1 {
2 "id": "chatcmpl-7476cf3f-fcc9-4902-a548-a12489856d8a",
3 //... main part of the response body ...
4 "request_breakdown": {
5 "prompt_tokens": 6,
6 "completion_tokens": 9,
7 "cost": 4.8e-5,
8 "prompt_messages": [
9   {
10     "role": "user",
11     "content": "How are you doing today?"
12   }
13 ],
14 "completion_message": {
15   "content": " I'm doing well, thanks for asking!",
16   "role": "assistant"
17 },
18 "model": "claude-2",
19 "cached": false,
20 "timestamp": "2024-02-20T01:23:39.329729Z",
21 "status_code": 200,
22 "stream": false,
23 "latency": 1.8415491580963135,
24 "scores": {},
25 "category": "Questions",
26 "metadata": {},
27 "routing_time": 0.18612787732854486,
28 "full_request": {
29   "messages": [
30     {
31       "role": "user",
32       "content": "How are you doing today?"
33     }
34   ],
35   "model": "claude-2",
36   "logprobs": true
37 },
38 "sentiment_score": 0
39 }
40 }

Streaming Response

1 //... other chunks ...
2 // The following is the last chunk
3 {
4   "id": "request_breakdown",
5   "choices": [
6     {
7       "delta": { "content": null, "role": "assistant" },
8       "finish_reason": "stop",
9       "request_breakdown": {
10         "prompt_tokens": 6,
11         "completion_tokens": 9,
12         "cost": 4.8e-5, //  In usd
13         "prompt_messages": [
14           {
15             "role": "user",
16             "content": "How are you doing today?"
17           }
18         ],
19         "completion_message": {
20           "content": " I'm doing well, thanks for asking!",
21           "role": "assistant"
22         },
23         "model": "claude-2",
24         "cached": false,
25         "timestamp": "2024-02-20T01:23:39.329729Z",
26         "status_code": 200,
27         "stream": false,
28         "latency": 1.8415491580963135, // in seconds
29         "scores": {},
30         "category": "Questions",
31         "metadata": {},
32         "routing_time": 0.18612787732854486, // in seconds
33         "full_request": {
34           "messages": [
35             {
36               "role": "user",
37               "content": "How are you doing today?"
38             }
39           ],
40           "model": "claude-2",
41           "logprobs": true
42         },
43         "sentiment_score": 0
44       },
45       "index": 0,
46       "message": { "content": null, "role": "assistant" }
47     }
48   ],
49   "created": 1706100589,
50   "model": "extra_parameter",
51   "object": "chat.completion.chunk",
52   "system_fingerprint": null,
53   "usage": {}
54 }

Evals parameters

positive_feedback boolean: Whether the user liked the output. True means the user liked the output.

Deprecated parameters

customer_api_keys object: You can pass in a dictionary of your customer’s API keys for specific models. If the router selects a model that is in the dictionary, it will attempt to use the customer’s API key for calling the model before using your integration API key or Respan’s default API key.

1 {
2   "gpt-3.5-turbo": "your_customer_api_key",
3   "gpt-4": "your_customer_api_key"
4 }

loadbalance_models array: Balance the load of your requests between different models. See the details of load balancing here.
This parameter will override the model parameter.

Example

1   {
2     // ...other parameters...
3     "loadbalance_models": [
4         {
5             "model": "claude-3-5-sonnet-20240620",
6             "weight": 34,
7             "credentials": { // Your own Anthropic API key, optional for team plan and above
8                 "api_key": "Your own Anthropic API key"
9             }
10         },
11         {
12             "model": "azure/gpt-35-turbo",
13             "weight": 34,
14             "credentials": { // Your own Azure credentials, optional for team plan and above
15                 "api_base": "Your own Azure api_base",
16                 "api_version": "Your own Azure api_version",
17                 "api_key": "Your own Azure api_key"
18             }
19         }
20     ]
21 }

Response

Below is an example response payload with the usage object. This helps you reconcile token accounting across providers and caching scenarios.

1 {
2   "id": "chatcmpl-e1b9665b-c354-41c5-bbe5-178bd0b69773",
3   "object": "chat.completion",
4   "created": 1761546960,
5   "model": "claude-sonnet-4-5-20250929",
6   "choices": [
7     {
8       "index": 0,
9       "finish_reason": "stop",
10       "message": {
11         "role": "assistant",
12         "content": "I'm doing well, thank you for asking! How can I help you today?"
13       }
14     }
15   ],
16   "usage": {
17     "completion_tokens": 20,
18     "prompt_tokens": 2619,
19     "total_tokens": 2639,
20 
21     // Details mirrors OpenAI-style fields
22     "completion_tokens_details": {
23       "accepted_prediction_tokens": 0,
24       "audio_tokens": 0,
25       "reasoning_tokens": 0,
26       "rejected_prediction_tokens": 0
27     },
28 
29     // Where cached prompt tokens are reported
30     "prompt_tokens_details": {
31       "audio_tokens": 0,
32       "cached_tokens": 2601,
33       "cache_creation_tokens": 0
34     },
35 
36     // Anthropic-style cache counters (present when applicable)
37     "cache_creation_input_tokens": 0,
38     "cache_read_input_tokens": 2601
39   }
40 }

1	import requests
2
3	url = "https://api.respan.ai/api/chat/completions"
4
5	payload = {
6	"messages": ["string"],
7	"model": "string"
8	}
9	headers = {
10	"Authorization": "Bearer <token>",
11	"Content-Type": "application/json"
12	}
13
14	response = requests.post(url, json=payload, headers=headers)
15
16	print(response.json())

1	def demo_call(input,
2	model="gpt-4o-mini",
3	token="YOUR_RESPAN_API_KEY"
4	):
5	headers = {
6	'Content-Type': 'application/json',
7	'Authorization': f'Bearer {token}',
8	}
9
10	data = {
11	'model': model,
12	'messages': [{'role': 'user', 'content': input}],
13	}
14
15	response = requests.post('https://api.respan.ai/api/chat/completions', headers=headers, json=data)
16	return response
17
18	messages = "Say 'Hello World'"
19	print(demo_call(messages).json())

1	fetch('https://api.respan.ai/api/chat/completions', {
2	method: 'POST',
3	headers: {
4	'Content-Type': 'application/json',
5	'Authorization': 'Bearer YOUR_RESPAN_API_KEY'
6	},
7	body: JSON.stringify({
8	model: 'gpt-4o-mini',
9	messages: [{role: 'user', content: "Say 'Hello World'"}]
10	})
11	})
12	.then(response => response.json())
13	.then(data => console.log(data));

$	curl -X POST "https://api.respan.ai/chat/completions"
$	-H "Content-Type: application/json"
$	-H "Authorization: Bearer Your_Respan_API_Key"
$	-d "{
>	"model": "gpt-4o-mini",
>	"messages": [{"role": "user", "content": "Hello"}],
>	}"

1	<?php
2	$ch = curl_init();
3
4	curl_setopt($ch, CURLOPT_URL, "https://api.respan.ai/chat/completions");
5	curl_setopt($ch, CURLOPT_POST, 1);
6	curl_setopt($ch, CURLOPT_HTTPHEADER, array(
7	"Content-Type: application/json",
8	"Authorization: Bearer Your_Respan_API_Key",
9	));
10	curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(array(
11	"model" => "gpt-4o-mini",
12	"messages" => array(["role" => "user", "content" => "Hello"]),
13	)));
14
15	$response = curl_exec($ch);
16	curl_close($ch);
17	?>

1	package main
2
3	"bytes"
4	"net/http"
5	)
6
7	func main() {
8	url := "https://api.respan.ai/chat/completions"
9	method := "POST"
10
11	payload := []byte(`{
12	"model" : "gpt-4o-mini",
13	"messages": [{"role": "user", "content": "Hello"}],
14	}`)
15
16	client := &http.Client{}
17	req, err := http.NewRequest(method, url, bytes.NewBuffer(payload))
18
19	if err != nil {
20	panic(err)
21	}
22
23	req.Header.Add("Content-Type", "application/json")
24	req.Header.Add("Authorization", "Bearer Your_Respan_API_Key")
25
26	res, err := client.Do(req)
27	defer res.Body.Close()
28	}

1	messages=[
2	{"role": "system", // Available choices are user, system or assistant
3	"content": "You are a helpful assistant."
4	},
5	{"role": "user", "content": "Hello!"}
6	]

1	{
2	"role": "user",
3	"content": [
4	{
5	"type": "text",
6	"text": "What's in this image?"
7	},
8	{
9	"type": "image_url",
10	"image_url": {
11	"url": "https://as1.ftcdn.net/v2/jpg/01/34/53/74/1000_F_134537443_VendrqyXIWyHrZgxdIsfyKUost734JDP.jpg"
12	}
13	}
14	]
15	}

1	{
2	"type": "function",
3	"function": {
4	"name": "get_current_weather",
5	"description": "Get the current weather in a given location",
6	"parameters": {
7	"type": "object",
8	"properties": {
9	"location": {
10	"type": "string",
11	"description": "The city and state, e.g. San Francisco, CA"
12	},
13	"unit": {
14	"type": "string",
15	"enum": ["celsius", "fahrenheit"]
16	}
17	},
18	"required": ["location"]
19	}
20	}
21	}

1	{
2	"type": "function",
3	"function": {"name": "name_of_the_function"},
4	}

1	response_schema = {
2	"type": "array", # or "string", "number", "object", "boolean"....
3	"items": { # items only for array type
4	"type": "object",
5	"properties": { # properties only for object type
6	"number": { "type": "number" },
7	"street_name": { "type": "string" },
8	"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
9	}
10	},
11	}
12
13	response_format={
14	"type": "json_object",
15	"response_schema": response_schema,
16	},

1	{
2	// you don't need to specify the model parameter, otherwise, the model parameter will overwrite the load balance group
3	"messages": [
4	{
5	"role": "user",
6	"content": "Hi, how are you?"
7	}
8	],
9	"load_balance_group": {
10	"group_id":"THE_GROUP_ID" // from Load balancing page
11	}
12	}

1	{
2	"load_balance_group": {
3	"group_id":"THE_GROUP_ID", // from Load balancing page
4	"models": [
5	{
6	"model": "azure/gpt-35-turbo",
7	"weight": 1
8	},
9	{
10	"model": "azure/gpt-4",
11	"credentials": { // add your own credentials if you want to use your own Azure credentials or custom model name
12	"api_base": "Your own Azure api_base",
13	"api_version": "Your own Azure api_version",
14	"api_key": "Your own Azure api_key"
15	},
16	"weight": 1
17	}
18	]
19	}
20	}

1	{
2	// ...other parameters...
3	"fallback_models": [
4	"gemini/gemini-pro",
5	"mistral/mistral-small",
6	"gpt-4o"
7	]
8	}

1	"customer_credentials": {
2
3	"openai": {
4	"api_key": "YOUR_OPENAI_API_KEY",
5	}
6	}

1	"credential_override": {
2	"azure/gpt-4o":{ // override for a specific model.
3	"api_key": "your-api-key",
4	"api_base": "your-api-base-url",
5	"api_version": "your-api-version",
6	}
7	}

1	{
2	"cache_options": { // optional
3	"cache_by_customer": true // or false
4	}
5	}

1	{
2	"variables": {
3	"regular_var": "plain string value",
4	"composed_var": {
5	"_type": "prompt",
6	"prompt_id": "CHILD_PROMPT_ID",
7	"version": 1,
8	"variables": {
9	"child_var": "value"
10	}
11	}
12	}
13	}

1	{
2	"override_params": {
3	"temperature": 0.5,
4	"max_tokens": 100
5	}
6	}

1	request_body = {
2	"prompt": {
3	"prompt_id": "xxxxxx",
4	"override_config": {"messages_override_mode": "append"}, # append or override
5	"override_params": {"messages": [{"role": "user", "content": "5"}]},
6	}
7	}

1	{
2	"prompt": {
3	"prompt_id": "prompt_id", //paste this from the prompt management page
4	"variables": {
5	"variable_name": "variable_value"
6	},
7	// "echo": true //optional parameter
8	}
9	}

1	{
2	"prompt": {
3	"prompt_id": "PARENT_PROMPT_ID",
4	"override": true,
5	"variables": {
6	"request": "dispute a charge from last month",
7	"conversation": {
8	"_type": "prompt",
9	"prompt_id": "CHILD_PROMPT_ID",
10	"version": 2,
11	"variables": {
12	"customer_name": "Sarah",
13	"department": "billing"
14	}
15	}
16	}
17	}
18	}

1	{
2	"prompt": {
3	"prompt_id": "YOUR_PROMPT_ID",
4	"schema_version": 2,
5	"variables": {
6	"task_description": "Square a number"
7	},
8	"patch": {
9	"temperature": 0.9,
10	"max_tokens": 500
11	}
12	}
13	}

1	{
2	"model": "azure/gpt-4o",
3	"model_name_map": {
4	"original_model_name": "azure/your_custom_model_name"
5	// e.g, "azure/gpt-4o": "azure/{your gpt-4o's deployment name}"
6	}
7	}

1	{
2	"metadata": {
3	"my_key": "my_value"
4	// Add any key-value pair here
5	}
6	}

1	{
2	"id": "chatcmpl-7476cf3f-fcc9-4902-a548-a12489856d8a",
3	//... main part of the response body ...
4	"request_breakdown": {
5	"prompt_tokens": 6,
6	"completion_tokens": 9,
7	"cost": 4.8e-5,
8	"prompt_messages": [
9	{
10	"role": "user",
11	"content": "How are you doing today?"
12	}
13	],
14	"completion_message": {
15	"content": " I'm doing well, thanks for asking!",
16	"role": "assistant"
17	},
18	"model": "claude-2",
19	"cached": false,
20	"timestamp": "2024-02-20T01:23:39.329729Z",
21	"status_code": 200,
22	"stream": false,
23	"latency": 1.8415491580963135,
24	"scores": {},
25	"category": "Questions",
26	"metadata": {},
27	"routing_time": 0.18612787732854486,
28	"full_request": {
29	"messages": [
30	{
31	"role": "user",
32	"content": "How are you doing today?"
33	}
34	],
35	"model": "claude-2",
36	"logprobs": true
37	},
38	"sentiment_score": 0
39	}
40	}

1	//... other chunks ...
2	// The following is the last chunk
3	{
4	"id": "request_breakdown",
5	"choices": [
6	{
7	"delta": { "content": null, "role": "assistant" },
8	"finish_reason": "stop",
9	"request_breakdown": {
10	"prompt_tokens": 6,
11	"completion_tokens": 9,
12	"cost": 4.8e-5, // In usd
13	"prompt_messages": [
14	{
15	"role": "user",
16	"content": "How are you doing today?"
17	}
18	],
19	"completion_message": {
20	"content": " I'm doing well, thanks for asking!",
21	"role": "assistant"
22	},
23	"model": "claude-2",
24	"cached": false,
25	"timestamp": "2024-02-20T01:23:39.329729Z",
26	"status_code": 200,
27	"stream": false,
28	"latency": 1.8415491580963135, // in seconds
29	"scores": {},
30	"category": "Questions",
31	"metadata": {},
32	"routing_time": 0.18612787732854486, // in seconds
33	"full_request": {
34	"messages": [
35	{
36	"role": "user",
37	"content": "How are you doing today?"
38	}
39	],
40	"model": "claude-2",
41	"logprobs": true
42	},
43	"sentiment_score": 0
44	},
45	"index": 0,
46	"message": { "content": null, "role": "assistant" }
47	}
48	],
49	"created": 1706100589,
50	"model": "extra_parameter",
51	"object": "chat.completion.chunk",
52	"system_fingerprint": null,
53	"usage": {}
54	}

1	{
2	"gpt-3.5-turbo": "your_customer_api_key",
3	"gpt-4": "your_customer_api_key"
4	}

1	{
2	// ...other parameters...
3	"loadbalance_models": [
4	{
5	"model": "claude-3-5-sonnet-20240620",
6	"weight": 34,
7	"credentials": { // Your own Anthropic API key, optional for team plan and above
8	"api_key": "Your own Anthropic API key"
9	}
10	},
11	{
12	"model": "azure/gpt-35-turbo",
13	"weight": 34,
14	"credentials": { // Your own Azure credentials, optional for team plan and above
15	"api_base": "Your own Azure api_base",
16	"api_version": "Your own Azure api_version",
17	"api_key": "Your own Azure api_key"
18	}
19	}
20	]
21	}

1	{
2	"id": "chatcmpl-e1b9665b-c354-41c5-bbe5-178bd0b69773",
3	"object": "chat.completion",
4	"created": 1761546960,
5	"model": "claude-sonnet-4-5-20250929",
6	"choices": [
7	{
8	"index": 0,
9	"finish_reason": "stop",
10	"message": {
11	"role": "assistant",
12	"content": "I'm doing well, thank you for asking! How can I help you today?"
13	}
14	}
15	],
16	"usage": {
17	"completion_tokens": 20,
18	"prompt_tokens": 2619,
19	"total_tokens": 2639,
20
21	// Details mirrors OpenAI-style fields
22	"completion_tokens_details": {
23	"accepted_prediction_tokens": 0,
24	"audio_tokens": 0,
25	"reasoning_tokens": 0,
26	"rejected_prediction_tokens": 0
27	},
28
29	// Where cached prompt tokens are reported
30	"prompt_tokens_details": {
31	"audio_tokens": 0,
32	"cached_tokens": 2601,
33	"cache_creation_tokens": 0
34	},
35
36	// Anthropic-style cache counters (present when applicable)
37	"cache_creation_input_tokens": 0,
38	"cache_read_input_tokens": 2601
39	}
40	}