MockServer can mock LLM API responses from any major provider using the httpLlmResponse action. You describe the completion in a provider-neutral format and MockServer encodes it into the correct wire format for the target provider — headers, JSON structure, streaming framing, and all. This lets you test AI-powered applications deterministically, without calling a real LLM.

 

Basic Completion Mock

Create an expectation with an httpLlmResponse action. The provider tells MockServer which wire format to produce; the completion describes what to return.

The Java client has a typed fluent API (LlmMockBuilder + HttpLlmResponse). All other clients send the expectation as raw JSON using their generic expectation method (mockAnyResponse in JavaScript; a direct PUT to /mockserver/expectation in other languages).

import static org.mockserver.client.LlmMockBuilder.llmMock;
import static org.mockserver.model.Completion.completion;
import static org.mockserver.model.Provider.OPENAI;
import static org.mockserver.model.Usage.usage;

// OpenAI — POST /v1/chat/completions
llmMock("/v1/chat/completions")
    .withProvider(OPENAI)
    .withModel("gpt-4o")
    .respondingWith(
        completion()
            .withText("MockServer is an open-source HTTP mock server.")
            .withStopReason("stop")
            .withUsage(usage().withInputTokens(12).withOutputTokens(8))
    )
    .applyTo(mockServerClient);
var mockServerClient = require('mockserver-client').mockServerClient;

// OpenAI — POST /v1/chat/completions
mockServerClient("localhost", 1080).mockAnyResponse({
    "httpRequest": {
        "method": "POST",
        "path": "/v1/chat/completions"
    },
    "httpLlmResponse": {
        "provider": "OPENAI",
        "model": "gpt-4o",
        "completion": {
            "text": "MockServer is an open-source HTTP mock server.",
            "stopReason": "stop",
            "usage": { "inputTokens": 12, "outputTokens": 8 }
        }
    }
}).then(
    function () { console.log("expectation created"); },
    function (error) { console.log(error); }
);
import requests

# OpenAI — POST /v1/chat/completions
requests.put(
    "http://localhost:1080/mockserver/expectation",
    json={
        "httpRequest": {
            "method": "POST",
            "path": "/v1/chat/completions"
        },
        "httpLlmResponse": {
            "provider": "OPENAI",
            "model": "gpt-4o",
            "completion": {
                "text": "MockServer is an open-source HTTP mock server.",
                "stopReason": "stop",
                "usage": {"inputTokens": 12, "outputTokens": 8}
            }
        }
    }
)
require 'net/http'
require 'json'

# OpenAI — POST /v1/chat/completions
uri = URI('http://localhost:1080/mockserver/expectation')
http = Net::HTTP.new(uri.host, uri.port)
req = Net::HTTP::Put.new(uri.path, 'Content-Type' => 'application/json')
req.body = JSON.generate({
  'httpRequest' => { 'method' => 'POST', 'path' => '/v1/chat/completions' },
  'httpLlmResponse' => {
    'provider' => 'OPENAI',
    'model' => 'gpt-4o',
    'completion' => {
      'text' => 'MockServer is an open-source HTTP mock server.',
      'stopReason' => 'stop',
      'usage' => { 'inputTokens' => 12, 'outputTokens' => 8 }
    }
  }
})
http.request(req)
package main

import (
    "bytes"
    "encoding/json"
    "net/http"
)

// OpenAI — POST /v1/chat/completions
func createLlmExpectation() {
    body, _ := json.Marshal(map[string]interface{}{
        "httpRequest": map[string]interface{}{
            "method": "POST",
            "path":   "/v1/chat/completions",
        },
        "httpLlmResponse": map[string]interface{}{
            "provider": "OPENAI",
            "model":    "gpt-4o",
            "completion": map[string]interface{}{
                "text":       "MockServer is an open-source HTTP mock server.",
                "stopReason": "stop",
                "usage":      map[string]int{"inputTokens": 12, "outputTokens": 8},
            },
        },
    })
    req, _ := http.NewRequest(http.MethodPut,
        "http://localhost:1080/mockserver/expectation", bytes.NewReader(body))
    req.Header.Set("Content-Type", "application/json")
    http.DefaultClient.Do(req)
}
using System.Net.Http;
using System.Text;
using System.Text.Json;

// OpenAI — POST /v1/chat/completions
var expectation = new
{
    httpRequest = new { method = "POST", path = "/v1/chat/completions" },
    httpLlmResponse = new
    {
        provider = "OPENAI",
        model = "gpt-4o",
        completion = new
        {
            text = "MockServer is an open-source HTTP mock server.",
            stopReason = "stop",
            usage = new { inputTokens = 12, outputTokens = 8 }
        }
    }
};
using var client = new HttpClient();
var json = JsonSerializer.Serialize(expectation);
await client.PutAsync(
    "http://localhost:1080/mockserver/expectation",
    new StringContent(json, Encoding.UTF8, "application/json"));
use serde_json::json;

// OpenAI — POST /v1/chat/completions
let client = reqwest::blocking::Client::new();
client.put("http://localhost:1080/mockserver/expectation")
    .json(&json!({
        "httpRequest": {
            "method": "POST",
            "path": "/v1/chat/completions"
        },
        "httpLlmResponse": {
            "provider": "OPENAI",
            "model": "gpt-4o",
            "completion": {
                "text": "MockServer is an open-source HTTP mock server.",
                "stopReason": "stop",
                "usage": { "inputTokens": 12, "outputTokens": 8 }
            }
        }
    }))
    .send()
    .unwrap();
<?php
// OpenAI — POST /v1/chat/completions
$expectation = [
    'httpRequest' => ['method' => 'POST', 'path' => '/v1/chat/completions'],
    'httpLlmResponse' => [
        'provider' => 'OPENAI',
        'model' => 'gpt-4o',
        'completion' => [
            'text' => 'MockServer is an open-source HTTP mock server.',
            'stopReason' => 'stop',
            'usage' => ['inputTokens' => 12, 'outputTokens' => 8],
        ],
    ],
];
$ch = curl_init('http://localhost:1080/mockserver/expectation');
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT');
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($expectation));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
curl_exec($ch);
curl_close($ch);
# OpenAI — POST /v1/chat/completions
curl -v -X PUT "http://localhost:1080/mockserver/expectation" -d '{
  "httpRequest": {
    "method": "POST",
    "path": "/v1/chat/completions"
  },
  "httpLlmResponse": {
    "provider": "OPENAI",
    "model": "gpt-4o",
    "completion": {
      "text": "MockServer is an open-source HTTP mock server.",
      "stopReason": "stop",
      "usage": { "inputTokens": 12, "outputTokens": 8 }
    }
  }
}'

# Anthropic — POST /v1/messages
curl -v -X PUT "http://localhost:1080/mockserver/expectation" -d '{
  "httpRequest": {
    "method": "POST",
    "path": "/v1/messages"
  },
  "httpLlmResponse": {
    "provider": "ANTHROPIC",
    "model": "claude-sonnet-4-20250514",
    "completion": {
      "text": "MockServer is an open-source HTTP mock server.",
      "stopReason": "end_turn",
      "usage": { "inputTokens": 12, "outputTokens": 8 }
    }
  }
}'

When a request matches POST /v1/chat/completions, MockServer returns an OpenAI-formatted JSON response with the specified text, model, stop reason, and token usage — including the correct id, created, and object fields that the OpenAI SDK expects.

 

Supported Providers

Each provider value produces the correct API response format for that provider's SDK:

ProviderTypical API pathNotes
OPENAI /v1/chat/completions Chat Completions API format
OPENAI_RESPONSES /v1/responses OpenAI Responses API format
ANTHROPIC /v1/messages Anthropic Messages API format
GEMINI /v1beta/models/{model}:generateContent Google Gemini format
BEDROCK /model/{model}/converse AWS Bedrock Converse API; streaming uses application/vnd.amazon.eventstream binary framing
AZURE_OPENAI /openai/deployments/{deployment}/chat/completions Azure-hosted OpenAI format (delegates to OpenAI codec)
OLLAMA /api/chat Ollama local model API format
 

Streaming Responses

Set streaming to true on the completion to return a stream instead of a single JSON response. MockServer splits the text into token-sized chunks and sends them as streaming chunks in the provider's native streaming format (SSE for most providers; NDJSON for Ollama).

Use streamingPhysics to control timing — useful for testing loading indicators, timeouts, and backpressure handling:

{
  "httpRequest": {
    "method": "POST",
    "path": "/v1/chat/completions"
  },
  "httpLlmResponse": {
    "provider": "OPENAI",
    "model": "gpt-4o",
    "completion": {
      "text": "This response is streamed token by token.",
      "streaming": true,
      "streamingPhysics": {
        "timeToFirstToken": {
          "timeUnit": "MILLISECONDS",
          "value": 200
        },
        "tokensPerSecond": 50,
        "jitter": 0.1,
        "seed": 42
      },
      "usage": {
        "inputTokens": 10,
        "outputTokens": 8
      }
    }
  }
}
FieldDescription
timeToFirstTokenDelay before the first SSE event is sent
tokensPerSecondBase token emission rate (1 – 10000)
jitterFractional uniform deviation from the base rate (0.0 – 1.0)
seedPRNG seed for reproducible inter-token timing
 

Tool Calls

To mock an LLM response that invokes tools (function calling), add toolCalls to the completion:

{
  "httpRequest": {
    "method": "POST",
    "path": "/v1/chat/completions"
  },
  "httpLlmResponse": {
    "provider": "OPENAI",
    "model": "gpt-4o",
    "completion": {
      "toolCalls": [
        {
          "id": "call_abc123",
          "name": "get_weather",
          "arguments": "{\"location\": \"London\"}"
        }
      ],
      "stopReason": "tool_use"
    }
  }
}
 

Multi-Turn Conversations

For agent testing, you often need to script a sequence of LLM responses that depend on what the agent sent. MockServer supports this through conversation predicates combined with scenario state to create multi-turn conversation flows.

Each turn uses conversationPredicates to match against the conversation history in the request body, and scenario state to track which turn the conversation is on:

[
  {
    "httpRequest": {
      "method": "POST",
      "path": "/v1/chat/completions"
    },
    "httpLlmResponse": {
      "provider": "OPENAI",
      "model": "gpt-4o",
      "completion": {
        "text": "I'll look up the weather for you.",
        "toolCalls": [
          {
            "id": "call_1",
            "name": "get_weather",
            "arguments": "{\"location\": \"London\"}"
          }
        ],
        "stopReason": "tool_use"
      },
      "conversationPredicates": {
        "turnIndex": 0,
        "latestMessageContains": "weather"
      }
    },
    "times": { "remainingTimes": 1 }
  },
  {
    "httpRequest": {
      "method": "POST",
      "path": "/v1/chat/completions"
    },
    "httpLlmResponse": {
      "provider": "OPENAI",
      "model": "gpt-4o",
      "completion": {
        "text": "The weather in London is 18C and sunny.",
        "stopReason": "stop"
      },
      "conversationPredicates": {
        "containsToolResultFor": "get_weather"
      }
    },
    "times": { "remainingTimes": 1 }
  }
]

Conversation predicates

Predicates match against the parsed conversation in the request body (decoded using the provider's message format):

PredicateDescription
turnIndexMatch when the assistant turn count equals this value (0-based)
latestMessageContainsMatch when the last message contains this substring
latestMessageMatchesMatch when the last message matches this regex pattern
latestMessageRoleMatch when the last message has this role (e.g. user, tool)
containsToolResultForMatch when the conversation contains a tool result for this tool name
semanticMatchAgainstOpt-in, exploratory: the expected meaning the latest message should express, judged by a runtime LLM. Off by default — ignored unless mockserver.llmSemanticMatchingEnabled is set and a backend resolves. Non-deterministic; for exploration only, never for CI assertions.

Prompt normalisation

Agent prompts are dynamically assembled, so exact-byte matching can be brittle. Add a normalization object to the predicates to apply deterministic transforms before matching:

"conversationPredicates": {
  "latestMessageContains": "search for weather",
  "normalization": {
    "collapseWhitespace": true,
    "lowercase": true,
    "sortJsonKeys": true
  }
}
 

Session Isolation

When multiple agents or test threads share a MockServer instance, each conversation needs its own independent state. Use an isolation source to extract a session key from each request (a header, query parameter, or cookie) so conversation state is tracked per session.

Session isolation is configured via the Java client’s conversation builder (isolateBy(IsolationSource.header("x-session-id"))) or the create_llm_conversation MCP tool’s isolateBy parameter. The isolation source is encoded into the expectation’s scenario name — it is not a separate field in the raw expectation JSON. Common isolation sources:

  • Header: extract the session key from a request header (e.g. x-session-id)
  • Query parameter: extract from a URL query parameter
  • Cookie: extract from a cookie value

Example using the create_llm_conversation MCP tool:

{
  "provider": "OPENAI",
  "path": "/v1/chat/completions",
  "isolateBy": {
    "source": "header",
    "name": "x-session-id"
  },
  "turns": [
    {
      "match": { "turnIndex": 0 },
      "response": { "text": "Hello!", "stopReason": "stop" }
    }
  ]
}

Each unique isolation key value creates an independent conversation state machine, so multiple agents can run conversations in parallel without interfering with each other.

 

Cost Budget

When using MockServer as a proxy in front of a real LLM provider, the llmCostBudgetUsd property sets a cumulative cost ceiling. Once the estimated cost of all forwarded LLM completions exceeds the budget, further LLM forwards are blocked with a 429 response. This is a safety net for CI pipelines or development environments where runaway agents could generate unexpected charges.

PropertyEnv varDefaultDescription
mockserver.llmCostBudgetUsd MOCKSERVER_LLM_COST_BUDGET_USD -1.0 (disabled) Cumulative cost budget in USD. Set to a positive value to enable; negative or unset means no limit.

The budget is enforced on all forward paths (matched forward actions, proxy-pass, and reverse-proxy routes) and resets on PUT /mockserver/reset. Cost estimation uses an internal pricing table and is approximate — treat it as a safety guard, not an invoice.

 

Chaos / Fault Injection

Add a chaos object to the httpLlmResponse to test how your application handles LLM failures:

{
  "httpRequest": {
    "method": "POST",
    "path": "/v1/chat/completions"
  },
  "httpLlmResponse": {
    "provider": "OPENAI",
    "model": "gpt-4o",
    "completion": {
      "text": "Normal response text",
      "streaming": true
    },
    "chaos": {
      "errorStatus": 503,
      "errorProbability": 0.3,
      "truncateMode": "MID_STREAM",
      "truncateAtFraction": 0.5,
      "malformedSse": true
    }
  }
}
FieldDescription
errorStatusReturn this HTTP status as a provider error (e.g. 503, 429). With no errorProbability, always fires.
retryAfterValue for the Retry-After header on an injected error (e.g. "30"). Useful alongside errorStatus: 429 to test retry-after handling.
errorProbabilityProbability (0.0 – 1.0) that the error fires on each request
truncateModeNONE (default) or MID_STREAM. Must be set to MID_STREAM for truncateAtFraction to take effect.
truncateAtFractionFor streaming responses, cut the stream short at this fraction of SSE events (0.0 – 1.0, default 0.5). Only applies when truncateMode is MID_STREAM.
malformedSseInject a deliberately broken-JSON SSE chunk
quotaName / quotaLimit / quotaWindowMillisDeterministic fixed-window rate limit
quotaErrorStatusHTTP status returned when the quota is exceeded (default 429). Must be between 100 and 599.
seedPRNG seed for reproducible probabilistic faults

See Chaos Testing & Fault Injection for more about chaos profiles.

 

Streaming — Client Examples

These examples show how to create a streaming LLM expectation with realistic timing physics. The Java client uses the typed fluent API; all other clients send the expectation as raw JSON.

import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.mockserver.client.Llm.jitter;
import static org.mockserver.client.Llm.timeToFirstToken;
import static org.mockserver.client.Llm.tokensPerSecond;
import static org.mockserver.client.LlmMockBuilder.llmMock;
import static org.mockserver.model.Completion.completion;
import static org.mockserver.model.Provider.OPENAI;

llmMock("/v1/chat/completions")
    .withProvider(OPENAI)
    .withModel("gpt-4o")
    .respondingWith(
        completion()
            .withText("This response is streamed token by token.")
            .streaming()
            .withStreamingPhysics(
                timeToFirstToken(200, MILLISECONDS),
                tokensPerSecond(50),
                jitter(0.1))
    )
    .applyTo(mockServerClient);
var mockServerClient = require('mockserver-client').mockServerClient;

mockServerClient("localhost", 1080).mockAnyResponse({
    "httpRequest": {
        "method": "POST",
        "path": "/v1/chat/completions"
    },
    "httpLlmResponse": {
        "provider": "OPENAI",
        "model": "gpt-4o",
        "completion": {
            "text": "This response is streamed token by token.",
            "streaming": true,
            "streamingPhysics": {
                "timeToFirstToken": { "timeUnit": "MILLISECONDS", "value": 200 },
                "tokensPerSecond": 50,
                "jitter": 0.1,
                "seed": 42
            }
        }
    }
}).then(
    function () { console.log("expectation created"); },
    function (error) { console.log(error); }
);
import requests

requests.put(
    "http://localhost:1080/mockserver/expectation",
    json={
        "httpRequest": {
            "method": "POST",
            "path": "/v1/chat/completions"
        },
        "httpLlmResponse": {
            "provider": "OPENAI",
            "model": "gpt-4o",
            "completion": {
                "text": "This response is streamed token by token.",
                "streaming": True,
                "streamingPhysics": {
                    "timeToFirstToken": {"timeUnit": "MILLISECONDS", "value": 200},
                    "tokensPerSecond": 50,
                    "jitter": 0.1,
                    "seed": 42
                }
            }
        }
    }
)
require 'net/http'
require 'json'

uri = URI('http://localhost:1080/mockserver/expectation')
http = Net::HTTP.new(uri.host, uri.port)
req = Net::HTTP::Put.new(uri.path, 'Content-Type' => 'application/json')
req.body = JSON.generate({
  'httpRequest' => { 'method' => 'POST', 'path' => '/v1/chat/completions' },
  'httpLlmResponse' => {
    'provider' => 'OPENAI',
    'model' => 'gpt-4o',
    'completion' => {
      'text' => 'This response is streamed token by token.',
      'streaming' => true,
      'streamingPhysics' => {
        'timeToFirstToken' => { 'timeUnit' => 'MILLISECONDS', 'value' => 200 },
        'tokensPerSecond' => 50,
        'jitter' => 0.1,
        'seed' => 42
      }
    }
  }
})
http.request(req)
package main

import (
    "bytes"
    "encoding/json"
    "net/http"
)

func createStreamingLlmExpectation() {
    body, _ := json.Marshal(map[string]interface{}{
        "httpRequest": map[string]interface{}{
            "method": "POST",
            "path":   "/v1/chat/completions",
        },
        "httpLlmResponse": map[string]interface{}{
            "provider": "OPENAI",
            "model":    "gpt-4o",
            "completion": map[string]interface{}{
                "text":      "This response is streamed token by token.",
                "streaming": true,
                "streamingPhysics": map[string]interface{}{
                    "timeToFirstToken": map[string]interface{}{
                        "timeUnit": "MILLISECONDS", "value": 200,
                    },
                    "tokensPerSecond": 50,
                    "jitter":          0.1,
                    "seed":            42,
                },
            },
        },
    })
    req, _ := http.NewRequest(http.MethodPut,
        "http://localhost:1080/mockserver/expectation", bytes.NewReader(body))
    req.Header.Set("Content-Type", "application/json")
    http.DefaultClient.Do(req)
}
using System.Net.Http;
using System.Text;
using System.Text.Json;

var expectation = new
{
    httpRequest = new { method = "POST", path = "/v1/chat/completions" },
    httpLlmResponse = new
    {
        provider = "OPENAI",
        model = "gpt-4o",
        completion = new
        {
            text = "This response is streamed token by token.",
            streaming = true,
            streamingPhysics = new
            {
                timeToFirstToken = new { timeUnit = "MILLISECONDS", value = 200 },
                tokensPerSecond = 50,
                jitter = 0.1,
                seed = 42
            }
        }
    }
};
using var client = new HttpClient();
var json = JsonSerializer.Serialize(expectation);
await client.PutAsync(
    "http://localhost:1080/mockserver/expectation",
    new StringContent(json, Encoding.UTF8, "application/json"));
use serde_json::json;

let client = reqwest::blocking::Client::new();
client.put("http://localhost:1080/mockserver/expectation")
    .json(&json!({
        "httpRequest": {
            "method": "POST",
            "path": "/v1/chat/completions"
        },
        "httpLlmResponse": {
            "provider": "OPENAI",
            "model": "gpt-4o",
            "completion": {
                "text": "This response is streamed token by token.",
                "streaming": true,
                "streamingPhysics": {
                    "timeToFirstToken": { "timeUnit": "MILLISECONDS", "value": 200 },
                    "tokensPerSecond": 50,
                    "jitter": 0.1,
                    "seed": 42
                }
            }
        }
    }))
    .send()
    .unwrap();
<?php
$expectation = [
    'httpRequest' => ['method' => 'POST', 'path' => '/v1/chat/completions'],
    'httpLlmResponse' => [
        'provider' => 'OPENAI',
        'model' => 'gpt-4o',
        'completion' => [
            'text' => 'This response is streamed token by token.',
            'streaming' => true,
            'streamingPhysics' => [
                'timeToFirstToken' => ['timeUnit' => 'MILLISECONDS', 'value' => 200],
                'tokensPerSecond' => 50,
                'jitter' => 0.1,
                'seed' => 42,
            ],
        ],
    ],
];
$ch = curl_init('http://localhost:1080/mockserver/expectation');
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT');
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($expectation));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
curl_exec($ch);
curl_close($ch);
curl -v -X PUT "http://localhost:1080/mockserver/expectation" -d '{
  "httpRequest": {
    "method": "POST",
    "path": "/v1/chat/completions"
  },
  "httpLlmResponse": {
    "provider": "OPENAI",
    "model": "gpt-4o",
    "completion": {
      "text": "This response is streamed token by token.",
      "streaming": true,
      "streamingPhysics": {
        "timeToFirstToken": { "timeUnit": "MILLISECONDS", "value": 200 },
        "tokensPerSecond": 50,
        "jitter": 0.1,
        "seed": 42
      }
    }
  }
}'
 

Java Client API (Simple Completion)

The Java client provides a typed fluent API for creating LLM expectations without hand-assembling JSON:

import static org.mockserver.model.HttpRequest.request;
import static org.mockserver.model.HttpLlmResponse.llmResponse;
import static org.mockserver.model.Completion.completion;

mockServerClient
    .when(
        request()
            .withMethod("POST")
            .withPath("/v1/chat/completions")
    )
    .respond(
        llmResponse()
            .withProvider(Provider.OPENAI)
            .withModel("gpt-4o")
            .withCompletion(
                completion()
                    .withText("Hello from MockServer!")
                    .withStopReason("stop")
            )
    );

Related Pages