How text goes from input string to generated tokens, step by step through the stack.
Every inference in llama.cpp follows this sequence. Click a step to explore its component page.
Opaque struct holding all model tensors (embeddings, attention matrices, FFN weights) plus hyperparameters loaded from the GGUF file. Shared across contexts and never mutated after load.
// include/llama.h — forward declaration only (opaque) struct llama_model; // ← allocated by llama_model_load_from_file() // Key hyperparameters exposed via llama_model_* accessors: // llama_model_n_embd() → embedding dimension (e.g. 4096) // llama_model_n_layer() → transformer depth (e.g. 32) // llama_model_n_head() → attention heads // llama_model_n_vocab() → vocabulary size
Holds the KV cache, logits buffer, backend scheduler, and all per-request state. One context per inference session; multiple contexts can share a model.
// include/llama.h#L336 — context parameters
struct llama_context_params {
uint32_t n_ctx; // token context window (default 512)
uint32_t n_batch; // max tokens per llama_decode() call
uint32_t n_ubatch; // physical micro-batch size
uint32_t n_threads; // threads for CPU ops
enum ggml_type type_k; // KV cache key dtype (F16 default)
enum ggml_type type_v; // KV cache value dtype
bool flash_attn; // use flash attention kernel
// ...
};
All data needed for a single llama_decode() call. Can contain multiple tokens from multiple sequences simultaneously.
// include/llama.h#L240
typedef struct llama_batch {
int32_t n_tokens; // total tokens in this batch
llama_token * token; // token IDs [n_tokens]
float * embd; // embeddings [n_tokens * n_embd] (or NULL)
llama_pos * pos; // positions [n_tokens]
int32_t * n_seq_id; // seq count [n_tokens]
llama_seq_id** seq_id; // seq IDs [n_tokens]
int8_t * logits; // want logits?[n_tokens] 1=yes, 0=no
} llama_batch;
Every weight matrix and intermediate activation is a ggml_tensor. Tensors are nodes in the computation graph; their src[] pointers form the graph edges. No computation happens at construction time.
// ggml/include/ggml.h#L666
struct ggml_tensor {
enum ggml_type type; // F32, F16, Q4_0, Q8_0, ...
int64_t ne[4]; // dimensions [cols, rows, batch, batch2]
size_t nb[4]; // byte strides per dimension
enum ggml_op op; // NONE | MUL_MAT | SOFT_MAX | ROPE | ...
struct ggml_tensor * src[GGML_MAX_SRC]; // input tensors (graph edges)
void * data; // raw data pointer (on device)
char name[GGML_MAX_NAME];
};
This is the skeleton every llama.cpp-based program follows. The server and CLI examples wrap this pattern.
// 1. Init
llama_backend_init(); // llama.h#L449
// 2. Load model from GGUF file
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = 35; // offload to GPU
struct llama_model * model =
llama_model_load_from_file("model.gguf", mparams); // llama.h#L484
// 3. Create inference context
llama_context_params cparams = llama_context_default_params();
cparams.n_ctx = 4096;
cparams.n_batch = 512;
struct llama_context * ctx =
llama_init_from_model(model, cparams); // llama.h#L509
// 4. Tokenize
const char * prompt = "Hello, world!";
int n_tokens = llama_tokenize( // llama.h#L1125
llama_model_get_vocab(model),
prompt, strlen(prompt),
tokens, max_tokens, true, false);
// 5. Create sampler chain
struct llama_sampler * smpl =
llama_sampler_chain_init(llama_sampler_chain_default_params());
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8));
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
// 6. Prefill prompt (process all prompt tokens at once)
struct llama_batch batch = llama_batch_get_one(tokens, n_tokens);
llama_decode(ctx, batch); // llama.h#L953
// 7. Autoregressive generation loop
for (int i = 0; i < max_new_tokens; i++) {
llama_token id =
llama_sampler_sample(smpl, ctx, -1); // llama.h#L1481
if (llama_vocab_is_eog(llama_model_get_vocab(model), id)) break;
// Print the token
char piece[32];
llama_token_to_piece(llama_model_get_vocab(model), id, piece, 32, 0, true);
printf("%s", piece);
// Feed sampled token back as next input
llama_sampler_accept(smpl, id);
batch = llama_batch_get_one(&id, 1);
llama_decode(ctx, batch); // KV cache grows +1
}
// 8. Cleanup
llama_sampler_free(smpl);
llama_free(ctx);
llama_model_free(model);
llama_backend_free();
llama.cpp/
├── include/llama.h ← Public C API (all structs & functions)
├── src/
│ ├── llama.cpp ← Thin wrappers → internal implementations
│ ├── llama-model.cpp ← GGUF loading, architecture dispatch
│ ├── llama-context.cpp ← decode(), KV cache, logits extraction
│ ├── llama-graph.cpp ← ggml_cgraph construction per architecture
│ ├── llama-batch.cpp ← Batch splitting into micro-batches
│ ├── llama-vocab.cpp ← Tokenizer (BPE / SPM / WPM / UGM / RWKV)
│ ├── llama-sampler.cpp ← Sampler chain implementations
│ └── models/llama*.cpp ← Per-architecture graph builders
├── ggml/
│ ├── include/ggml.h ← Tensor API (ggml_tensor, all ops)
│ ├── src/ggml.c ← CPU tensor math, graph executor
│ ├── src/ggml-backend.cpp ← Backend abstraction & scheduler
│ ├── src/ggml-cpu/ ← SIMD kernels (AVX2, NEON, …)
│ ├── src/ggml-cuda/ ← CUDA kernels
│ └── src/ggml-metal/ ← Metal shaders
└── tools/server/
├── server.cpp ← HTTP server entry point
├── server-context.cpp ← Inference loop, slot management
└── server-http.cpp ← Route handlers, JSON I/O