bentoml · Sherlock113 · Jul 25, 2025 · Jul 25, 2025
diff --git a/docs/getting-started/calculating-gpu-memory-for-llms.md b/docs/getting-started/calculating-gpu-memory-for-llms.md
@@ -2,7 +2,7 @@
 sidebar_position: 2
 description: Learn how to calculate GPU memory for serving LLMs.
 keywords:
-    - GPU memory calculation
+    - GPU memory calculation, LLM inference hardware calculator
     - VRAM calculation
     - LLM memory requirements
 ---

diff --git a/docs/getting-started/choosing-the-right-inference-framework.md b/docs/getting-started/choosing-the-right-inference-framework.md
@@ -2,9 +2,8 @@
 sidebar_position: 5
 description: Select the right inference frameworks for your use case.
 keywords:
-    - Inference frameworks
-    - Inference backends
-    - Inference runtimes
+    - Inference frameworks, inference backends, inference runtimes, inference engines, inference platforms
+    - Best inference frameworks, best LLM inference providers, LLM inference benchmark
     - vLLM, SGLang, LMDeploy, TensorRT-LLM, Hugging Face TGI, llama.cpp, MLC-LLM, Ollama
 ---
 

diff --git a/docs/getting-started/choosing-the-right-model.md b/docs/getting-started/choosing-the-right-model.md
@@ -2,7 +2,7 @@
 sidebar_position: 1
 description: Select the right models for your use case.
 keywords:
-    - LLMs
+    - LLMs, dense models
     - Base models
     - Instruction-tuned models
     - Mixture of Experts models

diff --git a/docs/getting-started/llm-fine-tuning.md b/docs/getting-started/llm-fine-tuning.md
@@ -1,9 +1,9 @@
 ---
 sidebar_position: 3
-description: Understand LLM fine-tuning and different fine-tuning frameworks
+description: Understand LLM fine-tuning and different fine-tuning frameworks.
 keywords:
-    - LLM fine-tuning
-    - Fine-tuning frameworks
+    - LLM fine-tuning, LoRA, how does LLM fine-tuning work
+    - Fine-tuning frameworks, open source LLM fine-tuning, types of LLM fine-tuning
     - Axolotl, Unsloth, Torchtune, LLaMA Factory
 ---
 

diff --git a/docs/getting-started/llm-quantization.md b/docs/getting-started/llm-quantization.md
@@ -1,9 +1,9 @@
 ---
 sidebar_position: 4
-description: Understand LLM quantization and different quantization formats and methods
+description: Understand LLM quantization and different quantization formats and methods.
 keywords:
-    - LLM quantization
-    - Quantization formats
+    - LLM quantization, how does quantization work, LLM quantization accuracy
+    - Quantization formats, quantization types, quantization techniques
     - AWQ, SmoothQuant, GPTQ
 ---
 

diff --git a/docs/getting-started/tool-integration/_category_.json b/docs/getting-started/tool-integration/_category_.json
@@ -1,6 +1,7 @@
 {
   "label": "Tool integration",
   "position": 6,
+  "collapsed": false,
   "link": {
     "type": "generated-index"
   }

diff --git a/docs/getting-started/tool-integration/function-calling.md b/docs/getting-started/tool-integration/function-calling.md
@@ -2,7 +2,8 @@
 sidebar_position: 1
 description: Learn what function calling is and its use case.
 keywords:
-    - Function calling
+    - Function calling, function calling APIs
+    - LLM tool use, LLM tool integration
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/getting-started/tool-integration/model-context-protocol.md b/docs/getting-started/tool-integration/model-context-protocol.md
@@ -2,8 +2,8 @@
 sidebar_position: 2
 description: Learn what Model Context Protocol (MCP) is and its use case.
 keywords:
-    - Model Context Protocol
-    - MCP
+    - Model Context Protocol, MCP protocol
+    - MCP host, MCP clients, MCP servers
 ---
 
 # Model Context Protocol

diff --git a/docs/inference-optimization/data-tensor-pipeline-expert-hybrid-parallelism.md b/docs/inference-optimization/data-tensor-pipeline-expert-hybrid-parallelism.md
@@ -2,8 +2,9 @@
 sidebar_position: 9
 description: Understand the differences between data, tensor, pipeline, expert and hybrid parallelisms.
 keywords:
-    - LLM inference optimization
+    - LLM inference optimization, LLM inference optimization techniques
     - Data parallelism, tensor parallelism, pipeline parallelism, expert parallelism and hybrid parallelism
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/inference-optimization/kv-cache-utilization-aware-load-balancing.md b/docs/inference-optimization/kv-cache-utilization-aware-load-balancing.md
@@ -3,9 +3,10 @@ sidebar_position: 8
 description: Route LLM requests based on KV cache usage for faster, smarter inference.
 keywords:
     - KV cache
-    - Load balancing
-    - LLM inference optimization
+    - Load balancing, LLM load balancing
+    - LLM inference optimization, LLM inference optimization techniques
     - Gateway API Inference Extension
+    - Speed up LLM inference
 ---
 
 # KV cache utilization-aware load balancing

diff --git a/docs/inference-optimization/llm-inference-metrics.md b/docs/inference-optimization/llm-inference-metrics.md
@@ -2,12 +2,11 @@
 sidebar_position: 1
 description: Measure key metrics like latency and throughput to optimize LLM inference performance.
 keywords:
-    - LLM inference metrics
-    - LLM benchmarks
-    - Time to First Token (TTFT), Time per Output Token (TPOT)
-    - Requests per Second (RPS), Tokens per Second (TPS)
-    - Latency, Throughput, Goodput
+    - LLM inference, inference metrics
+    - LLM benchmarks, inference benchmarks
+    - Time to First Token (TTFT), Time per Output Token (TPOT), Inter-token Latency (ITL), Requests per Second (RPS), Tokens per Second (TPS), Latency, Throughput, Goodput
     - Service-Level Objective (SLO)
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/inference-optimization/offline-batch-inference.md b/docs/inference-optimization/offline-batch-inference.md
@@ -2,8 +2,9 @@
 sidebar_position: 10
 description: Run predictions at scale with offline batch inference for efficient, non-real-time processing.
 keywords:
-    - Offline batch inference
-    - Batch inference
+    - Offline batch inference, batch inference, batch LLM inference, batch requests, batch processing, LLM inference batching
+    - LLM inference optimization, LLM inference optimization techniques
+    - Speed up LLM inference
 ---
 
 # Offline batch inference

diff --git a/docs/inference-optimization/pagedattention.md b/docs/inference-optimization/pagedattention.md
@@ -4,7 +4,9 @@ description: Improve LLM memory usage with block-based KV cache storage via Page
 keywords:
     - vLLM, Hugging Face TGI, TensorRT-LLM
     - PagedAttention
-    - KV cache
+    - KV cache, KV cache optimization, KV caching
+    - LLM inference optimization, LLM inference optimization techniques
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/inference-optimization/prefill-decode-disaggregation.md b/docs/inference-optimization/prefill-decode-disaggregation.md
@@ -6,7 +6,8 @@ keywords:
     - Disaggregating prefill and decode
     - Prefill, decode
     - Distributed LLM inference
-    - Inference optimization
+    - LLM inference optimization, LLM inference optimization techniques
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/inference-optimization/prefix-caching-cache-aware-routing.md b/docs/inference-optimization/prefix-caching-cache-aware-routing.md
@@ -8,6 +8,8 @@ keywords:
     - Distributed inference, distributed LLM inference
     - Inference optimization
     - Dynamo, SGLang, vLLM, llm-d
+    - LLM inference optimization, LLM inference optimization techniques
+    - Speed up LLM inference
 ---
 
 # Prefix cache-aware routing

diff --git a/docs/inference-optimization/prefix-caching.md b/docs/inference-optimization/prefix-caching.md
@@ -7,6 +7,8 @@ keywords:
     - Distributed inference, distributed LLM inference
     - Inference optimization
     - Dynamo, SGLang, vLLM, llm-d
+    - LLM inference optimization, LLM inference optimization techniques
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/inference-optimization/speculative-decoding.md b/docs/inference-optimization/speculative-decoding.md
@@ -5,7 +5,8 @@ keywords:
     - Speculative decoding, speculative sampling
     - Draft model, target model
     - Distributed inference, distributed LLM inference
-    - Inference optimization
+    - LLM inference optimization, LLM inference optimization techniques
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/inference-optimization/static-dynamic-continuous-batching.md b/docs/inference-optimization/static-dynamic-continuous-batching.md
@@ -3,6 +3,10 @@ sidebar_position: 2
 description: Optimize LLM inference with static, dynamic, and continuous batching for better GPU utilization.
 keywords:
     - Static batching, dynamic batching and continuous batching
+    - Batch LLM inference, batch requests, batch processing, LLM inference batching, LLM batching
+    - Batch size, batch window
+    - LLM inference optimization, LLM inference optimization techniques, LLM batch API
+    - Speed up LLM inference
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/...rastructure-and-operations/challenges-in-building-infra-for-llm-inference/_category_.json b/...rastructure-and-operations/challenges-in-building-infra-for-llm-inference/_category_.json
@@ -1,7 +1,8 @@
 {
   "label": "Challenges in building infrastructure for LLM inference",
   "position": 2,
+  "collapsed": false,
   "link": {
     "type": "generated-index"
   }
-}
+}
diff --git a/...ns/challenges-in-building-infra-for-llm-inference/build-and-maintenance-cost.md b/...ns/challenges-in-building-infra-for-llm-inference/build-and-maintenance-cost.md
@@ -2,7 +2,7 @@
 sidebar_position: 2
 description: Building LLM infrastructure in-house is costly, complex, and slows AI product development and innovation.
 keywords:
-    - LLM infrastructure cost
+    - LLM infrastructure cost, inference challenges
     - Self-hosted LLM challenges
     - Building vs buying AI infrastructure
     - AI inference infrastructure

diff --git a/...s/challenges-in-building-infra-for-llm-inference/comprehensive-observability.md b/...s/challenges-in-building-infra-for-llm-inference/comprehensive-observability.md
@@ -2,8 +2,7 @@
 sidebar_position: 3
 description: Ensure reliable LLM inference with comprehensive observability across metrics, logs, and GPU performance.
 keywords:
-    - LLM inference observability
-    - LLM-specific metrics
+    - LLM inference observability, LLM-specific metrics, inference metrics
     - GPU utilization, tokens per second, time to first token, total generation time, time per output token
     - LLM monitoring, logging, alerts, tracing
     - Self-hosted LLM challenges

diff --git a/...e-and-operations/challenges-in-building-infra-for-llm-inference/fast-scaling.md b/...e-and-operations/challenges-in-building-infra-for-llm-inference/fast-scaling.md
@@ -2,7 +2,7 @@
 sidebar_position: 1
 description: Fast scaling enables AI systems to handle dynamic LLM inference workloads while minimizing latency and cost.
 keywords:
-    - Scalable LLM inference, fast scaling, scalability
+    - Scalable LLM inference, fast scaling, scalability, LLM inference scaling, LLM scaling law
     - LLM cold starts, Kubernetes cold starts, LLM container cold starts
     - Concurrency, QPS, CPU and GPU utilization
     - Self-hosted LLM challenges

diff --git a/docs/infrastructure-and-operations/inferenceops-and-management.md b/docs/infrastructure-and-operations/inferenceops-and-management.md
@@ -5,6 +5,7 @@ keywords:
     - LLM inference infrastructure management
     - InferenceOps, inference operations
     - AI infrastructure
+    - LLM inference best practices
 ---
 
 # InferenceOps and management

diff --git a/docs/infrastructure-and-operations/what-is-llm-inference-infrastructure.md b/docs/infrastructure-and-operations/what-is-llm-inference-infrastructure.md
@@ -2,7 +2,7 @@
 sidebar_position: 1
 description: Deploy, scale, and manage LLMs with purpose-built inference infrastructure.
 keywords:
-    - LLM inference infrastructure
+    - LLM inference infrastructure, inference platform, in-house infrastructure
     - Self-hosted LLM inference
     - AI infrastructure
 ---

diff --git a/docs/introduction.md b/docs/introduction.md
@@ -4,8 +4,11 @@ sidebar_position: 0
 sidebar_class_name: hidden
 description: A practical handbook for engineers building, optimizing, scaling and operating LLM inference systems in production.
 keywords:
-    - LLM inference guide, LLM inference handbook
-    - LLM, LLM inference, AI inference
+    - LLM inference guide, LLM inference handbook, LLM inference book, LLM inference best practices
+    - Inference, LLM, LLM inference, AI inference, GenAI inference
+    - Inference optimization, inference techniques, LLM fast inference
+    - Inference platform, inference operations
+    - Efficient generative LLM inference, distributed LLM inference
 ---
 
 import Features from '@site/src/components/Features';

diff --git a/docs/llm-inference-basics/cpu-vs-gpu-vs-tpu.md b/docs/llm-inference-basics/cpu-vs-gpu-vs-tpu.md
@@ -1,10 +1,10 @@
 ---
 sidebar_position: 4
-description: Learn the differences between CPUs, GPUs, and TPUs
+description: Learn the differences between CPUs, GPUs, and TPUs and where you can deploy them.
 keywords:
-    - CPUs
-    - GPUs
-    - TPUs
+    - CPUs, GPUs, TPUs, CPU vs GPU vs TPU
+    - Cloud LLM inference, On-prem LLM inference, On-device LLM inference, GPU inference, Edge LLM inference
+    - LLM inference hardware
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/llm-inference-basics/how-does-llm-inference-work.md b/docs/llm-inference-basics/how-does-llm-inference-work.md
@@ -1,12 +1,11 @@
 ---
 sidebar_position: 3
-description: Learn how prefill and decode work in LLM inference.
+description: Learn how LLM inference works, from tokenization to prefill and decode stages, with tips on performance, KV caching, and optimization strategies.
 keywords:
-    - LLM inference
-    - Prefill
-    - Decode
-    - Tokenization
-    - Tokens
+    - LLM inference, how LLM inference works, autoregressive decoding, transformer inference
+    - Prefill and decode
+    - LLM tokenization, tokens
+    - KV cache LLM
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/llm-inference-basics/openai-compatible-api.md b/docs/llm-inference-basics/openai-compatible-api.md
@@ -1,10 +1,10 @@
 ---
 sidebar_position: 6
-description: Learn the concept of OpenAI-compatible API and why you need it.
+description: An OpenAI-compatible API implements the same request and response formats as OpenAI's official API, allowing developers to switch between different models without changing existing code.
 keywords:
-    - OpenAI-compatible API, OpenAI-compatible endpoint
-    - OpenAI API, OpenAI compatibility
-    - ChatGPT
+    - OpenAI-compatible API, OpenAI-compatible endpoint, OpenAI-compatible server
+    - OpenAI API, OpenAI compatibility, ChatGPT
+    - LLM inference API
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/llm-inference-basics/serverless-vs-self-hosted-llm-inference.md b/docs/llm-inference-basics/serverless-vs-self-hosted-llm-inference.md
@@ -1,11 +1,11 @@
 ---
 sidebar_position: 5
-description: Understand the differences between serverless AI APIs and self-hosted deployments.
+description: Understand the differences between serverless LLM APIs and self-hosted LLM deployments.
 keywords:
-    - Serverless APIs
-    - Managed APIs
-    - Self-hosted LLMs
-    - Self-hosted inference
+    - Serverless vs. Self-hosted LLM inference
+    - Serverless APIs, Managed APIs, serverless LLMs, serverless inference
+    - Self-hosted LLMs, Self-hosted inference, open source LLMs, custom LLMs
+    - LLM inference cost
 ---
 
 import LinkList from '@site/src/components/LinkList';

diff --git a/docs/llm-inference-basics/training-inference-differences.md b/docs/llm-inference-basics/training-inference-differences.md
@@ -1,11 +1,11 @@
 ---
 sidebar_position: 2
-description: LLM training builds the model; inference applies it to generate real-time outputs from new inputs.
+description: LLM training builds the model while LLM inference applies it to generate real-time outputs from new inputs.
 keywords:
     - LLM training vs. inference
     - LLM training, LLM inference
-    - AI inference
-    - AI training
+    - Differences between LLM inference and training
+    - AI training, training techniques
     - Traning and inference
 ---
 

diff --git a/docs/llm-inference-basics/what-is-llm-inference.md b/docs/llm-inference-basics/what-is-llm-inference.md
@@ -3,9 +3,8 @@ sidebar_position: 1
 description: LLM inference is the process of using a trained language model to generate responses or predictions based on prompts.
 keywords:
     - Large Language Models, LLM
-    - LLM inference meaning
-    - LLM inference
-    - AI inference
+    - LLM inference meaning, LLM inference concept
+    - LLM inference, AI inference, inference layer
 ---
 
 # What is LLM inference?