vllm.v1.core.kv_cache_metrics ¶

KV cache metrics tracking.

BlockMetricsState ¶

Tracks lifecycle metrics for a single KV cache block.

Source code in vllm/v1/core/kv_cache_metrics.py

class BlockMetricsState:
    """Tracks lifecycle metrics for a single KV cache block."""

    def __init__(self):
        now_ns = time.monotonic_ns()
        self.birth_time_ns = now_ns
        self.last_access_ns = now_ns
        # Bounded to prevent unbounded growth if a block is accessed many times.
        self.access_history: deque[int] = deque(maxlen=4)

    def record_access(self) -> None:
        now_ns = time.monotonic_ns()
        self.last_access_ns = now_ns
        self.access_history.append(now_ns)

    def get_lifetime_seconds(self) -> float:
        now_ns = time.monotonic_ns()
        return (now_ns - self.birth_time_ns) / 1e9

    def get_idle_time_seconds(self) -> float:
        now_ns = time.monotonic_ns()
        return (now_ns - self.last_access_ns) / 1e9

    def get_reuse_gaps_seconds(self) -> list[float]:
        if len(self.access_history) < 2:
            return []
        history = list(self.access_history)
        return [(history[i] - history[i - 1]) / 1e9 for i in range(1, len(history))]

access_history `instance-attribute` ¶

access_history: deque[int] = deque(maxlen=4)

birth_time_ns `instance-attribute` ¶

birth_time_ns = now_ns

last_access_ns `instance-attribute` ¶

last_access_ns = now_ns

init ¶

__init__()

Source code in vllm/v1/core/kv_cache_metrics.py

def __init__(self):
    now_ns = time.monotonic_ns()
    self.birth_time_ns = now_ns
    self.last_access_ns = now_ns
    # Bounded to prevent unbounded growth if a block is accessed many times.
    self.access_history: deque[int] = deque(maxlen=4)

get_idle_time_seconds ¶

get_idle_time_seconds() -> float

Source code in vllm/v1/core/kv_cache_metrics.py

def get_idle_time_seconds(self) -> float:
    now_ns = time.monotonic_ns()
    return (now_ns - self.last_access_ns) / 1e9

get_lifetime_seconds ¶

get_lifetime_seconds() -> float

Source code in vllm/v1/core/kv_cache_metrics.py

def get_lifetime_seconds(self) -> float:
    now_ns = time.monotonic_ns()
    return (now_ns - self.birth_time_ns) / 1e9

get_reuse_gaps_seconds ¶

get_reuse_gaps_seconds() -> list[float]

Source code in vllm/v1/core/kv_cache_metrics.py

def get_reuse_gaps_seconds(self) -> list[float]:
    if len(self.access_history) < 2:
        return []
    history = list(self.access_history)
    return [(history[i] - history[i - 1]) / 1e9 for i in range(1, len(history))]

record_access ¶

record_access() -> None

Source code in vllm/v1/core/kv_cache_metrics.py

def record_access(self) -> None:
    now_ns = time.monotonic_ns()
    self.last_access_ns = now_ns
    self.access_history.append(now_ns)

KVCacheMetricsCollector ¶

Collects KV cache residency metrics with sampling.

Source code in vllm/v1/core/kv_cache_metrics.py

class KVCacheMetricsCollector:
    """Collects KV cache residency metrics with sampling."""

    def __init__(self, sample_rate: float = 0.01):
        assert 0 < sample_rate <= 1.0, (
            f"sample_rate must be in (0, 1.0], got {sample_rate}"
        )
        self.sample_rate = sample_rate

        self.block_metrics: dict[int, BlockMetricsState] = {}

        self._eviction_events: list[KVCacheEvictionEvent] = []

    def should_sample_block(self) -> bool:
        return random.random() < self.sample_rate

    def on_block_allocated(self, block: "KVCacheBlock") -> None:
        if self.should_sample_block():
            self.block_metrics[block.block_id] = BlockMetricsState()

    def on_block_accessed(self, block: "KVCacheBlock") -> None:
        metrics = self.block_metrics.get(block.block_id)
        if metrics:
            metrics.record_access()

    def on_block_evicted(self, block: "KVCacheBlock") -> None:
        metrics = self.block_metrics.pop(block.block_id, None)
        if not metrics:
            return

        lifetime = metrics.get_lifetime_seconds()
        idle_time = metrics.get_idle_time_seconds()
        reuse_gaps = tuple(metrics.get_reuse_gaps_seconds())

        self._eviction_events.append(
            KVCacheEvictionEvent(
                lifetime_seconds=lifetime,
                idle_seconds=idle_time,
                reuse_gaps_seconds=reuse_gaps,
            )
        )

    def reset(self) -> None:
        """Clear all state on cache reset."""
        self.block_metrics.clear()
        self._eviction_events.clear()

    def drain_events(self) -> list[KVCacheEvictionEvent]:
        events = self._eviction_events
        self._eviction_events = []
        return events

_eviction_events `instance-attribute` ¶

_eviction_events: list[KVCacheEvictionEvent] = []

block_metrics `instance-attribute` ¶

block_metrics: dict[int, BlockMetricsState] = {}

sample_rate `instance-attribute` ¶

sample_rate = sample_rate

init ¶

__init__(sample_rate: float = 0.01)

Source code in vllm/v1/core/kv_cache_metrics.py

def __init__(self, sample_rate: float = 0.01):
    assert 0 < sample_rate <= 1.0, (
        f"sample_rate must be in (0, 1.0], got {sample_rate}"
    )
    self.sample_rate = sample_rate

    self.block_metrics: dict[int, BlockMetricsState] = {}

    self._eviction_events: list[KVCacheEvictionEvent] = []

drain_events ¶

drain_events() -> list[KVCacheEvictionEvent]

Source code in vllm/v1/core/kv_cache_metrics.py

def drain_events(self) -> list[KVCacheEvictionEvent]:
    events = self._eviction_events
    self._eviction_events = []
    return events

on_block_accessed ¶

on_block_accessed(block: KVCacheBlock) -> None

Source code in vllm/v1/core/kv_cache_metrics.py

def on_block_accessed(self, block: "KVCacheBlock") -> None:
    metrics = self.block_metrics.get(block.block_id)
    if metrics:
        metrics.record_access()

on_block_allocated ¶

on_block_allocated(block: KVCacheBlock) -> None

Source code in vllm/v1/core/kv_cache_metrics.py

def on_block_allocated(self, block: "KVCacheBlock") -> None:
    if self.should_sample_block():
        self.block_metrics[block.block_id] = BlockMetricsState()

on_block_evicted ¶

on_block_evicted(block: KVCacheBlock) -> None

Source code in vllm/v1/core/kv_cache_metrics.py

def on_block_evicted(self, block: "KVCacheBlock") -> None:
    metrics = self.block_metrics.pop(block.block_id, None)
    if not metrics:
        return

    lifetime = metrics.get_lifetime_seconds()
    idle_time = metrics.get_idle_time_seconds()
    reuse_gaps = tuple(metrics.get_reuse_gaps_seconds())

    self._eviction_events.append(
        KVCacheEvictionEvent(
            lifetime_seconds=lifetime,
            idle_seconds=idle_time,
            reuse_gaps_seconds=reuse_gaps,
        )
    )

reset ¶

reset() -> None

Clear all state on cache reset.

Source code in vllm/v1/core/kv_cache_metrics.py

def reset(self) -> None:
    """Clear all state on cache reset."""
    self.block_metrics.clear()
    self._eviction_events.clear()

should_sample_block ¶

should_sample_block() -> bool

Source code in vllm/v1/core/kv_cache_metrics.py

def should_sample_block(self) -> bool:
    return random.random() < self.sample_rate

vllm.v1.core.kv_cache_metrics ¶

BlockMetricsState ¶

access_history instance-attribute ¶

birth_time_ns instance-attribute ¶

last_access_ns instance-attribute ¶

__init__ ¶

get_idle_time_seconds ¶

get_lifetime_seconds ¶

get_reuse_gaps_seconds ¶

record_access ¶

KVCacheMetricsCollector ¶

_eviction_events instance-attribute ¶

block_metrics instance-attribute ¶

sample_rate instance-attribute ¶

__init__ ¶

drain_events ¶

on_block_accessed ¶

on_block_allocated ¶

on_block_evicted ¶

reset ¶

should_sample_block ¶

access_history `instance-attribute` ¶

birth_time_ns `instance-attribute` ¶

last_access_ns `instance-attribute` ¶

init ¶

_eviction_events `instance-attribute` ¶

block_metrics `instance-attribute` ¶

sample_rate `instance-attribute` ¶

init ¶