Strange Increase of non-torch memory for unexpected functions

Hi, team, I’m investigating how to measure the cuda memory taken by non-torch components, like NCCL. I have come up with a function that seems to do the trick:

def measure_current_non_torch():
    free, total = torch.cuda.mem_get_info()
    current_used = total - free
    current_torch = torch.cuda.memory_reserved()
    current_non_torch = current_used - current_torch
    return current_non_torch

If I just allocate tensors in PyTorch, I find this gives a consistent value, the only non-torch memory is cuda context.

However, I find that, when I do some computation, unexpected non-torch memory occurs.

I try to trace every line of python code to monitor the change of non-torch memory:

import contextlib
import dataclasses
import sys
import traceback
from typing import Callable, Generator, Generic, TypeVar

_T = TypeVar("_T")


@dataclasses.dataclass
class MonitoredValues(Generic[_T]):
    values: list[_T] = dataclasses.field(default_factory=list)
    trace_stacks: list[str] = dataclasses.field(default_factory=list)


@contextlib.contextmanager
def monitor(
    measure_func: Callable[[],
                           _T]) -> Generator[MonitoredValues[_T], None, None]:
    """
    Trace the function calls to continuously monitor the change of
    a value.

    Usage:

    ```python

    def measure_func():
        ... # measure the current value
        return current_value

    with monitor(measure_func) as monitored_values:
        # do something
    
        monitored_values.values # all changes of the values
        monitored_values.trace_stacks # trace stacks of every change
    ```
    """
    monitored_values = MonitoredValues[_T]()

    def _trace_calls(frame, event, arg=None):
        nonlocal monitored_values
        if event in ['line']:
            # triggered by every line of Python code.
            # only Python functions will trigger it,
            # c/cpp functions will not trigger it.
            try:
                # Temporarily disable the trace function
                sys.settrace(None)
                # do a measurement
                current_value = measure_func()
                if len(monitored_values.values
                       ) == 0 or current_value != monitored_values.values[-1]:
                    monitored_values.values.append(current_value)
                    monitored_values.trace_stacks.append("".join(
                        traceback.format_stack()))
                # Re-enable the trace function
                sys.settrace(_trace_calls)
            except NameError:
                # modules are deleted during shutdown
                pass
        return _trace_calls

    try:
        sys.settrace(_trace_calls)
        yield monitored_values
    finally:
        sys.settrace(None)

import torch
import torchvision

def f():
    net = torchvision.models.resnet50().cuda()
    inputs = torch.randn((64, 3, 224, 224)).cuda()
    outputs = net(inputs)

def measure_current_non_torch():
    free, total = torch.cuda.mem_get_info()
    current_used = total - free
    current_torch = torch.cuda.memory_reserved()
    current_non_torch = current_used - current_torch
    return current_non_torch

with monitor(measure_current_non_torch) as monitor_values:
    f()

for value, stack in zip(monitor_values.values, \
        monitor_values.trace_stacks):
    print(f"non_torch memory changed to {value / 1024 / 1024} MiB in\n")
    print(stack + "\n")

And the outputs are surprising:

non_torch memory changed to 529.0625 MiB in

  File "/data/youkaichao/vllm/testf.py", line 86, in <module>
    f()
  File "/data/youkaichao/vllm/testf.py", line 74, in f
    net = torchvision.models.resnet50().cuda()
  File "/data/youkaichao/vllm/testf.py", line 56, in _trace_calls
    traceback.format_stack()))


non_torch memory changed to 541.0625 MiB in

  File "/data/youkaichao/vllm/testf.py", line 86, in <module>
    f()
  File "/data/youkaichao/vllm/testf.py", line 76, in f
    outputs = net(inputs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 285, in forward
    return self._forward_impl(x)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 269, in _forward_impl
    x = self.bn1(x)
  File "/data/youkaichao/vllm/testf.py", line 56, in _trace_calls
    traceback.format_stack()))


non_torch memory changed to 631.0625 MiB in

  File "/data/youkaichao/vllm/testf.py", line 86, in <module>
    f()
  File "/data/youkaichao/vllm/testf.py", line 76, in f
    outputs = net(inputs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 285, in forward
    return self._forward_impl(x)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 269, in _forward_impl
    x = self.bn1(x)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/batchnorm.py", line 174, in forward
    if self.momentum is None:  # use cumulative moving average
  File "/data/youkaichao/vllm/testf.py", line 56, in _trace_calls
    traceback.format_stack()))


non_torch memory changed to 633.0625 MiB in

  File "/data/youkaichao/vllm/testf.py", line 86, in <module>
    f()
  File "/data/youkaichao/vllm/testf.py", line 76, in f
    outputs = net(inputs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 285, in forward
    return self._forward_impl(x)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 273, in _forward_impl
    x = self.layer1(x)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/container.py", line 250, in forward
    input = module(input)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 155, in forward
    out = self.bn3(out)
  File "/data/youkaichao/vllm/testf.py", line 56, in _trace_calls
    traceback.format_stack()))


non_torch memory changed to 697.0625 MiB in

  File "/data/youkaichao/vllm/testf.py", line 86, in <module>
    f()
  File "/data/youkaichao/vllm/testf.py", line 76, in f
    outputs = net(inputs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 285, in forward
    return self._forward_impl(x)
  File "/data/youkaichao/uv_envs/py310/lib/python3.10/site-packages/torchvision/models/resnet.py", line 282, in _forward_impl
    return x
  File "/data/youkaichao/vllm/testf.py", line 56, in _trace_calls
    traceback.format_stack()))

In particular, x = self.bn1(x) will cause non-torch memory increase.

I also have a very long profile for running an LLM: tracing_non_torch_memory_for_rank_0_in_tp_1.txt - Google Drive

It is even stranger, with cudagraph capture, running a linear layer will cause a sudden increase and decrease in non-torch memory.

So, my question is:

is my measure_current_non_torch function accurate? Anything I missed?

Thanks!

1 Like

okay, I did an analysis to hook into cudaMalloc and cudaFree to calculate the ground-truth memory PyTorch takes from cuda, and it seems torch.cuda.memory_reserved() is accurate.

just for record:

// save as audit_cuda.cpp
// compile with g++ -shared -fPIC -ldl -o libaudit.so audit_cuda.cpp -I/usr/local/cuda/include
#define _GNU_SOURCE
#include <link.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// include cuda_runtime.h
#include <cuda_runtime.h>

// use a map of uint to uint to record the size of the memory
#include <map>
#include <stdint.h>
using namespace std;
std::map<uint64_t, uint64_t> pointer_to_size;

// use file
#include <fstream>

extern "C"
{

typedef cudaError_t (*cudaMalloc_t)(void **, size_t);
typedef cudaError_t (*cudaFree_t)(void *);

// Global pointers to the REAL functions
static cudaMalloc_t real_cudaMalloc = NULL;
static cudaFree_t   real_cudaFree   = NULL;

uint64_t total_memory_from_cudaMalloc = 0;

// Interceptor for cudaMalloc
static cudaError_t my_cudaMalloc(void **devPtr, size_t size) {
    total_memory_from_cudaMalloc += size;
    // Call the real cudaMalloc
    cudaError_t val = real_cudaMalloc(devPtr, size);
    pointer_to_size[(uint64_t)*devPtr] = size;
    // print pointer and size
    printf("cudaMalloc: %p, %lu\n", *devPtr, size);
    // open a file named ans.txt, delete the old content
    ofstream fout("ans.txt", ios::out | ios::trunc);
    fout << total_memory_from_cudaMalloc << endl;
    return val;
}

// Interceptor for cudaFree
static cudaError_t my_cudaFree(void *devPtr) {
    // Remove the memory from the map
    uint64_t size = pointer_to_size[(uint64_t)devPtr];
    pointer_to_size.erase((uint64_t)devPtr);
    total_memory_from_cudaMalloc -= size;
    // print pointer and size
    printf("cudaFree: %p, %lu\n", devPtr, size);
    // open a file named ans.txt, delete the old content
    ofstream fout("ans.txt", ios::out | ios::trunc);
    fout << total_memory_from_cudaMalloc << endl;
    // Call the real cudaFree
    return real_cudaFree(devPtr);
}

unsigned int la_version(unsigned int version) {
    return LAV_CURRENT;
}

char *la_objsearch(const char *name, uintptr_t *cookie, unsigned int flag) {
    return (char *)name;
}

unsigned int la_objopen(struct link_map *map, Lmid_t lmid, uintptr_t *cookie) {
    *cookie = (uintptr_t)map;
    return LA_FLG_BINDTO | LA_FLG_BINDFROM;
}

uintptr_t la_symbind64(Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, uintptr_t *defcook, unsigned int *flags, const char *symname) {
    struct link_map *map = (struct link_map *)*defcook;

    // If we detect cudaMalloc, store the real function pointer and return our interceptor
    if (strcmp(symname, "cudaMalloc") == 0 && map) {
        real_cudaMalloc = (cudaMalloc_t)((uintptr_t)sym->st_value);

        // Return the address of our "my_cudaMalloc" so the calls get rerouted
        return (uintptr_t)my_cudaMalloc;
    }

    // If we detect cudaFree, store the real function pointer and return our interceptor
    if (strcmp(symname, "cudaFree") == 0 && map) {
        real_cudaFree = (cudaFree_t)((uintptr_t)sym->st_value);
        return (uintptr_t)my_cudaFree;
    }
    return sym->st_value;
}

unsigned int la_objclose(uintptr_t *cookie) {
    return 0;
}

void __attribute__((destructor)) finalize() {
}

} // extern "C"

Run the following Python code:

# run with LD_AUDIT=$PWD/libaudit.so python test.py
import torch
import torchvision
import sys

def get_total_memory_from_cudaMalloc():
    with open("ans.txt") as f:
        return int(f.read().strip())

def f():
    net = torchvision.models.resnet50().cuda()
    inputs = torch.randn((64, 3, 224, 224)).cuda()
    outputs = net(inputs)

def _trace_calls(frame, event, arg=None):
    if event in ['line']:
        # triggered by every line of Python code.
        # only Python functions will trigger it,
        # c/cpp functions will not trigger it.
        try:
            # Temporarily disable the trace function
            sys.settrace(None)
            # do a measurement
            gt_value = get_total_memory_from_cudaMalloc()
            pt_value = torch.cuda.memory_allocated()
            assert gt_value == pt_value, "".join(
                    traceback.format_stack())
            # Re-enable the trace function
            sys.settrace(_trace_calls)
        except NameError:
            # modules are deleted during shutdown
            pass
    return _trace_calls

sys.settrace(_trace_calls)

f()

and it finishes as normal, which means torch.cuda.memory_reserved() is accurate.

Okay, I think I get the point. the memory cost of cuda context is not constant, and it can change.

for example, capturing cudagraph can increase cuda context size. calling matmul also increases cuda context size, possibly because of cuda module loading.

Here is a test script:

import torch
def measure_current_non_torch():
    free, total = torch.cuda.mem_get_info()
    current_used = total - free
    current_torch = torch.cuda.memory_reserved()
    current_non_torch = current_used - current_torch
    return current_non_torch

torch.cuda.init()
# print in MiB
print(measure_current_non_torch() / 1024 / 1024)

# call matmul with 4096 x 4096 matrix
a = torch.randn(4096, 4096).cuda()
b = torch.randn(4096, 4096).cuda()
c = torch.matmul(a, b)

print(measure_current_non_torch() / 1024 / 1024)

Run with python test.py:

529.0625
597.0625

Run with CUDA_MODULE_LOADING=EAGER python test.py:

1179.0625
1805.0625

It is surprising that the increase can be as large as 700 MiB.

I’m asking in the cuda forum, but also cross-post here:

My original motivation is to monitor the memory allocated outside of PyTorch.

If I can get the exact amount of memory taken by the cuda context, and use torch.cuda.memory_reserved() as the memory taken by PyTorch, then I can track the memory allocated outside of PyTorch.

Right now, it seems there’s no way to query the exact amount of memory taken by the current cuda context.