A CUDA stream is a sequence of commands (possibly issued by different host threads) that execute in order. Applications can manage concurrent execution of kernels through multiple streams.

This puzzler executes kernels and does data transfer concurrently using multiple streams. There are two different implementations provided. Which implementation achieves better overlap and why?

def non_blocking_streams():
    with torch.cuda.stream(first_stream):
        for i in range(len(matrix_on_gpu)):
            torch.matmul(matrix_on_gpu[i], matrix_on_gpu[i])

    with torch.cuda.stream(second_stream):
        for i in range(len(data_on_gpu)):
            data_on_gpu[i].to(cpu, non_blocking=True)

    with torch.cuda.stream(third_stream):
        for i in range(len(data_on_cpu)):
            data_on_cpu[i].to(cuda, non_blocking=True)

def blocking_streams():
    with torch.cuda.stream(first_stream):
        for i in range(len(matrix_on_gpu)):
            torch.matmul(matrix_on_gpu[i], matrix_on_gpu[i])

    with torch.cuda.stream(second_stream):
        for i in range(len(data_on_gpu)):
            data_on_gpu[i].to(cpu, non_blocking=False)

    with torch.cuda.stream(third_stream):
        for i in range(len(data_on_cpu)):
            data_on_cpu[i].to(cuda, non_blocking=False)

first_stream = torch.cuda.Stream()
second_stream = torch.cuda.Stream()
third_stream = torch.cuda.Stream()

cpu = torch.device("cpu")
cuda = torch.device("cuda")

data_on_gpu = [torch.rand((1024, 1024), device=cuda) for _ in range(100)]
data_on_cpu = [torch.rand((1024, 1024), device=cpu) for _ in range(100)]
matrix_on_gpu = [torch.rand((1024, 1024), device=cuda) for _ in range(1000)]

The attached trace file contains the answer. Check if it matches your intuition.

See answer and discussion

Note: the overlap generated by tracing the above code snippet will vary by GPU.