addvecs.jl

using CuArrays
using CUDAnative
using CUDAdrv: synchronize
CuArrays.allowscalar(false)

function addvecs!(c, a, b)
    N = length(a)
    @inbounds for i = 1:N
        c[i] = a[i] + b[i]
    end
end

function fake_knl_addvecs!(c, a, b, num_threads_per_block, num_blocks)
    dim = num_threads_per_block
    N = length(a)
    for bid = 1:num_blocks
        for tid = 1:num_threads_per_block
            # The work that thread tid of block bid should do
            i = tid + dim *(bid - 1) # Unique global thread index
            if i <= N
                c[i] = a[i] + b[i]
            end
        end
    end
end

function knl_addvecs!(c, a, b)
    N = length(a)
    bid = blockIdx().x  # Get the thread's block id
    tid = threadIdx().x # Get the thread ID
    dim = blockDim().x  # How many threads are in each block
    i = tid + dim * (bid - 1) # unique global index
    if i <= N
        c[i] = a[i] + b[i]
    end
    return nothing
end

let
    N  = 100000000
    b = rand(N)
    a = rand(N)
    c = similar(a)
    c0 = a + b # Truth

    # CPU kernel
    # addvecs!(c, a, b)
    # @assert isapprox(c0, c)

    # c .= 0
    # threads = 64
    # N <= blocks * threads -> N / threads <= blocks -> blocks = cld(N, threads)
    # blocks = cld(N, threads)
    # @show blocks * threads
    # @assert N <= blocks * threads
    # fake_knl_addvecs!(c, a, b, threads, blocks)
    # @assert isapprox(c0, c)

    # Create buffers on the device for the computation
    d_a = CuArray(a) # Allocate on the device then copy a to the device
    d_b = CuArray(b) # Allocate on the device then copy b to the device
    d_c = similar(d_a) # Allocate space on the device (unitialized)


    # threads -> number of threads per block
    # blocks -> total number of blocks

    num_threads = 64
    num_blocks = cld(N, num_threads)
    @cuda threads=num_threads blocks=num_blocks knl_addvecs!(d_c, d_a, d_b)
    synchronize()
    t = @elapsed begin
        @cuda threads=num_threads blocks=num_blocks knl_addvecs!(d_c, d_a, d_b)
        synchronize()
    end
    @show t
    @assert d_c ≈ c0
end