-
Jeremy Edward Kozdon authoredJeremy Edward Kozdon authored
addvecs.jl 2.04 KiB
using CuArrays
using CUDAnative
using CUDAdrv: synchronize
CuArrays.allowscalar(false)
function addvecs!(c, a, b)
N = length(a)
@inbounds for i = 1:N
c[i] = a[i] + b[i]
end
end
function fake_knl_addvecs!(c, a, b, num_threads_per_block, num_blocks)
dim = num_threads_per_block
N = length(a)
for bid = 1:num_blocks
for tid = 1:num_threads_per_block
# The work that thread tid of block bid should do
i = tid + dim *(bid - 1) # Unique global thread index
if i <= N
c[i] = a[i] + b[i]
end
end
end
end
function knl_addvecs!(c, a, b)
N = length(a)
bid = blockIdx().x # Get the thread's block id
tid = threadIdx().x # Get the thread ID
dim = blockDim().x # How many threads are in each block
i = tid + dim * (bid - 1) # unique global index
if i <= N
c[i] = a[i] + b[i]
end
return nothing
end
let
N = 100000000
b = rand(N)
a = rand(N)
c = similar(a)
c0 = a + b # Truth
# CPU kernel
# addvecs!(c, a, b)
# @assert isapprox(c0, c)
# c .= 0
# threads = 64
# N <= blocks * threads -> N / threads <= blocks -> blocks = cld(N, threads)
# blocks = cld(N, threads)
# @show blocks * threads
# @assert N <= blocks * threads
# fake_knl_addvecs!(c, a, b, threads, blocks)
# @assert isapprox(c0, c)
# Create buffers on the device for the computation
d_a = CuArray(a) # Allocate on the device then copy a to the device
d_b = CuArray(b) # Allocate on the device then copy b to the device
d_c = similar(d_a) # Allocate space on the device (unitialized)
# threads -> number of threads per block
# blocks -> total number of blocks
num_threads = 64
num_blocks = cld(N, num_threads)
@cuda threads=num_threads blocks=num_blocks knl_addvecs!(d_c, d_a, d_b)
synchronize()
t = @elapsed begin
@cuda threads=num_threads blocks=num_blocks knl_addvecs!(d_c, d_a, d_b)
synchronize()
end
@show t
@assert d_c ≈ c0
end