Skip to content
Snippets Groups Projects
Commit 8f197a35 authored by Dirk Ribbrock's avatar Dirk Ribbrock Committed by Dirk Ribbrock
Browse files

[CUDA] Various minor fixes

parent fff511d7
No related branches found
No related tags found
No related merge requests found
......@@ -34,7 +34,7 @@ include $(top_srcdir)/am/global-rules
EXTRA_DIST = CMakeLists.txt
LINK = $CC -o $@ -lcudart -L/sfw/cuda/5.5/lib64 -lcublas
LINK = $(CC) -o $@ -lcudart -L/sfw/cuda/5.5/lib64 -lcublas
lib_LIBRARIES = libvec_cuda.a
libvec_cuda_a_SOURCES = cuda_kernels.cu
......
......@@ -209,23 +209,23 @@ namespace Dune
//-------------- axpy ---------------
template <typename DT_>
__global__ void device_axpy(DT_ * r, const DT_ * x, DT_ a, const DT_ * y, const unsigned long count)
__global__ void device_axpy(DT_ * r, const DT_ * y, DT_ a, const DT_ * x, const unsigned long count)
{
unsigned long idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= count)
return;
r[idx] = x[idx] * a + y[idx];
r[idx] = (x[idx] * a) + y[idx];
}
template <typename DT_>
DT_ * axpy(DT_ * r, const DT_* x, DT_ a, const DT_ * y, const unsigned long size)
DT_ * axpy(DT_ * r, const DT_* y, DT_ a, const DT_ * x, const unsigned long size)
{
unsigned long blocksize(128);
dim3 grid;
dim3 block;
block.x = blocksize;
grid.x = (unsigned)ceil((size)/(double)(block.x));
device_axpy<<<grid, block>>>(r, x, a, y, size);
device_axpy<<<grid, block>>>(r, y, a, x, size);
return r;
}
......
......@@ -33,7 +33,7 @@ namespace Dune
DT_ * division_scalar(DT_ * r, const DT_* x, DT_ y, const unsigned long size);
template <typename DT_>
DT_ * axpy(DT_ * r, const DT_* x, DT_ a, const DT_ * y, const unsigned long size);
DT_ * axpy(DT_ * r, const DT_* y, DT_ a, const DT_ * x, const unsigned long size);
template <typename DT_>
DT_ dot(const DT_ * x, const DT_ * y, const unsigned long size);
......
......@@ -46,9 +46,10 @@ void Dune::Cuda::copy(DT_ * dst, const DT_ * src, size_t count)
}
template <typename DT_>
void Dune::Cuda::set(DT_ * dst, const DT_ & val)
void Dune::Cuda::set(DT_ * dst, const DT_ & val, size_t count)
{
cudaMemcpy(dst, &val, sizeof(DT_), cudaMemcpyHostToDevice);
for (size_t i(0) ; i < count ; ++i)
cudaMemcpy(dst + i, &val, sizeof(DT_), cudaMemcpyHostToDevice);
}
template <typename DT_>
......@@ -65,7 +66,7 @@ template void Dune::Cuda::download(float *, const float *, size_t);
template void Dune::Cuda::download(double *, const double *, size_t);
template void Dune::Cuda::copy(float *, const float *, size_t);
template void Dune::Cuda::copy(double *, const double *, size_t);
template void Dune::Cuda::set(float *, const float &);
template void Dune::Cuda::set(double *, const double &);
template void Dune::Cuda::set(float *, const float &, size_t);
template void Dune::Cuda::set(double *, const double &, size_t);
template float Dune::Cuda::get(float *);
template double Dune::Cuda::get(double *);
......@@ -46,7 +46,7 @@ namespace Dune {
void copy(DT_ * dst, const DT_ * src, size_t count);
template <typename DT_>
void set(DT_ * dst, const DT_ & val);
void set(DT_ * dst, const DT_ & val, size_t count = 1);
template <typename DT_>
DT_ get(DT_ * src);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment