[CUDA] Various minor fixes

Signed-off-by: Dirk Ribbrock <dirk.ribbrock@math.uni-dortmund.de>

[CUDA] Various minor fixes
8f197a35 · Dirk Ribbrock · Dirk Ribbrock · fff511d7 · 8f197a35 · 8f197a35
Commit 8f197a35 authored 11 years ago by Dirk Ribbrock Committed by Dirk Ribbrock 11 years ago
--- a/dune/common/kernel/vec/Makefile.am
+++ b/dune/common/kernel/vec/Makefile.am
@@ -34,7 +34,7 @@ include $(top_srcdir)/am/global-rules

 EXTRA_DIST = CMakeLists.txt

-LINK = $CC -o $@ -lcudart -L/sfw/cuda/5.5/lib64 -lcublas
+LINK = $(CC) -o $@ -lcudart -L/sfw/cuda/5.5/lib64 -lcublas
 lib_LIBRARIES = libvec_cuda.a
 libvec_cuda_a_SOURCES = cuda_kernels.cu


--- a/dune/common/kernel/vec/cuda_kernels.cu
+++ b/dune/common/kernel/vec/cuda_kernels.cu
@@ -209,23 +209,23 @@ namespace Dune

    //-------------- axpy ---------------
    template <typename DT_>
-    __global__ void device_axpy(DT_ * r, const DT_ * x, DT_ a, const DT_ * y, const unsigned long count)
+    __global__ void device_axpy(DT_ * r, const DT_ * y, DT_ a, const DT_ * x, const unsigned long count)
    {
      unsigned long idx = threadIdx.x + blockDim.x * blockIdx.x;
      if (idx >= count)
        return;
-      r[idx] = x[idx] * a + y[idx];
+      r[idx] = (x[idx] * a) + y[idx];
    }

    template <typename DT_>
-    DT_ * axpy(DT_ * r, const DT_* x, DT_ a, const DT_ * y, const unsigned long size)
+    DT_ * axpy(DT_ * r, const DT_* y, DT_ a, const DT_ * x, const unsigned long size)
    {
      unsigned long blocksize(128);
      dim3 grid;
      dim3 block;
      block.x = blocksize;
      grid.x = (unsigned)ceil((size)/(double)(block.x));
-      device_axpy<<<grid, block>>>(r, x, a, y, size);
+      device_axpy<<<grid, block>>>(r, y, a, x, size);
      return r;
    }


--- a/dune/common/kernel/vec/cuda_kernels.hh
+++ b/dune/common/kernel/vec/cuda_kernels.hh
@@ -33,7 +33,7 @@ namespace Dune
    DT_ * division_scalar(DT_ * r, const DT_* x, DT_ y, const unsigned long size);

    template <typename DT_>
-    DT_ * axpy(DT_ * r, const DT_* x, DT_ a, const DT_ * y, const unsigned long size);
+    DT_ * axpy(DT_ * r, const DT_* y, DT_ a, const DT_ * x, const unsigned long size);

    template <typename DT_>
    DT_ dot(const DT_ * x, const DT_ * y, const unsigned long size);

--- a/dune/common/memory/cuda_allocator.cu
+++ b/dune/common/memory/cuda_allocator.cu
@@ -46,9 +46,10 @@ void Dune::Cuda::copy(DT_ * dst, const DT_ * src, size_t count)
 }

 template <typename DT_>
-void Dune::Cuda::set(DT_ * dst, const DT_ & val)
+void Dune::Cuda::set(DT_ * dst, const DT_ & val, size_t count)
 {
-  cudaMemcpy(dst, &val, sizeof(DT_), cudaMemcpyHostToDevice);
+  for (size_t i(0) ; i < count ; ++i)
+    cudaMemcpy(dst + i, &val, sizeof(DT_), cudaMemcpyHostToDevice);
 }

 template <typename DT_>
@@ -65,7 +66,7 @@ template void Dune::Cuda::download(float *, const float *, size_t);
 template void Dune::Cuda::download(double *, const double *, size_t);
 template void Dune::Cuda::copy(float *, const float *, size_t);
 template void Dune::Cuda::copy(double *, const double *, size_t);
-template void Dune::Cuda::set(float *, const float &);
-template void Dune::Cuda::set(double *, const double &);
+template void Dune::Cuda::set(float *, const float &, size_t);
+template void Dune::Cuda::set(double *, const double &, size_t);
 template float Dune::Cuda::get(float *);
 template double Dune::Cuda::get(double *);
--- a/dune/common/memory/cuda_allocator.hh
+++ b/dune/common/memory/cuda_allocator.hh
@@ -46,7 +46,7 @@ namespace Dune {
    void copy(DT_ * dst, const DT_ * src, size_t count);

    template <typename DT_>
-    void set(DT_ * dst, const DT_ & val);
+    void set(DT_ * dst, const DT_ & val, size_t count = 1);

    template <typename DT_>
    DT_ get(DT_ * src);