Execution policy

UAMMD exposes a custom execution policy compatible with thrust. When used on a thrust algorithm this policy will make thrust leverage UAMMD’s cached allocator meachanism. This is useful, for instance, when calling algorithms that require memory allocation such as thrust::sort

You should just use this when you would put thrust::device.

Example

#include"utils/execution_policy.cuh"
#include<thrust/device_vector.h>
#include<thrust/sequence.h>
#include <sstream>
#include <cuda_profiler_api.h>
//Lets fill a vector with a decrasing sequence of number
auto get_unsorted_vector(int n){
  thrust::device_vector<int> vec(n);
  thrust::sequence(vec.rbegin(), vec.rend(), 0);
  return vec;
}

template<class Vector>
void print_first_elements(Vector &vec, int n = 3){
   thrust::copy_n(vec.begin(), 3, std::ostream_iterator<int>(std::cout," "));
   std::cout<<std::endl;
}

int main(){

  int n = 16384;
  //Lets fill a vector with a decrasing sequence of numbers
  auto vec = get_unsorted_vector(n);
  print_first_elements(vec);
  //We can use thrust::sort to sort it in increasing order
  thrust::sort(vec.begin(), vec.end());
  print_first_elements(vec);
  //However, sort requires allocating some temporary memory as part of the computation.
  //If this sorting is required a lot of times memory allocation will become a non-neglegible part of the time.
  //Moreover, the fact that memory allocation happens in the default stream would impose a synchronization barrier.
  //We can use uammd's execution policy to mitigate this:
  vec = get_unsorted_vector(n); //Unsorted vector again
  cudaStream_t st; cudaStreamCreate(&st);
  for(int i = 0;i<10;i++){
    if(i==1) cudaProfilerStart();
    thrust::sort(uammd::cached_device_execution_policy.on(st), vec.begin(), vec.end());
  }
  cudaProfilerStop();
  cudaStreamDestroy(st);
  print_first_elements(vec);
  //If you inspect the execution of this second batch of calls to sort you will see that calls to cudaMalloc/cudaFree
  // only happen during the first call (in fact cudaFree will never be called).
  //Moreover, all kernels will run in the stream "st", without synchronization barriers.
  //You can check this by using nsys profile, for instance:
  //If you run $ nsys profile -c cudaProfilerApi --stats=true ./execution_policy
  //Inspecting the part of the output about cuda api calls you will see something similar to:
  // Time (%)  Total Time (ns)  Num Calls  Avg (ns)          Name
  //  --------  ---------------  ---------  --------  ---------------------
  //      51.2          399,468        144   2,774.1  cudaLaunchKernel
  //      48.8          380,612         18  21,145.1  cudaStreamSynchronize
  //However, if you remove uammds execution policy and run again:
  //  Time (%)  Total Time (ns)  Num Calls  Avg (ns)  StdDev (ns)          Name
  // --------  ---------------  ---------  --------  -----------  ---------------------
  //     54.7          499,330         20  24,966.5     25,204.9  cudaStreamSynchronize
  //     40.6          370,021        160   2,312.6      1,957.3  cudaLaunchKernel
  //      3.1           27,919         10   2,791.9      4,715.2  cudaMalloc
  //      1.7           15,071         10   1,507.1      1,046.1  cudaFree
  //Granted, not a miraculous improvement in this case, take it as a proof of concept.
  return 0;
}