C1z/nstream-memcpy-target.c

///
/// Copyright (c) 2019, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
///       notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
///       copyright notice, this list of conditions and the following
///       disclaimer in the documentation and/or other materials provided
///       with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
///       contributors may be used to endorse or promote products
///       derived from this software without specific prior written
///       permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME:    nstream
///
/// PURPOSE: To compute memory bandwidth when adding a vector of a given
///          number of double precision values to the scalar multiple of
///          another vector of the same length, and storing the result in
///          a third vector.
///
/// USAGE:   The program takes as input the number
///          of iterations to loop over the triad vectors and
///          the length of the vectors.
///
///          <progname> <# iterations> <vector length>
///
///          The output consists of diagnostics to make sure the
///          algorithm worked, and of timing statistics.
///
/// NOTES:   Bandwidth is determined as the number of words read, plus the
///          number of words written, times the size of the words, divided
///          by the execution time. For a vector length of N, the total
///          number of words read and written is 4*N*sizeof(double).
///
///
/// HISTORY: This code is loosely based on the Stream benchmark by John
///          McCalpin, but does not follow all the Stream rules. Hence,
///          reported results should not be associated with Stream in
///          external publications
///
///          Converted to C++11 by Jeff Hammond, November 2017.
///          Converted to C11 by Jeff Hammond, February 2019.
///
//////////////////////////////////////////////////////////////////////

#pragma omp requires unified_address

#include "prk_util.h"
#include "prk_openmp.h"

int main(int argc, char * argv[])
{
  printf("Parallel Research Kernels version %d\n", PRKVERSION );
  printf("C11/OpenMP TARGET STREAM triad: A = B + scalar * C\n");

  //////////////////////////////////////////////////////////////////////
  /// Read and test input parameters
  //////////////////////////////////////////////////////////////////////

  if (argc < 3) {
    printf("Usage: <# iterations> <vector length>\n");
    return 1;
  }

  int iterations = atoi(argv[1]);
  if (iterations < 1) {
    printf("ERROR: iterations must be >= 1\n");
    return 1;
  }

  // length of a the vector
  size_t length = atol(argv[2]);
  if (length <= 0) {
    printf("ERROR: Vector length must be greater than 0\n");
    return 1;
  }

  int device = (argc > 3) ? atol(argv[3]) : omp_get_default_device();
  if ( (device < 0 || omp_get_num_devices() <= device ) && (device != omp_get_default_device()) ) {
    printf("ERROR: device number %d is not valid.\n", device);
    return 1;
  }

  printf("Number of iterations = %d\n", iterations);
  printf("Vector length        = %zu\n", length);
  printf("OpenMP Device        = %d\n", device);

  //////////////////////////////////////////////////////////////////////
  // Allocate space and perform the computation
  //////////////////////////////////////////////////////////////////////

  double nstream_time = 0.0;

  int host = omp_get_initial_device();

  size_t bytes = length*sizeof(double);
  double * restrict h_A = omp_target_alloc(bytes, host);
  double * restrict h_B = omp_target_alloc(bytes, host);
  double * restrict h_C = omp_target_alloc(bytes, host);

  double scalar = 3.0;

  #pragma omp parallel for simd schedule(static)
  for (size_t i=0; i<length; i++) {
      h_A[i] = 0.0;
      h_B[i] = 2.0;
      h_C[i] = 2.0;
  }

  double * restrict d_A = omp_target_alloc(bytes, device);
  double * restrict d_B = omp_target_alloc(bytes, device);
  double * restrict d_C = omp_target_alloc(bytes, device);

  int rc = 0;
  rc = omp_target_memcpy(d_A, h_A, bytes, 0, 0, device, host);
  if (rc) { printf("ERROR: omp_target_memcpy(A) returned %d\n", rc); abort(); }
  rc = omp_target_memcpy(d_B, h_B, bytes, 0, 0, device, host);
  if (rc) { printf("ERROR: omp_target_memcpy(B) returned %d\n", rc); abort(); }
  rc = omp_target_memcpy(d_C, h_C, bytes, 0, 0, device, host);
  if (rc) { printf("ERROR: omp_target_memcpy(C) returned %d\n", rc); abort(); }

  omp_target_free(h_C, host);
  omp_target_free(h_B, host);

  {
    for (int iter = 0; iter<=iterations; iter++) {

      if (iter==1) nstream_time = omp_get_wtime();

      #pragma omp target teams distribute parallel for simd schedule(static) device(device) is_device_ptr(d_A,d_B,d_C)
      for (size_t i=0; i<length; i++) {
          d_A[i] += d_B[i] + scalar * d_C[i];
      }
    }
    nstream_time = omp_get_wtime() - nstream_time;
  }

  rc = omp_target_memcpy(h_A, d_A, bytes, 0, 0, host, device);
  if (rc) { printf("ERROR: omp_target_memcpy(A) returned %d\n", rc); abort(); }

  omp_target_free(d_C, device);
  omp_target_free(d_B, device);
  omp_target_free(d_A, device);

  //////////////////////////////////////////////////////////////////////
  /// Analyze and output results
  //////////////////////////////////////////////////////////////////////

  double ar = 0.0;
  double br = 2.0;
  double cr = 2.0;
  for (int i=0; i<=iterations; i++) {
      ar += br + scalar * cr;
  }

  ar *= length;

  double asum = 0.0;
  #pragma omp parallel for reduction(+:asum)
  for (size_t i=0; i<length; i++) {
      asum += fabs(h_A[i]);
  }

  omp_target_free(h_A, host);

  double epsilon=1.e-8;
  if (fabs(ar-asum)/asum > epsilon) {
      printf("Failed Validation on output array\n"
             "       Expected checksum: %lf\n"
             "       Observed checksum: %lf\n"
             "ERROR: solution did not validate\n", ar, asum);
      return 1;
  } else {
      printf("Solution validates\n");
      double avgtime = nstream_time/iterations;
      double nbytes = 4.0 * length * sizeof(double);
      printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime);
  }

  return 0;
}