/*
 *     Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 */
#include <stdarg.h>

#ifndef NVHPC_UTILRT_DEV_SIG
#define NVHPC_UTILRT_DEV_SIG __device__ static __inline__
#else
NVHPC_UTILRT_DEV_SIG void __pgi_memcpy(signed char *dst, signed char *src,
        size_t num_bytes);
NVHPC_UTILRT_DEV_SIG void pgf90_sect_alldim_i8(void *pd, void *pa, long long *lower,
        long long *upper, long long *stride, long long flags, int rank, int wrank);
#endif

NVHPC_UTILRT_DEV_SIG void
pgf90_str_copy_klen(int n, signed char *to, long long to_len, ...)
{
  va_list va;
  signed char *from, *tmpbuf, *savbuf;
  long long from_len, dumb_len;

  if (to_len <= 0) return;

  va_start(va, to_len);
  if (n == 1) {
    /* N==1 is the most common case */
    from = va_arg(va, signed char *);
    from_len = va_arg(va, long long);
    if (from_len < 0) from_len = 0;
    /* Do not expect overlap to occur */
    if ((from < to) && ((from+from_len) > to)) {
      tmpbuf = (signed char *) malloc(from_len);
      __pgi_memcpy(tmpbuf, from, from_len);
      from = tmpbuf;
    } else {
      tmpbuf = 0;
    }
    while (to_len-- > 0) {
      if (from_len > 0) {
        *to++ = *from++;
        from_len--;
      } else
        *to++ = ' ';
    }
    if (tmpbuf) free(tmpbuf);
  } else {
    /* N>1 is not a common case, just use dumb algorithm, one buffer */
    dumb_len = to_len;
    tmpbuf = (signed char *) malloc(dumb_len);
    savbuf = tmpbuf;
    while (n-- > 0) {
      from = va_arg(va, signed char *);
      from_len = va_arg(va, long long);
      if (from_len > dumb_len) from_len = dumb_len;
      if (from_len > 0) {
        __pgi_memcpy(tmpbuf, from, (size_t) from_len);
        tmpbuf += from_len;
        dumb_len -= from_len;
      }
    }
    while (dumb_len-- > 0)
      *tmpbuf++ = ' ';
    __pgi_memcpy(to, savbuf, to_len);
    free(savbuf);
  }
}

NVHPC_UTILRT_DEV_SIG void
pgf90_sect_i8(void *pd, void *pa, void *prank, ...)
{
  va_list va;
  long long lower[7], upper[7], stride[7], flags;
  int ax, rank, wrank;
  typedef struct F90_DumDesc_la {
    long long tag;
    long long rank;
  } F90_DumDesc_la;

  /* Just need to pull the rank out of this descriptor */
  F90_DumDesc_la *a = (F90_DumDesc_la *)pa;

  va_start(va, prank);
  wrank = (int)(*((long long *)(prank)));
  for (ax = 0; ax < wrank; ++ax) {
    lower[ax] = *va_arg(va, long long *);
    upper[ax] = *va_arg(va, long long *);
    stride[ax] = *va_arg(va, long long *);
  }
  flags = *va_arg(va, long long *);
  va_end(va);

  rank = (flags & 0x55) + (flags >> 1 & 0x15);
  rank = (rank & 0x33) + (rank >> 2 & 0x13);
  rank += rank >> 4;
  rank &= 0x7;
  /* Just need to pull the rank out of this descriptor */
  wrank = a->rank;
  
  pgf90_sect_alldim_i8(pd, pa, lower, upper, stride, flags, rank, wrank);
}

/* Device versions for loads/stores with cache operators */

/* ldca */
NVHPC_UTILRT_DEV_SIG int
__ldca_i4(signed char *ptr)
{
  unsigned int ret;
  asm ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : "l"(ptr) : "memory");
  return (int)ret;
}

NVHPC_UTILRT_DEV_SIG long long
__ldca_i8(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return (long long)ret;
}

NVHPC_UTILRT_DEV_SIG unsigned long long
__ldca_cd(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG unsigned short
__ldca_r2(signed char *ptr)
{
  unsigned short ret;
  asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG float
__ldca_r4(signed char *ptr)
{
  float ret;
  asm ("ld.global.ca.f32 %0, [%1];"  : "=f"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG double
__ldca_r8(signed char *ptr)
{
  double ret;
  asm ("ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_c4_(signed char *dst, signed char *src)
{
  float rx, ry;
  asm ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG cmplx2
__ldca_c4x(signed char *src)
{
  cmplx2 ret;
  float rx, ry;
  asm ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_c8_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG dcmplx2
__ldca_c8x(signed char *src)
{
  dcmplx2 ret;
  double rx, ry;
  asm ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_i4x4_(signed char *dst, signed char *src)
{
  int rx, ry, rz, rw;
  asm ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(rx), "=r"(ry), "=r"(rz), "=r"(rw) : "l"(src) : "memory");
  ((int *)dst)[0] = rx;
  ((int *)dst)[1] = ry;
  ((int *)dst)[2] = rz;
  ((int *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_i8x2_(signed char *dst, signed char *src)
{
  long long rx, ry;
  asm ("ld.global.ca.v2.s64 {%0,%1}, [%2];"  : "=l"(rx), "=l"(ry) : "l"(src) : "memory");
  ((long long *)dst)[0] = rx;
  ((long long *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_r2x2_(signed char *dst, signed char *src)
{
  unsigned int rx;
  asm ("ld.global.ca.b32 {%0}, [%1];"  : "=r"(rx) : "l"(src) : "memory");
  ((unsigned int *)dst)[0] = rx;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_r4x4_(signed char *dst, signed char *src)
{
  float rx, ry, rz, rw;
  asm ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(rx), "=f"(ry), "=f"(rz), "=f"(rw) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
  ((float *)dst)[2] = rz;
  ((float *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldca_r8x2_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

/* ldcg */
NVHPC_UTILRT_DEV_SIG int
__ldcg_i4(signed char *ptr)
{
  unsigned int ret;
  asm ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : "l"(ptr) : "memory");
  return (int)ret;
}

NVHPC_UTILRT_DEV_SIG long long
__ldcg_i8(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return (long long)ret;
}

NVHPC_UTILRT_DEV_SIG unsigned long long
__ldcg_cd(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG unsigned short
__ldcg_r2(signed char *ptr)
{
  unsigned short ret;
  asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG float
__ldcg_r4(signed char *ptr)
{
  float ret;
  asm ("ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG double
__ldcg_r8(signed char *ptr)
{
  double ret;
  asm ("ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_c4_(signed char *dst, signed char *src)
{
  float rx, ry;
  asm ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG cmplx2
__ldcg_c4x(signed char *src)
{
  cmplx2 ret;
  float rx, ry;
  asm ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_c8_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG dcmplx2
__ldcg_c8x(signed char *src)
{
  dcmplx2 ret;
  double rx, ry;
  asm ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_i4x4_(signed char *dst, signed char *src)
{
  int rx, ry, rz, rw;
  asm ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(rx), "=r"(ry), "=r"(rz), "=r"(rw) : "l"(src) : "memory");
  ((int *)dst)[0] = rx;
  ((int *)dst)[1] = ry;
  ((int *)dst)[2] = rz;
  ((int *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_i8x2_(signed char *dst, signed char *src)
{
  long long rx, ry;
  asm ("ld.global.cg.v2.s64 {%0,%1}, [%2];"  : "=l"(rx), "=l"(ry) : "l"(src) : "memory");
  ((long long *)dst)[0] = rx;
  ((long long *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_r2x2_(signed char *dst, signed char *src)
{
  unsigned int rx;
  asm ("ld.global.cg.b32 {%0}, [%1];"  : "=r"(rx) : "l"(src) : "memory");
  ((unsigned int *)dst)[0] = rx;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_r4x4_(signed char *dst, signed char *src)
{
  float rx, ry, rz, rw;
  asm ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(rx), "=f"(ry), "=f"(rz), "=f"(rw) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
  ((float *)dst)[2] = rz;
  ((float *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldcg_r8x2_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

/* ldcs */
NVHPC_UTILRT_DEV_SIG int
__ldcs_i4(signed char *ptr)
{
  unsigned int ret;
  asm ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : "l"(ptr) : "memory");
  return (int)ret;
}

NVHPC_UTILRT_DEV_SIG long long
__ldcs_i8(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return (long long)ret;
}

NVHPC_UTILRT_DEV_SIG unsigned long long
__ldcs_cd(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG unsigned short
__ldcs_r2(signed char *ptr)
{
  unsigned short ret;
  asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG float
__ldcs_r4(signed char *ptr)
{
  float ret;
  asm ("ld.global.cs.f32 %0, [%1];"  : "=f"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG double
__ldcs_r8(signed char *ptr)
{
  double ret;
  asm ("ld.global.cs.f64 %0, [%1];"  : "=d"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_c4_(signed char *dst, signed char *src)
{
  float rx, ry;
  asm ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG cmplx2
__ldcs_c4x(signed char *src)
{
  cmplx2 ret;
  float rx, ry;
  asm ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_c8_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG dcmplx2
__ldcs_c8x(signed char *src)
{
  dcmplx2 ret;
  double rx, ry;
  asm ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_i4x4_(signed char *dst, signed char *src)
{
  int rx, ry, rz, rw;
  asm ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(rx), "=r"(ry), "=r"(rz), "=r"(rw) : "l"(src) : "memory");
  ((int *)dst)[0] = rx;
  ((int *)dst)[1] = ry;
  ((int *)dst)[2] = rz;
  ((int *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_i8x2_(signed char *dst, signed char *src)
{
  long long rx, ry;
  asm ("ld.global.cs.v2.s64 {%0,%1}, [%2];"  : "=l"(rx), "=l"(ry) : "l"(src) : "memory");
  ((long long *)dst)[0] = rx;
  ((long long *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_r2x2_(signed char *dst, signed char *src)
{
  unsigned int rx;
  asm ("ld.global.cs.b32 {%0}, [%1];"  : "=r"(rx) : "l"(src) : "memory");
  ((unsigned int *)dst)[0] = rx;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_r4x4_(signed char *dst, signed char *src)
{
  float rx, ry, rz, rw;
  asm ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(rx), "=f"(ry), "=f"(rz), "=f"(rw) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
  ((float *)dst)[2] = rz;
  ((float *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldcs_r8x2_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

/* ldlu */
NVHPC_UTILRT_DEV_SIG int
__ldlu_i4(signed char *ptr)
{
  unsigned int ret;
  asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : "l"(ptr) : "memory");
  return (int)ret;
}

NVHPC_UTILRT_DEV_SIG long long
__ldlu_i8(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return (long long)ret;
}

NVHPC_UTILRT_DEV_SIG unsigned long long
__ldlu_cd(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG unsigned short
__ldlu_r2(signed char *ptr)
{
  unsigned short ret;
  asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG float
__ldlu_r4(signed char *ptr)
{
  float ret;
  asm ("ld.global.lu.f32 %0, [%1];"  : "=f"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG double
__ldlu_r8(signed char *ptr)
{
  double ret;
  asm ("ld.global.lu.f64 %0, [%1];"  : "=d"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_c4_(signed char *dst, signed char *src)
{
  float rx, ry;
  asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG cmplx2
__ldlu_c4x(signed char *src)
{
  cmplx2 ret;
  float rx, ry;
  asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_c8_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG dcmplx2
__ldlu_c8x(signed char *src)
{
  dcmplx2 ret;
  double rx, ry;
  asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_i4x4_(signed char *dst, signed char *src)
{
  int rx, ry, rz, rw;
  asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(rx), "=r"(ry), "=r"(rz), "=r"(rw) : "l"(src) : "memory");
  ((int *)dst)[0] = rx;
  ((int *)dst)[1] = ry;
  ((int *)dst)[2] = rz;
  ((int *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_i8x2_(signed char *dst, signed char *src)
{
  long long rx, ry;
  asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];"  : "=l"(rx), "=l"(ry) : "l"(src) : "memory");
  ((long long *)dst)[0] = rx;
  ((long long *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_r2x2_(signed char *dst, signed char *src)
{
  unsigned int rx;
  asm ("ld.global.lu.b32 {%0}, [%1];"  : "=r"(rx) : "l"(src) : "memory");
  ((unsigned int *)dst)[0] = rx;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_r4x4_(signed char *dst, signed char *src)
{
  float rx, ry, rz, rw;
  asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(rx), "=f"(ry), "=f"(rz), "=f"(rw) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
  ((float *)dst)[2] = rz;
  ((float *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldlu_r8x2_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

/* ldcv */
NVHPC_UTILRT_DEV_SIG int
__ldcv_i4(signed char *ptr)
{
  unsigned int ret;
  asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : "l"(ptr) : "memory");
  return (int)ret;
}

NVHPC_UTILRT_DEV_SIG long long
__ldcv_i8(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return (long long)ret;
}

NVHPC_UTILRT_DEV_SIG unsigned long long
__ldcv_cd(signed char *ptr)
{
  unsigned long long ret;
  asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG unsigned short
__ldcv_r2(signed char *ptr)
{
  unsigned short ret;
  asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG float
__ldcv_r4(signed char *ptr)
{
  float ret;
  asm ("ld.global.cv.f32 %0, [%1];"  : "=f"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG double
__ldcv_r8(signed char *ptr)
{
  double ret;
  asm ("ld.global.cv.f64 %0, [%1];"  : "=d"(ret) : "l"(ptr) : "memory");
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_c4_(signed char *dst, signed char *src)
{
  float rx, ry;
  asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG cmplx2
__ldcv_c4x(signed char *src)
{
  cmplx2 ret;
  float rx, ry;
  asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];"  : "=f"(rx), "=f"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_c8_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG dcmplx2
__ldcv_c8x(signed char *src)
{
  dcmplx2 ret;
  double rx, ry;
  asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ret.r = rx;
  ret.i = ry;
  return ret;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_i4x4_(signed char *dst, signed char *src)
{
  int rx, ry, rz, rw;
  asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(rx), "=r"(ry), "=r"(rz), "=r"(rw) : "l"(src) : "memory");
  ((int *)dst)[0] = rx;
  ((int *)dst)[1] = ry;
  ((int *)dst)[2] = rz;
  ((int *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_i8x2_(signed char *dst, signed char *src)
{
  long long rx, ry;
  asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];"  : "=l"(rx), "=l"(ry) : "l"(src) : "memory");
  ((long long *)dst)[0] = rx;
  ((long long *)dst)[1] = ry;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_r2x2_(signed char *dst, signed char *src)
{
  unsigned int rx;
  asm ("ld.global.cv.b32 {%0}, [%1];"  : "=r"(rx) : "l"(src) : "memory");
  ((unsigned int *)dst)[0] = rx;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_r4x4_(signed char *dst, signed char *src)
{
  float rx, ry, rz, rw;
  asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(rx), "=f"(ry), "=f"(rz), "=f"(rw) : "l"(src) : "memory");
  ((float *)dst)[0] = rx;
  ((float *)dst)[1] = ry;
  ((float *)dst)[2] = rz;
  ((float *)dst)[3] = rw;
}

NVHPC_UTILRT_DEV_SIG void
__ldcv_r8x2_(signed char *dst, signed char *src)
{
  double rx, ry;
  asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(rx), "=d"(ry) : "l"(src) : "memory");
  ((double *)dst)[0] = rx;
  ((double *)dst)[1] = ry;
}

/* stwb */
NVHPC_UTILRT_DEV_SIG void
__stwb_i4(signed char *ptr, int value)
{
  asm ("st.global.wb.s32 [%0], %1;"  :: "l"(ptr), "r"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_i8(signed char *ptr, long long value)
{
  asm ("st.global.wb.s64 [%0], %1;"  :: "l"(ptr), "l"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_cd(signed char *ptr, signed char *value)
{
  unsigned long long vx = ((unsigned long long *)value)[0];
  asm ("st.global.wb.u64 [%0], %1;"  :: "l"(ptr), "l"(vx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_r2(signed char *ptr, unsigned short value)
{
  asm ("st.global.wb.b16 [%0], %1;"  :: "l"(ptr), "h"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_r4(signed char *ptr, float value)
{
  asm ("st.global.wb.f32 [%0], %1;"  :: "l"(ptr), "f"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_r8(signed char *ptr, double value)
{
  asm ("st.global.wb.f64 [%0], %1;"  :: "l"(ptr), "d"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_c4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  asm ("st.global.wb.v2.f32 [%0], {%1,%2};"  :: "l"(ptr), "f"(vx), "f"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_c8(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.wb.v2.f64 [%0], {%1,%2};"  :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_i4x4(signed char *ptr, signed char *value)
{
  int vx = ((int *)value)[0];
  int vy = ((int *)value)[1];
  int vz = ((int *)value)[2];
  int vw = ((int *)value)[3];
  asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "r"(vx), "r"(vy), "r"(vz), "r"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_i8x2(signed char *ptr, signed char *value)
{
  long long vx = ((long long *)value)[0];
  long long vy = ((long long *)value)[1];
  asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: "l"(ptr), "l"(vx), "l"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_r2x2(signed char *ptr, signed char *value)
{
  int rx = ((int *)value)[0];
  asm ("st.global.wb.b32 [%0], %1;"  :: "l"(ptr), "r"(rx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_r4x4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  float vz = ((float *)value)[2];
  float vw = ((float *)value)[3];
  asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "f"(vx), "f"(vy), "f"(vz), "f"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwb_r8x2(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.wb.v2.f64 [%0], {%1,%2};" :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

/* stcg */
NVHPC_UTILRT_DEV_SIG void
__stcg_i4(signed char *ptr, int value)
{
  asm ("st.global.cg.s32 [%0], %1;"  :: "l"(ptr), "r"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_i8(signed char *ptr, long long value)
{
  asm ("st.global.cg.s64 [%0], %1;"  :: "l"(ptr), "l"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_cd(signed char *ptr, signed char *value)
{
  unsigned long long vx = ((unsigned long long *)value)[0];
  asm ("st.global.cg.u64 [%0], %1;"  :: "l"(ptr), "l"(vx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_r2(signed char *ptr, unsigned short value)
{
  asm ("st.global.cg.b16 [%0], %1;"  :: "l"(ptr), "h"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_r4(signed char *ptr, float value)
{
  asm ("st.global.cg.f32 [%0], %1;"  :: "l"(ptr), "f"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_r8(signed char *ptr, double value)
{
  asm ("st.global.cg.f64 [%0], %1;"  :: "l"(ptr), "d"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_c4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  asm ("st.global.cg.v2.f32 [%0], {%1,%2};"  :: "l"(ptr), "f"(vx), "f"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_c8(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.cg.v2.f64 [%0], {%1,%2};"  :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_i4x4(signed char *ptr, signed char *value)
{
  int vx = ((int *)value)[0];
  int vy = ((int *)value)[1];
  int vz = ((int *)value)[2];
  int vw = ((int *)value)[3];
  asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "r"(vx), "r"(vy), "r"(vz), "r"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_i8x2(signed char *ptr, signed char *value)
{
  long long vx = ((long long *)value)[0];
  long long vy = ((long long *)value)[1];
  asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: "l"(ptr), "l"(vx), "l"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_r2x2(signed char *ptr, signed char *value)
{
  int rx = ((int *)value)[0];
  asm ("st.global.cg.b32 [%0], %1;"  :: "l"(ptr), "r"(rx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_r4x4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  float vz = ((float *)value)[2];
  float vw = ((float *)value)[3];
  asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "f"(vx), "f"(vy), "f"(vz), "f"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcg_r8x2(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.cg.v2.f64 [%0], {%1,%2};" :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

/* stcs */
NVHPC_UTILRT_DEV_SIG void
__stcs_i4(signed char *ptr, int value)
{
  asm ("st.global.cs.s32 [%0], %1;"  :: "l"(ptr), "r"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_i8(signed char *ptr, long long value)
{
  asm ("st.global.cs.s64 [%0], %1;"  :: "l"(ptr), "l"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_cd(signed char *ptr, signed char *value)
{
  unsigned long long vx = ((unsigned long long *)value)[0];
  asm ("st.global.cs.u64 [%0], %1;"  :: "l"(ptr), "l"(vx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_r2(signed char *ptr, unsigned short value)
{
  asm ("st.global.cs.b16 [%0], %1;"  :: "l"(ptr), "h"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_r4(signed char *ptr, float value)
{
  asm ("st.global.cs.f32 [%0], %1;"  :: "l"(ptr), "f"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_r8(signed char *ptr, double value)
{
  asm ("st.global.cs.f64 [%0], %1;"  :: "l"(ptr), "d"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_c4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  asm ("st.global.cs.v2.f32 [%0], {%1,%2};"  :: "l"(ptr), "f"(vx), "f"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_c8(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.cs.v2.f64 [%0], {%1,%2};"  :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_i4x4(signed char *ptr, signed char *value)
{
  int vx = ((int *)value)[0];
  int vy = ((int *)value)[1];
  int vz = ((int *)value)[2];
  int vw = ((int *)value)[3];
  asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "r"(vx), "r"(vy), "r"(vz), "r"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_i8x2(signed char *ptr, signed char *value)
{
  long long vx = ((long long *)value)[0];
  long long vy = ((long long *)value)[1];
  asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: "l"(ptr), "l"(vx), "l"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_r2x2(signed char *ptr, signed char *value)
{
  int rx = ((int *)value)[0];
  asm ("st.global.cs.b32 [%0], %1;"  :: "l"(ptr), "r"(rx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_r4x4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  float vz = ((float *)value)[2];
  float vw = ((float *)value)[3];
  asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "f"(vx), "f"(vy), "f"(vz), "f"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stcs_r8x2(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.cs.v2.f64 [%0], {%1,%2};" :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

/* stwt */
NVHPC_UTILRT_DEV_SIG void
__stwt_i4(signed char *ptr, int value)
{
  asm ("st.global.wt.s32 [%0], %1;"  :: "l"(ptr), "r"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_i8(signed char *ptr, long long value)
{
  asm ("st.global.wt.s64 [%0], %1;"  :: "l"(ptr), "l"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_cd(signed char *ptr, signed char *value)
{
  unsigned long long vx = ((unsigned long long *)value)[0];
  asm ("st.global.wt.u64 [%0], %1;"  :: "l"(ptr), "l"(vx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_r2(signed char *ptr, unsigned short value)
{
  asm ("st.global.wt.b16 [%0], %1;"  :: "l"(ptr), "h"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_r4(signed char *ptr, float value)
{
  asm ("st.global.wt.f32 [%0], %1;"  :: "l"(ptr), "f"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_r8(signed char *ptr, double value)
{
  asm ("st.global.wt.f64 [%0], %1;"  :: "l"(ptr), "d"(value) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_c4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  asm ("st.global.wt.v2.f32 [%0], {%1,%2};"  :: "l"(ptr), "f"(vx), "f"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_c8(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.wt.v2.f64 [%0], {%1,%2};"  :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_i4x4(signed char *ptr, signed char *value)
{
  int vx = ((int *)value)[0];
  int vy = ((int *)value)[1];
  int vz = ((int *)value)[2];
  int vw = ((int *)value)[3];
  asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "r"(vx), "r"(vy), "r"(vz), "r"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_i8x2(signed char *ptr, signed char *value)
{
  long long vx = ((long long *)value)[0];
  long long vy = ((long long *)value)[1];
  asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: "l"(ptr), "l"(vx), "l"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_r2x2(signed char *ptr, signed char *value)
{
  int rx = ((int *)value)[0];
  asm ("st.global.wt.b32 [%0], %1;"  :: "l"(ptr), "r"(rx) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_r4x4(signed char *ptr, signed char *value)
{
  float vx = ((float *)value)[0];
  float vy = ((float *)value)[1];
  float vz = ((float *)value)[2];
  float vw = ((float *)value)[3];
  asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};" :: "l"(ptr), "f"(vx), "f"(vy), "f"(vz), "f"(vw) : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__stwt_r8x2(signed char *ptr, signed char *value)
{
  double vx = ((double *)value)[0];
  double vy = ((double *)value)[1];
  asm ("st.global.wt.v2.f64 [%0], {%1,%2};" :: "l"(ptr), "d"(vx), "d"(vy) : "memory");
}

NVHPC_UTILRT_DEV_SIG int
__nvhpc_syncthreads_count(int pred)
{
  return syncthreads_count(pred);
}

NVHPC_UTILRT_DEV_SIG void
pgf90_mvbits(signed char *from, signed char *frompos, signed char *len,
                 signed char *to, signed char *topos, signed char *szfrom,
                 signed char *szfrompos, signed char *szlen, signed char *sztopos)
{
  int ifrp, ilen, itop;

  if (4 == (*((int *)szfrompos))) 
    ifrp = *((int *)frompos);
  else
    ifrp = (int) *((long long *)frompos);

  if (4 == (*((int *)szlen))) 
    ilen = *((int *)len);
  else
    ilen = (int) *((long long *)len);

  if (4 == (*((int *)sztopos))) 
    itop = *((int *)topos);
  else
    itop = (int) *((long long *)topos);

  if (ifrp < 0 || itop < 0 || ilen <= 0)
    return;

  if (4 == (*((int *)szfrom))) {
    if ((ifrp + ilen) > 32 || (itop + ilen) > 32)
      return;

    if (ilen == 32) {
      *(int *)to = *(int *)from;
    } else {
      int ifr, ito, imsk;
      ifr = *((int *)from);
      ito = *((int *)to);
      imsk = (~(-1 << ilen)) << itop;
      *(int *)to = (ito & ~imsk) | (((ifr >> ifrp) << itop) & imsk);
    }

  } else if (8 == (*((int *)szfrom))) {
    if ((ifrp + ilen) > 64 || (itop + ilen) > 64)
      return;

    if (ilen == 64) {
      *(long long *)to = *(long long *)from;
    } else {
      long long ifr, ito, imsk;
      ifr = *((long long *)from);
      ito = *((long long *)to);
      imsk = (~(-1LL << ilen)) << itop;
      *(long long *)to = (ito & ~imsk) | (((ifr >> ifrp) << itop) & imsk);
    }
  }
}

NVHPC_UTILRT_DEV_SIG unsigned long long
__nvf_global_timer()
{
  unsigned long long ret;
  asm("mov.u64 %0, %globaltimer;" : "=l"(ret));
  return ret;
}

#if defined(PGI_COMPILE_BITCODE)

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
#include <patched/cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
namespace cg = cooperative_groups;

NVHPC_UTILRT_DEV_SIG void
__nvf_fastgridsync()
{
    cg::grid_group grid = cg::this_grid();
    cg::sync(grid);
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900

typedef struct __nvf_ClusterType {
  unsigned long long handle;
  unsigned int size;
  unsigned int rank;
} __nvf_ClusterType;

NVHPC_UTILRT_DEV_SIG void
__nvf_cg_syncthreads_cluster()
{
    cg::cluster_group cluster = cg::this_cluster();
    cluster.sync();
}

NVHPC_UTILRT_DEV_SIG void*
__nvf_cg_map_shared_rank(signed char *addr, int rank)
{
    cg::cluster_group cluster = cg::this_cluster();
    return (void *)(cluster.map_shared_rank(addr, rank-1));
}

/* Fortran symbol, for handling the struct arg */
NVHPC_UTILRT_DEV_SIG void
__nvf_cg_this_cluster_(signed char *cg)
{
    __nvf_ClusterType *ncl = (__nvf_ClusterType *)cg;
    cg::cluster_group cluster = cg::this_cluster();
    ncl->size = cluster.num_threads();
    ncl->rank = cluster.block_rank() + 1;
    return;
}

NVHPC_UTILRT_DEV_SIG void
__nvf_dimblocks_this_cluster_(signed char *db)
{
    cg::cluster_group cluster = cg::this_cluster();
    dim3 clusterDim = cluster.dim_blocks();
    ((int *)(db))[0] = clusterDim.x;
    ((int *)(db))[1] = clusterDim.y;
    ((int *)(db))[2] = clusterDim.z;
    return;
}

NVHPC_UTILRT_DEV_SIG void
__nvf_blockindex_this_cluster_(signed char *bi)
{
  cg::cluster_group cluster = cg::this_cluster();
  dim3 blockIndex = cluster.block_index();
  ((int *)(bi))[0] = blockIndex.x + 1;
  ((int *)(bi))[1] = blockIndex.y + 1;
  ((int *)(bi))[2] = blockIndex.z + 1;
  return;
}

NVHPC_UTILRT_DEV_SIG void
__nvf_barrier_arrive_this_cluster_()
{
  cg::cluster_group cluster = cg::this_cluster();
  cluster.barrier_arrive();
}

NVHPC_UTILRT_DEV_SIG void
__nvf_barrier_wait_this_cluster_()
{
  cg::cluster_group cluster = cg::this_cluster();
  cluster.barrier_wait();
}

/* === Bulk loads and stores, and associated Barriers.  1st for CUF */
/* === Still within CUDA_ARCH >= 900                                */

#include <cuda_awbarrier_primitives.h>

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_g2s(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldi4(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 4;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldi8(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 8;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldr2(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 2;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldr4(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 4;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldr8(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 8;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldc4(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 8;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_ldc8(__mbarrier_t *barrier, const void *src, void *dst, int size)
{
  size *= 16;
  asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(dst))),
        "l"(src), "r"(size), "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier)))
     : "memory");
  asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
    :: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_s2gp(const void *src, void *dst, int size)
{
  asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;"
    :: "l"(dst),
       "r"(static_cast<unsigned int>(__cvta_generic_to_shared(src))), "r"(size)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_commit_group()
{
  asm volatile("cp.async.bulk.commit_group;"
     :
     :
     :);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_wait_group()
{
  asm volatile("cp.async.bulk.wait_group %0;"
     :
     : "n"(0)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_s2g(const void *src, void *dst, int size)
{
  asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;"
    :: "l"(dst),
       "r"(static_cast<unsigned int>(__cvta_generic_to_shared(src))), "r"(size)
     : "memory");
  asm volatile("cp.async.bulk.commit_group;"
     :
     :
     :);
  /* Add this to ensure SMEM can be reused */
  asm volatile("cp.async.bulk.wait_group %0;"
     :
     : "n"(0)
     : "memory");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_sti4(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*4);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_sti8(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*8);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_str2(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*2);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_str4(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*4);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_str8(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*8);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_stc4(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*8);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_tma_bulk_stc8(const void *src, void *dst, int size)
{
  __nvhpc_tma_bulk_s2g(src, dst, size*16);
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_barrier_initp(__mbarrier_t *barrier, unsigned int count)
{
    /* count here is the number of threads participating in the barrier */
    asm volatile("mbarrier.init.shared.b64 [%0], %1;"
      :
      : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(count)
      : "memory");
    return;
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_fence_proxy_async()
{
    asm("fence.proxy.async.shared::cta;");
}

NVHPC_UTILRT_DEV_SIG void
__nvhpc_barrier_init(__mbarrier_t *barrier, unsigned int count)
{
    asm volatile("mbarrier.init.shared.b64 [%0], %1;"
      :
      : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(count)
      : "memory");
    asm("fence.proxy.async.shared::cta;");
    return;
}

NVHPC_UTILRT_DEV_SIG __mbarrier_token_t
__nvhpc_barrier_arrive(__mbarrier_t *barrier, unsigned int tx_count)
{
    __mbarrier_token_t token; /* This function returns a token */

    /* tx_count here is the transaction count, expected # of bytes to arrive */
    asm volatile("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
        : "=l"(token)
        : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))),
          "r"(tx_count)
        : "memory");
    return token;
}

NVHPC_UTILRT_DEV_SIG __mbarrier_token_t
__nvhpc_barrier_arrive_nocnt(__mbarrier_t *barrier)
{
    __mbarrier_token_t token; /* This function returns a token */
    const unsigned int tx_count = 1;

    /* tx_count here is the transaction count, expected # of bytes to arrive */
    asm volatile("mbarrier.arrive.shared::cta.b64 %0, [%1], %2;"
        : "=l"(token)
        : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))),
          "r"(tx_count)
        : "memory");
    return token;
}

NVHPC_UTILRT_DEV_SIG unsigned int
__nvhpc_barrier_try_wait_sleep(__mbarrier_t *barrier,
                           __mbarrier_token_t token, int ns) {
    unsigned int __ready = 0;
    asm volatile("{\n\t"
                 ".reg .pred p;\n\t"
                 "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t"
                 "selp.b32 %0, 1, 0, p;\n\t"
                 "}"
                 : "=r"(__ready)
                 : "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))),
                   "l"(token),
                   "r"(static_cast<uint32_t>(ns))
                 : "memory");
    return __ready;
}

NVHPC_UTILRT_DEV_SIG unsigned int
__nvhpc_barrier_try_wait(__mbarrier_t *barrier, __mbarrier_token_t token) {
    volatile unsigned int __ready = 0;
    unsigned int ns = 1000000;
    while (! __ready) {
      //for (int i = 0; (i < 10) && !__ready; i++) {
      __ready = __nvhpc_barrier_try_wait_sleep(barrier, token, ns);
      //}
      //if (! __ready) __nanosleep(100);
    }
    return __ready;
}
/* === End of Bulk loads and stores, and associated Barriers. */
#endif

#endif
