/*******************************************************************************
* Copyright 2023 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

#ifndef __ESIMD_HELPERS_HPP__
#define __ESIMD_HELPERS_HPP__

#include <sycl/ext/intel/esimd.hpp>

//
// The LSC apis for memory management on DG2, PVC +
//
// sycl::ext::intel::experimental::esimd::cache_hint : uint8_t:
//
//    none, uncached, cached, write_back, write_through, streaming, read_invalidate
//

namespace {

namespace esimd     = sycl::ext::intel::esimd;
namespace esimd_exp = sycl::ext::intel::experimental::esimd;

using cache_hint = esimd_exp::cache_hint;
using ds         = esimd_exp::lsc_data_size;
using lsc_atomic_op = esimd::native::lsc::atomic_op;

static constexpr auto nc = cache_hint::none;
static constexpr auto uc = cache_hint::uncached;
static constexpr auto ca = cache_hint::cached;
static constexpr auto st = cache_hint::streaming;
static constexpr auto wt = cache_hint::write_through;
static constexpr auto wb = cache_hint::write_back;

/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
// SLM API WRAPPERS

//
// SLM scalar load
//
template <typename T, typename IntType, ds DS = ds::default_size>
static T esimd_lsc_slm_scalar_load(const IntType &offset)
{
    const std::uint32_t offsetBytes = offset * sizeof(T);
    const esimd::simd<T, 1> Res = esimd_exp::lsc_slm_block_load<T, 1, DS>(offsetBytes);
    return static_cast<T>(Res[0]);
}

//
// SLM block load
//
template <typename T, typename IntType, int block_size = 8, ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_slm_block_load(const IntType &offset)
{
    const std::uint32_t offsetBytes = offset * sizeof(T);
    return esimd_exp::lsc_slm_block_load<T, block_size, DS>(offsetBytes);
}

//
// SLM scalar store
//
template <typename T, typename IntType, ds DS = ds::default_size>
static void esimd_lsc_slm_scalar_store(const IntType &offset, const T &val)
{
    const std::uint32_t offsetBytes = offset * sizeof(T);
    esimd_exp::lsc_slm_block_store<T, 1, DS>(offsetBytes, esimd::simd<T, 1>(val));
}

//
// SLM block store
//
template <typename T, typename IntType, int block_size = 8, ds DS = ds::default_size>
static void esimd_lsc_slm_block_store(const IntType &offset, const esimd::simd<T, block_size> &val)
{
    const std::uint32_t offsetBytes = offset * sizeof(T);
    esimd_exp::lsc_slm_block_store<T, block_size, DS>(offsetBytes, val);
}

//
// SLM gather with optional mask
//
template <typename T,
          typename IntType,
          int block_size = 8,
          ds DS          = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_slm_gather(const esimd::simd<IntType, block_size> &offsets,
                                                       const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint32_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_slm_gather<T, 1, DS, block_size>(offsetBytes, mask);
}


//
// SLM gather with mask and pass_through
//
template <typename T,
          typename IntType,
          int block_size = 8,
          ds DS          = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_slm_gather(const esimd::simd<IntType, block_size> &offsets,
                                                       const esimd::simd_mask<block_size> &mask,
                                                       const esimd::simd<T, block_size> &pass_thru)
{
    const esimd::simd<uint32_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_slm_gather<T, 1, DS, block_size>(offsetBytes, mask, pass_thru);
}



//
// SLM scatter with mask
//
template <typename T,
          typename IntType,
          int block_size = 8,
          ds DS          = ds::default_size>
static void esimd_lsc_slm_scatter(const esimd::simd<IntType, block_size> &offsets,
                                  const esimd::simd<T, block_size> &vals,
                                  const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint32_t, block_size> offsetBytes = offsets * sizeof(T);
    esimd_exp::lsc_slm_scatter<T, 1, DS, block_size>(offsetBytes, vals, mask);
}

//
// SLM atomic update APIs
//
template <esimd::atomic_op Op,
          typename T,
          typename IntType,
          int block_size = 8,
          ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_slm_atomic_update(const esimd::simd<IntType, block_size> &offsets,
                                                              const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<std::uint32_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_slm_atomic_update<Op, T, block_size>(offsetBytes, mask);
}

template <esimd::atomic_op Op,
          typename T,
          typename IntType,
          int block_size = 8,
          ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_slm_atomic_update(const esimd::simd<IntType, block_size> &offsets,
                                                              const esimd::simd<T, block_size> &src0,
                                                              const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<std::uint32_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_slm_atomic_update<Op, T, block_size>(offsetBytes, src0, mask);
}

template <esimd::atomic_op Op,
          typename T,
          typename IntType,
          int block_size = 8,
          ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_slm_atomic_update(const esimd::simd<IntType, block_size> &offsets,
                                                              const esimd::simd<T, block_size> &src0,
                                                              const esimd::simd<T, block_size> &src1,
                                                              const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<std::uint32_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_slm_atomic_update<Op, T, block_size>(offsetBytes, src0, src1, mask);
}

/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
// GLOBAL MEMORY API WRAPPERS

//
// scalar load
//
template <typename T,
          typename IntType,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static T esimd_lsc_scalar_load(const T *acc, const IntType &offset)
{
    const esimd::simd<T, 1> Res = esimd_exp::lsc_block_load<T, 1, DS, L1H, L3H>(acc + offset);
    return static_cast<T>(Res[0]);
}

//
// scalar store
//
template <typename T,
          typename IntType,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static void esimd_lsc_scalar_store(T *acc, const IntType &offset, const T &val)
{
    esimd_exp::lsc_block_store<T, 1, DS, L1H, L3H>(acc + offset, esimd::simd<T, 1>(val));
}

//
// block_load
//
// we are assuming that the offset, "offset*size(T)", is not necessarily oword (16 byte)
//  aligned
//
// non-LSC block loads assume vector size alignment unless otherwise stated, so we change
// the default to be element size alignment for us. There should be no performance penalty
// for this, but vector_alignment may give incorrect results if we are not careful.
//
// alternative (default for esimd) is esimd::vector_aligned_tag
template <typename T,
          typename IntType,
          int block_size = 8,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_block_load(const T *acc, const IntType &offset)
{
    return esimd_exp::lsc_block_load<T, block_size, DS, L1H, L3H>(acc + offset);
}

//
// block_store
//
template <typename T,
          typename IntType,
          int block_size = 8,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static void
esimd_lsc_block_store(T *acc, const IntType &offset, const esimd::simd<T, block_size> &vals)
{
    esimd_exp::lsc_block_store<T, block_size, DS, L1H, L3H>(acc + offset, vals);
}

//
// gather with mask
//
template <typename T,
          typename IntType,
          int block_size = 8,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_gather(const T *acc,
                                                   const esimd::simd<IntType, block_size> &offsets,
                                                   const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_gather<T, 1, DS, L1H, L3H, block_size>(acc, offsetBytes, mask);
}

//
// SLM gather with mask and pass_through
//
template <typename T,
          typename IntType,
          int block_size = 8,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_gather(const T *acc,
                                                   const esimd::simd<IntType, block_size> &offsets,
                                                   const esimd::simd_mask<block_size> &mask,
                                                   const esimd::simd<T, block_size> &pass_thru)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_gather<T, 1, DS, L1H, L3H, block_size>(acc, offsetBytes, mask, pass_thru);
}
//
// scatter with mask
//
template <typename T,
          typename IntType,
          int block_size = 8,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static void esimd_lsc_scatter(T *acc,
                              const esimd::simd<IntType, block_size> &offsets,
                              const esimd::simd<T, block_size> &vals,
                              const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    esimd_exp::lsc_scatter<T, 1, DS, L1H, L3H, block_size>(acc, offsetBytes, vals, mask);
}

//
// LSC Prefetch instructions
//
template <typename T,
          typename IntType,
          uint8_t NElts  = 1,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static void esimd_lsc_prefetch(const T *acc, const IntType &offset)
{
    esimd_exp::lsc_prefetch<T, NElts, DS, L1H, L3H>(acc + offset);
}

template <typename T,
          typename IntType,
          int block_size,
          uint8_t NElts  = 1,
          cache_hint L1H = cache_hint::none,
          cache_hint L3H = cache_hint::none,
          ds DS          = ds::default_size>
static void esimd_lsc_prefetch(const T *acc,
                               const esimd::simd<IntType, block_size> &offsets,
                               const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    esimd_exp::lsc_prefetch<T, NElts, DS, L1H, L3H, block_size>(acc, offsetBytes, mask);
}


//
// global  atomic update APIs
//
template <esimd::atomic_op Op,
          typename T,
          typename IntType,
          int block_size = 1,
          ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_atomic_update(T * p,
                                                          const esimd::simd<IntType, block_size> &offsets,
                                                          const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_atomic_update<Op, T, block_size>(p, offsetBytes, mask);
}

template <esimd::atomic_op Op,
          typename T,
          typename IntType,
          int block_size = 1,
          ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_atomic_update(T *p,
                                                          const esimd::simd<IntType, block_size> &offsets,
                                                          const esimd::simd<T, block_size> &src0,
                                                          const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_atomic_update<Op, T, block_size>(p, offsetBytes, src0, mask);
}

template <esimd::atomic_op Op,
          typename T,
          typename IntType,
          int block_size = 1,
          ds DS = ds::default_size>
static esimd::simd<T, block_size> esimd_lsc_atomic_update(T *p,
                                                          const esimd::simd<IntType, block_size> &offsets,
                                                          const esimd::simd<T, block_size> &src0,
                                                          const esimd::simd<T, block_size> &src1,
                                                          const esimd::simd_mask<block_size> &mask = 1)
{
    const esimd::simd<uint64_t, block_size> offsetBytes = offsets * sizeof(T);
    return esimd_exp::lsc_atomic_update<Op, T, block_size>(p, offsetBytes, src0, src1, mask);
}


} // anonymous namespace

#endif // #ifndef __ESIMD_HELPERS_HPP__
