; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS

; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32
; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}

; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.

define i32 @test_dynamic_stackalloc(i64 %n) {
; CHECK-32-LABEL: test_dynamic_stackalloc(
; CHECK-32:       {
; CHECK-32-NEXT:    .reg .b32 %r<8>;
; CHECK-32-EMPTY:
; CHECK-32-NEXT:  // %bb.0:
; CHECK-32-NEXT:    ld.param.b32 %r1, [test_dynamic_stackalloc_param_0];
; CHECK-32-NEXT:    add.s32 %r2, %r1, 7;
; CHECK-32-NEXT:    and.b32 %r3, %r2, -8;
; CHECK-32-NEXT:    alloca.u32 %r4, %r3, 16;
; CHECK-32-NEXT:    cvta.local.u32 %r5, %r4;
; CHECK-32-NEXT:    { // callseq 0, 0
; CHECK-32-NEXT:    .param .b32 param0;
; CHECK-32-NEXT:    st.param.b32 [param0], %r5;
; CHECK-32-NEXT:    .param .b32 retval0;
; CHECK-32-NEXT:    call.uni (retval0), bar, (param0);
; CHECK-32-NEXT:    ld.param.b32 %r6, [retval0];
; CHECK-32-NEXT:    } // callseq 0
; CHECK-32-NEXT:    st.param.b32 [func_retval0], %r6;
; CHECK-32-NEXT:    ret;
;
; CHECK-64-LABEL: test_dynamic_stackalloc(
; CHECK-64:       {
; CHECK-64-NEXT:    .reg .b32 %r<3>;
; CHECK-64-NEXT:    .reg .b64 %rd<6>;
; CHECK-64-EMPTY:
; CHECK-64-NEXT:  // %bb.0:
; CHECK-64-NEXT:    ld.param.b64 %rd1, [test_dynamic_stackalloc_param_0];
; CHECK-64-NEXT:    add.s64 %rd2, %rd1, 7;
; CHECK-64-NEXT:    and.b64 %rd3, %rd2, -8;
; CHECK-64-NEXT:    alloca.u64 %rd4, %rd3, 16;
; CHECK-64-NEXT:    cvta.local.u64 %rd5, %rd4;
; CHECK-64-NEXT:    { // callseq 0, 0
; CHECK-64-NEXT:    .param .b64 param0;
; CHECK-64-NEXT:    st.param.b64 [param0], %rd5;
; CHECK-64-NEXT:    .param .b32 retval0;
; CHECK-64-NEXT:    call.uni (retval0), bar, (param0);
; CHECK-64-NEXT:    ld.param.b32 %r1, [retval0];
; CHECK-64-NEXT:    } // callseq 0
; CHECK-64-NEXT:    st.param.b32 [func_retval0], %r1;
; CHECK-64-NEXT:    ret;
  %alloca = alloca i8, i64 %n, align 16
  %call = call i32 @bar(ptr %alloca)
  ret i32 %call
}

define float @test_dynamic_stackalloc_unaligned(i64 %0) {
; CHECK-32-LABEL: test_dynamic_stackalloc_unaligned(
; CHECK-32:       {
; CHECK-32-NEXT:    .reg .b32 %r<7>;
; CHECK-32-EMPTY:
; CHECK-32-NEXT:  // %bb.0:
; CHECK-32-NEXT:    ld.param.b32 %r1, [test_dynamic_stackalloc_unaligned_param_0];
; CHECK-32-NEXT:    shl.b32 %r2, %r1, 2;
; CHECK-32-NEXT:    add.s32 %r3, %r2, 7;
; CHECK-32-NEXT:    and.b32 %r4, %r3, -8;
; CHECK-32-NEXT:    alloca.u32 %r5, %r4, 8;
; CHECK-32-NEXT:    ld.local.b32 %r6, [%r5];
; CHECK-32-NEXT:    st.param.b32 [func_retval0], %r6;
; CHECK-32-NEXT:    ret;
;
; CHECK-64-LABEL: test_dynamic_stackalloc_unaligned(
; CHECK-64:       {
; CHECK-64-NEXT:    .reg .b32 %r<2>;
; CHECK-64-NEXT:    .reg .b64 %rd<6>;
; CHECK-64-EMPTY:
; CHECK-64-NEXT:  // %bb.0:
; CHECK-64-NEXT:    ld.param.b64 %rd1, [test_dynamic_stackalloc_unaligned_param_0];
; CHECK-64-NEXT:    shl.b64 %rd2, %rd1, 2;
; CHECK-64-NEXT:    add.s64 %rd3, %rd2, 7;
; CHECK-64-NEXT:    and.b64 %rd4, %rd3, -8;
; CHECK-64-NEXT:    alloca.u64 %rd5, %rd4, 8;
; CHECK-64-NEXT:    ld.local.b32 %r1, [%rd5];
; CHECK-64-NEXT:    st.param.b32 [func_retval0], %r1;
; CHECK-64-NEXT:    ret;
  %4 = alloca float, i64 %0, align 4
  %5 = getelementptr float, ptr %4, i64 0
  %6 = load float, ptr %5, align 4
  ret float %6
}

declare i32 @bar(ptr)

