; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -mattr=+sme -force-streaming  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE

; This test only tests the legal types for a given vector width, as mulh nodes
; do not get generated for non-legal types.

target triple = "aarch64-unknown-linux-gnu"

;
; SMULH
;

define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; SVE-LABEL: smulh_v4i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.h, vl4
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    sxtb z0.h, p0/m, z0.h
; SVE-NEXT:    sxtb z1.h, p0/m, z1.h
; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    lsr z0.h, z0.h, #4
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v4i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ptrue p0.h, vl4
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    sxtb z0.h, p0/m, z0.h
; SVE2-NEXT:    sxtb z1.h, p0/m, z1.h
; SVE2-NEXT:    mul z0.h, z0.h, z1.h
; SVE2-NEXT:    lsr z0.h, z0.h, #4
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v4i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #22]
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
; NONEON-NOSVE-NEXT:    mul w8, w8, w12
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w9, w9, w13
; NONEON-NOSVE-NEXT:    mul w10, w10, w14
; NONEON-NOSVE-NEXT:    mul w11, w11, w12
; NONEON-NOSVE-NEXT:    ubfx w8, w8, #4, #12
; NONEON-NOSVE-NEXT:    ubfx w9, w9, #4, #12
; NONEON-NOSVE-NEXT:    ubfx w10, w10, #4, #12
; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
; NONEON-NOSVE-NEXT:    ubfx w8, w11, #4, #12
; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %insert = insertelement <4 x i16> undef, i16 4, i64 0
  %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
  %1 = sext <4 x i8> %op1 to <4 x i16>
  %2 = sext <4 x i8> %op2 to <4 x i16>
  %mul = mul <4 x i16> %1, %2
  %shr = lshr <4 x i16> %mul, <i16 4, i16 4, i16 4, i16 4>
  %res = trunc <4 x i16> %shr to <4 x i8>
  ret <4 x i8> %res
}

define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; SVE-LABEL: smulh_v8i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.b, vl8
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v8i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v8i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #15]
; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #23]
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #13]
; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #14]
; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #22]
; NONEON-NOSVE-NEXT:    mul w15, w15, w16
; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #21]
; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
; NONEON-NOSVE-NEXT:    mul w14, w14, w17
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
; NONEON-NOSVE-NEXT:    mul w13, w13, w16
; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w12, w12, w18
; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #19]
; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #18]
; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #17]
; NONEON-NOSVE-NEXT:    mul w8, w8, w17
; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
; NONEON-NOSVE-NEXT:    mul w11, w11, w0
; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
; NONEON-NOSVE-NEXT:    mul w10, w10, w16
; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
; NONEON-NOSVE-NEXT:    mul w9, w9, w18
; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %insert = insertelement <8 x i16> undef, i16 8, i64 0
  %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
  %1 = sext <8 x i8> %op1 to <8 x i16>
  %2 = sext <8 x i8> %op2 to <8 x i16>
  %mul = mul <8 x i16> %1, %2
  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
  %res = trunc <8 x i16> %shr to <8 x i8>
  ret <8 x i8> %res
}

define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; SVE-LABEL: smulh_v16i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.b, vl16
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v16i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v16i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #160
; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
; NONEON-NOSVE-NEXT:    str q0, [sp]
; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #44]
; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #45]
; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #46]
; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #47]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #49]
; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #50]
; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #51]
; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #63]
; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #62]
; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #61]
; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #60]
; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #52]
; NONEON-NOSVE-NEXT:    mul w20, w20, w21
; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #53]
; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #54]
; NONEON-NOSVE-NEXT:    mul w19, w19, w23
; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #55]
; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #40]
; NONEON-NOSVE-NEXT:    mul w7, w7, w25
; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #41]
; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #42]
; NONEON-NOSVE-NEXT:    mul w6, w6, w26
; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #43]
; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #88]
; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #89]
; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #90]
; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #91]
; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #92]
; NONEON-NOSVE-NEXT:    mul w9, w9, w16
; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #93]
; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #94]
; NONEON-NOSVE-NEXT:    mul w11, w11, w1
; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #95]
; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #56]
; NONEON-NOSVE-NEXT:    mul w12, w12, w5
; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #59]
; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #58]
; NONEON-NOSVE-NEXT:    mul w15, w15, w24
; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #57]
; NONEON-NOSVE-NEXT:    mul w0, w0, w23
; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
; NONEON-NOSVE-NEXT:    mul w4, w4, w27
; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
; NONEON-NOSVE-NEXT:    mul w3, w3, w25
; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
; NONEON-NOSVE-NEXT:    mul w2, w2, w26
; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
; NONEON-NOSVE-NEXT:    mul w17, w17, w21
; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
; NONEON-NOSVE-NEXT:    mul w13, w13, w22
; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
; NONEON-NOSVE-NEXT:    mul w10, w10, w18
; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
; NONEON-NOSVE-NEXT:    mul w8, w8, w14
; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
; NONEON-NOSVE-NEXT:    add sp, sp, #160
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <16 x i8> %op1 to <16 x i16>
  %2 = sext <16 x i8> %op2 to <16 x i16>
  %mul = mul <16 x i16> %1, %2
  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
  %res = trunc <16 x i16> %shr to <16 x i8>
  ret <16 x i8> %res
}

define void @smulh_v32i8(ptr %a, ptr %b) {
; SVE-LABEL: smulh_v32i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.b, vl16
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v32i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    smulh z0.b, z1.b, z0.b
; SVE2-NEXT:    smulh z1.b, z2.b, z3.b
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v32i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #384
; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    mov x29, x0
; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #184]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #185]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #186]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #187]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #188]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #189]
; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #229]
; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #227]
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #228]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #190]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #191]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #176]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #177]
; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #226]
; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #214]
; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #215]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #178]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #179]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #212]
; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #213]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #180]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #181]
; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #247]
; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #246]
; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #244]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #182]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #183]
; NONEON-NOSVE-NEXT:    mul w26, w12, w16
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #242]
; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #250]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #232]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #233]
; NONEON-NOSVE-NEXT:    mul w30, w10, w12
; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #255]
; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #253]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #234]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #235]
; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #248]
; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #249]
; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #210]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #236]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #237]
; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #211]
; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #208]
; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #209]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #238]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #239]
; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #222]
; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #223]
; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #220]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #224]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #225]
; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #221]
; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #219]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #230]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #231]
; NONEON-NOSVE-NEXT:    mul w27, w8, w14
; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #245]
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #217]
; NONEON-NOSVE-NEXT:    mul w9, w9, w15
; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #251]
; NONEON-NOSVE-NEXT:    mul w25, w13, w14
; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #243]
; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #218]
; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
; NONEON-NOSVE-NEXT:    mul w28, w11, w13
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #216]
; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #241]
; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #240]
; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #254]
; NONEON-NOSVE-NEXT:    mul w8, w25, w8
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #252]
; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w9, w25, w9
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
; NONEON-NOSVE-NEXT:    mul w12, w14, w12
; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
; NONEON-NOSVE-NEXT:    mul w10, w25, w10
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
; NONEON-NOSVE-NEXT:    mul w11, w25, w11
; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
; NONEON-NOSVE-NEXT:    mul w13, w14, w13
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    mul w10, w10, w15
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w11, w11, w16
; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    mul w12, w12, w18
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
; NONEON-NOSVE-NEXT:    mul w13, w13, w0
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w10, w10, w1
; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    mul w11, w11, w2
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
; NONEON-NOSVE-NEXT:    mul w12, w12, w3
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w13, w13, w4
; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    mul w10, w10, w5
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
; NONEON-NOSVE-NEXT:    mul w11, w11, w6
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w12, w12, w7
; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    mul w13, w13, w19
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
; NONEON-NOSVE-NEXT:    mul w10, w10, w20
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w11, w11, w21
; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w12, w12, w22
; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
; NONEON-NOSVE-NEXT:    mul w13, w13, w23
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w10, w10, w24
; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    mul w11, w11, w27
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
; NONEON-NOSVE-NEXT:    mul w12, w12, w15
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
; NONEON-NOSVE-NEXT:    mul w13, w13, w14
; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    add sp, sp, #384
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <32 x i8>, ptr %a
  %op2 = load <32 x i8>, ptr %b
  %1 = sext <32 x i8> %op1 to <32 x i16>
  %2 = sext <32 x i8> %op2 to <32 x i16>
  %mul = mul <32 x i16> %1, %2
  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
  %res = trunc <32 x i16> %shr to <32 x i8>
  store <32 x i8> %res, ptr %a
  ret void
}

define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; SVE-LABEL: smulh_v2i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.s, vl2
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    sxth z0.s, p0/m, z0.s
; SVE-NEXT:    sxth z1.s, p0/m, z1.s
; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    lsr z0.s, z0.s, #16
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v2i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ptrue p0.s, vl2
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    sxth z0.s, p0/m, z0.s
; SVE2-NEXT:    sxth z1.s, p0/m, z1.s
; SVE2-NEXT:    mul z0.s, z0.s, z1.s
; SVE2-NEXT:    lsr z0.s, z0.s, #16
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v2i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w8, w8, w10
; NONEON-NOSVE-NEXT:    mul w9, w9, w11
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <2 x i16> %op1 to <2 x i32>
  %2 = sext <2 x i16> %op2 to <2 x i32>
  %mul = mul <2 x i32> %1, %2
  %shr = lshr <2 x i32> %mul, <i32 16, i32 16>
  %res = trunc <2 x i32> %shr to <2 x i16>
  ret <2 x i16> %res
}

define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; SVE-LABEL: smulh_v4i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.h, vl4
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v4i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v4i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #22]
; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #18]
; NONEON-NOSVE-NEXT:    mul w11, w11, w12
; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w10, w10, w13
; NONEON-NOSVE-NEXT:    mul w9, w9, w14
; NONEON-NOSVE-NEXT:    mul w8, w8, w12
; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <4 x i16> %op1 to <4 x i32>
  %2 = sext <4 x i16> %op2 to <4 x i32>
  %mul = mul <4 x i32> %1, %2
  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
  %res = trunc <4 x i32> %shr to <4 x i16>
  ret <4 x i16> %res
}

define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; SVE-LABEL: smulh_v8i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.h, vl8
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v8i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v8i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #38]
; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #32]
; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #34]
; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #36]
; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #44]
; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #46]
; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #52]
; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #50]
; NONEON-NOSVE-NEXT:    ldrsh w0, [sp, #62]
; NONEON-NOSVE-NEXT:    mul w15, w15, w16
; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #48]
; NONEON-NOSVE-NEXT:    mul w14, w14, w17
; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #56]
; NONEON-NOSVE-NEXT:    mul w13, w13, w18
; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #60]
; NONEON-NOSVE-NEXT:    mul w12, w12, w16
; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #58]
; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
; NONEON-NOSVE-NEXT:    mul w11, w11, w0
; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
; NONEON-NOSVE-NEXT:    mul w10, w10, w18
; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
; NONEON-NOSVE-NEXT:    mul w9, w9, w16
; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
; NONEON-NOSVE-NEXT:    mul w8, w8, w17
; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
; NONEON-NOSVE-NEXT:    add sp, sp, #80
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <8 x i16> %op1 to <8 x i32>
  %2 = sext <8 x i16> %op2 to <8 x i32>
  %mul = mul <8 x i32> %1, %2
  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %res = trunc <8 x i32> %shr to <8 x i16>
  ret <8 x i16> %res
}

define void @smulh_v16i16(ptr %a, ptr %b) {
; SVE-LABEL: smulh_v16i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.h, vl8
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v16i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    smulh z0.h, z1.h, z0.h
; SVE2-NEXT:    smulh z1.h, z2.h, z3.h
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v16i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #240
; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
; NONEON-NOSVE-NEXT:    str q0, [sp]
; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #60]
; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #62]
; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #48]
; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #50]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldrsh w7, [sp, #96]
; NONEON-NOSVE-NEXT:    ldrsh w19, [sp, #98]
; NONEON-NOSVE-NEXT:    ldrsh w20, [sp, #100]
; NONEON-NOSVE-NEXT:    ldrsh w21, [sp, #102]
; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #52]
; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #104]
; NONEON-NOSVE-NEXT:    ldrsh w2, [sp, #106]
; NONEON-NOSVE-NEXT:    ldrsh w4, [sp, #108]
; NONEON-NOSVE-NEXT:    ldrsh w5, [sp, #110]
; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #88]
; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #90]
; NONEON-NOSVE-NEXT:    ldrsh w1, [sp, #92]
; NONEON-NOSVE-NEXT:    ldrsh w3, [sp, #94]
; NONEON-NOSVE-NEXT:    mul w8, w8, w15
; NONEON-NOSVE-NEXT:    ldrsh w6, [sp, #80]
; NONEON-NOSVE-NEXT:    ldrsh w23, [sp, #82]
; NONEON-NOSVE-NEXT:    mul w11, w11, w3
; NONEON-NOSVE-NEXT:    ldrsh w25, [sp, #84]
; NONEON-NOSVE-NEXT:    mul w13, w13, w23
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
; NONEON-NOSVE-NEXT:    mul w14, w14, w25
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    mul w12, w12, w6
; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
; NONEON-NOSVE-NEXT:    mul w10, w10, w1
; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #118]
; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #116]
; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #114]
; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #112]
; NONEON-NOSVE-NEXT:    ldrsh w28, [sp, #126]
; NONEON-NOSVE-NEXT:    mul w9, w9, w17
; NONEON-NOSVE-NEXT:    mul w21, w21, w22
; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #86]
; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
; NONEON-NOSVE-NEXT:    mul w20, w20, w24
; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #120]
; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
; NONEON-NOSVE-NEXT:    mul w19, w19, w26
; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #124]
; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
; NONEON-NOSVE-NEXT:    mul w7, w7, w27
; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #122]
; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
; NONEON-NOSVE-NEXT:    mul w5, w5, w28
; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    mul w4, w4, w26
; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
; NONEON-NOSVE-NEXT:    mul w2, w2, w27
; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
; NONEON-NOSVE-NEXT:    mul w18, w18, w24
; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
; NONEON-NOSVE-NEXT:    mul w16, w16, w22
; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    add sp, sp, #240
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <16 x i16>, ptr %a
  %op2 = load <16 x i16>, ptr %b
  %1 = sext <16 x i16> %op1 to <16 x i32>
  %2 = sext <16 x i16> %op2 to <16 x i32>
  %mul = mul <16 x i32> %1, %2
  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %res = trunc <16 x i32> %shr to <16 x i16>
  store <16 x i16> %res, ptr %a
  ret void
}

define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; SVE-LABEL: smulh_v2i32:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.s, vl2
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v2i32:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v2i32:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #8]
; NONEON-NOSVE-NEXT:    ldpsw x11, x10, [sp, #16]
; NONEON-NOSVE-NEXT:    smull x9, w9, w10
; NONEON-NOSVE-NEXT:    smull x8, w8, w11
; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <2 x i32> %op1 to <2 x i64>
  %2 = sext <2 x i32> %op2 to <2 x i64>
  %mul = mul <2 x i64> %1, %2
  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
  %res = trunc <2 x i64> %shr to <2 x i32>
  ret <2 x i32> %res
}

define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; SVE-LABEL: smulh_v4i32:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.s, vl4
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v4i32:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v4i32:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #32]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldpsw x13, x12, [sp, #48]
; NONEON-NOSVE-NEXT:    smull x11, w11, w12
; NONEON-NOSVE-NEXT:    ldpsw x12, x14, [sp, #56]
; NONEON-NOSVE-NEXT:    smull x10, w10, w13
; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
; NONEON-NOSVE-NEXT:    smull x9, w9, w14
; NONEON-NOSVE-NEXT:    smull x8, w8, w12
; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
; NONEON-NOSVE-NEXT:    add sp, sp, #80
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <4 x i32> %op1 to <4 x i64>
  %2 = sext <4 x i32> %op2 to <4 x i64>
  %mul = mul <4 x i64> %1, %2
  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
  %res = trunc <4 x i64> %shr to <4 x i32>
  ret <4 x i32> %res
}

define void @smulh_v8i32(ptr %a, ptr %b) {
; SVE-LABEL: smulh_v8i32:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.s, vl4
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v8i32:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    smulh z0.s, z1.s, z0.s
; SVE2-NEXT:    smulh z1.s, z2.s, z3.s
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v8i32:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #160
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #56]
; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #48]
; NONEON-NOSVE-NEXT:    ldpsw x12, x13, [sp, #104]
; NONEON-NOSVE-NEXT:    ldpsw x14, x15, [sp, #96]
; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
; NONEON-NOSVE-NEXT:    ldpsw x17, x16, [sp, #112]
; NONEON-NOSVE-NEXT:    smull x15, w15, w16
; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #120]
; NONEON-NOSVE-NEXT:    smull x14, w14, w17
; NONEON-NOSVE-NEXT:    ldpsw x17, x1, [sp, #80]
; NONEON-NOSVE-NEXT:    smull x13, w13, w18
; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
; NONEON-NOSVE-NEXT:    smull x12, w12, w16
; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #88]
; NONEON-NOSVE-NEXT:    smull x11, w11, w1
; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
; NONEON-NOSVE-NEXT:    smull x10, w10, w17
; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
; NONEON-NOSVE-NEXT:    smull x9, w9, w18
; NONEON-NOSVE-NEXT:    smull x8, w8, w16
; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    add sp, sp, #160
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <8 x i32>, ptr %a
  %op2 = load <8 x i32>, ptr %b
  %1 = sext <8 x i32> %op1 to <8 x i64>
  %2 = sext <8 x i32> %op2 to <8 x i64>
  %mul = mul <8 x i64> %1, %2
  %shr = lshr <8 x i64> %mul,  <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
  %res = trunc <8 x i64> %shr to <8 x i32>
  store <8 x i32> %res, ptr %a
  ret void
}

define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; SVE-LABEL: smulh_v1i64:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.d, vl1
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v1i64:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v1i64:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #16
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT:    fmov x8, d0
; NONEON-NOSVE-NEXT:    fmov x9, d1
; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT:    add sp, sp, #16
; NONEON-NOSVE-NEXT:    ret
  %insert = insertelement <1 x i128> undef, i128 64, i128 0
  %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
  %1 = sext <1 x i64> %op1 to <1 x i128>
  %2 = sext <1 x i64> %op2 to <1 x i128>
  %mul = mul <1 x i128> %1, %2
  %shr = lshr <1 x i128> %mul, %splat
  %res = trunc <1 x i128> %shr to <1 x i64>
  ret <1 x i64> %res
}

define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; SVE-LABEL: smulh_v2i64:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.d, vl2
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v2i64:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v2i64:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
; NONEON-NOSVE-NEXT:    add sp, sp, #64
; NONEON-NOSVE-NEXT:    ret
  %1 = sext <2 x i64> %op1 to <2 x i128>
  %2 = sext <2 x i64> %op2 to <2 x i128>
  %mul = mul <2 x i128> %1, %2
  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
  %res = trunc <2 x i128> %shr to <2 x i64>
  ret <2 x i64> %res
}

define void @smulh_v4i64(ptr %a, ptr %b) {
; SVE-LABEL: smulh_v4i64:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.d, vl2
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: smulh_v4i64:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    smulh z0.d, z1.d, z0.d
; SVE2-NEXT:    smulh z1.d, z2.d, z3.d
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: smulh_v4i64:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #128
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    add sp, sp, #128
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <4 x i64>, ptr %a
  %op2 = load <4 x i64>, ptr %b
  %1 = sext <4 x i64> %op1 to <4 x i128>
  %2 = sext <4 x i64> %op2 to <4 x i128>
  %mul = mul <4 x i128> %1, %2
  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
  %res = trunc <4 x i128> %shr to <4 x i64>
  store <4 x i64> %res, ptr %a
  ret void
}

;
; UMULH
;

define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; SVE-LABEL: umulh_v4i8:
; SVE:       // %bb.0:
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    ptrue p0.h, vl4
; SVE-NEXT:    and z0.h, z0.h, #0xff
; SVE-NEXT:    and z1.h, z1.h, #0xff
; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    lsr z0.h, z0.h, #4
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v4i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    and z0.h, z0.h, #0xff
; SVE2-NEXT:    and z1.h, z1.h, #0xff
; SVE2-NEXT:    mul z0.h, z0.h, z1.h
; SVE2-NEXT:    lsr z0.h, z0.h, #4
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v4i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #22]
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
; NONEON-NOSVE-NEXT:    mul w8, w8, w12
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w9, w9, w13
; NONEON-NOSVE-NEXT:    mul w10, w10, w14
; NONEON-NOSVE-NEXT:    mul w11, w11, w12
; NONEON-NOSVE-NEXT:    lsr w8, w8, #4
; NONEON-NOSVE-NEXT:    lsr w9, w9, #4
; NONEON-NOSVE-NEXT:    lsr w10, w10, #4
; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
; NONEON-NOSVE-NEXT:    lsr w8, w11, #4
; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <4 x i8> %op1 to <4 x i16>
  %2 = zext <4 x i8> %op2 to <4 x i16>
  %mul = mul <4 x i16> %1, %2
  %shr = lshr <4 x i16> %mul, <i16 4, i16 4, i16 4, i16 4>
  %res = trunc <4 x i16> %shr to <4 x i8>
  ret <4 x i8> %res
}

define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; SVE-LABEL: umulh_v8i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.b, vl8
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v8i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v8i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #15]
; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #22]
; NONEON-NOSVE-NEXT:    mul w15, w15, w16
; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #21]
; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
; NONEON-NOSVE-NEXT:    mul w14, w14, w17
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
; NONEON-NOSVE-NEXT:    mul w13, w13, w16
; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w12, w12, w18
; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #19]
; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #18]
; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #17]
; NONEON-NOSVE-NEXT:    mul w8, w8, w17
; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
; NONEON-NOSVE-NEXT:    mul w11, w11, w0
; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
; NONEON-NOSVE-NEXT:    mul w10, w10, w16
; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
; NONEON-NOSVE-NEXT:    mul w9, w9, w18
; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <8 x i8> %op1 to <8 x i16>
  %2 = zext <8 x i8> %op2 to <8 x i16>
  %mul = mul <8 x i16> %1, %2
  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
  %res = trunc <8 x i16> %shr to <8 x i8>
  ret <8 x i8> %res
}

define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; SVE-LABEL: umulh_v16i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.b, vl16
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v16i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v16i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #160
; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
; NONEON-NOSVE-NEXT:    str q0, [sp]
; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #45]
; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #46]
; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #47]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #50]
; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #51]
; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #63]
; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #62]
; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #61]
; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #60]
; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #52]
; NONEON-NOSVE-NEXT:    mul w20, w20, w21
; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #53]
; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #54]
; NONEON-NOSVE-NEXT:    mul w19, w19, w23
; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #55]
; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #40]
; NONEON-NOSVE-NEXT:    mul w7, w7, w25
; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #42]
; NONEON-NOSVE-NEXT:    mul w6, w6, w26
; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #43]
; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #88]
; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #89]
; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #90]
; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #91]
; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #92]
; NONEON-NOSVE-NEXT:    mul w9, w9, w16
; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #93]
; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #94]
; NONEON-NOSVE-NEXT:    mul w11, w11, w1
; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #95]
; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #56]
; NONEON-NOSVE-NEXT:    mul w12, w12, w5
; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #59]
; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #58]
; NONEON-NOSVE-NEXT:    mul w15, w15, w24
; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #57]
; NONEON-NOSVE-NEXT:    mul w0, w0, w23
; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
; NONEON-NOSVE-NEXT:    mul w4, w4, w27
; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
; NONEON-NOSVE-NEXT:    mul w3, w3, w25
; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
; NONEON-NOSVE-NEXT:    mul w2, w2, w26
; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
; NONEON-NOSVE-NEXT:    mul w17, w17, w21
; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
; NONEON-NOSVE-NEXT:    mul w13, w13, w22
; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
; NONEON-NOSVE-NEXT:    mul w10, w10, w18
; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
; NONEON-NOSVE-NEXT:    mul w8, w8, w14
; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
; NONEON-NOSVE-NEXT:    add sp, sp, #160
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <16 x i8> %op1 to <16 x i16>
  %2 = zext <16 x i8> %op2 to <16 x i16>
  %mul = mul <16 x i16> %1, %2
  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
  %res = trunc <16 x i16> %shr to <16 x i8>
  ret <16 x i8> %res
}

define void @umulh_v32i8(ptr %a, ptr %b) {
; SVE-LABEL: umulh_v32i8:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.b, vl16
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v32i8:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    umulh z0.b, z1.b, z0.b
; SVE2-NEXT:    umulh z1.b, z2.b, z3.b
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v32i8:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #384
; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    mov x29, x0
; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #184]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #185]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #186]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #187]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #188]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #189]
; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #229]
; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #227]
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #228]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #190]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #191]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #176]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #177]
; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #226]
; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #214]
; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #215]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #179]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #212]
; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #213]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #180]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #181]
; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #247]
; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #244]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #183]
; NONEON-NOSVE-NEXT:    mul w26, w12, w16
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #242]
; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #250]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #232]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #233]
; NONEON-NOSVE-NEXT:    mul w30, w10, w12
; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #255]
; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #253]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #234]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #235]
; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #248]
; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #249]
; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #210]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #236]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #237]
; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #211]
; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #208]
; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #209]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #238]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #239]
; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #222]
; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #223]
; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #220]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #224]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #225]
; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #221]
; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #219]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #230]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #231]
; NONEON-NOSVE-NEXT:    mul w27, w8, w14
; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #217]
; NONEON-NOSVE-NEXT:    mul w9, w9, w15
; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #251]
; NONEON-NOSVE-NEXT:    mul w25, w13, w14
; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #243]
; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #218]
; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
; NONEON-NOSVE-NEXT:    mul w28, w11, w13
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #216]
; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #241]
; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #240]
; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #254]
; NONEON-NOSVE-NEXT:    mul w8, w25, w8
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #252]
; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w9, w25, w9
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
; NONEON-NOSVE-NEXT:    mul w12, w14, w12
; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
; NONEON-NOSVE-NEXT:    mul w10, w25, w10
; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
; NONEON-NOSVE-NEXT:    mul w11, w25, w11
; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
; NONEON-NOSVE-NEXT:    mul w13, w14, w13
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    mul w10, w10, w15
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w11, w11, w16
; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    mul w12, w12, w18
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
; NONEON-NOSVE-NEXT:    mul w13, w13, w0
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w10, w10, w1
; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    mul w11, w11, w2
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
; NONEON-NOSVE-NEXT:    mul w12, w12, w3
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w13, w13, w4
; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    mul w10, w10, w5
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
; NONEON-NOSVE-NEXT:    mul w11, w11, w6
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w12, w12, w7
; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    mul w13, w13, w19
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
; NONEON-NOSVE-NEXT:    mul w10, w10, w20
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w11, w11, w21
; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w12, w12, w22
; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
; NONEON-NOSVE-NEXT:    mul w13, w13, w23
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
; NONEON-NOSVE-NEXT:    mul w10, w10, w24
; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    mul w11, w11, w27
; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
; NONEON-NOSVE-NEXT:    mul w12, w12, w15
; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
; NONEON-NOSVE-NEXT:    mul w13, w13, w14
; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    add sp, sp, #384
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <32 x i8>, ptr %a
  %op2 = load <32 x i8>, ptr %b
  %1 = zext <32 x i8> %op1 to <32 x i16>
  %2 = zext <32 x i8> %op2 to <32 x i16>
  %mul = mul <32 x i16> %1, %2
  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
  %res = trunc <32 x i16> %shr to <32 x i8>
  store <32 x i8> %res, ptr %a
  ret void
}

define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; SVE-LABEL: umulh_v2i16:
; SVE:       // %bb.0:
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    ptrue p0.s, vl2
; SVE-NEXT:    and z0.s, z0.s, #0xffff
; SVE-NEXT:    and z1.s, z1.s, #0xffff
; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    lsr z0.s, z0.s, #16
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v2i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    and z0.s, z0.s, #0xffff
; SVE2-NEXT:    and z1.s, z1.s, #0xffff
; SVE2-NEXT:    mul z0.s, z0.s, z1.s
; SVE2-NEXT:    lsr z0.s, z0.s, #16
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v2i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w8, w8, w10
; NONEON-NOSVE-NEXT:    mul w9, w9, w11
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <2 x i16> %op1 to <2 x i32>
  %2 = zext <2 x i16> %op2 to <2 x i32>
  %mul = mul <2 x i32> %1, %2
  %shr = lshr <2 x i32> %mul, <i32 16, i32 16>
  %res = trunc <2 x i32> %shr to <2 x i16>
  ret <2 x i16> %res
}

define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; SVE-LABEL: umulh_v4i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.h, vl4
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v4i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v4i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #20]
; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #18]
; NONEON-NOSVE-NEXT:    mul w11, w11, w12
; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
; NONEON-NOSVE-NEXT:    mul w10, w10, w13
; NONEON-NOSVE-NEXT:    mul w9, w9, w14
; NONEON-NOSVE-NEXT:    mul w8, w8, w12
; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <4 x i16> %op1 to <4 x i32>
  %2 = zext <4 x i16> %op2 to <4 x i32>
  %mul = mul <4 x i32> %1, %2
  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
  %res = trunc <4 x i32> %shr to <4 x i16>
  ret <4 x i16> %res
}

define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; SVE-LABEL: umulh_v8i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.h, vl8
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v8i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v8i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #34]
; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #36]
; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #46]
; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #52]
; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #50]
; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #62]
; NONEON-NOSVE-NEXT:    mul w15, w15, w16
; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #48]
; NONEON-NOSVE-NEXT:    mul w14, w14, w17
; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #56]
; NONEON-NOSVE-NEXT:    mul w13, w13, w18
; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #60]
; NONEON-NOSVE-NEXT:    mul w12, w12, w16
; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #58]
; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
; NONEON-NOSVE-NEXT:    mul w11, w11, w0
; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
; NONEON-NOSVE-NEXT:    mul w10, w10, w18
; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
; NONEON-NOSVE-NEXT:    mul w9, w9, w16
; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
; NONEON-NOSVE-NEXT:    mul w8, w8, w17
; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
; NONEON-NOSVE-NEXT:    add sp, sp, #80
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <8 x i16> %op1 to <8 x i32>
  %2 = zext <8 x i16> %op2 to <8 x i32>
  %mul = mul <8 x i32> %1, %2
  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %res = trunc <8 x i32> %shr to <8 x i16>
  ret <8 x i16> %res
}

define void @umulh_v16i16(ptr %a, ptr %b) {
; SVE-LABEL: umulh_v16i16:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.h, vl8
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v16i16:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    umulh z0.h, z1.h, z0.h
; SVE2-NEXT:    umulh z1.h, z2.h, z3.h
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v16i16:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #240
; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
; NONEON-NOSVE-NEXT:    str q0, [sp]
; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #62]
; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #48]
; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #50]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #96]
; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #98]
; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #100]
; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #102]
; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #52]
; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #104]
; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #106]
; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #108]
; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #110]
; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #88]
; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #90]
; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #92]
; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #94]
; NONEON-NOSVE-NEXT:    mul w8, w8, w15
; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #80]
; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #82]
; NONEON-NOSVE-NEXT:    mul w11, w11, w3
; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #84]
; NONEON-NOSVE-NEXT:    mul w13, w13, w23
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
; NONEON-NOSVE-NEXT:    mul w14, w14, w25
; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
; NONEON-NOSVE-NEXT:    mul w12, w12, w6
; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
; NONEON-NOSVE-NEXT:    mul w10, w10, w1
; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #118]
; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #116]
; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #114]
; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #112]
; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #126]
; NONEON-NOSVE-NEXT:    mul w9, w9, w17
; NONEON-NOSVE-NEXT:    mul w21, w21, w22
; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #86]
; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
; NONEON-NOSVE-NEXT:    mul w20, w20, w24
; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #120]
; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
; NONEON-NOSVE-NEXT:    mul w19, w19, w26
; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #124]
; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
; NONEON-NOSVE-NEXT:    mul w7, w7, w27
; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #122]
; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
; NONEON-NOSVE-NEXT:    mul w5, w5, w28
; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
; NONEON-NOSVE-NEXT:    mul w4, w4, w26
; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
; NONEON-NOSVE-NEXT:    mul w2, w2, w27
; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
; NONEON-NOSVE-NEXT:    mul w18, w18, w24
; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
; NONEON-NOSVE-NEXT:    mul w16, w16, w22
; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    add sp, sp, #240
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <16 x i16>, ptr %a
  %op2 = load <16 x i16>, ptr %b
  %1 = zext <16 x i16> %op1 to <16 x i32>
  %2 = zext <16 x i16> %op2 to <16 x i32>
  %mul = mul <16 x i32> %1, %2
  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %res = trunc <16 x i32> %shr to <16 x i16>
  store <16 x i16> %res, ptr %a
  ret void
}

define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; SVE-LABEL: umulh_v2i32:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.s, vl2
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v2i32:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v2i32:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #32
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
; NONEON-NOSVE-NEXT:    umull x9, w9, w10
; NONEON-NOSVE-NEXT:    umull x8, w8, w11
; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT:    add sp, sp, #32
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <2 x i32> %op1 to <2 x i64>
  %2 = zext <2 x i32> %op2 to <2 x i64>
  %mul = mul <2 x i64> %1, %2
  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
  %res = trunc <2 x i64> %shr to <2 x i32>
  ret <2 x i32> %res
}

define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; SVE-LABEL: umulh_v4i32:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.s, vl4
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v4i32:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v4i32:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldp w13, w12, [sp, #48]
; NONEON-NOSVE-NEXT:    umull x11, w11, w12
; NONEON-NOSVE-NEXT:    ldp w12, w14, [sp, #56]
; NONEON-NOSVE-NEXT:    umull x10, w10, w13
; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
; NONEON-NOSVE-NEXT:    umull x9, w9, w14
; NONEON-NOSVE-NEXT:    umull x8, w8, w12
; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
; NONEON-NOSVE-NEXT:    add sp, sp, #80
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <4 x i32> %op1 to <4 x i64>
  %2 = zext <4 x i32> %op2 to <4 x i64>
  %mul = mul <4 x i64> %1, %2
  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
  %res = trunc <4 x i64> %shr to <4 x i32>
  ret <4 x i32> %res
}

define void @umulh_v8i32(ptr %a, ptr %b) {
; SVE-LABEL: umulh_v8i32:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.s, vl4
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v8i32:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    umulh z0.s, z1.s, z0.s
; SVE2-NEXT:    umulh z1.s, z2.s, z3.s
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v8i32:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #160
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #104]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
; NONEON-NOSVE-NEXT:    ldp w17, w16, [sp, #112]
; NONEON-NOSVE-NEXT:    umull x15, w15, w16
; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #120]
; NONEON-NOSVE-NEXT:    umull x14, w14, w17
; NONEON-NOSVE-NEXT:    ldp w17, w1, [sp, #80]
; NONEON-NOSVE-NEXT:    umull x13, w13, w18
; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
; NONEON-NOSVE-NEXT:    umull x12, w12, w16
; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #88]
; NONEON-NOSVE-NEXT:    umull x11, w11, w1
; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
; NONEON-NOSVE-NEXT:    umull x10, w10, w17
; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
; NONEON-NOSVE-NEXT:    umull x9, w9, w18
; NONEON-NOSVE-NEXT:    umull x8, w8, w16
; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    add sp, sp, #160
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <8 x i32>, ptr %a
  %op2 = load <8 x i32>, ptr %b
  %insert = insertelement <8 x i64> undef, i64 32, i64 0
  %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
  %1 = zext <8 x i32> %op1 to <8 x i64>
  %2 = zext <8 x i32> %op2 to <8 x i64>
  %mul = mul <8 x i64> %1, %2
  %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
  %res = trunc <8 x i64> %shr to <8 x i32>
  store <8 x i32> %res, ptr %a
  ret void
}

define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; SVE-LABEL: umulh_v1i64:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.d, vl1
; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v1i64:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v1i64:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #16
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT:    fmov x8, d0
; NONEON-NOSVE-NEXT:    fmov x9, d1
; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT:    add sp, sp, #16
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <1 x i64> %op1 to <1 x i128>
  %2 = zext <1 x i64> %op2 to <1 x i128>
  %mul = mul <1 x i128> %1, %2
  %shr = lshr <1 x i128> %mul, <i128 64>
  %res = trunc <1 x i128> %shr to <1 x i64>
  ret <1 x i64> %res
}

define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; SVE-LABEL: umulh_v2i64:
; SVE:       // %bb.0:
; SVE-NEXT:    ptrue p0.d, vl2
; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v2i64:
; SVE2:       // %bb.0:
; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v2i64:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
; NONEON-NOSVE-NEXT:    add sp, sp, #64
; NONEON-NOSVE-NEXT:    ret
  %1 = zext <2 x i64> %op1 to <2 x i128>
  %2 = zext <2 x i64> %op2 to <2 x i128>
  %mul = mul <2 x i128> %1, %2
  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
  %res = trunc <2 x i128> %shr to <2 x i64>
  ret <2 x i64> %res
}

define void @umulh_v4i64(ptr %a, ptr %b) {
; SVE-LABEL: umulh_v4i64:
; SVE:       // %bb.0:
; SVE-NEXT:    ldp q0, q3, [x1]
; SVE-NEXT:    ptrue p0.d, vl2
; SVE-NEXT:    ldp q1, q2, [x0]
; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
; SVE-NEXT:    movprfx z1, z2
; SVE-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
; SVE-NEXT:    stp q0, q1, [x0]
; SVE-NEXT:    ret
;
; SVE2-LABEL: umulh_v4i64:
; SVE2:       // %bb.0:
; SVE2-NEXT:    ldp q0, q3, [x1]
; SVE2-NEXT:    ldp q1, q2, [x0]
; SVE2-NEXT:    umulh z0.d, z1.d, z0.d
; SVE2-NEXT:    umulh z1.d, z2.d, z3.d
; SVE2-NEXT:    stp q0, q1, [x0]
; SVE2-NEXT:    ret
;
; NONEON-NOSVE-LABEL: umulh_v4i64:
; NONEON-NOSVE:       // %bb.0:
; NONEON-NOSVE-NEXT:    sub sp, sp, #128
; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
; NONEON-NOSVE-NEXT:    add sp, sp, #128
; NONEON-NOSVE-NEXT:    ret
  %op1 = load <4 x i64>, ptr %a
  %op2 = load <4 x i64>, ptr %b
  %1 = zext <4 x i64> %op1 to <4 x i128>
  %2 = zext <4 x i64> %op2 to <4 x i128>
  %mul = mul <4 x i128> %1, %2
  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
  %res = trunc <4 x i128> %shr to <4 x i64>
  store <4 x i64> %res, ptr %a
  ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
