; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s

define void @vld2(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld2:
; CHECK:       .Lfunc_begin0:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB0_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
; CHECK-NEXT:    str q2, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB0_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 1
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep19 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <8 x float>, ptr %next.gep, align 4
  %1 = fmul fast <8 x float> %wide.vec, %wide.vec
  %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %3 = fmul fast <8 x float> %wide.vec, %wide.vec
  %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %5 = fadd fast <4 x float> %4, %2
  store <4 x float> %5, ptr %next.gep19, align 4
  %index.next = add i64 %index, 4
  %6 = icmp eq i64 %index.next, 1024
  br i1 %6, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld3(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld3:
; CHECK:       .Lfunc_begin1:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB1_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
; CHECK-NEXT:    str q3, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB1_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = mul i64 %index, 3
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep23 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <12 x float>, ptr %next.gep, align 4
  %1 = fmul fast <12 x float> %wide.vec, %wide.vec
  %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  %3 = fmul fast <12 x float> %wide.vec, %wide.vec
  %4 = shufflevector <12 x float> %3, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  %5 = fadd fast <4 x float> %4, %2
  %6 = fmul fast <12 x float> %wide.vec, %wide.vec
  %7 = shufflevector <12 x float> %6, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  %8 = fadd fast <4 x float> %5, %7
  store <4 x float> %8, ptr %next.gep23, align 4
  %index.next = add i64 %index, 4
  %9 = icmp eq i64 %index.next, 1024
  br i1 %9, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld4:
; CHECK:       .Lfunc_begin2:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB2_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
; CHECK-NEXT:    add x9, x1, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT:    b.ne .LBB2_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 2
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %1 = shl i64 %index, 1
  %wide.vec = load <16 x float>, ptr %next.gep, align 4
  %2 = fmul fast <16 x float> %wide.vec, %wide.vec
  %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
  %4 = fmul fast <16 x float> %wide.vec, %wide.vec
  %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
  %6 = fadd fast <4 x float> %5, %3
  %7 = fmul fast <16 x float> %wide.vec, %wide.vec
  %8 = shufflevector <16 x float> %7, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
  %9 = fmul fast <16 x float> %wide.vec, %wide.vec
  %10 = shufflevector <16 x float> %9, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
  %11 = fadd fast <4 x float> %10, %8
  %12 = getelementptr inbounds float, ptr %pDst, i64 %1
  %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %11, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  store <8 x float> %interleaved.vec, ptr %12, align 4
  %index.next = add i64 %index, 4
  %13 = icmp eq i64 %index.next, 1024
  br i1 %13, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: twosrc:
; CHECK:       .Lfunc_begin3:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB3_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    add x10, x1, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x10]
; CHECK-NEXT:    fmul v4.4s, v2.4s, v0.4s
; CHECK-NEXT:    fmla v4.4s, v1.4s, v3.4s
; CHECK-NEXT:    str q4, [x2], #16
; CHECK-NEXT:    b.ne .LBB3_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 1
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %1 = shl i64 %index, 1
  %next.gep23 = getelementptr float, ptr %pSrc2, i64 %1
  %next.gep24 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <8 x float>, ptr %next.gep, align 4
  %wide.vec26 = load <8 x float>, ptr %next.gep23, align 4
  %2 = fmul fast <8 x float> %wide.vec26, %wide.vec
  %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %4 = fmul fast <8 x float> %wide.vec26, %wide.vec
  %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %6 = fadd fast <4 x float> %5, %3
  store <4 x float> %6, ptr %next.gep24, align 4
  %index.next = add i64 %index, 4
  %7 = icmp eq i64 %index.next, 1024
  br i1 %7, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld2_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld2_multiuse:
; CHECK:       .Lfunc_begin4:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB4_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
; CHECK-NEXT:    str q2, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB4_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 1
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep19 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <8 x float>, ptr %next.gep, align 4
  %1 = fmul fast <8 x float> %wide.vec, %wide.vec
  %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %3 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  %4 = fadd fast <4 x float> %3, %2
  store <4 x float> %4, ptr %next.gep19, align 4
  %index.next = add i64 %index, 4
  %5 = icmp eq i64 %index.next, 1024
  br i1 %5, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld3_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld3_multiuse:
; CHECK:       .Lfunc_begin5:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB5_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
; CHECK-NEXT:    str q3, [x1, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
; CHECK-NEXT:    b.ne .LBB5_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = mul i64 %index, 3
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %next.gep23 = getelementptr float, ptr %pDst, i64 %index
  %wide.vec = load <12 x float>, ptr %next.gep, align 4
  %1 = fmul fast <12 x float> %wide.vec, %wide.vec
  %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  %3 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  %4 = fadd fast <4 x float> %3, %2
  %5 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  %6 = fadd fast <4 x float> %4, %5
  store <4 x float> %6, ptr %next.gep23, align 4
  %index.next = add i64 %index, 4
  %7 = icmp eq i64 %index.next, 1024
  br i1 %7, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
; CHECK-LABEL: vld4_multiuse:
; CHECK:       .Lfunc_begin6:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB6_1: // %vector.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
; CHECK-NEXT:    add x9, x1, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
; CHECK-NEXT:    b.ne .LBB6_1
; CHECK-NEXT:  // %bb.2: // %while.end
; CHECK-NEXT:    ret
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = shl i64 %index, 2
  %next.gep = getelementptr float, ptr %pSrc, i64 %0
  %1 = shl i64 %index, 1
  %wide.vec = load <16 x float>, ptr %next.gep, align 4
  %2 = fmul fast <16 x float> %wide.vec, %wide.vec
  %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
  %4 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
  %5 = fadd fast <4 x float> %4, %3
  %6 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
  %7 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
  %8 = fadd fast <4 x float> %7, %6
  %9 = getelementptr inbounds float, ptr %pDst, i64 %1
  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %8, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  store <8 x float> %interleaved.vec, ptr %9, align 4
  %index.next = add i64 %index, 4
  %10 = icmp eq i64 %index.next, 1024
  br i1 %10, label %while.end, label %vector.body

while.end:                                        ; preds = %vector.body
  ret void
}

; This example has store(shuffle(shuffle(... that would be better to be treated
; as a single store. This avoids the vld2 for data that is already shuffled.
define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
; CHECK-LABEL: transpose_s16_8x8_simpler:
; CHECK:       .Lfunc_begin7:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    ldp q0, q1, [x0]
; CHECK-NEXT:    ldp q2, q3, [x0, #64]
; CHECK-NEXT:    ldp q4, q5, [x0, #32]
; CHECK-NEXT:    ldp q6, q7, [x0, #96]
; CHECK-NEXT:    trn1 v0.8h, v0.8h, v1.8h
; CHECK-NEXT:    trn1 v1.8h, v2.8h, v3.8h
; CHECK-NEXT:    trn1 v2.8h, v4.8h, v5.8h
; CHECK-NEXT:    trn1 v3.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn1 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v1.4s, v2.4s, v3.4s
; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
; CHECK-NEXT:    str q2, [x0, #64]
; CHECK-NEXT:    ret
entry:
  %0 = load <8 x i16>, ptr %a, align 16
  %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1
  %1 = load <8 x i16>, ptr %arrayidx1, align 16
  %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2
  %2 = load <8 x i16>, ptr %arrayidx2, align 16
  %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3
  %3 = load <8 x i16>, ptr %arrayidx3, align 16
  %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4
  %4 = load <8 x i16>, ptr %arrayidx5, align 16
  %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5
  %5 = load <8 x i16>, ptr %arrayidx6, align 16
  %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6
  %6 = load <8 x i16>, ptr %arrayidx8, align 16
  %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7
  %7 = load <8 x i16>, ptr %arrayidx9, align 16
  %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %8 = bitcast <8 x i16> %shuffle.i to <4 x i32>
  %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32>
  %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32>
  %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32>
  %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %vzip.i, ptr %a, align 16
  store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16
  ret void
}

; Same as above with some different shuffles
define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
; CHECK-LABEL: transpose_s16_8x8_simpler2:
; CHECK:       .Lfunc_begin8:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    ldp q0, q2, [x0]
; CHECK-NEXT:    ldp q3, q4, [x0, #64]
; CHECK-NEXT:    ldp q5, q6, [x0, #32]
; CHECK-NEXT:    ldp q7, q16, [x0, #96]
; CHECK-NEXT:    mov v0.h[5], v2.h[4]
; CHECK-NEXT:    zip1 v2.8h, v3.8h, v4.8h
; CHECK-NEXT:    zip1 v3.8h, v5.8h, v6.8h
; CHECK-NEXT:    mov v7.h[5], v16.h[4]
; CHECK-NEXT:    mov v0.s[1], v2.s[0]
; CHECK-NEXT:    uzp1 v1.4s, v3.4s, v7.4s
; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
; CHECK-NEXT:    str q2, [x0, #64]
; CHECK-NEXT:    ret
entry:
  %0 = load <8 x i16>, ptr %a, align 16
  %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1
  %1 = load <8 x i16>, ptr %arrayidx1, align 16
  %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2
  %2 = load <8 x i16>, ptr %arrayidx2, align 16
  %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3
  %3 = load <8 x i16>, ptr %arrayidx3, align 16
  %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4
  %4 = load <8 x i16>, ptr %arrayidx5, align 16
  %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5
  %5 = load <8 x i16>, ptr %arrayidx6, align 16
  %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6
  %6 = load <8 x i16>, ptr %arrayidx8, align 16
  %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7
  %7 = load <8 x i16>, ptr %arrayidx9, align 16
  %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef>
  %8 = bitcast <8 x i16> %shuffle.i to <4 x i32>
  %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32>
  %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
  %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32>
  %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32>
  %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %vzip.i, ptr %a, align 16
  store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16
  ret void
}


define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) {
; CHECK-LABEL: transpose_s16_8x8:
; CHECK:       .Lfunc_begin9:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ldr q0, [x0]
; CHECK-NEXT:    ldr q1, [x1]
; CHECK-NEXT:    ldr q3, [x4]
; CHECK-NEXT:    ldr q4, [x5]
; CHECK-NEXT:    ldr q2, [x2]
; CHECK-NEXT:    ldr q5, [x3]
; CHECK-NEXT:    trn1 v16.8h, v0.8h, v1.8h
; CHECK-NEXT:    trn2 v0.8h, v0.8h, v1.8h
; CHECK-NEXT:    ldr q6, [x6]
; CHECK-NEXT:    ldr q7, [x7]
; CHECK-NEXT:    trn1 v17.8h, v3.8h, v4.8h
; CHECK-NEXT:    trn2 v1.8h, v3.8h, v4.8h
; CHECK-NEXT:    trn1 v18.8h, v2.8h, v5.8h
; CHECK-NEXT:    trn2 v2.8h, v2.8h, v5.8h
; CHECK-NEXT:    trn1 v19.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn2 v3.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn1 v4.4s, v16.4s, v17.4s
; CHECK-NEXT:    trn1 v6.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v5.4s, v18.4s, v19.4s
; CHECK-NEXT:    trn1 v7.4s, v2.4s, v3.4s
; CHECK-NEXT:    trn2 v17.4s, v18.4s, v19.4s
; CHECK-NEXT:    trn2 v1.4s, v2.4s, v3.4s
; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x0]
; CHECK-NEXT:    zip2 v2.4s, v4.4s, v5.4s
; CHECK-NEXT:    zip2 v3.4s, v6.4s, v7.4s
; CHECK-NEXT:    zip2 v4.4s, v16.4s, v17.4s
; CHECK-NEXT:    st2 { v6.2s, v7.2s }, [x1]
; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x2]
; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x3]
; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    str q2, [x4]
; CHECK-NEXT:    str q3, [x5]
; CHECK-NEXT:    str q4, [x6]
; CHECK-NEXT:    str q0, [x7]
; CHECK-NEXT:    ret
  %9 = load <8 x i16>, ptr %0, align 16
  %10 = load <8 x i16>, ptr %1, align 16
  %11 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %12 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %13 = load <8 x i16>, ptr %2, align 16
  %14 = load <8 x i16>, ptr %3, align 16
  %15 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %16 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %17 = load <8 x i16>, ptr %4, align 16
  %18 = load <8 x i16>, ptr %5, align 16
  %19 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %20 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %21 = load <8 x i16>, ptr %6, align 16
  %22 = load <8 x i16>, ptr %7, align 16
  %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %24 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %25 = bitcast <8 x i16> %11 to <4 x i32>
  %26 = bitcast <8 x i16> %19 to <4 x i32>
  %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %29 = bitcast <8 x i16> %12 to <4 x i32>
  %30 = bitcast <8 x i16> %20 to <4 x i32>
  %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %33 = bitcast <8 x i16> %15 to <4 x i32>
  %34 = bitcast <8 x i16> %23 to <4 x i32>
  %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %37 = bitcast <8 x i16> %16 to <4 x i32>
  %38 = bitcast <8 x i16> %24 to <4 x i32>
  %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %41, ptr %0, align 16
  store <4 x i32> %43, ptr %1, align 16
  store <4 x i32> %45, ptr %2, align 16
  store <4 x i32> %47, ptr %3, align 16
  store <4 x i32> %42, ptr %4, align 16
  store <4 x i32> %44, ptr %5, align 16
  store <4 x i32> %46, ptr %6, align 16
  store <4 x i32> %48, ptr %7, align 16
  ret void
}

define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
; CHECK-LABEL: transpose_s16_8x8_:
; CHECK:       .Lfunc_begin10:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ldp q0, q1, [x0]
; CHECK-NEXT:    ldp q2, q3, [x0, #32]
; CHECK-NEXT:    ldp q4, q5, [x0, #64]
; CHECK-NEXT:    ldp q6, q7, [x0, #96]
; CHECK-NEXT:    trn1 v16.8h, v0.8h, v1.8h
; CHECK-NEXT:    trn2 v0.8h, v0.8h, v1.8h
; CHECK-NEXT:    trn1 v1.8h, v2.8h, v3.8h
; CHECK-NEXT:    trn2 v2.8h, v2.8h, v3.8h
; CHECK-NEXT:    trn1 v17.8h, v4.8h, v5.8h
; CHECK-NEXT:    trn2 v3.8h, v4.8h, v5.8h
; CHECK-NEXT:    trn1 v18.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn2 v4.8h, v6.8h, v7.8h
; CHECK-NEXT:    trn1 v5.4s, v16.4s, v17.4s
; CHECK-NEXT:    trn1 v7.4s, v0.4s, v3.4s
; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
; CHECK-NEXT:    trn1 v6.4s, v1.4s, v18.4s
; CHECK-NEXT:    trn1 v19.4s, v2.4s, v4.4s
; CHECK-NEXT:    trn2 v1.4s, v1.4s, v18.4s
; CHECK-NEXT:    trn2 v0.4s, v0.4s, v3.4s
; CHECK-NEXT:    trn2 v2.4s, v2.4s, v4.4s
; CHECK-NEXT:    zip1 v3.4s, v5.4s, v6.4s
; CHECK-NEXT:    zip1 v4.4s, v7.4s, v19.4s
; CHECK-NEXT:    zip1 v17.4s, v16.4s, v1.4s
; CHECK-NEXT:    zip1 v18.4s, v0.4s, v2.4s
; CHECK-NEXT:    zip2 v5.4s, v5.4s, v6.4s
; CHECK-NEXT:    zip2 v1.4s, v16.4s, v1.4s
; CHECK-NEXT:    zip2 v0.4s, v0.4s, v2.4s
; CHECK-NEXT:    stp q3, q4, [x0]
; CHECK-NEXT:    zip2 v3.4s, v7.4s, v19.4s
; CHECK-NEXT:    stp q17, q18, [x0, #32]
; CHECK-NEXT:    stp q1, q0, [x0, #96]
; CHECK-NEXT:    stp q5, q3, [x0, #64]
; CHECK-NEXT:    ret
  %2 = load <8 x i16>, ptr %0, align 16
  %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
  %4 = load <8 x i16>, ptr %3, align 1
  %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %6 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %7 = getelementptr inbounds <8 x i16>, ptr %0, i64 2
  %8 = load <8 x i16>, ptr %7, align 16
  %9 = getelementptr inbounds <8 x i16>, ptr %0, i64 3
  %10 = load <8 x i16>, ptr %9, align 16
  %11 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %12 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %13 = getelementptr inbounds <8 x i16>, ptr %0, i64 4
  %14 = load <8 x i16>, ptr %13, align 16
  %15 = getelementptr inbounds <8 x i16>, ptr %0, i64 5
  %16 = load <8 x i16>, ptr %15, align 16
  %17 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %18 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %19 = getelementptr inbounds <8 x i16>, ptr %0, i64 6
  %20 = load <8 x i16>, ptr %19, align 16
  %21 = getelementptr inbounds <8 x i16>, ptr %0, i64 7
  %22 = load <8 x i16>, ptr %21, align 16
  %23 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
  %24 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
  %25 = bitcast <8 x i16> %5 to <4 x i32>
  %26 = bitcast <8 x i16> %17 to <4 x i32>
  %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %29 = bitcast <8 x i16> %6 to <4 x i32>
  %30 = bitcast <8 x i16> %18 to <4 x i32>
  %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %33 = bitcast <8 x i16> %11 to <4 x i32>
  %34 = bitcast <8 x i16> %23 to <4 x i32>
  %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %37 = bitcast <8 x i16> %12 to <4 x i32>
  %38 = bitcast <8 x i16> %24 to <4 x i32>
  %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
  %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %41, ptr %0, align 16
  store <4 x i32> %43, ptr %3, align 16
  store <4 x i32> %45, ptr %7, align 16
  store <4 x i32> %47, ptr %9, align 16
  store <4 x i32> %42, ptr %13, align 16
  store <4 x i32> %44, ptr %15, align 16
  store <4 x i32> %46, ptr %19, align 16
  store <4 x i32> %48, ptr %21, align 16
  ret void
}

define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: store_factor2:
; CHECK:       .Lfunc_begin11:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v3.4s, v1.4s, v0.4s
; CHECK-NEXT:    st2 { v2.4s, v3.4s }, [x0]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  store <8 x i32> %interleaved.vec, ptr %ptr, align 4
  ret void
}

define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: store_factor2_high:
; CHECK:       .Lfunc_begin12:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v0.4s, v1.4s, v0.4s
; CHECK-NEXT:    zip1 v1.4s, v2.4s, v0.4s
; CHECK-NEXT:    trn1 v1.4s, v1.4s, v0.4s
; CHECK-NEXT:    zip2 v0.4s, v2.4s, v0.4s
; CHECK-NEXT:    str q1, [x0]
; CHECK-NEXT:    str q0, [x1]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
  %interleaved.vec2 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %interleaved.vec, ptr %ptr, align 4
  store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4
  ret void
}

define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: store_factor2_high2:
; CHECK:       .Lfunc_begin13:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    zip1 v2.4s, v0.4s, v1.4s
; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v2.4s, v2.4s, v1.4s
; CHECK-NEXT:    str q2, [x0]
; CHECK-NEXT:    str q0, [x1]
; CHECK-NEXT:    ret
  %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
  %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  store <4 x i32> %interleaved.vec, ptr %ptr, align 4
  store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4
  ret void
}

define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: store_factor3:
; CHECK:       .Lfunc_begin14:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #12
; CHECK-NEXT:    ext v6.16b, v1.16b, v2.16b, #12
; CHECK-NEXT:    zip2 v3.4s, v0.4s, v3.4s
; CHECK-NEXT:    mov v3.s[0], v0.s[0]
; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #12
; CHECK-NEXT:    zip2 v4.4s, v1.4s, v6.4s
; CHECK-NEXT:    mov v4.s[0], v1.s[0]
; CHECK-NEXT:    zip2 v5.4s, v2.4s, v0.4s
; CHECK-NEXT:    mov v5.s[0], v2.s[0]
; CHECK-NEXT:    st3 { v3.4s, v4.4s, v5.4s }, [x0]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a0, <4 x i32> <i32 0, i32 5, i32 3, i32 6>
  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
  store <12 x i32> %interleaved.vec, ptr %ptr, align 4
  ret void
}

define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; CHECK-LABEL: store_factor4:
; CHECK:       .Lfunc_begin15:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    trn1 v4.4s, v0.4s, v1.4s
; CHECK-NEXT:    trn1 v5.4s, v1.4s, v2.4s
; CHECK-NEXT:    trn1 v6.4s, v2.4s, v3.4s
; CHECK-NEXT:    trn1 v7.4s, v3.4s, v0.4s
; CHECK-NEXT:    st4 { v4.4s, v5.4s, v6.4s, v7.4s }, [x0]
; CHECK-NEXT:    ret
  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %v3 = shufflevector <4 x i32> %a3, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
  store <16 x i32> %interleaved.vec, ptr %ptr, align 4
  ret void
}

define void @debuginfo(ptr nocapture noundef writeonly %buf, <8 x i16> noundef %a) {
; CHECK-LABEL: debuginfo:
; CHECK:       .Lfunc_begin16:
; CHECK-NEXT:    .cfi_startproc
; CHECK-NEXT:  // %bb.0: // %entry
; CHECK-NEXT:    movi v1.2d, #0000000000000000
; CHECK-NEXT:    zip1 v2.8h, v0.8h, v1.8h
; CHECK-NEXT:    zip2 v0.8h, v0.8h, v1.8h
; CHECK-NEXT:    stp q2, q0, [x0]
; CHECK-NEXT:    ret
entry:
  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  store <8 x i16> %vzip.i, ptr %buf, align 4
  call void @llvm.dbg.value(metadata <8 x i16> %vzip1.i, metadata !21, metadata !DIExpression()), !dbg !23
  %add.ptr = getelementptr inbounds i32, ptr %buf, i64 4
  store <8 x i16> %vzip1.i, ptr %add.ptr, align 4
  ret void
}

declare void @llvm.dbg.value(metadata, metadata, metadata)

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!6, !7, !8, !9, !10, !11}

!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "a64.c", directory: "", checksumkind: CSK_MD5, checksum: "a1a236fb20d703d1ea5963e75545b91a")
!2 = !{!15}
!3 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!4 = !{!5}
!5 = !DISubrange(count: 8)
!6 = !{i32 7, !"Dwarf Version", i32 5}
!7 = !{i32 2, !"Debug Info Version", i32 3}
!8 = !{i32 1, !"wchar_size", i32 4}
!9 = !{i32 7, !"uwtable", i32 2}
!10 = !{i32 7, !"frame-pointer", i32 1}
!11 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
!12 = !DISubroutineType(types: !13)
!13 = !{null, !14, !15}
!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3, size: 64)
!15 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !16)
!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int16x8_t", file: !1, line: 57, baseType: !17)
!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 128, flags: DIFlagVector, elements: !4)
!18 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
!19 = distinct !DISubprogram(name: "store_s16q_to_tran_low_", scope: !1, file: !1, line: 13, type: !12, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20)
!20 = !{!21}
!21 = !DILocalVariable(name: "__s1", scope: !22, file: !1, line: 16, type: !16)
!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 16, column: 3)
!23 = !DILocation(line: 0, scope: !22)
