; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)

declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)

; GCN-LABEL: {{^}}image_bvh_intersect_ray:
; GCN: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
; Arguments are flattened to represent the actual VGPR_A layout, so we have no
; extra moves in the generated kernel.
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
main_body:
  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
 %r = bitcast <4 x i32> %v to <4 x float>
 ret <4 x float> %r
}

; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16:
; GCN: image_bvh_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
main_body:
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
}

; GCN-LABEL: {{^}}image_bvh64_intersect_ray:
; GCN:  image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
; Arguments are flattened to represent the actual VGPR_A layout, so we have no
; extra moves in the generated kernel.
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
main_body:
  %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
 %r = bitcast <4 x i32> %v to <4 x float>
 ret <4 x float> %r
}

; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16:
; GCN: image_bvh64_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
main_body:
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
}

; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.

; GCN-LABEL: {{^}}image_bvh_intersect_ray_nsa_reassign:
; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
main_body:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
  %node_ptr = load i32, i32* %gep_node_ptr, align 4
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
}

; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16_nsa_reassign:
; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
main_body:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
  %node_ptr = load i32, i32* %gep_node_ptr, align 4
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
}

; GCN-LABEL: {{^}}image_bvh64_intersect_ray_nsa_reassign:
; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
main_body:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
}

; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16_nsa_reassign:
; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
main_body:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()