U
    jg                  
   @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z!m"Z" e
 Z#e#j$Z$e#j%Z&e#j'Z'dd Z(e&e)eddd Z*e&e)eddd Z+e&e)eddd Z,e&e)eddd Z-e&e)eddd Z.e&e d d!d" Z/e&e d#d$d% Z0e&e d&d'd( Z1e$ej2j3d)d* Z4e$d+e!d,d- Z5e$ej6j7ej8d.d/ Z9d a:d0d1 Z;e$ej<j=ej>ej?d2d3 Z@e$ej<j=ejAej?e$ej<j=ejBej?d4d5 ZCe$ejDj=ej>ej?d6d7 ZEe$ejDj=ejAej?e$ejDj=ejBej?d8d9 ZFe$ejGd:d; ZHe$ejId<d= ZJe$ejKd>d? ZLe$ejMd@dA ZNe$ejMejOdBdC ZPe$ejQejOejOejOejOejOe$ejQejOejOejRejOejOe$ejQejOejOejSejOejOe$ejQejOejOejTejOejOdDdE ZUe$ejVejOejOejWdFdG ZXe$ejYejOejOe$ejYejOejRe$ejYejOejSe$ejYejOejTdHdI ZZe$ej[ejOejOe$ej[ejOejRe$ej[ejOejSe$ej[ejOejTdJdK Z\e$ej]dLdM Z^e$ej_dNdO Z`e$ejaej?dPdQ Zbe$ejcej?ej?ej?dRdS ZddTdU ZeeejfejgdVdW ZheejgejfdXdY ZidZd[ Zjeejfejkd\d] Zleejkejfeej>ejfd^d_ Zmd`da Znenejojpdb enejqdb enejrdb enejojsdc enejtdc enejudc enejojvdd enejwdd enejxdd e$ejojyejfdedf Zze$ej{ejfdgdh Z|e$ejoj}ejfdidj Z~e$eejfdkdl Ze$ejojejfejfejfdmdn Ze$ejejfejfe$ejejfejfdodp ZdqZdrds Ze$ejojejfejfedt e$ejejfejfedt e$ejojejfejfedu e$ejejfejfedu e$ejojejfejfedv e$ejejfejfedv e$ejojejfejfedw e$ejejfejfedw e$ejojejfejfedx e$ejejfejfedx e$ejojejfejfedy e$ejejfejfedy dzd{ Zeejojd|dw eejojd}dy ejd~ejdiZe$ejeje$ejejdd Ze$ejejdd Ze$ejejdd Ze$ejej?dd Ze$ejejOe$ejejdd Ze$ejejRe$ejejdd Ze$ejej?ej?ej?dd Ze$eejSejSdd Ze$eejTejSe$eejSejTe$eejTejTdd Ze$eejSejSdd Ze$eejTejSe$eejSejTe$eejTejTdd Ze$eejSe$eejTdd Ze$eejSejke$eejTejkdd Zdd Zejd Zdej Ze$ejejSee e$ejejTee e$ejejSee e$ejejTee dd Zdd Ze$ejjqej8ejej?e$ejjqej8ejBej?e$ejjqej8ejAej?edd Ze$ejjtej8ejej?e$ejjtej8ejBej?e$ejjtej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Zdd Zeejjd eejjd eejjd e$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ej?ej?dd Ze$ejjej8ejej?ej?e$ejjej8ejAej?ej?e$ejjej8ejBej?ej?dd Ze$ejej҃dd ZdddZe'e"dd Zee֡ e$ dS )    )reduceN)ir)Registry
lower_cast)parse_dtype)models)typescgutils)ufunc_db)register_ufuncs   )nvvm)cuda)	nvvmutilsstubserrors)dim3
grid_groupCUDADispatcherc                 C   sB   t | d| }t | d| }t | d| }t| |||fS )Nz%s.xz%s.yz%s.z)r   	call_sregr	   Zpack_struct)builderprefixxyz r   7/tmp/pip-unpacked-wheel-qtpwf23r/numba/cuda/cudaimpl.pyinitialize_dim3   s    r   Z	threadIdxc                 C   s
   t |dS )Ntidr   contextr   sigargsr   r   r   cuda_threadIdx    s    r$   ZblockDimc                 C   s
   t |dS )NZntidr   r    r   r   r   cuda_blockDim%   s    r%   ZblockIdxc                 C   s
   t |dS )NZctaidr   r    r   r   r   cuda_blockIdx*   s    r&   ZgridDimc                 C   s
   t |dS )NZnctaidr   r    r   r   r   cuda_gridDim/   s    r'   laneidc                 C   s   t |dS )Nr(   )r   r   r    r   r   r   cuda_laneid4   s    r)   r   c                 C   s   | |dS Nr   extract_valuer    r   r   r   dim3_x9   s    r-   r   c                 C   s   | |dS Nr   r+   r    r   r   r   dim3_y>   s    r/   r   c                 C   s   | |dS )N   r+   r    r   r   r   dim3_zC   s    r1   c                 C   s(   |  tjd}|j}|t||fS r.   )get_constantr   int32modulecallr   Z declare_cudaCGGetIntrinsicHandle)r!   r   r"   r#   Zonelmodr   r   r   cg_this_gridH   s    r7   zGridGroup.syncc                 C   s,   |  tjd}|j}|t|||fS r*   )r2   r   r3   r4   r5   r   Zdeclare_cudaCGSynchronize)r!   r   r"   r#   flagsr6   r   r   r   ptx_sync_groupQ   s    r9   c                 C   s   |d S r*   r   r    r   r   r   cuda_const_array_like\   s    r:   c                 C   s   t d7 a d| t S )zDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})_unique_smem_idformatnamer   r   r   _get_unique_smem_idf   s    r?   c              	   C   s8   |j d j}t|j d }t| ||f|tdtjddS )Nr   r   _cudapy_smemTshapedtypesymbol_name	addrspacecan_dynsized)r#   literal_valuer   _generic_arrayr?   r   ADDRSPACE_SHAREDr!   r   r"   r#   lengthrC   r   r   r   cuda_shared_array_integerp   s    rL   c              	   C   s>   dd |j d D }t|j d }t| |||tdtjddS )Nc                 S   s   g | ]
}|j qS r   rG   .0sr   r   r   
<listcomp>}   s     z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   r@   TrA   )r#   r   rH   r?   r   rI   r!   r   r"   r#   rB   rC   r   r   r   cuda_shared_array_tuplez   s    
rS   c              	   C   s4   |j d j}t|j d }t| ||f|dtjddS )Nr   r   _cudapy_lmemFrA   )r#   rG   r   rH   r   ADDRSPACE_LOCALrJ   r   r   r   cuda_local_array_integer   s    rV   c              	   C   s:   dd |j d D }t|j d }t| |||dtjddS )Nc                 S   s   g | ]
}|j qS r   rM   rN   r   r   r   rQ      s     z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   rT   FrA   )r#   r   rH   r   rU   rR   r   r   r   ptx_lmem_alloc_array   s    
rW   c                 C   sD   |rt d}|j}tt d}t|||}||d |  S )Nzllvm.nvvm.membar.ctar   	AssertionErrorr4   r   FunctionTypeVoidTyper	   get_or_insert_functionr5   get_dummy_valuer!   r   r"   r#   fnamer6   fntysyncr   r   r   ptx_threadfence_block   s    rb   c                 C   sD   |rt d}|j}tt d}t|||}||d |  S )Nzllvm.nvvm.membar.sysr   rX   r^   r   r   r   ptx_threadfence_system   s    rc   c                 C   sD   |rt d}|j}tt d}t|||}||d |  S )Nzllvm.nvvm.membar.glr   rX   r^   r   r   r   ptx_threadfence_device   s    rd   c                 C   s*   |  tjd}ttj}t| |||gS )Nl    )r2   r   r3   noneptx_syncwarp_mask)r!   r   r"   r#   maskZmask_sigr   r   r   ptx_syncwarp   s    rh   c                 C   sD   d}|j }tt tdf}t|||}||| |  S )Nzllvm.nvvm.bar.warp.sync    )	r4   r   rZ   r[   IntTyper	   r\   r5   r]   r^   r   r   r   rf      s    rf   c              
   C   s  |\}}}}}|j d }	|	tjkr6||t|	j}d}
|j}tt	tdtdftdtdtdtdtdf}t
|||
}|	jdkr|||||||f}|	tjkr||d}||d}||t }t
|||f}n||td}||| tjd}||td}|||||||f}|||||||f}||d}||d}||d}||td}||td}||| tjd}|||}|	tjkr||t }t
|||f}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r0   zllvm.nvvm.shfl.sync.i32ri   r   r   @   )r#   r   real_domainbitcastr   rj   bitwidthr4   rZ   LiteralStructTyper	   r\   r5   float32r,   	FloatTypeZmake_anonymous_structtruncZlshrr2   i8zextZshlor_float64
DoubleType)r!   r   r"   r#   rg   modevalueindexclampZ
value_typer_   r6   r`   funcretrvpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   ptx_shfl_sync_i32   sJ    

 

r   c                 C   s^   d}|j }tttdtdftdtdtdf}t|||}|||S )Nzllvm.nvvm.vote.syncri   r   )r4   r   rZ   ro   rj   r	   r\   r5   )r!   r   r"   r#   r_   r6   r`   r|   r   r   r   ptx_vote_sync  s    r   c                 C   s   |\}}|j d j}|j d tjkr6||t|}d|}|j}t	tdtdt|f}	t
||	|}
||
||fS )Nr   zllvm.nvvm.match.any.sync.i{}ri   )r#   rn   r   rl   rm   r   rj   r<   r4   rZ   r	   r\   r5   r!   r   r"   r#   rg   ry   widthr_   r6   r`   r|   r   r   r   ptx_match_any_sync  s    
"r   c                 C   s   |\}}|j d j}|j d tjkr6||t|}d|}|j}t	t
tdtdftdt|f}	t||	|}
||
||fS )Nr   zllvm.nvvm.match.all.sync.i{}ri   )r#   rn   r   rl   rm   r   rj   r<   r4   rZ   ro   r	   r\   r5   r   r   r   r   ptx_match_all_sync  s    
r   c                 C   s,   t jt t dg dddd}||g S )Nri   zactivemask.b32 $0;=rTZside_effectr   	InlineAsmrZ   rj   r5   r!   r   r"   r#   
activemaskr   r   r   ptx_activemask/  s      r   c                 C   s,   t jt t dg dddd}||g S )Nri   zmov.u32 $0, %lanemask_lt;r   Tr   r   r   r   r   r   ptx_lanemask_lt6  s     r   c                 C   s   | |d S r*   )Zctpopr    r   r   r   ptx_popc>  s    r   c                 C   s
   |j | S N)fmar    r   r   r   ptx_fmaC  s    r   c                 C   sD   ddd}z
||  W S  t k
r>   d|  d}t|Y nX d S )N)Zf32f)Zf64d)ri   rk   z$Conversion between float16 and float unsupportedKeyErrorr   ZCudaLoweringErrorrn   typemapmsgr   r   r   float16_float_ty_constraintH  s    

r   c           	      C   sd   |j |j kr|S t|j \}}t| |tdg}t|d| dd| d}|||gS )N   zcvt..f16 $0, $1;=,h)rn   r   r   rZ   get_value_typerj   r   r5   	r!   r   fromtytotyvalty
constraintr`   asmr   r   r   float16_to_float_castR  s    r   c           	      C   sb   |j |j kr|S t|j \}}ttd| |g}t|d| dd| }|||gS )Nr   cvt.rn.f16. $0, $1;=h,)rn   r   r   rZ   rj   r   r   r5   r   r   r   r   float_to_float16_cast^  s    r   c                 C   sH   ddddd}z
||  W S  t k
rB   d|  d}t|Y nX d S )Nchrl)   r   ri   rk   z"Conversion between float16 and intr   r   r   r   r   r   float16_int_constraintj  s    
r   c           
      C   sf   |j }t|}|jrdnd}t| |tdg}t|d| | dd| d}	||	|gS )NrP   ur   zcvt.rni.r   r   r   )	rn   r   signedr   rZ   r   rj   r   r5   
r!   r   r   r   r   rn   r   Z
signednessr`   r   r   r   r   float16_to_integer_castt  s    
r   c           
      C   sd   |j }t|}|jrdnd}ttd| |g}t|d| | dd| }	||	|gS )NrP   r   r   r   r   r   )	rn   r   r   r   rZ   rj   r   r   r5   r   r   r   r   integer_to_float16_cast  s    
r   c                    s    t | tjtj fdd}d S )Nc                    sB   t t dt dt dg}t |  dd}|||S )Nr   z.f16 $0,$1,$2;=h,h,hr   rZ   rj   r   r5   r!   r   r"   r#   r`   r   opr   r   ptx_fp16_binary  s
    z*lower_fp16_binary.<locals>.ptx_fp16_binarylowerr   float16)fnr   r   r   r   r   lower_fp16_binary  s    r   addsubmulc                 C   s4   t t dt dg}t |dd}|||S )Nr   zneg.f16 $0, $1;=h,hr   r   r   r   r   ptx_fp16_hneg  s    r   c                 C   s   t | |||S r   )r   r    r   r   r   operator_hneg  s    r   c                 C   s4   t t dt dg}t |dd}|||S )Nr   zabs.f16 $0, $1;r   r   r   r   r   r   ptx_fp16_habs  s    r   c                 C   s   t | |||S r   )r   r    r   r   r   operator_habs  s    r   c                 C   sH   t dt dt dg}t t d|}t |dd}|||S )Nr   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   rj   rZ   r   r5   )r!   r   r"   r#   Zargtysr`   r   r   r   r   ptx_hfma  s    r   c                 C   s   dd }|  ||||S )Nc                 S   s   t j| |S r   )r   fp16Zhdiv)r   r   r   r   r   fp16_div  s    zfp16_div_impl.<locals>.fp16_divZcompile_internal)r!   r   r"   r#   r   r   r   r   fp16_div_impl  s    r   z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    s    fdd}|S )Nc           	         sr   t t dt dt dg}t |tj dd}|||}| tj	d}|
|t d}|d||S )Nr   r   r   r   z!=)r   rZ   rj   r   	_fp16_cmpr<   r5   r2   r   Zint16rm   Zicmp_unsigned)	r!   r   r"   r#   r`   r   resultZzeroZ
int_resultr   r   r   ptx_fp16_comparison  s    "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )r   r   r   r   r   _gen_fp16_cmp  s    r   eqnegegtleltc                    s    t | tjtj fdd}d S )Nc                    s(   t  | |||}|||d |d S )Nr   r   )r   select)r!   r   r"   r#   choicer   r   r   ptx_fp16_minmax  s    z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxr   )r   r_   r   r   r   r   r   lower_fp16_minmax  s    r   maxminZ
__nv_cbrtfZ	__nv_cbrtc           
      C   sF   |j }t| }| |}|j}t||g}t|||}	||	|S r   )	return_type
cbrt_funcsr   r4   r   rZ   r	   r\   r5   )
r!   r   r"   r#   r   r_   Zftyr6   r`   r   r   r   r   ptx_cbrt  s    
r   c              	   C   s2   t |jttdtdfd}|||S )Nri   Z	__nv_brevr	   r\   r4   r   rZ   rj   r5   r!   r   r"   r#   r   r   r   r   ptx_brev_u4  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrk   Z__nv_brevllr   r   r   r   r   ptx_brev_u8  s    r   c                 C   s   | |d | tjdS r*   )Zctlzr2   r   booleanr    r   r   r   ptx_clz'  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nri   Z__nv_ffsr   r   r   r   r   
ptx_ffs_32.  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nri   rk   Z
__nv_ffsllr   r   r   r   r   
ptx_ffs_648  s    r   c                 C   s   |\}}}| |||S r   )r   )r!   r   r"   r#   testabr   r   r   ptx_selpB  s    
r   c              	   C   s4   t |jtt t t fd}|||S )NZ
__nv_fmaxfr	   r\   r4   r   rZ   rq   r5   r   r   r   r   
ptx_max_f4H  s    r   c              
   C   sh   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fmaxr   r   r	   r\   r4   r   rZ   rw   r5   castr#   r   doubler   r   r   r   
ptx_max_f8S  s    r   c              	   C   s4   t |jtt t t fd}|||S )NZ
__nv_fminfr   r   r   r   r   
ptx_min_f4d  s    r   c              
   C   sh   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fminr   r   r   r   r   r   r   
ptx_min_f8o  s    r   c              	   C   sJ   t |jttdt fd}||| ||d |j	d t
jgS )Nrk   Z__nv_llrintr   )r	   r\   r4   r   rZ   rj   rw   r5   r   r#   r   r   r   r   r   r   	ptx_round  s    r   c                 C   s   dd }|  ||||S )Nc                 S   s   t | st | r| S |dkrb|dkr:d|d  }d}nd| }d}| | | }t |rt| S nd|  }| | }t|}t || dkrdt|d  }|dkr|| | }n||9 }|S )Nr      g      $@gMDg      ?g      ?g       @)mathisinfisnanroundfabs)r   ndigitsZpow1Zpow2r   r   r   r   r   round_ndigits  s(    

z$round_to_impl.<locals>.round_ndigitsr   )r!   r   r"   r#   r   r   r   r   round_to_impl  s    !r   c                    s    fdd}|S )Nc                    s$   |j \}| | }|||d S r*   )r#   r2   Zfmul)r!   r   r"   r#   ZargtyZfactorconstr   r   impl  s    zgen_deg_rad.<locals>.implr   )r  r  r   r   r   gen_deg_rad  s    r  g     f@c                    s   |t jkr t j|dd}|g}ntj |t|d} fddt||D }|j}||krltd||f |j	t|krtd|j	t|f ||fS )z4
    Convert integer indices into tuple of intp
    r   )rC   count)r  c                    s"   g | ]\}}  ||tjqS r   )r   r   intp)rO   tir   r!   r   r   rQ     s   z&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Zinteger_domainUniTupler	   Zunpack_tuplelenziprC   	TypeErrorndim)r!   r   indtyindsarytyvaltyindicesrC   r   r  r   _normalize_indices  s    
r  c                    s    fdd}|S )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||\}}| || ||}tj| ||||dd} | ||
||	S )NTZ
wraparound)r#   rC   r  
make_arrayr	   get_item_pointer)r!   r   r"   r#   r  r  r  aryr  r   rC   r  laryptrdispatch_fnr   r   imp  s    

 z_atomic_dispatcher.<locals>.impr   )r  r  r   r  r   _atomic_dispatcher  s    r  c                 C   s`   |t jkr&|j}|t|||fS |t jkrL|j}|t|||fS |d||dS d S )Nr   	monotonic)	r   rp   r4   r5   r   Zdeclare_atomic_add_float32rv   Zdeclare_atomic_add_float64
atomic_rmwr!   r   rC   r  r   r6   r   r   r   ptx_atomic_add_tuple  s    

r!  c                 C   s`   |t jkr&|j}|t|||fS |t jkrL|j}|t|||fS |d||dS d S )Nr   r  )	r   rp   r4   r5   r   Zdeclare_atomic_sub_float32rv   Zdeclare_atomic_sub_float64r  r   r   r   r   ptx_atomic_sub  s    

r"  c                 C   sP   |t jjkr<|j}|j}ttd| }|||||fS td| dd S )NZdeclare_atomic_inc_intzUnimplemented atomic inc with  array	r   cudadeclZunsigned_int_numba_typesrn   r4   getattrr   r5   r  r!   r   rC   r  r   Zbwr6   r   r   r   r   ptx_atomic_inc  s    r(  c                 C   sP   |t jjkr<|j}|j}ttd| }|||||fS td| dd S )NZdeclare_atomic_dec_intzUnimplemented atomic dec with r#  r$  r'  r   r   r   ptx_atomic_dec"  s    r)  c                    s@   t  fdd}tjtjtjfD ]}t| tj|tj| q d S )Nc                    s6   |t jjkr| ||dS td  d| dd S )Nr  zUnimplemented atomic z with r#  r   r%  integer_numba_typesr  r  r!   r   rC   r  r   r   r   r   impl_ptx_atomic1  s    z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)r  r   r  r	  Tupler   ArrayAny)Zstubr   r-  r   r   r   r   ptx_atomic_bitwise0  s    r1  andorxorc                 C   s0   |t jjkr|d||dS td| dd S )NZxchgr  zUnimplemented atomic exch with r#  r*  r,  r   r   r   ptx_atomic_exchA  s    r5  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S Nr   r  ZorderingZumaxz&Unimplemented atomic max with %s array)r4   r   rv   r5   r   Zdeclare_atomic_max_float64rp   Zdeclare_atomic_max_float32r3   int64r  uint32uint64r  r   r   r   r   ptx_atomic_maxL  s    

r;  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S Nr   r  r7  Zuminz&Unimplemented atomic min with %s array)r4   r   rv   r5   r   Zdeclare_atomic_min_float64rp   Zdeclare_atomic_min_float32r3   r8  r  r9  r:  r  r   r   r   r   ptx_atomic_min`  s    

r=  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S r6  )r4   r   rv   r5   r   Zdeclare_atomic_nanmax_float64rp   Zdeclare_atomic_nanmax_float32r3   r8  r  r9  r:  r  r   r   r   r   ptx_atomic_nanmaxt  s    

r>  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S r<  )r4   r   rv   r5   r   Zdeclare_atomic_nanmin_float64rp   Zdeclare_atomic_nanmin_float32r3   r8  r  r9  r:  r  r   r   r   r   ptx_atomic_nanmin  s    

r?  c                 C   sT   | |jd tj|jd |jd }|d | tjd|d |d f}t| |||S )Nr   r   r0   )r   r#   r   r  r2   ptx_atomic_casr    r   r   r   ptx_atomic_compare_and_swap  s    $"rA  c                 C   s   |j \}}}}|\}}	}
}t| |||	||\}}| || ||}tj| ||||dd}|jtjjkr|j	}|jj
}t|||||
|S td|j d S )NTr  z&Unimplemented atomic cas with %s array)r#   r  r  r	   r  rC   r   r%  r+  r4   rn   r   Zatomic_cmpxchgr  )r!   r   r"   r#   r  r  Zoldtyr  r  r  oldr   r  r  r  r6   rn   r   r   r   r@    s    r@  c                 C   s@   t jt t  t dgdddd}|d }|||g d S )Nri   znanosleep.u32 $0;r   Tr   r   )r   r   rZ   r[   rj   r5   )r!   r   r"   r#   	nanosleepnsr   r   r   ptx_nanosleep  s      rE  Fc               	      sf  t tj|d}|dko$|o$t|dk}|dkr:|s:td j| }	t|tjtj	fpjt|	t
jpj|tjk}
|tjkr|
std|  |}t||}|tjkrtj|||d}nh|j}t||||} |}d|d  > |_|rd|_nt|tj|_||t t!dd}t"#t$ j%} |}|&|}|}g }t't(|D ]\}}|)| ||9 }qXd	d
 t(|D } fdd
|D }|rtj*t+t!dg dddd}|,|-|g t!d} .tj/|}|0||g}n fdd
|D }t|}tj1||dd} 2| |} j3||4||j5j6|| .tj/|d d |7 S )Nr   r   zarray length <= 0zunsupported type: %sr=   Zexternalr   Zgenericc                 S   s   g | ]}|qS r   r   rN   r   r   r   rQ     s     z"_generic_array.<locals>.<listcomp>c                    s   g | ]}  tj|qS r   r2   r   r  rN   r!   r   r   rQ     s     ri   zmov.u32 $0, %dynamic_smem_size;r   Tr   rk   c                    s   g | ]}  tj|qS r   rF  rN   rG  r   r   rQ     s     C)rC   r  Zlayout)datarB   stridesitemsizeZmeminfo)8r   operatorr   r
  
ValueErrorZdata_model_manager
isinstancer   ZRecordBooleanr   ZStructModelr   Znumber_domainr  Zget_data_typer   Z	ArrayTyper   rU   r	   Zalloca_oncer4   Zadd_global_variableZget_abi_sizeof
bit_lengthalignlinkageConstant	UndefinedZinitializerZaddrspacecastZPointerTyperj   llZcreate_target_dataZNVVMZdata_layoutZget_abi_size	enumeratereversedappendr   rZ   rt   r5   r2   r  Zudivr/  r  Zpopulate_arrayrm   rI  typeZ	_getvalue) r!   r   rB   rC   rD   rE   rF   Z	elemcountZdynamic_smemZ
data_modelZother_supported_typeZlldtypeZlarytyZdataptrr6   ZgvmemrQ  Z
targetdatarK  Z
laststrideZrstridesr  ZlastsizerJ  ZkstridesZget_dynshared_sizeZdynsmem_sizeZ	kitemsizeZkshaper  r  r  r   rG  r   rH     sz    








 rH   c                 C   s   |   S r   )r]   )r!   r   r   Zpyvalr   r   r   cuda_dispatcher_const*  s    rZ  )F)	functoolsr   rL  r   Zllvmliter   Zllvmlite.bindingZbindingrU  Znumba.core.imputilsr   r   Znumba.core.typing.npydeclr   Znumba.core.datamodelr   Z
numba.corer   r	   Znumba.npr
   Znumba.np.npyimplr   Zcudadrvr   Znumbar   Z
numba.cudar   r   r   Znumba.cuda.typesr   r   r   registryr   Zlower_getattrZ
lower_attrZlower_constantr   Moduler$   r%   r&   r'   r)   r-   r/   r1   ZcgZ	this_gridr7   r9   r  Z
array_liker/  r:   r;   r?   ZsharedarrayZIntegerLiteralr0  rL   r.  r	  rS   localrV   rW   Zthreadfence_blockrb   Zthreadfence_systemrc   Zthreadfencerd   Zsyncwarprh   i4rf   Zshfl_sync_intrinsicrs   Zf4Zf8r   Zvote_sync_intrinsicr   r   Zmatch_any_syncr   Zmatch_all_syncr   r   r   Zlanemask_ltr   Zpopcr   r   r   r   r   ZFloatr   r   r   ZIntegerr   r   r   r   Zhaddr   iaddZhsubr   isubZhmulr   imulZhnegr   negr   Zhabsr   absr   Zhfmar   truedivitruedivr   r   r   Zheqr   hner   Zhger   Zhgtr   hler   Zhltr   r   ZhmaxZhminrp   rv   r   Zcbrtr   ZbrevZu4r   u8r   Zclzr   Zffsr   r   Zselpr   r   r   r   r   r   r   r   r   r   r  piZ_deg2radZ_rad2degradiansdegreesr  r  Zatomicr  r!  r"  incr(  decr)  r1  and_ru   r4  Zexchr5  r;  r=  Znanmaxr>  Znanminr?  Zcompare_and_swaprA  Zcasr@  rC  r9  rE  rH   rZ  Z
get_ufuncsr   r   r   r   <module>   s  














		
		







	.










	





  










%





 
d
