//    Licensed to the Apache Software Foundation (ASF) under one or more
//    contributor license agreements.  See the NOTICE file distributed with
//    this work for additional information regarding copyright ownership.
//    The ASF licenses this file to You under the Apache License, Version 2.0
//    (the "License"); you may not use this file except in compliance with
//    the License.  You may obtain a copy of the License at
// 
//      http://www.apache.org/licenses/LICENSE-2.0
// 
//   Unless required by applicable law or agreed to in writing, software
//   distributed under the License is distributed on an "AS IS" BASIS,
//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//   See the License for the specific language governing permissions and
//   limitations under the License.
//
// Author: Salikh Zakirov
// 
// Assembly code needed to interface JIT-ed code with C code.
//

// my_memset:	function to set a number of bytes to a char value
	
	.section .text
// -- Begin  memset
	.proc  memset#
	.align 32
// Replicate the value into all bytes using mmx broadcast
// Fall through to aligned short (<16 bytes) code
// live out:	r21 (alignment), r31(replicated c),
//		r32(s), r33(c), r34(n)
	.global memset#
	.prologue
memset:
	mov	r8=r32			// Return value
	cmp.le	p14=16,r34
	and	r22=0xF,r32		// Spec test for 16-byte boundary
	and	r21=7,r32		// Spec test for 8-byte boundary
	mux1	r31=r33,@brcst		// Replicate byte value
 (p14)	br.cond.dpnt	Not_short
	;;
// Handle short values quickly
	cmp.ne	p15=0,r21		// If zero, skip alignment
	cmp.le	p11,p10=8,r34		// Spec test for st8 safety
	tbit.nz	p13,p12=r32,0		// Spec test for st1 alignment
	cmp.ge	p14=0,r34		// Spec test for early exit
 (p14)	br.ret.dpnt	b0
 (p15)	br.cond.dpnt	Align_short
	;;
// We're aligned and p11/p10 is set/clear if we need to do the st8
// Use complementary predicates to allow length tests in parallel with store
Short:
{ .mmi
	.pred.rel "mutex",p10,p11
 (p11)	st8	[r32]=r31,8
 (p11)	cmp.le	p13,p12=12,r34
 (p10)	cmp.le	p13,p12=4,r34
} { .mmi
	.pred.rel "mutex",p12,p13
 (p11)	add	r34=-8,r34
	;;
 (p13)	st4	[r32]=r31,4
 (p13)	cmp.le	p11,p10=6,r34
} { .mii
 (p12)	cmp.le	p11,p10=2,r34
	.pred.rel "mutex",p10,p11
 (p13)	add	r34=-4,r34
	;;
 (p11)	cmp.le	p13=3,r34
} { .mii
 (p11)	st2	[r32]=r31,2
 (p10)	cmp.le	p13=1,r34
	;;
} { .mib
 (p13)	st1	[r32]=r31
	br.ret.sptk	b0
	;;
}
// Align, while taking care not to exceed length
// Similar to aligned code above, but adds an alignment test to length test
Align_short:
{ .mmi
	.pred.rel "mutex",p12,p13
 (p13)	st1	[r32]=r33,1
 (p13)	cmp.le	p11,p10=3,r34
 (p12)	cmp.le	p11,p10=2,r34
} { .mii
 (p13)	add	r34=-1,r34
	;;
 (p11)	tbit.nz	p11,p10=r32,1		// length is OK, are we on 2-byte boundary?
	;;
} { .mmi
	.pred.rel "mutex",p10,p11
 (p11)	st2	[r32]=r31,2
 (p10)	cmp.le	p13,p12=4,r34
 (p11)	cmp.le	p13,p12=6,r34
} { .mmi
 (p11)	add	r34=-2,r34
	;;
 (p13)	tbit.nz	p13,p12=r32,2
	;;
} { .mmi
	.pred.rel "mutex",p12,p13
 (p13)	st4	[r32]=r31,4
 (p12)	cmp.le	p11,p10=8,r34
 (p13)	cmp.le	p11,p10=12,r34
} { .mib
 (p13)	add	r34=-4,r34
 	br.cond.sptk	Short
	;;
}	
// Code for lengths >= 16
// If we're not on a 16-byte boundary, move to one
// live out: r31 (replicated c), r33(unsigned c), r32(s), r34(unsigned n)
Not_short:
	cmp.ne	p15=0,r22		//0: Low 4 bits zero?
	cmp.ne	p11,p10=0,r33
	tbit.nz	p13,p12=r32,0		// Spec test for st1 alignment
  (p15)	br.cond.dpnt	Align_long
	;;
// OK, it's long, it's aligned to a 16-byte boundary.
// If r33 is not zero, skip to st8 code, otherwise fall into spill f0 version
Is_aligned:
	cmp.ne	p14=0,r33		// Check value of fill character
	add	r16=128,r32	// prefetch pointer
	.save	ar.lc,r11,t01
[t01:]	mov	r11=ar.lc
	mov	r24=r34
  (p14)	br.cond.dpnt	Nonzero
	;;
//
// Version when memset is clearing memory
//
	.body
	add	r17=16,r32	// second spill pointer
	cmp.le	p13=32,r34	// Spec for first set of spills
	cmp.ge	p14=127,r34
	and	r24=127,r34
	mov	r21=144		// = 128+16, length needed for second prefetch
 (p14)	br.cond.dpnt		Zero_medium
//
/// Enter loop code when length is at least 128
/// Prefetch each line with a spill
///
	stf.spill	[r32]=f0,32
	cmp.le		p9=r21,r34
	shr.u		r22=r34,7	// line size is 128
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	stf.spill	[r16]=f0,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	add		r22=-1,r22	// Loop count
	;;
	mov		ar.lc=r22
 (p9)	stf.spill	[r16]=f0,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	stf.spill	[r16]=f0,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	stf.spill	[r16]=f0,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	stf.spill	[r16]=f0,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
// Counted loop storing 128 bytes/iteration,
/// with out-of-order spills causing line prefetch
// live out:	r11(ar.lc), r17(s+16), r23(128-n&15), r24(n&15), r32(s)
//              r33(replicated c), r34(n), p13(n&15>32)
Zero_loop:
 (p9)	stf.spill	[r16]=f0,128
	stf.spill	[r17]=f0,32
	cmp.le		p9=r21,r34
	;;
	stf.spill	[r32]=f0,32
	stf.spill	[r17]=f0,32
	add		r21=128,r21	// next prefetch safe length
	;;
	stf.spill	[r32]=f0,32
	stf.spill	[r17]=f0,32
	cmp.le		p13=32,r24
	;;
	stf.spill	[r32]=f0,64
	stf.spill	[r17]=f0,32
	br.cloop.sptk	Zero_loop
	;;
	add		r32=-32,r32
	;;
Zero_medium:
 (p13)	stf.spill	[r32]=f0,32	// Redundant if entered from loop path
 (p13)	stf.spill	[r17]=f0,32
	cmp.le		p12=64,r24
	;;
 (p12)	stf.spill	[r32]=f0,32
 (p12)	stf.spill	[r17]=f0,32
	cmp.le		p11=96,r24
	;;
 (p11)	stf.spill	[r32]=f0,32
 (p11)	stf.spill	[r17]=f0,32
	tbit.nz		p10=r24,4
	;;
 (p10)	stf.spill	[r32]=f0,16
	tbit.nz		p9=r24,3
	;;
 (p9)	st8		[r32]=r0,8
	tbit.nz		p13=r24,2
	;;
// 
// Clean up any partial word stores.
//	
	tbit.nz		p12=r24,1
 (p13)	st4		[r32]=r0,4
	;;
 (p12)	st2		[r32]=r0,2
	tbit.nz		p11=r24,0
	;;
 (p11)	st1		[r32]=r0,1
	mov		ar.lc=r11
	br.ret.sptk.many	b0
	;;
//
// Fill character is not zero
// Now that p is aligned to a 16-byte boundary
//     use straight-line code for n<=64, a loop otherwise
// live out:	r8 (return value, original value of r32)
//		p14 (n>=MINIMUM_LONG)
//
Nonzero:
	MINIMUM_LONG=0x40
	add	r17=8,r32		//0: second pointer
	mov	r21=136		// = 128+8, length needed for second prefetch
	add	r22=64,r34	// May need extra 1/2 iteration
	cmp.le	p13=16,r34	// Spec for use when loop is skipped
	cmp.gt	p14=MINIMUM_LONG,r34
 (p14)	br.cond.dpnt	Nonzero_medium
	;;
//
/// Enter loop code when length is at least 128
/// Prefetch each line with a st8
///
	st8		[r32]=r31,16
	cmp.le		p9=r21,r34
	shr.u		r22=r22,7	// line size is 128
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	st8		[r16]=r31,128
	add		r22=-1,r22	// Loop count
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
	mov		ar.lc=r22
 (p9)	st8		[r16]=r31,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	st8		[r16]=r31,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	st8		[r16]=r31,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
 (p9)	st8		[r16]=r31,128
	cmp.le		p9=r21,r34
	add		r21=128,r21	// next prefetch safe length
	;;
// Counted loop storing 128 bytes/iteration,
/// with out-of-order spills causing line prefetch
// live out:	r11(ar.lc), r17(s+16), r23(128-n&15), r24(n&15), r32(s)
//              r33(replicated c), r34(n), p13(n&15>32)
Nonzero_loop:
 (p9)	st8		[r16]=r31,128
	st8		[r17]=r31,16
	cmp.lt		p10,p11=127,r24	// should we store the last 64?
	;;
	st8		[r32]=r31,16
	st8		[r17]=r31,16
 (p10)	add		r24=-128,r24	// Update count of remaining bytes
	;;
	st8		[r32]=r31,16
	st8		[r17]=r31,16
 (p11)	add		r24=-64,r24	// Update count of remaining bytes
	;;
	st8		[r32]=r31,16
	st8		[r17]=r31,16
	cmp.le		p9=r21,r34	// Compare prefetch offset with length
	;;
 (p10)	st8		[r32]=r31,16
 (p10)	st8		[r17]=r31,16
	add		r21=128,r21	// next prefetch-safe length
	;;
 (p10)	st8		[r32]=r31,16
 (p10)	st8		[r17]=r31,16
	cmp.le		p13=16,r24	// Spec for epilog
	;;
 (p10)	st8		[r32]=r31,16
 (p10)	st8		[r17]=r31,16
// (p10)	cmp.lt.unc	p11,p12=64,r24	// p11 true if we need another iter
	;;
//  {.mmi
 (p10)	st8		[r32]=r31,32
 (p10)	st8		[r17]=r31,16
//} {.mib
//	.pred.rel "mutex",p11,p12
// (p11)	add		r32=32,r32	// skip the bytes stored out-of-order
// (p12)	add		r32=16,r32	// prepare for epilogue
	br.cloop.sptk	Nonzero_loop
	;;
//}
 (p10)	add	r32=-16,r32
	;;
// Short memsets are done with predicated straightline code
// live out:	r8 (return value, original value of r32)
Nonzero_medium:
 (p13)	st8	[r32]=r31,16
 (p13)	st8	[r17]=r31,16
	cmp.le	p12=0x20,r24		//0: 32 <= n?
	;;
 (p12)	st8	[r32]=r31,16
 (p12)	st8	[r17]=r31,16
	cmp.le	p11=0x30,r24		//0: 48 <= n?
	;;
 (p11)	st8	[r32]=r31,16
 (p11)	st8	[r17]=r31,16
	tbit.nz	p10=r24,3
	;;
 (p10)	st8	[r32]=r31,8
	tbit.nz	p9=r24,2
	;;
// 
// Clean up any partial word stores.
//	
	tbit.nz	p8=r24,1
 (p9)	st4	[r32]=r31,4
	;;
 (p8)	st2	[r32]=r31,2
	tbit.nz	p7=r24,0
	;;
 (p7)	st1	[r32]=r31,1
	mov	ar.lc=r11
	br.ret.sptk.many	b0
	;;
Align_long:
 (p13)	st1	[r32]=r33,1
 (p13)	add	r34=-1,r34
	;;
	tbit.nz	p13=r32,1
	;;
 (p13)	st2	[r32]=r31,2
 (p13)	add	r34=-2,r34
	;;
	tbit.nz	p13=r32,2
	;;
 (p13)	st4	[r32]=r31,4
 (p13)	add	r34=-4,r34
	;;
	tbit.nz	p13,p12=r32,3
	;;
 (p13)	st8	[r32]=r31,8
 (p13)	add	r34=-8,r34
	;;
 	cmp.le	p11,p10=8,r34		// Spec for entry to Short
 	cmp.le	p13,p12=16,r34	
 (p12)	br.cond.dpnt	Short
	br.cond.dptk	Is_aligned
	;;
//
// -- End  memset
	.endp  memset#
// End