1 #ifndef MSHADOW_TENSOR_SSE_INL_HPP
2 #define MSHADOW_TENSOR_SSE_INL_HPP
27 pitch = ((lspace+15) >> 4) << 4;
29 void * res = _aligned_malloc( pitch*num_line, 16 );
32 void *res = malloc( pitch * num_line );
34 void * res = memalign( 16, pitch*num_line );
53 return !(pitch & ((1<<4)-1));
65 return (( (size*fsize+15) >> 4 ) << 4) / fsize;
73 return (( (size*fsize) >> 4 ) << 4) / fsize;
80 #include <emmintrin.h>
88 template<
typename FloatType>
struct FVec{};
102 FVec( DType data ):data_(data){}
104 FVec(
const float &s ){
105 data_ = _mm_set1_ps( s );
109 data_ = _mm_load_ps( src );
113 inline void Store(
float *dst )
const{
114 return _mm_store_ps( dst, data_ );
117 inline float Sum(
void )
const{
118 DType ans = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );
119 DType rst = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );
120 #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
121 return rst.m128_f32[ 0 ];
123 float rr = _mm_cvtss_f32( rst ) ;
133 typedef __m128d DType;
141 FVec( DType data ):data_(data){}
143 FVec(
const double &s ){
144 data_ = _mm_set1_pd( s );
148 data_ = _mm_load_pd( src );
152 inline void Store(
double *dst )
const{
153 return _mm_store_pd( dst, data_ );
156 inline double Sum(
void )
const{
157 DType tmp = _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;
158 #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
159 return tmp.m128d_f64[0];
161 double ans = _mm_cvtsd_f64( tmp );
170 template<
typename OP>
172 const static bool kEnabled =
false;
176 const static bool kEnabled =
true;
186 const static bool kEnabled =
true;
196 const static bool kEnabled =
true;
206 const static bool kEnabled =
true;
217 const static bool kEnabled =
true;
229 template<
typename SV,
typename TFloat>
237 template<
typename TFloat>
249 template<
typename ExpType>
260 template <
typename Device,
int dim>
264 :dptr_(t.
dptr),stride_(t.
shape.stride_){}
269 return dptr_[ y * stride_ + x ];
290 template<
typename OP,
typename TA,
typename TB,
int etype>
294 :lhs_(lhs), rhs_(rhs){}
299 return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
306 template<
typename OP,
typename TA,
int etype>
314 return OP::Map( src_.Eval( y, x ) );
320 template<
typename OP,
typename TA,
typename TB,
int etype>
328 inline SSEPlan<T> MakeSSEPlan(
const ContainerExp<T> &e ){
329 return SSEPlan<T>( e.self() );
332 template<
typename T,
int dim>
333 inline SSEPlan<T> MakeSSEPlan(
const MakeTensorExp<T,cpu,dim> &e ){
334 return SSEPlan<T>( e.real_self() );
337 template<
typename OP,
typename TA,
int etype>
338 inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan(
const UnaryMapExp<OP,TA,etype> &e ){
339 return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );
342 template<
typename OP,
typename TA,
typename TB,
int etype>
343 inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan(
const BinaryMapExp<OP,TA,TB,etype> &e ){
344 return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );
358 const static bool kPass =
false;
362 const static bool kPass =
true;
366 const static bool kPass =
true;
369 template<
typename OP,
typename TA,
int etype>
373 template<
typename OP,
typename TA,
typename TB,
int etype>
380 template<
int dim,
typename E>
382 inline static bool Check(
const E &exp ){
388 inline static bool Check(
const ScalarExp &exp ){
398 template<
int dim,
typename OP,
typename TA,
int etype>
404 template<
int dim,
typename OP,
typename TA,
typename TB,
int etype>
416 template<
typename SV,
typename E,
int dim>
421 for(
index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){
425 SV::Save( dst[y][x], plan.Eval(y,x) );
430 #endif // MSHADOW_USE_SSE
431 #endif // MSHADOW_TENSOR_SSE_INL_HPP
void MapSSEPlan(Tensor< cpu, dim > _dst, const expr::SSEPlan< E > &plan)
use SSEPlan to compute result
Definition: tensor_sse-inl.hpp:417
unsigned index_t
type that will be used for index
Definition: tensor_base.h:123
bool CheckAlign(size_t pitch)
check if a pointer is aligned
Definition: tensor_sse-inl.hpp:52
void * AlignedMallocPitch(size_t &pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: tensor_sse-inl.hpp:26
float Sum(void) const
sum of all content
Definition: tensor_sse-inl.hpp:117
binary map expression lhs [op] rhs
Definition: tensor_expr.h:225
void Assert(bool exp)
assert a expression is true
Definition: tensor_base.h:285
Definition: tensor_sse-inl.hpp:277
void Store(float *dst) const
store data into dst space
Definition: tensor_sse-inl.hpp:113
static check sse enable if a expression E can not be evaluated using sse, then kPass = false ...
Definition: tensor_sse-inl.hpp:357
float real_t
type that will be used for content
Definition: tensor_base.h:118
float vector real type, used for vectorization
Definition: tensor_sse-inl.hpp:88
index_t LowerAlign(index_t size, size_t fsize)
get lower bound of aligned index of size
Definition: tensor_sse-inl.hpp:72
sse2 operator type of certain operator
Definition: tensor_sse-inl.hpp:171
header file of tensor data structure and functions covention: this lib requires explicit memory alloc...
device name CPU
Definition: tensor.h:185
const TA & src_
source expression
Definition: tensor_expr.h:342
DType data_
data content
Definition: tensor_sse-inl.hpp:137
Definition: tensor_sse-inl.hpp:250
DType data_
data content
Definition: tensor_sse-inl.hpp:98
vector real type for float
Definition: tensor_sse-inl.hpp:92
MSHADOW_CINLINE sse2::FVec< real_t > EvalSSE(index_t y, index_t x) const
evaluate the expression at index [y][x], x will be aligned to 4 to be implemented by SubType ...
FVec(const double *src)
load from pointer src
Definition: tensor_sse-inl.hpp:147
Definition: tensor_sse-inl.hpp:230
real_t * dptr
pointer to the data
Definition: tensor.h:215
MSHADOW_XINLINE Tensor< Device, 2 > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:229
unary map expression op(src)
Definition: tensor_expr.h:340
FVec(const float *src)
load from pointer src
Definition: tensor_sse-inl.hpp:108
Shape< dimension > shape
shape of the tensor
Definition: tensor.h:217
#define MSHADOW_CINLINE
cpu force inline
Definition: tensor_base.h:101
scalar expression
Definition: tensor_expr.h:62
vector real type for float
Definition: tensor_sse-inl.hpp:131
const TA & lhs_
left operand
Definition: tensor_expr.h:227
void Store(double *dst) const
store data into dst space
Definition: tensor_sse-inl.hpp:152
definitions of abstract expressions and expressions template
void AlignedFree(void *ptr)
free aligned space
Definition: tensor_sse-inl.hpp:44
real_t scalar_
scalar value
Definition: tensor_expr.h:64
general tensor
Definition: tensor.h:206
Definition: tensor_sse-inl.hpp:381
const TB & rhs_
right operand
Definition: tensor_expr.h:229
double Sum(void) const
sum of all content
Definition: tensor_sse-inl.hpp:156
index_t UpperAlign(index_t size, size_t fsize)
get upper bound of aligned index of size
Definition: tensor_sse-inl.hpp:64