content/api/tensor__sse-inl_8hpp_source.html

 #ifndef MSHADOW_TENSOR_SSE_INL_HPP

 #define MSHADOW_TENSOR_SSE_INL_HPP


 #ifdef __APPLE__

 #include <stdlib.h>

 #else

 #include <malloc.h>

 #endif


 #include "tensor_expr.h"

 #include "tensor.h"


 namespace mshadow {

     namespace sse2{

         inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){

             pitch = ((lspace+15) >> 4) << 4;

             #ifdef _MSC_VER

             void * res = _aligned_malloc( pitch*num_line, 16 );

             #else

             #ifdef __APPLE__

             void *res = malloc( pitch * num_line );

             #else

             void * res = memalign( 16, pitch*num_line );

             #endif

             #endif

             utils::Assert( res != NULL, "AlignedMallocPitch failed" );

             return res;

         }

         inline void AlignedFree( void *ptr ){

             #ifdef _MSC_VER

             _aligned_free( ptr );

             #else

             free( ptr );

             #endif

         }

         inline bool CheckAlign( size_t pitch ){

             return !(pitch & ((1<<4)-1));

         }

         inline bool CheckAlign( void *ptr ){

             return CheckAlign( (size_t)ptr );

         }

         inline index_t UpperAlign( index_t size, size_t fsize ){

             return (( (size*fsize+15) >> 4 ) << 4) / fsize;

         }

         inline index_t LowerAlign( index_t size, size_t fsize ){

             return (( (size*fsize) >> 4 ) << 4) / fsize;

         }

     }; // namespace sse2

 }; // namespace  mshadow


 #if MSHADOW_USE_SSE

 // sse types are not compatible with nvcc, only use them in cpu mode

 #include <emmintrin.h>


 namespace mshadow{

     namespace sse2{

         template<typename FloatType> struct FVec{};


         template<>

         struct FVec<float> {

         public:

             typedef __m128 DType;

             const static index_t kSize = 4;

             DType data_;

         public:

             /* constructors */

             FVec( void ){}

             FVec( DType data ):data_(data){}

             /* set the float */

             FVec( const float &s ){

                 data_ = _mm_set1_ps( s );

             }

             FVec( const float *src ){

                 data_ = _mm_load_ps( src );

             }

         public:

             inline void Store( float *dst ) const{

                 return _mm_store_ps( dst, data_ );

             }

             inline float Sum( void ) const{

                 DType ans  = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );

                 DType rst  = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );

                 #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)

                 return rst.m128_f32[ 0 ];

                 #else

                 float rr = _mm_cvtss_f32( rst ) ;

                 return rr;

                 #endif

             }

         };


         template<>

         struct FVec<double> {

         public:

             typedef __m128d DType;

             const static index_t kSize = 2;

             DType data_;

         public:

             /* constructors */

             FVec( void ){}

             FVec( DType data ):data_(data){}

             /* set the float */

             FVec( const double &s ){

                 data_ = _mm_set1_pd( s );

             }

             FVec( const double *src ){

                 data_ = _mm_load_pd( src );

             }

         public:

             inline void Store( double *dst ) const{

                 return _mm_store_pd( dst, data_ );

             }

             inline double Sum( void ) const{

                 DType tmp =  _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;

                 #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)

                 return tmp.m128d_f64[0];

                 #else

                 double ans = _mm_cvtsd_f64( tmp );

                 return ans;

                 #endif

             }

         };

     };


     namespace sse2{

         template<typename OP>

         struct SSEOp{

             const static bool kEnabled = false;

         };

         template<>

         struct SSEOp<op::plus>{

             const static bool kEnabled = true;

             MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){

                 return FVec<float>( _mm_add_ps( lhs.data_, rhs.data_ ) );

             }

             MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){

                 return FVec<double>( _mm_add_pd( lhs.data_, rhs.data_ ) );

             }

         };

         template<>

         struct SSEOp<op::minus>{

             const static bool kEnabled = true;

             MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){

                 return FVec<float>( _mm_sub_ps( lhs.data_, rhs.data_ ) );

             }

             MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){

                 return FVec<double>( _mm_sub_pd( lhs.data_, rhs.data_ ) );

             }

         };

         template<>

         struct SSEOp<op::mul>{

             const static bool kEnabled = true;

             MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){

                 return FVec<float>( _mm_mul_ps( lhs.data_, rhs.data_ ) );

             }

             MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){

                 return FVec<double>( _mm_mul_pd( lhs.data_, rhs.data_ ) );

             }

         };

         template<>

         struct SSEOp<op::div>{

             const static bool kEnabled = true;

             MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){

                 return FVec<float>( _mm_div_ps( lhs.data_, rhs.data_ ) );

             }

             MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){

                 return FVec<double>( _mm_div_pd( lhs.data_, rhs.data_ ) );

             }

         };


         template<>

         struct SSEOp<op::identity>{

             const static bool kEnabled = true;

             MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &src ){

                 return src;

             }

             MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &src ){

                 return src;

             }

         };

     }; // namespace sse2


     namespace sse2{

         // savers to do storage

         template<typename SV, typename TFloat>

         struct Saver{

             MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){

                 FVec<TFloat> lhs( dst );

                 FVec<TFloat> ans = SSEOp<typename SV::OPType>::Map( lhs, src );

                 ans.Store( dst );

             }

         };

         template<typename TFloat>

         struct Saver<sv::saveto,TFloat>{

             MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){

                 src.Store( dst );

             }

         };

     }; // namespace sse2

 }; // namespace mshadow


 namespace mshadow{

     namespace expr{

         // same as plan, but use sse2

         template<typename ExpType>

         class SSEPlan {

         public:

             MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const;

             MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const;

         };


         template <typename Device, int dim>

         class SSEPlan< Tensor<Device,dim> >{

         public:

             SSEPlan( const Tensor<Device,dim> &t )

                 :dptr_(t.dptr),stride_(t.shape.stride_){}

             MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{

                 return sse2::FVec<real_t>( &dptr_[ y*stride_+x ] );

             }

             MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{

                 return dptr_[ y * stride_ + x ];

             }

         private:

             const real_t  *dptr_;

             index_t stride_;

         };


         template<>

         class SSEPlan<ScalarExp>{

         public:

             SSEPlan( real_t scalar ):scalar_(scalar){}

             MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{

                 return sse2::FVec<real_t>( scalar_ );

             }

             MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{

                 return scalar_;

             }

         private:

             real_t scalar_;

         };


         template<typename OP, typename TA, typename TB,int etype>

         class SSEPlan< BinaryMapExp<OP,TA,TB,etype> >{

         public:

             SSEPlan( const SSEPlan<TA> &lhs, const SSEPlan<TB> &rhs )

                 :lhs_(lhs), rhs_(rhs){}

             MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{

                 return sse2::SSEOp<OP>::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) );

             }

             MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{

                 return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );

             }

         private:

             SSEPlan<TA> lhs_;

             SSEPlan<TB> rhs_;

         };


         template<typename OP, typename TA, int etype>

         class SSEPlan< UnaryMapExp<OP,TA,etype> >{

         public:

             SSEPlan( const SSEPlan<TA> &src ):src_(src){}

             MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{

                 return sse2::SSEOp<OP>::Map( src_.EvalSSE( y, x ) );

             }

             MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{

                 return OP::Map( src_.Eval( y, x ) );

             }

         private:

             SSEPlan<TA> src_;

         };


         template<typename OP, typename TA, typename TB, int etype>

         inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e );


         inline SSEPlan<ScalarExp> MakeSSEPlan( const ScalarExp &e ){

             return SSEPlan<ScalarExp>( e.scalar_ );

         }


         template<typename T>

         inline SSEPlan<T> MakeSSEPlan( const ContainerExp<T> &e ){

             return SSEPlan<T>( e.self() );

         }


         template<typename T,int dim>

         inline SSEPlan<T> MakeSSEPlan( const MakeTensorExp<T,cpu,dim> &e ){

             return SSEPlan<T>( e.real_self() );

         }


         template<typename OP, typename TA, int etype>

         inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan( const UnaryMapExp<OP,TA,etype> &e ){

             return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );

         }


         template<typename OP, typename TA, typename TB, int etype>

         inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e ){

                 return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );

         }

     };


     namespace expr{

         template<typename E>

         struct SSECheck{

             const static bool kPass = false;

         };

         template<>

         struct SSECheck<ScalarExp>{

             const static bool kPass = true;

         };

         template<int dim>

         struct SSECheck<Tensor<cpu,dim> >{

             const static bool kPass = true;

         };


         template<typename OP, typename TA, int etype>

         struct SSECheck<UnaryMapExp<OP,TA,etype> >{

             const static bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;

         };

         template<typename OP, typename TA, typename TB, int etype>

         struct SSECheck< BinaryMapExp<OP,TA,TB,etype> >{

             const static bool kPass = SSECheck<TA>::kPass && SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;

         };

     }; // namespace expr

     namespace expr{

         // check if data is aligned and allow sse operation

         template<int dim,typename E>

         struct SSEAlignCheck{

             inline static bool Check( const E &exp ){

                 return false;

             }

         };

         template<int dim>

         struct SSEAlignCheck< dim, ScalarExp >{

             inline static bool Check( const ScalarExp &exp ){

                 return true;

             }

         };

         template<int dim>

         struct SSEAlignCheck< dim,Tensor<cpu,dim> >{

             inline static bool Check( const Tensor<cpu,dim> &t ){

                 return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) );

             }

         };

         template<int dim, typename OP, typename TA, int etype>

         struct SSEAlignCheck< dim, UnaryMapExp<OP,TA,etype> >{

             inline static bool Check( const UnaryMapExp<OP,TA,etype> &t ){

                 return SSEAlignCheck<dim,TA>::Check( t.src_);

             }

         };

         template<int dim, typename OP, typename TA, typename TB, int etype>

         struct SSEAlignCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{

             inline static bool Check( const BinaryMapExp<OP,TA,TB,etype> &t ){

                 return SSEAlignCheck<dim,TA>::Check( t.lhs_ ) &&

                     SSEAlignCheck<dim,TB>::Check( t.rhs_ );

             }

         };

     }; // namespace expr


     template<typename SV, typename E, int dim>

     inline void MapSSEPlan(Tensor<cpu,dim> _dst, const expr::SSEPlan<E> &plan){

         Tensor<cpu,2> dst = _dst.FlatTo2D();

         const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) );

         for ( index_t y = 0; y < dst.shape[1]; y ++ ) {

             for( index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){

                 sse2::Saver<SV,real_t>::Save( &dst[y][x], plan.EvalSSE( y,x ) );

             }

             for( index_t x = xlen; x < dst.shape[0]; x ++ ){

                 SV::Save( dst[y][x], plan.Eval(y,x) );

             }

         }

     }

 }; // namespace mshadow

 #endif // MSHADOW_USE_SSE

 #endif // MSHADOW_TENSOR_SSE_INL_HPP

mshadow::MapSSEPlan
void MapSSEPlan(Tensor< cpu, dim > _dst, const expr::SSEPlan< E > &plan)
use SSEPlan to compute result
Definition: tensor_sse-inl.hpp:417

mshadow::index_t
unsigned index_t
type that will be used for index
Definition: tensor_base.h:123

mshadow::sse2::CheckAlign
bool CheckAlign(size_t pitch)
check if a pointer is aligned
Definition: tensor_sse-inl.hpp:52

mshadow::sse2::AlignedMallocPitch
void * AlignedMallocPitch(size_t &pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: tensor_sse-inl.hpp:26

mshadow::sse2::FVec< float >::Sum
float Sum(void) const
sum of all content
Definition: tensor_sse-inl.hpp:117

mshadow::expr::BinaryMapExp
binary map expression lhs [op] rhs
Definition: tensor_expr.h:225

mshadow::utils::Assert
void Assert(bool exp)
assert a expression is true
Definition: tensor_base.h:285

mshadow::expr::SSEPlan< ScalarExp >
Definition: tensor_sse-inl.hpp:277

mshadow::sse2::FVec< float >::Store
void Store(float *dst) const
store data into dst space
Definition: tensor_sse-inl.hpp:113

mshadow::expr::SSECheck
static check sse enable if a expression E can not be evaluated using sse, then kPass = false ...
Definition: tensor_sse-inl.hpp:357

mshadow::real_t
float real_t
type that will be used for content
Definition: tensor_base.h:118

mshadow::sse2::FVec
float vector real type, used for vectorization
Definition: tensor_sse-inl.hpp:88

mshadow::sse2::LowerAlign
index_t LowerAlign(index_t size, size_t fsize)
get lower bound of aligned index of size
Definition: tensor_sse-inl.hpp:72

mshadow::sse2::SSEOp
sse2 operator type of certain operator
Definition: tensor_sse-inl.hpp:171

tensor.h
header file of tensor data structure and functions covention: this lib requires explicit memory alloc...

mshadow::cpu
device name CPU
Definition: tensor.h:185

mshadow::expr::UnaryMapExp::src_
const TA & src_
source expression
Definition: tensor_expr.h:342

mshadow::sse2::FVec< double >::data_
DType data_
data content
Definition: tensor_sse-inl.hpp:137

mshadow::expr::SSEPlan
Definition: tensor_sse-inl.hpp:250

mshadow::sse2::FVec< float >::data_
DType data_
data content
Definition: tensor_sse-inl.hpp:98

mshadow::sse2::FVec< float >
vector real type for float
Definition: tensor_sse-inl.hpp:92

mshadow::expr::SSEPlan::EvalSSE
MSHADOW_CINLINE sse2::FVec< real_t > EvalSSE(index_t y, index_t x) const
evaluate the expression at index [y][x], x will be aligned to 4 to be implemented by SubType ...

mshadow::sse2::FVec< double >::FVec
FVec(const double *src)
load from pointer src
Definition: tensor_sse-inl.hpp:147

mshadow::sse2::Saver
Definition: tensor_sse-inl.hpp:230

mshadow::Tensor::dptr
real_t * dptr
pointer to the data
Definition: tensor.h:215

mshadow::Tensor::FlatTo2D
MSHADOW_XINLINE Tensor< Device, 2 > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:229

mshadow::expr::UnaryMapExp
unary map expression op(src)
Definition: tensor_expr.h:340

mshadow::sse2::FVec< float >::FVec
FVec(const float *src)
load from pointer src
Definition: tensor_sse-inl.hpp:108

mshadow::Tensor::shape
Shape< dimension > shape
shape of the tensor
Definition: tensor.h:217

MSHADOW_CINLINE
#define MSHADOW_CINLINE
cpu force inline
Definition: tensor_base.h:101

mshadow::expr::ScalarExp
scalar expression
Definition: tensor_expr.h:62

mshadow::sse2::FVec< double >
vector real type for float
Definition: tensor_sse-inl.hpp:131

mshadow::expr::BinaryMapExp::lhs_
const TA & lhs_
left operand
Definition: tensor_expr.h:227

mshadow::sse2::FVec< double >::Store
void Store(double *dst) const
store data into dst space
Definition: tensor_sse-inl.hpp:152

tensor_expr.h
definitions of abstract expressions and expressions template

mshadow::sse2::AlignedFree
void AlignedFree(void *ptr)
free aligned space
Definition: tensor_sse-inl.hpp:44

mshadow::expr::ScalarExp::scalar_
real_t scalar_
scalar value
Definition: tensor_expr.h:64

mshadow::Tensor
general tensor
Definition: tensor.h:206

mshadow::expr::SSEAlignCheck
Definition: tensor_sse-inl.hpp:381

mshadow::expr::BinaryMapExp::rhs_
const TB & rhs_
right operand
Definition: tensor_expr.h:229

mshadow::sse2::FVec< double >::Sum
double Sum(void) const
sum of all content
Definition: tensor_sse-inl.hpp:156

mshadow::sse2::UpperAlign
index_t UpperAlign(index_t size, size_t fsize)
get upper bound of aligned index of size
Definition: tensor_sse-inl.hpp:64