Apache SINGA
A distributed deep learning platform .
 All Classes Namespaces Files Functions Variables Typedefs Enumerator Macros
tensor_sse-inl.hpp
Go to the documentation of this file.
1 #ifndef MSHADOW_TENSOR_SSE_INL_HPP
2 #define MSHADOW_TENSOR_SSE_INL_HPP
3 
8 #ifdef __APPLE__
9 #include <stdlib.h>
10 #else
11 #include <malloc.h>
12 #endif
13 
14 #include "tensor_expr.h"
15 #include "tensor.h"
16 
17 namespace mshadow {
19  namespace sse2{
26  inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){
27  pitch = ((lspace+15) >> 4) << 4;
28  #ifdef _MSC_VER
29  void * res = _aligned_malloc( pitch*num_line, 16 );
30  #else
31  #ifdef __APPLE__
32  void *res = malloc( pitch * num_line );
33  #else
34  void * res = memalign( 16, pitch*num_line );
35  #endif
36  #endif
37  utils::Assert( res != NULL, "AlignedMallocPitch failed" );
38  return res;
39  }
44  inline void AlignedFree( void *ptr ){
45  #ifdef _MSC_VER
46  _aligned_free( ptr );
47  #else
48  free( ptr );
49  #endif
50  }
52  inline bool CheckAlign( size_t pitch ){
53  return !(pitch & ((1<<4)-1));
54  }
56  inline bool CheckAlign( void *ptr ){
57  return CheckAlign( (size_t)ptr );
58  }
64  inline index_t UpperAlign( index_t size, size_t fsize ){
65  return (( (size*fsize+15) >> 4 ) << 4) / fsize;
66  }
72  inline index_t LowerAlign( index_t size, size_t fsize ){
73  return (( (size*fsize) >> 4 ) << 4) / fsize;
74  }
75  }; // namespace sse2
76 }; // namespace mshadow
77 
78 #if MSHADOW_USE_SSE
79 // sse types are not compatible with nvcc, only use them in cpu mode
80 #include <emmintrin.h>
81 
82 namespace mshadow{
83  namespace sse2{
88  template<typename FloatType> struct FVec{};
89 
91  template<>
92  struct FVec<float> {
93  public:
94  typedef __m128 DType;
96  const static index_t kSize = 4;
98  DType data_;
99  public:
100  /* constructors */
101  FVec( void ){}
102  FVec( DType data ):data_(data){}
103  /* set the float */
104  FVec( const float &s ){
105  data_ = _mm_set1_ps( s );
106  }
108  FVec( const float *src ){
109  data_ = _mm_load_ps( src );
110  }
111  public:
113  inline void Store( float *dst ) const{
114  return _mm_store_ps( dst, data_ );
115  }
117  inline float Sum( void ) const{
118  DType ans = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );
119  DType rst = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );
120  #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
121  return rst.m128_f32[ 0 ];
122  #else
123  float rr = _mm_cvtss_f32( rst ) ;
124  return rr;
125  #endif
126  }
127  };
128 
130  template<>
131  struct FVec<double> {
132  public:
133  typedef __m128d DType;
135  const static index_t kSize = 2;
137  DType data_;
138  public:
139  /* constructors */
140  FVec( void ){}
141  FVec( DType data ):data_(data){}
142  /* set the float */
143  FVec( const double &s ){
144  data_ = _mm_set1_pd( s );
145  }
147  FVec( const double *src ){
148  data_ = _mm_load_pd( src );
149  }
150  public:
152  inline void Store( double *dst ) const{
153  return _mm_store_pd( dst, data_ );
154  }
156  inline double Sum( void ) const{
157  DType tmp = _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;
158  #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
159  return tmp.m128d_f64[0];
160  #else
161  double ans = _mm_cvtsd_f64( tmp );
162  return ans;
163  #endif
164  }
165  };
166  };
167 
168  namespace sse2{
170  template<typename OP>
171  struct SSEOp{
172  const static bool kEnabled = false;
173  };
174  template<>
175  struct SSEOp<op::plus>{
176  const static bool kEnabled = true;
177  MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
178  return FVec<float>( _mm_add_ps( lhs.data_, rhs.data_ ) );
179  }
180  MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
181  return FVec<double>( _mm_add_pd( lhs.data_, rhs.data_ ) );
182  }
183  };
184  template<>
185  struct SSEOp<op::minus>{
186  const static bool kEnabled = true;
187  MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
188  return FVec<float>( _mm_sub_ps( lhs.data_, rhs.data_ ) );
189  }
190  MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
191  return FVec<double>( _mm_sub_pd( lhs.data_, rhs.data_ ) );
192  }
193  };
194  template<>
195  struct SSEOp<op::mul>{
196  const static bool kEnabled = true;
197  MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
198  return FVec<float>( _mm_mul_ps( lhs.data_, rhs.data_ ) );
199  }
200  MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
201  return FVec<double>( _mm_mul_pd( lhs.data_, rhs.data_ ) );
202  }
203  };
204  template<>
205  struct SSEOp<op::div>{
206  const static bool kEnabled = true;
207  MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
208  return FVec<float>( _mm_div_ps( lhs.data_, rhs.data_ ) );
209  }
210  MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
211  return FVec<double>( _mm_div_pd( lhs.data_, rhs.data_ ) );
212  }
213  };
214 
215  template<>
216  struct SSEOp<op::identity>{
217  const static bool kEnabled = true;
218  MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &src ){
219  return src;
220  }
221  MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &src ){
222  return src;
223  }
224  };
225  }; // namespace sse2
226 
227  namespace sse2{
228  // savers to do storage
229  template<typename SV, typename TFloat>
230  struct Saver{
231  MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
232  FVec<TFloat> lhs( dst );
234  ans.Store( dst );
235  }
236  };
237  template<typename TFloat>
238  struct Saver<sv::saveto,TFloat>{
239  MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
240  src.Store( dst );
241  }
242  };
243  }; // namespace sse2
244 }; // namespace mshadow
245 
246 namespace mshadow{
247  namespace expr{
248  // same as plan, but use sse2
249  template<typename ExpType>
250  class SSEPlan {
251  public:
257  MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const;
258  };
259 
260  template <typename Device, int dim>
261  class SSEPlan< Tensor<Device,dim> >{
262  public:
263  SSEPlan( const Tensor<Device,dim> &t )
264  :dptr_(t.dptr),stride_(t.shape.stride_){}
266  return sse2::FVec<real_t>( &dptr_[ y*stride_+x ] );
267  }
268  MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
269  return dptr_[ y * stride_ + x ];
270  }
271  private:
272  const real_t *dptr_;
273  index_t stride_;
274  };
275 
276  template<>
278  public:
279  SSEPlan( real_t scalar ):scalar_(scalar){}
281  return sse2::FVec<real_t>( scalar_ );
282  }
283  MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
284  return scalar_;
285  }
286  private:
287  real_t scalar_;
288  };
289 
290  template<typename OP, typename TA, typename TB,int etype>
291  class SSEPlan< BinaryMapExp<OP,TA,TB,etype> >{
292  public:
293  SSEPlan( const SSEPlan<TA> &lhs, const SSEPlan<TB> &rhs )
294  :lhs_(lhs), rhs_(rhs){}
296  return sse2::SSEOp<OP>::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) );
297  }
298  MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
299  return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
300  }
301  private:
302  SSEPlan<TA> lhs_;
303  SSEPlan<TB> rhs_;
304  };
305 
306  template<typename OP, typename TA, int etype>
307  class SSEPlan< UnaryMapExp<OP,TA,etype> >{
308  public:
309  SSEPlan( const SSEPlan<TA> &src ):src_(src){}
311  return sse2::SSEOp<OP>::Map( src_.EvalSSE( y, x ) );
312  }
313  MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
314  return OP::Map( src_.Eval( y, x ) );
315  }
316  private:
317  SSEPlan<TA> src_;
318  };
319 
320  template<typename OP, typename TA, typename TB, int etype>
322 
323  inline SSEPlan<ScalarExp> MakeSSEPlan( const ScalarExp &e ){
324  return SSEPlan<ScalarExp>( e.scalar_ );
325  }
326 
327  template<typename T>
328  inline SSEPlan<T> MakeSSEPlan( const ContainerExp<T> &e ){
329  return SSEPlan<T>( e.self() );
330  }
331 
332  template<typename T,int dim>
333  inline SSEPlan<T> MakeSSEPlan( const MakeTensorExp<T,cpu,dim> &e ){
334  return SSEPlan<T>( e.real_self() );
335  }
336 
337  template<typename OP, typename TA, int etype>
338  inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan( const UnaryMapExp<OP,TA,etype> &e ){
339  return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );
340  }
341 
342  template<typename OP, typename TA, typename TB, int etype>
343  inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
344  return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );
345  }
346  };
347 
348  namespace expr{
356  template<typename E>
357  struct SSECheck{
358  const static bool kPass = false;
359  };
360  template<>
362  const static bool kPass = true;
363  };
364  template<int dim>
365  struct SSECheck<Tensor<cpu,dim> >{
366  const static bool kPass = true;
367  };
368 
369  template<typename OP, typename TA, int etype>
370  struct SSECheck<UnaryMapExp<OP,TA,etype> >{
371  const static bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;
372  };
373  template<typename OP, typename TA, typename TB, int etype>
374  struct SSECheck< BinaryMapExp<OP,TA,TB,etype> >{
375  const static bool kPass = SSECheck<TA>::kPass && SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;
376  };
377  }; // namespace expr
378  namespace expr{
379  // check if data is aligned and allow sse operation
380  template<int dim,typename E>
382  inline static bool Check( const E &exp ){
383  return false;
384  }
385  };
386  template<int dim>
387  struct SSEAlignCheck< dim, ScalarExp >{
388  inline static bool Check( const ScalarExp &exp ){
389  return true;
390  }
391  };
392  template<int dim>
393  struct SSEAlignCheck< dim,Tensor<cpu,dim> >{
394  inline static bool Check( const Tensor<cpu,dim> &t ){
395  return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) );
396  }
397  };
398  template<int dim, typename OP, typename TA, int etype>
399  struct SSEAlignCheck< dim, UnaryMapExp<OP,TA,etype> >{
400  inline static bool Check( const UnaryMapExp<OP,TA,etype> &t ){
402  }
403  };
404  template<int dim, typename OP, typename TA, typename TB, int etype>
405  struct SSEAlignCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
406  inline static bool Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
407  return SSEAlignCheck<dim,TA>::Check( t.lhs_ ) &&
409  }
410  };
411  }; // namespace expr
412 
416  template<typename SV, typename E, int dim>
417  inline void MapSSEPlan(Tensor<cpu,dim> _dst, const expr::SSEPlan<E> &plan){
418  Tensor<cpu,2> dst = _dst.FlatTo2D();
419  const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) );
420  for ( index_t y = 0; y < dst.shape[1]; y ++ ) {
421  for( index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){
422  sse2::Saver<SV,real_t>::Save( &dst[y][x], plan.EvalSSE( y,x ) );
423  }
424  for( index_t x = xlen; x < dst.shape[0]; x ++ ){
425  SV::Save( dst[y][x], plan.Eval(y,x) );
426  }
427  }
428  }
429 }; // namespace mshadow
430 #endif // MSHADOW_USE_SSE
431 #endif // MSHADOW_TENSOR_SSE_INL_HPP
void MapSSEPlan(Tensor< cpu, dim > _dst, const expr::SSEPlan< E > &plan)
use SSEPlan to compute result
Definition: tensor_sse-inl.hpp:417
unsigned index_t
type that will be used for index
Definition: tensor_base.h:123
bool CheckAlign(size_t pitch)
check if a pointer is aligned
Definition: tensor_sse-inl.hpp:52
void * AlignedMallocPitch(size_t &pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: tensor_sse-inl.hpp:26
float Sum(void) const
sum of all content
Definition: tensor_sse-inl.hpp:117
binary map expression lhs [op] rhs
Definition: tensor_expr.h:225
void Assert(bool exp)
assert a expression is true
Definition: tensor_base.h:285
Definition: tensor_sse-inl.hpp:277
void Store(float *dst) const
store data into dst space
Definition: tensor_sse-inl.hpp:113
static check sse enable if a expression E can not be evaluated using sse, then kPass = false ...
Definition: tensor_sse-inl.hpp:357
float real_t
type that will be used for content
Definition: tensor_base.h:118
float vector real type, used for vectorization
Definition: tensor_sse-inl.hpp:88
index_t LowerAlign(index_t size, size_t fsize)
get lower bound of aligned index of size
Definition: tensor_sse-inl.hpp:72
sse2 operator type of certain operator
Definition: tensor_sse-inl.hpp:171
header file of tensor data structure and functions covention: this lib requires explicit memory alloc...
device name CPU
Definition: tensor.h:185
const TA & src_
source expression
Definition: tensor_expr.h:342
DType data_
data content
Definition: tensor_sse-inl.hpp:137
Definition: tensor_sse-inl.hpp:250
DType data_
data content
Definition: tensor_sse-inl.hpp:98
vector real type for float
Definition: tensor_sse-inl.hpp:92
MSHADOW_CINLINE sse2::FVec< real_t > EvalSSE(index_t y, index_t x) const
evaluate the expression at index [y][x], x will be aligned to 4 to be implemented by SubType ...
FVec(const double *src)
load from pointer src
Definition: tensor_sse-inl.hpp:147
Definition: tensor_sse-inl.hpp:230
real_t * dptr
pointer to the data
Definition: tensor.h:215
MSHADOW_XINLINE Tensor< Device, 2 > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:229
unary map expression op(src)
Definition: tensor_expr.h:340
FVec(const float *src)
load from pointer src
Definition: tensor_sse-inl.hpp:108
Shape< dimension > shape
shape of the tensor
Definition: tensor.h:217
#define MSHADOW_CINLINE
cpu force inline
Definition: tensor_base.h:101
scalar expression
Definition: tensor_expr.h:62
vector real type for float
Definition: tensor_sse-inl.hpp:131
const TA & lhs_
left operand
Definition: tensor_expr.h:227
void Store(double *dst) const
store data into dst space
Definition: tensor_sse-inl.hpp:152
definitions of abstract expressions and expressions template
void AlignedFree(void *ptr)
free aligned space
Definition: tensor_sse-inl.hpp:44
real_t scalar_
scalar value
Definition: tensor_expr.h:64
general tensor
Definition: tensor.h:206
Definition: tensor_sse-inl.hpp:381
const TB & rhs_
right operand
Definition: tensor_expr.h:229
double Sum(void) const
sum of all content
Definition: tensor_sse-inl.hpp:156
index_t UpperAlign(index_t size, size_t fsize)
get upper bound of aligned index of size
Definition: tensor_sse-inl.hpp:64