Eigen  3.2.92
GeneralBlockPanelKernel.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
18 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
19 class gebp_traits;
20 
21 
23 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
24 {
25  return a<=0 ? b : a;
26 }
27 
28 #if EIGEN_ARCH_i386_OR_x86_64
29 const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30 const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31 const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
32 #else
33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
36 #endif
37 
39 struct CacheSizes {
40  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
41  int l1CacheSize, l2CacheSize, l3CacheSize;
42  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
43  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
44  m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
45  m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
46  }
47 
48  std::ptrdiff_t m_l1;
49  std::ptrdiff_t m_l2;
50  std::ptrdiff_t m_l3;
51 };
52 
53 
55 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56 {
57  static CacheSizes m_cacheSizes;
58 
59  if(action==SetAction)
60  {
61  // set the cpu cache size and cache all block sizes from a global cache size in byte
62  eigen_internal_assert(l1!=0 && l2!=0);
63  m_cacheSizes.m_l1 = *l1;
64  m_cacheSizes.m_l2 = *l2;
65  m_cacheSizes.m_l3 = *l3;
66  }
67  else if(action==GetAction)
68  {
69  eigen_internal_assert(l1!=0 && l2!=0);
70  *l1 = m_cacheSizes.m_l1;
71  *l2 = m_cacheSizes.m_l2;
72  *l3 = m_cacheSizes.m_l3;
73  }
74  else
75  {
76  eigen_internal_assert(false);
77  }
78 }
79 
80 /* Helper for computeProductBlockingSizes.
81  *
82  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
83  * this function computes the blocking size parameters along the respective dimensions
84  * for matrix products and related algorithms. The blocking sizes depends on various
85  * parameters:
86  * - the L1 and L2 cache sizes,
87  * - the register level blocking sizes defined by gebp_traits,
88  * - the number of scalars that fit into a packet (when vectorization is enabled).
89  *
90  * \sa setCpuCacheSizes */
91 
92 template<typename LhsScalar, typename RhsScalar, int KcFactor>
93 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
94 {
95  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
96 
97  // Explanations:
98  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
99  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
100  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
101  // at the register level. This small horizontal panel has to stay within L1 cache.
102  std::ptrdiff_t l1, l2, l3;
103  manage_caching_sizes(GetAction, &l1, &l2, &l3);
104 
105  if (num_threads > 1) {
106  typedef typename Traits::ResScalar ResScalar;
107  enum {
108  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
109  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
110  k_mask = -8,
111 
112  mr = Traits::mr,
113  mr_mask = -mr,
114 
115  nr = Traits::nr,
116  nr_mask = -nr
117  };
118  // Increasing k gives us more time to prefetch the content of the "C"
119  // registers. However once the latency is hidden there is no point in
120  // increasing the value of k, so we'll cap it at 320 (value determined
121  // experimentally).
122  const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
123  if (k_cache < k) {
124  k = k_cache & k_mask;
125  eigen_internal_assert(k > 0);
126  }
127 
128  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
129  const Index n_per_thread = numext::div_ceil(n, num_threads);
130  if (n_cache <= n_per_thread) {
131  // Don't exceed the capacity of the l2 cache.
132  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
133  n = n_cache & nr_mask;
134  eigen_internal_assert(n > 0);
135  } else {
136  n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
137  }
138 
139  if (l3 > l2) {
140  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
141  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
142  const Index m_per_thread = numext::div_ceil(m, num_threads);
143  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
144  m = m_cache & mr_mask;
145  eigen_internal_assert(m > 0);
146  } else {
147  m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
148  }
149  }
150  }
151  else {
152  // In unit tests we do not want to use extra large matrices,
153  // so we reduce the cache size to check the blocking strategy is not flawed
154 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
155  l1 = 9*1024;
156  l2 = 32*1024;
157  l3 = 512*1024;
158 #endif
159 
160  // Early return for small problems because the computation below are time consuming for small problems.
161  // Perhaps it would make more sense to consider k*n*m??
162  // Note that for very tiny problem, this function should be bypassed anyway
163  // because we use the coefficient-based implementation for them.
164  if((std::max)(k,(std::max)(m,n))<48)
165  return;
166 
167  typedef typename Traits::ResScalar ResScalar;
168  enum {
169  k_peeling = 8,
170  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
171  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
172  };
173 
174  // ---- 1st level of blocking on L1, yields kc ----
175 
176  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
177  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
178  // We also include a register-level block of the result (mx x nr).
179  // (In an ideal world only the lhs panel would stay in L1)
180  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
181  const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
182  const Index old_k = k;
183  if(k>max_kc)
184  {
185  // We are really blocking on the third dimension:
186  // -> reduce blocking size to make sure the last block is as large as possible
187  // while keeping the same number of sweeps over the result.
188  k = (k%max_kc)==0 ? max_kc
189  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
190 
191  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
192  }
193 
194  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
195 
196  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
197  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
198  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
199  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
200  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
201  const Index actual_l2 = l3;
202  #else
203  const Index actual_l2 = 1572864; // == 1.5 MB
204  #endif
205 
206  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
207  // The second half is implicitly reserved to access the result and lhs coefficients.
208  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
209  // to limit this growth: we bound nc to growth by a factor x1.5.
210  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
211  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
212  Index max_nc;
213  const Index lhs_bytes = m * k * sizeof(LhsScalar);
214  const Index remaining_l1 = l1- k_sub - lhs_bytes;
215  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
216  {
217  // L1 blocking
218  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
219  }
220  else
221  {
222  // L2 blocking
223  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
224  }
225  // WARNING Below, we assume that Traits::nr is a power of two.
226  Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
227  if(n>nc)
228  {
229  // We are really blocking over the columns:
230  // -> reduce blocking size to make sure the last block is as large as possible
231  // while keeping the same number of sweeps over the packed lhs.
232  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
233  n = (n%nc)==0 ? nc
234  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
235  }
236  else if(old_k==k)
237  {
238  // So far, no blocking at all, i.e., kc==k, and nc==n.
239  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
240  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
241  Index problem_size = k*n*sizeof(LhsScalar);
242  Index actual_lm = actual_l2;
243  Index max_mc = m;
244  if(problem_size<=1024)
245  {
246  // problem is small enough to keep in L1
247  // Let's choose m such that lhs's block fit in 1/3 of L1
248  actual_lm = l1;
249  }
250  else if(l3!=0 && problem_size<=32768)
251  {
252  // we have both L2 and L3, and problem is small enough to be kept in L2
253  // Let's choose m such that lhs's block fit in 1/3 of L2
254  actual_lm = l2;
255  max_mc = 576;
256  }
257  Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
258  if (mc > Traits::mr) mc -= mc % Traits::mr;
259  else if (mc==0) return;
260  m = (m%mc)==0 ? mc
261  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
262  }
263  }
264 }
265 
266 inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
267 {
268 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
269  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
270  k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
271  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
272  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
273  return true;
274  }
275 #else
276  EIGEN_UNUSED_VARIABLE(k)
277  EIGEN_UNUSED_VARIABLE(m)
278  EIGEN_UNUSED_VARIABLE(n)
279 #endif
280  return false;
281 }
282 
299 template<typename LhsScalar, typename RhsScalar, int KcFactor>
300 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
301 {
302  if (!useSpecificBlockingSizes(k, m, n)) {
303  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
304  }
305 
306  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
307  enum {
308  kr = 8,
309  mr = Traits::mr,
310  nr = Traits::nr
311  };
312  if (k > kr) k -= k % kr;
313  if (m > mr) m -= m % mr;
314  if (n > nr) n -= n % nr;
315 }
316 
317 template<typename LhsScalar, typename RhsScalar>
318 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
319 {
320  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
321 }
322 
323 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
324  #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
325 #else
326 
327  // FIXME (a bit overkill maybe ?)
328 
329  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
330  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
331  {
332  c = cj.pmadd(a,b,c);
333  }
334  };
335 
336  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
337  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
338  {
339  t = b; t = cj.pmul(a,t); c = padd(c,t);
340  }
341  };
342 
343  template<typename CJ, typename A, typename B, typename C, typename T>
344  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
345  {
346  gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
347  }
348 
349  #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
350 // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
351 #endif
352 
353 /* Vectorization logic
354  * real*real: unpack rhs to constant packets, ...
355  *
356  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
357  * storing each res packet into two packets (2x2),
358  * at the end combine them: swap the second and addsub them
359  * cf*cf : same but with 2x4 blocks
360  * cplx*real : unpack rhs to constant packets, ...
361  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
362  */
363 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
364 class gebp_traits
365 {
366 public:
367  typedef _LhsScalar LhsScalar;
368  typedef _RhsScalar RhsScalar;
369  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
370 
371  enum {
372  ConjLhs = _ConjLhs,
373  ConjRhs = _ConjRhs,
374  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
375  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
376  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
377  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
378 
379  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
380 
381  // register block size along the N direction must be 1 or 4
382  nr = 4,
383 
384  // register block size along the M direction (currently, this one cannot be modified)
385  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
386 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
387  // we assume 16 registers
388  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
389  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
390  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
391 #else
392  mr = default_mr,
393 #endif
394 
395  LhsProgress = LhsPacketSize,
396  RhsProgress = 1
397  };
398 
399  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
400  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
401  typedef typename packet_traits<ResScalar>::type _ResPacket;
402 
403  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
404  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
405  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
406 
407  typedef ResPacket AccPacket;
408 
409  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
410  {
411  p = pset1<ResPacket>(ResScalar(0));
412  }
413 
414  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
415  {
416  pbroadcast4(b, b0, b1, b2, b3);
417  }
418 
419 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
420 // {
421 // pbroadcast2(b, b0, b1);
422 // }
423 
424  template<typename RhsPacketType>
425  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
426  {
427  dest = pset1<RhsPacketType>(*b);
428  }
429 
430  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
431  {
432  dest = ploadquad<RhsPacket>(b);
433  }
434 
435  template<typename LhsPacketType>
436  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
437  {
438  dest = pload<LhsPacketType>(a);
439  }
440 
441  template<typename LhsPacketType>
442  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
443  {
444  dest = ploadu<LhsPacketType>(a);
445  }
446 
447  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
448  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
449  {
450  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
451  // let gcc allocate the register in which to store the result of the pmul
452  // (in the case where there is no FMA) gcc fails to figure out how to avoid
453  // spilling register.
454 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
455  EIGEN_UNUSED_VARIABLE(tmp);
456  c = pmadd(a,b,c);
457 #else
458  tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
459 #endif
460  }
461 
462  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
463  {
464  r = pmadd(c,alpha,r);
465  }
466 
467  template<typename ResPacketHalf>
468  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
469  {
470  r = pmadd(c,alpha,r);
471  }
472 
473 protected:
474 // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
475 // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
476 };
477 
478 template<typename RealScalar, bool _ConjLhs>
479 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
480 {
481 public:
482  typedef std::complex<RealScalar> LhsScalar;
483  typedef RealScalar RhsScalar;
484  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
485 
486  enum {
487  ConjLhs = _ConjLhs,
488  ConjRhs = false,
489  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
490  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
491  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
492  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
493 
494  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
495  nr = 4,
496 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
497  // we assume 16 registers
498  mr = 3*LhsPacketSize,
499 #else
500  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
501 #endif
502 
503  LhsProgress = LhsPacketSize,
504  RhsProgress = 1
505  };
506 
507  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
508  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
509  typedef typename packet_traits<ResScalar>::type _ResPacket;
510 
511  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
512  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
513  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
514 
515  typedef ResPacket AccPacket;
516 
517  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
518  {
519  p = pset1<ResPacket>(ResScalar(0));
520  }
521 
522  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
523  {
524  dest = pset1<RhsPacket>(*b);
525  }
526 
527  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
528  {
529  dest = pset1<RhsPacket>(*b);
530  }
531 
532  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
533  {
534  dest = pload<LhsPacket>(a);
535  }
536 
537  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
538  {
539  dest = ploadu<LhsPacket>(a);
540  }
541 
542  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
543  {
544  pbroadcast4(b, b0, b1, b2, b3);
545  }
546 
547 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
548 // {
549 // pbroadcast2(b, b0, b1);
550 // }
551 
552  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
553  {
554  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
555  }
556 
557  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
558  {
559 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
560  EIGEN_UNUSED_VARIABLE(tmp);
561  c.v = pmadd(a.v,b,c.v);
562 #else
563  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
564 #endif
565  }
566 
567  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
568  {
569  c += a * b;
570  }
571 
572  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
573  {
574  r = cj.pmadd(c,alpha,r);
575  }
576 
577 protected:
578  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
579 };
580 
581 template<typename Packet>
582 struct DoublePacket
583 {
584  Packet first;
585  Packet second;
586 };
587 
588 template<typename Packet>
589 DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
590 {
591  DoublePacket<Packet> res;
592  res.first = padd(a.first, b.first);
593  res.second = padd(a.second,b.second);
594  return res;
595 }
596 
597 template<typename Packet>
598 const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
599 {
600  return a;
601 }
602 
603 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
604 // template<typename Packet>
605 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
606 // {
607 // DoublePacket<Packet> res;
608 // res.first = padd(a.first, b.first);
609 // res.second = padd(a.second,b.second);
610 // return res;
611 // }
612 
613 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
614 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
615 {
616 public:
617  typedef std::complex<RealScalar> Scalar;
618  typedef std::complex<RealScalar> LhsScalar;
619  typedef std::complex<RealScalar> RhsScalar;
620  typedef std::complex<RealScalar> ResScalar;
621 
622  enum {
623  ConjLhs = _ConjLhs,
624  ConjRhs = _ConjRhs,
625  Vectorizable = packet_traits<RealScalar>::Vectorizable
626  && packet_traits<Scalar>::Vectorizable,
627  RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
628  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
629  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
630  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
631 
632  // FIXME: should depend on NumberOfRegisters
633  nr = 4,
634  mr = ResPacketSize,
635 
636  LhsProgress = ResPacketSize,
637  RhsProgress = 1
638  };
639 
640  typedef typename packet_traits<RealScalar>::type RealPacket;
641  typedef typename packet_traits<Scalar>::type ScalarPacket;
642  typedef DoublePacket<RealPacket> DoublePacketType;
643 
644  typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
645  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
646  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
647  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
648 
649  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
650 
651  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
652  {
653  p.first = pset1<RealPacket>(RealScalar(0));
654  p.second = pset1<RealPacket>(RealScalar(0));
655  }
656 
657  // Scalar path
658  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
659  {
660  dest = pset1<ResPacket>(*b);
661  }
662 
663  // Vectorized path
664  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
665  {
666  dest.first = pset1<RealPacket>(real(*b));
667  dest.second = pset1<RealPacket>(imag(*b));
668  }
669 
670  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
671  {
672  loadRhs(b,dest);
673  }
674  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
675  {
676  eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
677  loadRhs(b,dest);
678  }
679 
680  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
681  {
682  // FIXME not sure that's the best way to implement it!
683  loadRhs(b+0, b0);
684  loadRhs(b+1, b1);
685  loadRhs(b+2, b2);
686  loadRhs(b+3, b3);
687  }
688 
689  // Vectorized path
690  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
691  {
692  // FIXME not sure that's the best way to implement it!
693  loadRhs(b+0, b0);
694  loadRhs(b+1, b1);
695  }
696 
697  // Scalar path
698  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
699  {
700  // FIXME not sure that's the best way to implement it!
701  loadRhs(b+0, b0);
702  loadRhs(b+1, b1);
703  }
704 
705  // nothing special here
706  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
707  {
708  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
709  }
710 
711  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
712  {
713  dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
714  }
715 
716  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
717  {
718  c.first = padd(pmul(a,b.first), c.first);
719  c.second = padd(pmul(a,b.second),c.second);
720  }
721 
722  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
723  {
724  c = cj.pmadd(a,b,c);
725  }
726 
727  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
728 
729  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
730  {
731  // assemble c
732  ResPacket tmp;
733  if((!ConjLhs)&&(!ConjRhs))
734  {
735  tmp = pcplxflip(pconj(ResPacket(c.second)));
736  tmp = padd(ResPacket(c.first),tmp);
737  }
738  else if((!ConjLhs)&&(ConjRhs))
739  {
740  tmp = pconj(pcplxflip(ResPacket(c.second)));
741  tmp = padd(ResPacket(c.first),tmp);
742  }
743  else if((ConjLhs)&&(!ConjRhs))
744  {
745  tmp = pcplxflip(ResPacket(c.second));
746  tmp = padd(pconj(ResPacket(c.first)),tmp);
747  }
748  else if((ConjLhs)&&(ConjRhs))
749  {
750  tmp = pcplxflip(ResPacket(c.second));
751  tmp = psub(pconj(ResPacket(c.first)),tmp);
752  }
753 
754  r = pmadd(tmp,alpha,r);
755  }
756 
757 protected:
758  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
759 };
760 
761 template<typename RealScalar, bool _ConjRhs>
762 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
763 {
764 public:
765  typedef std::complex<RealScalar> Scalar;
766  typedef RealScalar LhsScalar;
767  typedef Scalar RhsScalar;
768  typedef Scalar ResScalar;
769 
770  enum {
771  ConjLhs = false,
772  ConjRhs = _ConjRhs,
773  Vectorizable = packet_traits<RealScalar>::Vectorizable
774  && packet_traits<Scalar>::Vectorizable,
775  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
776  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
777  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
778 
779  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
780  // FIXME: should depend on NumberOfRegisters
781  nr = 4,
782  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
783 
784  LhsProgress = ResPacketSize,
785  RhsProgress = 1
786  };
787 
788  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
789  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
790  typedef typename packet_traits<ResScalar>::type _ResPacket;
791 
792  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
793  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
794  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
795 
796  typedef ResPacket AccPacket;
797 
798  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
799  {
800  p = pset1<ResPacket>(ResScalar(0));
801  }
802 
803  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
804  {
805  dest = pset1<RhsPacket>(*b);
806  }
807 
808  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
809  {
810  pbroadcast4(b, b0, b1, b2, b3);
811  }
812 
813 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
814 // {
815 // // FIXME not sure that's the best way to implement it!
816 // b0 = pload1<RhsPacket>(b+0);
817 // b1 = pload1<RhsPacket>(b+1);
818 // }
819 
820  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
821  {
822  dest = ploaddup<LhsPacket>(a);
823  }
824 
825  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
826  {
827  eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
828  loadRhs(b,dest);
829  }
830 
831  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
832  {
833  dest = ploaddup<LhsPacket>(a);
834  }
835 
836  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
837  {
838  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
839  }
840 
841  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
842  {
843 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
844  EIGEN_UNUSED_VARIABLE(tmp);
845  c.v = pmadd(a,b.v,c.v);
846 #else
847  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
848 #endif
849 
850  }
851 
852  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
853  {
854  c += a * b;
855  }
856 
857  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
858  {
859  r = cj.pmadd(alpha,c,r);
860  }
861 
862 protected:
863  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
864 };
865 
866 // helper for the rotating kernel below
867 template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
868 struct PossiblyRotatingKernelHelper
869 {
870  // default implementation, not rotating
871 
872  typedef typename GebpKernel::Traits Traits;
873  typedef typename Traits::RhsScalar RhsScalar;
874  typedef typename Traits::RhsPacket RhsPacket;
875  typedef typename Traits::AccPacket AccPacket;
876 
877  const Traits& traits;
878  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
879 
880 
881  template <size_t K, size_t Index>
882  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
883  {
884  traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
885  }
886 
887  void unrotateResult(AccPacket&,
888  AccPacket&,
889  AccPacket&,
890  AccPacket&)
891  {
892  }
893 };
894 
895 // rotating implementation
896 template <typename GebpKernel>
897 struct PossiblyRotatingKernelHelper<GebpKernel, true>
898 {
899  typedef typename GebpKernel::Traits Traits;
900  typedef typename Traits::RhsScalar RhsScalar;
901  typedef typename Traits::RhsPacket RhsPacket;
902  typedef typename Traits::AccPacket AccPacket;
903 
904  const Traits& traits;
905  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
906 
907  template <size_t K, size_t Index>
908  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
909  {
910  if (Index == 0) {
911  to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
912  } else {
913  EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
914  to = protate<1>(to);
915  }
916  }
917 
918  void unrotateResult(AccPacket& res0,
919  AccPacket& res1,
920  AccPacket& res2,
921  AccPacket& res3)
922  {
923  PacketBlock<AccPacket> resblock;
924  resblock.packet[0] = res0;
925  resblock.packet[1] = res1;
926  resblock.packet[2] = res2;
927  resblock.packet[3] = res3;
928  ptranspose(resblock);
929  resblock.packet[3] = protate<1>(resblock.packet[3]);
930  resblock.packet[2] = protate<2>(resblock.packet[2]);
931  resblock.packet[1] = protate<3>(resblock.packet[1]);
932  ptranspose(resblock);
933  res0 = resblock.packet[0];
934  res1 = resblock.packet[1];
935  res2 = resblock.packet[2];
936  res3 = resblock.packet[3];
937  }
938 };
939 
940 /* optimized GEneral packed Block * packed Panel product kernel
941  *
942  * Mixing type logic: C += A * B
943  * | A | B | comments
944  * |real |cplx | no vectorization yet, would require to pack A with duplication
945  * |cplx |real | easy vectorization
946  */
947 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
948 struct gebp_kernel
949 {
950  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
951  typedef typename Traits::ResScalar ResScalar;
952  typedef typename Traits::LhsPacket LhsPacket;
953  typedef typename Traits::RhsPacket RhsPacket;
954  typedef typename Traits::ResPacket ResPacket;
955  typedef typename Traits::AccPacket AccPacket;
956 
957  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
958  typedef typename SwappedTraits::ResScalar SResScalar;
959  typedef typename SwappedTraits::LhsPacket SLhsPacket;
960  typedef typename SwappedTraits::RhsPacket SRhsPacket;
961  typedef typename SwappedTraits::ResPacket SResPacket;
962  typedef typename SwappedTraits::AccPacket SAccPacket;
963 
964  typedef typename DataMapper::LinearMapper LinearMapper;
965 
966  enum {
967  Vectorizable = Traits::Vectorizable,
968  LhsProgress = Traits::LhsProgress,
969  RhsProgress = Traits::RhsProgress,
970  ResPacketSize = Traits::ResPacketSize
971  };
972 
973 
974  static const bool UseRotatingKernel =
975  EIGEN_ARCH_ARM &&
976  internal::is_same<LhsScalar, float>::value &&
977  internal::is_same<RhsScalar, float>::value &&
978  internal::is_same<ResScalar, float>::value &&
979  Traits::LhsPacketSize == 4 &&
980  Traits::RhsPacketSize == 4 &&
981  Traits::ResPacketSize == 4;
982 
983  EIGEN_DONT_INLINE
984  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
985  Index rows, Index depth, Index cols, ResScalar alpha,
986  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
987 };
988 
989 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
990 EIGEN_DONT_INLINE
991 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
992  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
993  Index rows, Index depth, Index cols, ResScalar alpha,
994  Index strideA, Index strideB, Index offsetA, Index offsetB)
995  {
996  Traits traits;
997  SwappedTraits straits;
998 
999  if(strideA==-1) strideA = depth;
1000  if(strideB==-1) strideB = depth;
1001  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
1002  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1003  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1004  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1005  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
1006  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1007  const Index peeled_kc = depth & ~(pk-1);
1008  const Index prefetch_res_offset = 32/sizeof(ResScalar);
1009 // const Index depth2 = depth & ~1;
1010 
1011  //---------- Process 3 * LhsProgress rows at once ----------
1012  // This corresponds to 3*LhsProgress x nr register blocks.
1013  // Usually, make sense only with FMA
1014  if(mr>=3*Traits::LhsProgress)
1015  {
1016  PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
1017 
1018  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1019  // and on each largest micro vertical panel of the rhs (depth * nr).
1020  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1021  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1022  // This actual number of rows is computed as follow:
1023  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1024  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1025  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1026  // or because we are testing specific blocking sizes.
1027  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1028  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1029  {
1030  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1031  for(Index j2=0; j2<packet_cols4; j2+=nr)
1032  {
1033  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1034  {
1035 
1036  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1037  // stored into 3 x nr registers.
1038 
1039  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1040  prefetch(&blA[0]);
1041 
1042  // gets res block as register
1043  AccPacket C0, C1, C2, C3,
1044  C4, C5, C6, C7,
1045  C8, C9, C10, C11;
1046  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1047  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1048  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1049 
1050  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1051  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1052  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1053  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1054 
1055  r0.prefetch(0);
1056  r1.prefetch(0);
1057  r2.prefetch(0);
1058  r3.prefetch(0);
1059 
1060  // performs "inner" products
1061  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1062  prefetch(&blB[0]);
1063  LhsPacket A0, A1;
1064 
1065  for(Index k=0; k<peeled_kc; k+=pk)
1066  {
1067  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1068  RhsPacket B_0, T0;
1069  LhsPacket A2;
1070 
1071 #define EIGEN_GEBP_ONESTEP(K) \
1072  do { \
1073  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1074  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1075  internal::prefetch(blA+(3*K+16)*LhsProgress); \
1076  if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
1077  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1078  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1079  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1080  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
1081  traits.madd(A0, B_0, C0, T0); \
1082  traits.madd(A1, B_0, C4, T0); \
1083  traits.madd(A2, B_0, C8, B_0); \
1084  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
1085  traits.madd(A0, B_0, C1, T0); \
1086  traits.madd(A1, B_0, C5, T0); \
1087  traits.madd(A2, B_0, C9, B_0); \
1088  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
1089  traits.madd(A0, B_0, C2, T0); \
1090  traits.madd(A1, B_0, C6, T0); \
1091  traits.madd(A2, B_0, C10, B_0); \
1092  possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
1093  traits.madd(A0, B_0, C3 , T0); \
1094  traits.madd(A1, B_0, C7, T0); \
1095  traits.madd(A2, B_0, C11, B_0); \
1096  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1097  } while(false)
1098 
1099  internal::prefetch(blB);
1100  EIGEN_GEBP_ONESTEP(0);
1101  EIGEN_GEBP_ONESTEP(1);
1102  EIGEN_GEBP_ONESTEP(2);
1103  EIGEN_GEBP_ONESTEP(3);
1104  EIGEN_GEBP_ONESTEP(4);
1105  EIGEN_GEBP_ONESTEP(5);
1106  EIGEN_GEBP_ONESTEP(6);
1107  EIGEN_GEBP_ONESTEP(7);
1108 
1109  blB += pk*4*RhsProgress;
1110  blA += pk*3*Traits::LhsProgress;
1111 
1112  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1113  }
1114  // process remaining peeled loop
1115  for(Index k=peeled_kc; k<depth; k++)
1116  {
1117  RhsPacket B_0, T0;
1118  LhsPacket A2;
1119  EIGEN_GEBP_ONESTEP(0);
1120  blB += 4*RhsProgress;
1121  blA += 3*Traits::LhsProgress;
1122  }
1123 
1124 #undef EIGEN_GEBP_ONESTEP
1125 
1126  possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
1127  possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
1128  possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
1129 
1130  ResPacket R0, R1, R2;
1131  ResPacket alphav = pset1<ResPacket>(alpha);
1132 
1133  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1134  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1135  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1136  traits.acc(C0, alphav, R0);
1137  traits.acc(C4, alphav, R1);
1138  traits.acc(C8, alphav, R2);
1139  r0.storePacket(0 * Traits::ResPacketSize, R0);
1140  r0.storePacket(1 * Traits::ResPacketSize, R1);
1141  r0.storePacket(2 * Traits::ResPacketSize, R2);
1142 
1143  R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1144  R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1145  R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1146  traits.acc(C1, alphav, R0);
1147  traits.acc(C5, alphav, R1);
1148  traits.acc(C9, alphav, R2);
1149  r1.storePacket(0 * Traits::ResPacketSize, R0);
1150  r1.storePacket(1 * Traits::ResPacketSize, R1);
1151  r1.storePacket(2 * Traits::ResPacketSize, R2);
1152 
1153  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1154  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1155  R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1156  traits.acc(C2, alphav, R0);
1157  traits.acc(C6, alphav, R1);
1158  traits.acc(C10, alphav, R2);
1159  r2.storePacket(0 * Traits::ResPacketSize, R0);
1160  r2.storePacket(1 * Traits::ResPacketSize, R1);
1161  r2.storePacket(2 * Traits::ResPacketSize, R2);
1162 
1163  R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1164  R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1165  R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1166  traits.acc(C3, alphav, R0);
1167  traits.acc(C7, alphav, R1);
1168  traits.acc(C11, alphav, R2);
1169  r3.storePacket(0 * Traits::ResPacketSize, R0);
1170  r3.storePacket(1 * Traits::ResPacketSize, R1);
1171  r3.storePacket(2 * Traits::ResPacketSize, R2);
1172  }
1173  }
1174 
1175  // Deal with remaining columns of the rhs
1176  for(Index j2=packet_cols4; j2<cols; j2++)
1177  {
1178  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1179  {
1180  // One column at a time
1181  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1182  prefetch(&blA[0]);
1183 
1184  // gets res block as register
1185  AccPacket C0, C4, C8;
1186  traits.initAcc(C0);
1187  traits.initAcc(C4);
1188  traits.initAcc(C8);
1189 
1190  LinearMapper r0 = res.getLinearMapper(i, j2);
1191  r0.prefetch(0);
1192 
1193  // performs "inner" products
1194  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1195  LhsPacket A0, A1, A2;
1196 
1197  for(Index k=0; k<peeled_kc; k+=pk)
1198  {
1199  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1200  RhsPacket B_0;
1201 #define EIGEN_GEBGP_ONESTEP(K) \
1202  do { \
1203  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1204  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1205  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1206  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1207  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1208  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1209  traits.madd(A0, B_0, C0, B_0); \
1210  traits.madd(A1, B_0, C4, B_0); \
1211  traits.madd(A2, B_0, C8, B_0); \
1212  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1213  } while(false)
1214 
1215  EIGEN_GEBGP_ONESTEP(0);
1216  EIGEN_GEBGP_ONESTEP(1);
1217  EIGEN_GEBGP_ONESTEP(2);
1218  EIGEN_GEBGP_ONESTEP(3);
1219  EIGEN_GEBGP_ONESTEP(4);
1220  EIGEN_GEBGP_ONESTEP(5);
1221  EIGEN_GEBGP_ONESTEP(6);
1222  EIGEN_GEBGP_ONESTEP(7);
1223 
1224  blB += pk*RhsProgress;
1225  blA += pk*3*Traits::LhsProgress;
1226 
1227  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1228  }
1229 
1230  // process remaining peeled loop
1231  for(Index k=peeled_kc; k<depth; k++)
1232  {
1233  RhsPacket B_0;
1234  EIGEN_GEBGP_ONESTEP(0);
1235  blB += RhsProgress;
1236  blA += 3*Traits::LhsProgress;
1237  }
1238 #undef EIGEN_GEBGP_ONESTEP
1239  ResPacket R0, R1, R2;
1240  ResPacket alphav = pset1<ResPacket>(alpha);
1241 
1242  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1243  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1244  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1245  traits.acc(C0, alphav, R0);
1246  traits.acc(C4, alphav, R1);
1247  traits.acc(C8, alphav, R2);
1248  r0.storePacket(0 * Traits::ResPacketSize, R0);
1249  r0.storePacket(1 * Traits::ResPacketSize, R1);
1250  r0.storePacket(2 * Traits::ResPacketSize, R2);
1251  }
1252  }
1253  }
1254  }
1255 
1256  //---------- Process 2 * LhsProgress rows at once ----------
1257  if(mr>=2*Traits::LhsProgress)
1258  {
1259  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1260  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1261  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1262  // or because we are testing specific blocking sizes.
1263  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1264 
1265  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1266  {
1267  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1268  for(Index j2=0; j2<packet_cols4; j2+=nr)
1269  {
1270  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1271  {
1272 
1273  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1274  // stored into 2 x nr registers.
1275 
1276  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1277  prefetch(&blA[0]);
1278 
1279  // gets res block as register
1280  AccPacket C0, C1, C2, C3,
1281  C4, C5, C6, C7;
1282  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1283  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1284 
1285  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1286  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1287  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1288  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1289 
1290  r0.prefetch(prefetch_res_offset);
1291  r1.prefetch(prefetch_res_offset);
1292  r2.prefetch(prefetch_res_offset);
1293  r3.prefetch(prefetch_res_offset);
1294 
1295  // performs "inner" products
1296  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1297  prefetch(&blB[0]);
1298  LhsPacket A0, A1;
1299 
1300  for(Index k=0; k<peeled_kc; k+=pk)
1301  {
1302  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1303  RhsPacket B_0, B1, B2, B3, T0;
1304 
1305  #define EIGEN_GEBGP_ONESTEP(K) \
1306  do { \
1307  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1308  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1309  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1310  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1311  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1312  traits.madd(A0, B_0, C0, T0); \
1313  traits.madd(A1, B_0, C4, B_0); \
1314  traits.madd(A0, B1, C1, T0); \
1315  traits.madd(A1, B1, C5, B1); \
1316  traits.madd(A0, B2, C2, T0); \
1317  traits.madd(A1, B2, C6, B2); \
1318  traits.madd(A0, B3, C3, T0); \
1319  traits.madd(A1, B3, C7, B3); \
1320  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1321  } while(false)
1322 
1323  internal::prefetch(blB+(48+0));
1324  EIGEN_GEBGP_ONESTEP(0);
1325  EIGEN_GEBGP_ONESTEP(1);
1326  EIGEN_GEBGP_ONESTEP(2);
1327  EIGEN_GEBGP_ONESTEP(3);
1328  internal::prefetch(blB+(48+16));
1329  EIGEN_GEBGP_ONESTEP(4);
1330  EIGEN_GEBGP_ONESTEP(5);
1331  EIGEN_GEBGP_ONESTEP(6);
1332  EIGEN_GEBGP_ONESTEP(7);
1333 
1334  blB += pk*4*RhsProgress;
1335  blA += pk*(2*Traits::LhsProgress);
1336 
1337  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1338  }
1339  // process remaining peeled loop
1340  for(Index k=peeled_kc; k<depth; k++)
1341  {
1342  RhsPacket B_0, B1, B2, B3, T0;
1343  EIGEN_GEBGP_ONESTEP(0);
1344  blB += 4*RhsProgress;
1345  blA += 2*Traits::LhsProgress;
1346  }
1347 #undef EIGEN_GEBGP_ONESTEP
1348 
1349  ResPacket R0, R1, R2, R3;
1350  ResPacket alphav = pset1<ResPacket>(alpha);
1351 
1352  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1353  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1354  R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1355  R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1356  traits.acc(C0, alphav, R0);
1357  traits.acc(C4, alphav, R1);
1358  traits.acc(C1, alphav, R2);
1359  traits.acc(C5, alphav, R3);
1360  r0.storePacket(0 * Traits::ResPacketSize, R0);
1361  r0.storePacket(1 * Traits::ResPacketSize, R1);
1362  r1.storePacket(0 * Traits::ResPacketSize, R2);
1363  r1.storePacket(1 * Traits::ResPacketSize, R3);
1364 
1365  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1366  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1367  R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1368  R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1369  traits.acc(C2, alphav, R0);
1370  traits.acc(C6, alphav, R1);
1371  traits.acc(C3, alphav, R2);
1372  traits.acc(C7, alphav, R3);
1373  r2.storePacket(0 * Traits::ResPacketSize, R0);
1374  r2.storePacket(1 * Traits::ResPacketSize, R1);
1375  r3.storePacket(0 * Traits::ResPacketSize, R2);
1376  r3.storePacket(1 * Traits::ResPacketSize, R3);
1377  }
1378  }
1379 
1380  // Deal with remaining columns of the rhs
1381  for(Index j2=packet_cols4; j2<cols; j2++)
1382  {
1383  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1384  {
1385  // One column at a time
1386  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1387  prefetch(&blA[0]);
1388 
1389  // gets res block as register
1390  AccPacket C0, C4;
1391  traits.initAcc(C0);
1392  traits.initAcc(C4);
1393 
1394  LinearMapper r0 = res.getLinearMapper(i, j2);
1395  r0.prefetch(prefetch_res_offset);
1396 
1397  // performs "inner" products
1398  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1399  LhsPacket A0, A1;
1400 
1401  for(Index k=0; k<peeled_kc; k+=pk)
1402  {
1403  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1404  RhsPacket B_0, B1;
1405 
1406 #define EIGEN_GEBGP_ONESTEP(K) \
1407  do { \
1408  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1409  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1410  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1411  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1412  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1413  traits.madd(A0, B_0, C0, B1); \
1414  traits.madd(A1, B_0, C4, B_0); \
1415  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1416  } while(false)
1417 
1418  EIGEN_GEBGP_ONESTEP(0);
1419  EIGEN_GEBGP_ONESTEP(1);
1420  EIGEN_GEBGP_ONESTEP(2);
1421  EIGEN_GEBGP_ONESTEP(3);
1422  EIGEN_GEBGP_ONESTEP(4);
1423  EIGEN_GEBGP_ONESTEP(5);
1424  EIGEN_GEBGP_ONESTEP(6);
1425  EIGEN_GEBGP_ONESTEP(7);
1426 
1427  blB += pk*RhsProgress;
1428  blA += pk*2*Traits::LhsProgress;
1429 
1430  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1431  }
1432 
1433  // process remaining peeled loop
1434  for(Index k=peeled_kc; k<depth; k++)
1435  {
1436  RhsPacket B_0, B1;
1437  EIGEN_GEBGP_ONESTEP(0);
1438  blB += RhsProgress;
1439  blA += 2*Traits::LhsProgress;
1440  }
1441 #undef EIGEN_GEBGP_ONESTEP
1442  ResPacket R0, R1;
1443  ResPacket alphav = pset1<ResPacket>(alpha);
1444 
1445  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1446  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1447  traits.acc(C0, alphav, R0);
1448  traits.acc(C4, alphav, R1);
1449  r0.storePacket(0 * Traits::ResPacketSize, R0);
1450  r0.storePacket(1 * Traits::ResPacketSize, R1);
1451  }
1452  }
1453  }
1454  }
1455  //---------- Process 1 * LhsProgress rows at once ----------
1456  if(mr>=1*Traits::LhsProgress)
1457  {
1458  // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1459  for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1460  {
1461  // loops on each largest micro vertical panel of rhs (depth * nr)
1462  for(Index j2=0; j2<packet_cols4; j2+=nr)
1463  {
1464  // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1465  // stored into 1 x nr registers.
1466 
1467  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1468  prefetch(&blA[0]);
1469 
1470  // gets res block as register
1471  AccPacket C0, C1, C2, C3;
1472  traits.initAcc(C0);
1473  traits.initAcc(C1);
1474  traits.initAcc(C2);
1475  traits.initAcc(C3);
1476 
1477  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1478  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1479  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1480  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1481 
1482  r0.prefetch(prefetch_res_offset);
1483  r1.prefetch(prefetch_res_offset);
1484  r2.prefetch(prefetch_res_offset);
1485  r3.prefetch(prefetch_res_offset);
1486 
1487  // performs "inner" products
1488  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1489  prefetch(&blB[0]);
1490  LhsPacket A0;
1491 
1492  for(Index k=0; k<peeled_kc; k+=pk)
1493  {
1494  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1495  RhsPacket B_0, B1, B2, B3;
1496 
1497 #define EIGEN_GEBGP_ONESTEP(K) \
1498  do { \
1499  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1500  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1501  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1502  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1503  traits.madd(A0, B_0, C0, B_0); \
1504  traits.madd(A0, B1, C1, B1); \
1505  traits.madd(A0, B2, C2, B2); \
1506  traits.madd(A0, B3, C3, B3); \
1507  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1508  } while(false)
1509 
1510  internal::prefetch(blB+(48+0));
1511  EIGEN_GEBGP_ONESTEP(0);
1512  EIGEN_GEBGP_ONESTEP(1);
1513  EIGEN_GEBGP_ONESTEP(2);
1514  EIGEN_GEBGP_ONESTEP(3);
1515  internal::prefetch(blB+(48+16));
1516  EIGEN_GEBGP_ONESTEP(4);
1517  EIGEN_GEBGP_ONESTEP(5);
1518  EIGEN_GEBGP_ONESTEP(6);
1519  EIGEN_GEBGP_ONESTEP(7);
1520 
1521  blB += pk*4*RhsProgress;
1522  blA += pk*1*LhsProgress;
1523 
1524  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1525  }
1526  // process remaining peeled loop
1527  for(Index k=peeled_kc; k<depth; k++)
1528  {
1529  RhsPacket B_0, B1, B2, B3;
1530  EIGEN_GEBGP_ONESTEP(0);
1531  blB += 4*RhsProgress;
1532  blA += 1*LhsProgress;
1533  }
1534 #undef EIGEN_GEBGP_ONESTEP
1535 
1536  ResPacket R0, R1;
1537  ResPacket alphav = pset1<ResPacket>(alpha);
1538 
1539  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1540  R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1541  traits.acc(C0, alphav, R0);
1542  traits.acc(C1, alphav, R1);
1543  r0.storePacket(0 * Traits::ResPacketSize, R0);
1544  r1.storePacket(0 * Traits::ResPacketSize, R1);
1545 
1546  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1547  R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1548  traits.acc(C2, alphav, R0);
1549  traits.acc(C3, alphav, R1);
1550  r2.storePacket(0 * Traits::ResPacketSize, R0);
1551  r3.storePacket(0 * Traits::ResPacketSize, R1);
1552  }
1553 
1554  // Deal with remaining columns of the rhs
1555  for(Index j2=packet_cols4; j2<cols; j2++)
1556  {
1557  // One column at a time
1558  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1559  prefetch(&blA[0]);
1560 
1561  // gets res block as register
1562  AccPacket C0;
1563  traits.initAcc(C0);
1564 
1565  LinearMapper r0 = res.getLinearMapper(i, j2);
1566 
1567  // performs "inner" products
1568  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1569  LhsPacket A0;
1570 
1571  for(Index k=0; k<peeled_kc; k+=pk)
1572  {
1573  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1574  RhsPacket B_0;
1575 
1576 #define EIGEN_GEBGP_ONESTEP(K) \
1577  do { \
1578  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1579  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1580  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1581  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1582  traits.madd(A0, B_0, C0, B_0); \
1583  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1584  } while(false);
1585 
1586  EIGEN_GEBGP_ONESTEP(0);
1587  EIGEN_GEBGP_ONESTEP(1);
1588  EIGEN_GEBGP_ONESTEP(2);
1589  EIGEN_GEBGP_ONESTEP(3);
1590  EIGEN_GEBGP_ONESTEP(4);
1591  EIGEN_GEBGP_ONESTEP(5);
1592  EIGEN_GEBGP_ONESTEP(6);
1593  EIGEN_GEBGP_ONESTEP(7);
1594 
1595  blB += pk*RhsProgress;
1596  blA += pk*1*Traits::LhsProgress;
1597 
1598  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1599  }
1600 
1601  // process remaining peeled loop
1602  for(Index k=peeled_kc; k<depth; k++)
1603  {
1604  RhsPacket B_0;
1605  EIGEN_GEBGP_ONESTEP(0);
1606  blB += RhsProgress;
1607  blA += 1*Traits::LhsProgress;
1608  }
1609 #undef EIGEN_GEBGP_ONESTEP
1610  ResPacket R0;
1611  ResPacket alphav = pset1<ResPacket>(alpha);
1612  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1613  traits.acc(C0, alphav, R0);
1614  r0.storePacket(0 * Traits::ResPacketSize, R0);
1615  }
1616  }
1617  }
1618  //---------- Process remaining rows, 1 at once ----------
1619  if(peeled_mc1<rows)
1620  {
1621  // loop on each panel of the rhs
1622  for(Index j2=0; j2<packet_cols4; j2+=nr)
1623  {
1624  // loop on each row of the lhs (1*LhsProgress x depth)
1625  for(Index i=peeled_mc1; i<rows; i+=1)
1626  {
1627  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1628  prefetch(&blA[0]);
1629  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1630 
1631  if( (SwappedTraits::LhsProgress % 4)==0 )
1632  {
1633  // NOTE The following piece of code wont work for 512 bit registers
1634  SAccPacket C0, C1, C2, C3;
1635  straits.initAcc(C0);
1636  straits.initAcc(C1);
1637  straits.initAcc(C2);
1638  straits.initAcc(C3);
1639 
1640  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1641  const Index endk = (depth/spk)*spk;
1642  const Index endk4 = (depth/(spk*4))*(spk*4);
1643 
1644  Index k=0;
1645  for(; k<endk4; k+=4*spk)
1646  {
1647  SLhsPacket A0,A1;
1648  SRhsPacket B_0,B_1;
1649 
1650  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1651  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1652 
1653  straits.loadRhsQuad(blA+0*spk, B_0);
1654  straits.loadRhsQuad(blA+1*spk, B_1);
1655  straits.madd(A0,B_0,C0,B_0);
1656  straits.madd(A1,B_1,C1,B_1);
1657 
1658  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1659  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1660  straits.loadRhsQuad(blA+2*spk, B_0);
1661  straits.loadRhsQuad(blA+3*spk, B_1);
1662  straits.madd(A0,B_0,C2,B_0);
1663  straits.madd(A1,B_1,C3,B_1);
1664 
1665  blB += 4*SwappedTraits::LhsProgress;
1666  blA += 4*spk;
1667  }
1668  C0 = padd(padd(C0,C1),padd(C2,C3));
1669  for(; k<endk; k+=spk)
1670  {
1671  SLhsPacket A0;
1672  SRhsPacket B_0;
1673 
1674  straits.loadLhsUnaligned(blB, A0);
1675  straits.loadRhsQuad(blA, B_0);
1676  straits.madd(A0,B_0,C0,B_0);
1677 
1678  blB += SwappedTraits::LhsProgress;
1679  blA += spk;
1680  }
1681  if(SwappedTraits::LhsProgress==8)
1682  {
1683  // Special case where we have to first reduce the accumulation register C0
1684  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1685  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1686  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1687  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1688 
1689  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1690  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1691 
1692  if(depth-endk>0)
1693  {
1694  // We have to handle the last row of the rhs which corresponds to a half-packet
1695  SLhsPacketHalf a0;
1696  SRhsPacketHalf b0;
1697  straits.loadLhsUnaligned(blB, a0);
1698  straits.loadRhs(blA, b0);
1699  SAccPacketHalf c0 = predux4(C0);
1700  straits.madd(a0,b0,c0,b0);
1701  straits.acc(c0, alphav, R);
1702  }
1703  else
1704  {
1705  straits.acc(predux4(C0), alphav, R);
1706  }
1707  res.scatterPacket(i, j2, R);
1708  }
1709  else
1710  {
1711  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1712  SResPacket alphav = pset1<SResPacket>(alpha);
1713  straits.acc(C0, alphav, R);
1714  res.scatterPacket(i, j2, R);
1715  }
1716  }
1717  else // scalar path
1718  {
1719  // get a 1 x 4 res block as registers
1720  ResScalar C0(0), C1(0), C2(0), C3(0);
1721 
1722  for(Index k=0; k<depth; k++)
1723  {
1724  LhsScalar A0;
1725  RhsScalar B_0, B_1;
1726 
1727  A0 = blA[k];
1728 
1729  B_0 = blB[0];
1730  B_1 = blB[1];
1731  CJMADD(cj,A0,B_0,C0, B_0);
1732  CJMADD(cj,A0,B_1,C1, B_1);
1733 
1734  B_0 = blB[2];
1735  B_1 = blB[3];
1736  CJMADD(cj,A0,B_0,C2, B_0);
1737  CJMADD(cj,A0,B_1,C3, B_1);
1738 
1739  blB += 4;
1740  }
1741  res(i, j2 + 0) += alpha * C0;
1742  res(i, j2 + 1) += alpha * C1;
1743  res(i, j2 + 2) += alpha * C2;
1744  res(i, j2 + 3) += alpha * C3;
1745  }
1746  }
1747  }
1748  // remaining columns
1749  for(Index j2=packet_cols4; j2<cols; j2++)
1750  {
1751  // loop on each row of the lhs (1*LhsProgress x depth)
1752  for(Index i=peeled_mc1; i<rows; i+=1)
1753  {
1754  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1755  prefetch(&blA[0]);
1756  // gets a 1 x 1 res block as registers
1757  ResScalar C0(0);
1758  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1759  for(Index k=0; k<depth; k++)
1760  {
1761  LhsScalar A0 = blA[k];
1762  RhsScalar B_0 = blB[k];
1763  CJMADD(cj, A0, B_0, C0, B_0);
1764  }
1765  res(i, j2) += alpha * C0;
1766  }
1767  }
1768  }
1769  }
1770 
1771 
1772 #undef CJMADD
1773 
1774 // pack a block of the lhs
1775 // The traversal is as follow (mr==4):
1776 // 0 4 8 12 ...
1777 // 1 5 9 13 ...
1778 // 2 6 10 14 ...
1779 // 3 7 11 15 ...
1780 //
1781 // 16 20 24 28 ...
1782 // 17 21 25 29 ...
1783 // 18 22 26 30 ...
1784 // 19 23 27 31 ...
1785 //
1786 // 32 33 34 35 ...
1787 // 36 36 38 39 ...
1788 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1789 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1790 {
1791  typedef typename DataMapper::LinearMapper LinearMapper;
1792  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1793 };
1794 
1795 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1796 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1797  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1798 {
1799  typedef typename packet_traits<Scalar>::type Packet;
1800  enum { PacketSize = packet_traits<Scalar>::size };
1801 
1802  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1803  EIGEN_UNUSED_VARIABLE(stride);
1804  EIGEN_UNUSED_VARIABLE(offset);
1805  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1806  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1807  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1808  Index count = 0;
1809 
1810  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1811  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1812  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1813  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1814  : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1815 
1816  Index i=0;
1817 
1818  // Pack 3 packets
1819  if(Pack1>=3*PacketSize)
1820  {
1821  for(; i<peeled_mc3; i+=3*PacketSize)
1822  {
1823  if(PanelMode) count += (3*PacketSize) * offset;
1824 
1825  for(Index k=0; k<depth; k++)
1826  {
1827  Packet A, B, C;
1828  A = lhs.loadPacket(i+0*PacketSize, k);
1829  B = lhs.loadPacket(i+1*PacketSize, k);
1830  C = lhs.loadPacket(i+2*PacketSize, k);
1831  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1832  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1833  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1834  }
1835  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1836  }
1837  }
1838  // Pack 2 packets
1839  if(Pack1>=2*PacketSize)
1840  {
1841  for(; i<peeled_mc2; i+=2*PacketSize)
1842  {
1843  if(PanelMode) count += (2*PacketSize) * offset;
1844 
1845  for(Index k=0; k<depth; k++)
1846  {
1847  Packet A, B;
1848  A = lhs.loadPacket(i+0*PacketSize, k);
1849  B = lhs.loadPacket(i+1*PacketSize, k);
1850  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1851  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1852  }
1853  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1854  }
1855  }
1856  // Pack 1 packets
1857  if(Pack1>=1*PacketSize)
1858  {
1859  for(; i<peeled_mc1; i+=1*PacketSize)
1860  {
1861  if(PanelMode) count += (1*PacketSize) * offset;
1862 
1863  for(Index k=0; k<depth; k++)
1864  {
1865  Packet A;
1866  A = lhs.loadPacket(i+0*PacketSize, k);
1867  pstore(blockA+count, cj.pconj(A));
1868  count+=PacketSize;
1869  }
1870  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1871  }
1872  }
1873  // Pack scalars
1874  if(Pack2<PacketSize && Pack2>1)
1875  {
1876  for(; i<peeled_mc0; i+=Pack2)
1877  {
1878  if(PanelMode) count += Pack2 * offset;
1879 
1880  for(Index k=0; k<depth; k++)
1881  for(Index w=0; w<Pack2; w++)
1882  blockA[count++] = cj(lhs(i+w, k));
1883 
1884  if(PanelMode) count += Pack2 * (stride-offset-depth);
1885  }
1886  }
1887  for(; i<rows; i++)
1888  {
1889  if(PanelMode) count += offset;
1890  for(Index k=0; k<depth; k++)
1891  blockA[count++] = cj(lhs(i, k));
1892  if(PanelMode) count += (stride-offset-depth);
1893  }
1894 }
1895 
1896 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1897 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1898 {
1899  typedef typename DataMapper::LinearMapper LinearMapper;
1900  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1901 };
1902 
1903 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1904 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1905  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1906 {
1907  typedef typename packet_traits<Scalar>::type Packet;
1908  enum { PacketSize = packet_traits<Scalar>::size };
1909 
1910  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1911  EIGEN_UNUSED_VARIABLE(stride);
1912  EIGEN_UNUSED_VARIABLE(offset);
1913  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1914  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1915  Index count = 0;
1916 
1917 // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1918 // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1919 // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1920 
1921  int pack = Pack1;
1922  Index i = 0;
1923  while(pack>0)
1924  {
1925  Index remaining_rows = rows-i;
1926  Index peeled_mc = i+(remaining_rows/pack)*pack;
1927  for(; i<peeled_mc; i+=pack)
1928  {
1929  if(PanelMode) count += pack * offset;
1930 
1931  const Index peeled_k = (depth/PacketSize)*PacketSize;
1932  Index k=0;
1933  if(pack>=PacketSize)
1934  {
1935  for(; k<peeled_k; k+=PacketSize)
1936  {
1937  for (Index m = 0; m < pack; m += PacketSize)
1938  {
1939  PacketBlock<Packet> kernel;
1940  for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1941  ptranspose(kernel);
1942  for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
1943  }
1944  count += PacketSize*pack;
1945  }
1946  }
1947  for(; k<depth; k++)
1948  {
1949  Index w=0;
1950  for(; w<pack-3; w+=4)
1951  {
1952  Scalar a(cj(lhs(i+w+0, k))),
1953  b(cj(lhs(i+w+1, k))),
1954  c(cj(lhs(i+w+2, k))),
1955  d(cj(lhs(i+w+3, k)));
1956  blockA[count++] = a;
1957  blockA[count++] = b;
1958  blockA[count++] = c;
1959  blockA[count++] = d;
1960  }
1961  if(pack%4)
1962  for(;w<pack;++w)
1963  blockA[count++] = cj(lhs(i+w, k));
1964  }
1965 
1966  if(PanelMode) count += pack * (stride-offset-depth);
1967  }
1968 
1969  pack -= PacketSize;
1970  if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1971  pack = Pack2;
1972  }
1973 
1974  for(; i<rows; i++)
1975  {
1976  if(PanelMode) count += offset;
1977  for(Index k=0; k<depth; k++)
1978  blockA[count++] = cj(lhs(i, k));
1979  if(PanelMode) count += (stride-offset-depth);
1980  }
1981 }
1982 
1983 // copy a complete panel of the rhs
1984 // this version is optimized for column major matrices
1985 // The traversal order is as follow: (nr==4):
1986 // 0 1 2 3 12 13 14 15 24 27
1987 // 4 5 6 7 16 17 18 19 25 28
1988 // 8 9 10 11 20 21 22 23 26 29
1989 // . . . . . . . . . .
1990 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1991 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1992 {
1993  typedef typename packet_traits<Scalar>::type Packet;
1994  typedef typename DataMapper::LinearMapper LinearMapper;
1995  enum { PacketSize = packet_traits<Scalar>::size };
1996  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
1997 };
1998 
1999 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2000 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2001  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2002 {
2003  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2004  EIGEN_UNUSED_VARIABLE(stride);
2005  EIGEN_UNUSED_VARIABLE(offset);
2006  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2007  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2008  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2009  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2010  Index count = 0;
2011  const Index peeled_k = (depth/PacketSize)*PacketSize;
2012 // if(nr>=8)
2013 // {
2014 // for(Index j2=0; j2<packet_cols8; j2+=8)
2015 // {
2016 // // skip what we have before
2017 // if(PanelMode) count += 8 * offset;
2018 // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
2019 // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
2020 // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
2021 // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
2022 // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
2023 // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
2024 // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
2025 // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
2026 // Index k=0;
2027 // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
2028 // {
2029 // for(; k<peeled_k; k+=PacketSize) {
2030 // PacketBlock<Packet> kernel;
2031 // for (int p = 0; p < PacketSize; ++p) {
2032 // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
2033 // }
2034 // ptranspose(kernel);
2035 // for (int p = 0; p < PacketSize; ++p) {
2036 // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
2037 // count+=PacketSize;
2038 // }
2039 // }
2040 // }
2041 // for(; k<depth; k++)
2042 // {
2043 // blockB[count+0] = cj(b0[k]);
2044 // blockB[count+1] = cj(b1[k]);
2045 // blockB[count+2] = cj(b2[k]);
2046 // blockB[count+3] = cj(b3[k]);
2047 // blockB[count+4] = cj(b4[k]);
2048 // blockB[count+5] = cj(b5[k]);
2049 // blockB[count+6] = cj(b6[k]);
2050 // blockB[count+7] = cj(b7[k]);
2051 // count += 8;
2052 // }
2053 // // skip what we have after
2054 // if(PanelMode) count += 8 * (stride-offset-depth);
2055 // }
2056 // }
2057 
2058  if(nr>=4)
2059  {
2060  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2061  {
2062  // skip what we have before
2063  if(PanelMode) count += 4 * offset;
2064  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2065  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2066  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2067  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2068 
2069  Index k=0;
2070  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2071  {
2072  for(; k<peeled_k; k+=PacketSize) {
2073  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
2074  kernel.packet[0] = dm0.loadPacket(k);
2075  kernel.packet[1%PacketSize] = dm1.loadPacket(k);
2076  kernel.packet[2%PacketSize] = dm2.loadPacket(k);
2077  kernel.packet[3%PacketSize] = dm3.loadPacket(k);
2078  ptranspose(kernel);
2079  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2080  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2081  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2082  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2083  count+=4*PacketSize;
2084  }
2085  }
2086  for(; k<depth; k++)
2087  {
2088  blockB[count+0] = cj(dm0(k));
2089  blockB[count+1] = cj(dm1(k));
2090  blockB[count+2] = cj(dm2(k));
2091  blockB[count+3] = cj(dm3(k));
2092  count += 4;
2093  }
2094  // skip what we have after
2095  if(PanelMode) count += 4 * (stride-offset-depth);
2096  }
2097  }
2098 
2099  // copy the remaining columns one at a time (nr==1)
2100  for(Index j2=packet_cols4; j2<cols; ++j2)
2101  {
2102  if(PanelMode) count += offset;
2103  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2104  for(Index k=0; k<depth; k++)
2105  {
2106  blockB[count] = cj(dm0(k));
2107  count += 1;
2108  }
2109  if(PanelMode) count += (stride-offset-depth);
2110  }
2111 }
2112 
2113 // this version is optimized for row major matrices
2114 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2115 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2116 {
2117  typedef typename packet_traits<Scalar>::type Packet;
2118  typedef typename DataMapper::LinearMapper LinearMapper;
2119  enum { PacketSize = packet_traits<Scalar>::size };
2120  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2121 };
2122 
2123 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2124 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2125  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2126 {
2127  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2128  EIGEN_UNUSED_VARIABLE(stride);
2129  EIGEN_UNUSED_VARIABLE(offset);
2130  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2131  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2132  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2133  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2134  Index count = 0;
2135 
2136 // if(nr>=8)
2137 // {
2138 // for(Index j2=0; j2<packet_cols8; j2+=8)
2139 // {
2140 // // skip what we have before
2141 // if(PanelMode) count += 8 * offset;
2142 // for(Index k=0; k<depth; k++)
2143 // {
2144 // if (PacketSize==8) {
2145 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2146 // pstoreu(blockB+count, cj.pconj(A));
2147 // } else if (PacketSize==4) {
2148 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2149 // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2150 // pstoreu(blockB+count, cj.pconj(A));
2151 // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2152 // } else {
2153 // const Scalar* b0 = &rhs[k*rhsStride + j2];
2154 // blockB[count+0] = cj(b0[0]);
2155 // blockB[count+1] = cj(b0[1]);
2156 // blockB[count+2] = cj(b0[2]);
2157 // blockB[count+3] = cj(b0[3]);
2158 // blockB[count+4] = cj(b0[4]);
2159 // blockB[count+5] = cj(b0[5]);
2160 // blockB[count+6] = cj(b0[6]);
2161 // blockB[count+7] = cj(b0[7]);
2162 // }
2163 // count += 8;
2164 // }
2165 // // skip what we have after
2166 // if(PanelMode) count += 8 * (stride-offset-depth);
2167 // }
2168 // }
2169  if(nr>=4)
2170  {
2171  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2172  {
2173  // skip what we have before
2174  if(PanelMode) count += 4 * offset;
2175  for(Index k=0; k<depth; k++)
2176  {
2177  if (PacketSize==4) {
2178  Packet A = rhs.loadPacket(k, j2);
2179  pstoreu(blockB+count, cj.pconj(A));
2180  count += PacketSize;
2181  } else {
2182  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2183  blockB[count+0] = cj(dm0(0));
2184  blockB[count+1] = cj(dm0(1));
2185  blockB[count+2] = cj(dm0(2));
2186  blockB[count+3] = cj(dm0(3));
2187  count += 4;
2188  }
2189  }
2190  // skip what we have after
2191  if(PanelMode) count += 4 * (stride-offset-depth);
2192  }
2193  }
2194  // copy the remaining columns one at a time (nr==1)
2195  for(Index j2=packet_cols4; j2<cols; ++j2)
2196  {
2197  if(PanelMode) count += offset;
2198  for(Index k=0; k<depth; k++)
2199  {
2200  blockB[count] = cj(rhs(k, j2));
2201  count += 1;
2202  }
2203  if(PanelMode) count += stride-offset-depth;
2204  }
2205 }
2206 
2207 } // end namespace internal
2208 
2211 inline std::ptrdiff_t l1CacheSize()
2212 {
2213  std::ptrdiff_t l1, l2, l3;
2214  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2215  return l1;
2216 }
2217 
2220 inline std::ptrdiff_t l2CacheSize()
2221 {
2222  std::ptrdiff_t l1, l2, l3;
2223  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2224  return l2;
2225 }
2226 
2232 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2233 {
2234  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2235 }
2236 
2237 } // end namespace Eigen
2238 
2239 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Definition: LDLT.h:16
Definition: StdDeque.h:58
Definition: Constants.h:320
Definition: Constants.h:322
Definition: Eigen_Colamd.h:54