TensorReduction.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 template<typename Op, typename Dims, typename XprType>
24 struct traits<TensorReductionOp<Op, Dims, XprType> >
25  : traits<XprType>
26 {
27  typedef typename traits<XprType>::Scalar Scalar;
28  typedef typename internal::packet_traits<Scalar>::type Packet;
29  typedef typename traits<XprType>::StorageKind StorageKind;
30  typedef typename traits<XprType>::Index Index;
31  typedef typename XprType::Nested Nested;
32 };
33 
34 template<typename Op, typename Dims, typename XprType>
35 struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
36 {
37  typedef const TensorReductionOp<Op, Dims, XprType>& type;
38 };
39 
40 template<typename Op, typename Dims, typename XprType>
41 struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
42 {
43  typedef TensorReductionOp<Op, Dims, XprType> type;
44 };
45 
46 
47 template <typename OutputDims> struct DimInitializer {
48  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
49  static void run(const InputDims& input_dims,
50  const array<bool, internal::array_size<InputDims>::value>& reduced,
51  OutputDims* output_dims, ReducedDims* reduced_dims) {
52  const int NumInputDims = internal::array_size<InputDims>::value;
53  int outputIndex = 0;
54  int reduceIndex = 0;
55  for (int i = 0; i < NumInputDims; ++i) {
56  if (reduced[i]) {
57  (*reduced_dims)[reduceIndex] = input_dims[i];
58  ++reduceIndex;
59  } else {
60  (*output_dims)[outputIndex] = input_dims[i];
61  ++outputIndex;
62  }
63  }
64  }
65 };
66 
67 template <> struct DimInitializer<Sizes<> > {
68  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
69  static void run(const InputDims& input_dims, const array<bool, Rank>&,
70  Sizes<>*, array<Index, Rank>* reduced_dims) {
71  const int NumInputDims = internal::array_size<InputDims>::value;
72  for (int i = 0; i < NumInputDims; ++i) {
73  (*reduced_dims)[i] = input_dims[i];
74  }
75  }
76 };
77 
78 
79 template <typename ReducedDims, int NumTensorDims, int Layout>
80 struct are_inner_most_dims {
81  static const bool value = false;
82 };
83 template <typename ReducedDims, int NumTensorDims, int Layout>
84 struct preserve_inner_most_dims {
85  static const bool value = false;
86 };
87 
88 #if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
89 template <typename ReducedDims, int NumTensorDims>
90 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
91  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
92  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
93  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
94  static const bool value = tmp1 & tmp2 & tmp3;
95 };
96 template <typename ReducedDims, int NumTensorDims>
97 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
98  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
99  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
100  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
101  static const bool value = tmp1 & tmp2 & tmp3;
102 
103 };
104 template <typename ReducedDims, int NumTensorDims>
105 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
106  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
107  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
108  static const bool value = tmp1 & tmp2;
109 
110 };
111 template <typename ReducedDims, int NumTensorDims>
112 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
113  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
114  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
115  static const bool value = tmp1 & tmp2;
116 };
117 #endif
118 
119 
120 template <int DimIndex, typename Self, typename Op>
121 struct GenericDimReducer {
122  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
123  EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
124  for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
125  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
126  GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
127  }
128  }
129 };
130 template <typename Self, typename Op>
131 struct GenericDimReducer<0, Self, Op> {
132  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
133  for (int j = 0; j < self.m_reducedDims[0]; ++j) {
134  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
135  reducer.reduce(self.m_impl.coeff(input), accum);
136  }
137  }
138 };
139 template <typename Self, typename Op>
140 struct GenericDimReducer<-1, Self, Op> {
141  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
142  reducer.reduce(self.m_impl.coeff(index), accum);
143  }
144 };
145 
146 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
147 struct InnerMostDimReducer {
148  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
149  typename Self::CoeffReturnType accum = reducer.initialize();
150  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
151  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
152  }
153  return reducer.finalize(accum);
154  }
155 };
156 
157 template <typename Self, typename Op>
158 struct InnerMostDimReducer<Self, Op, true> {
159  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
160  const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
161  const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
162  typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
163  for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
164  reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
165  }
166  typename Self::CoeffReturnType accum = reducer.initialize();
167  for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
168  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
169  }
170  return reducer.finalizeBoth(accum, p);
171  }
172 };
173 
174 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
175 struct InnerMostDimPreserver {
176  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
177  eigen_assert(false && "should never be called");
178  }
179 };
180 
181 template <int DimIndex, typename Self, typename Op>
182 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
183  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
184  EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
185  for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
186  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
187  InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
188  }
189  }
190 };
191 
192 template <typename Self, typename Op>
193 struct InnerMostDimPreserver<0, Self, Op, true> {
194  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
195  for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
196  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
197  reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
198  }
199  }
200 };
201 template <typename Self, typename Op>
202 struct InnerMostDimPreserver<-1, Self, Op, true> {
203  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
204  eigen_assert(false && "should never be called");
205  }
206 };
207 
208 // Default full reducer
209 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
210 struct FullReducer {
211  static const bool HasOptimizedImplementation = false;
212 
213  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
214  const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
215  *output = InnerMostDimReducer<Self, Op>::reduce(self, 0, num_coeffs, reducer);
216  }
217 };
218 
219 
220 #ifdef EIGEN_USE_THREADS
221 // Multithreaded full reducers
222 template <typename Eval, typename Op, bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)>
223 struct FullReducerShard {
224  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
225 
226  shard->saccum = reducer.initialize();
227  for (typename Eval::Index j = 0; j < numValuesToReduce; ++j) {
228  reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
229  }
230  }
231 
232  typename Eval::CoeffReturnType saccum;
233 };
234 
235 template <typename Eval, typename Op>
236 struct FullReducerShard<Eval, Op, true> {
237  static void run(const Eval& eval, typename Eval::Index firstIndex, typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
238 
239  const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size;
240  const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
241 
242  shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>();
243  for (typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) {
244  reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum);
245  }
246  shard->saccum = reducer.initialize();
247  for (typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
248  reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
249  }
250  }
251 
252  typename Eval::PacketReturnType paccum;
253  typename Eval::CoeffReturnType saccum;
254 };
255 
256 
257 template <typename Self, typename Op>
258 struct FullReducer<Self, Op, ThreadPoolDevice, false> {
259  static const bool HasOptimizedImplementation = !Op::IsStateful;
260 
261  // launch one reducer per thread and accumulate the result.
262  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
263  typedef typename Self::Index Index;
264  const Index num_coeffs = array_prod(self.m_impl.dimensions());
265  const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
266  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
267  eigen_assert(num_coeffs >= numblocks * blocksize);
268 
269  std::vector<Notification*> results;
270  results.reserve(numblocks);
271  std::vector<FullReducerShard<Self, Op, false> > shards;
272  shards.resize(numblocks);
273  for (Index i = 0; i < numblocks; ++i) {
274  results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
275  }
276 
277  FullReducerShard<Self, Op, false> finalShard;
278  if (numblocks * blocksize < num_coeffs) {
279  FullReducerShard<Self, Op, false>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
280  } else {
281  finalShard.saccum = reducer.initialize();
282  }
283 
284  for (Index i = 0; i < numblocks; ++i) {
285  wait_until_ready(results[i]);
286  delete results[i];
287  }
288 
289  for (Index i = 0; i < numblocks; ++i) {
290  reducer.reduce(shards[i].saccum, &finalShard.saccum);
291  }
292  *output = reducer.finalize(finalShard.saccum);
293  }
294 };
295 
296 template <typename Self, typename Op>
297 struct FullReducer<Self, Op, ThreadPoolDevice, true> {
298  static const bool HasOptimizedImplementation = !Op::IsStateful;
299 
300  // launch one reducer per thread and accumulate the result.
301  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, typename Self::CoeffReturnType* output) {
302  typedef typename Self::Index Index;
303  const Index num_coeffs = array_prod(self.m_impl.dimensions());
304  const Index blocksize = std::floor<Index>(static_cast<float>(num_coeffs)/device.numThreads());
305  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
306  eigen_assert(num_coeffs >= numblocks * blocksize);
307 
308  std::vector<Notification*> results;
309  results.reserve(numblocks);
310  std::vector<FullReducerShard<Self, Op, true> > shards;
311  shards.resize(numblocks);
312  for (Index i = 0; i < numblocks; ++i) {
313  results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run, self, i*blocksize, blocksize, reducer, &shards[i]));
314  }
315 
316  FullReducerShard<Self, Op, true> finalShard;
317  if (numblocks * blocksize < num_coeffs) {
318  FullReducerShard<Self, Op, true>::run(self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
319  } else {
320  finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
321  finalShard.saccum = reducer.initialize();
322  }
323 
324  for (Index i = 0; i < numblocks; ++i) {
325  wait_until_ready(results[i]);
326  delete results[i];
327  }
328 
329  for (Index i = 0; i < numblocks; ++i) {
330  reducer.reducePacket(shards[i].paccum, &finalShard.paccum);
331  reducer.reduce(shards[i].saccum, &finalShard.saccum);
332  }
333 
334  *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum);
335  }
336 };
337 #endif
338 
339 
340 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
341 template <int B, int N, typename S, typename R, typename I>
342 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
343 #endif
344 
345 } // end namespace internal
346 
347 
348 template <typename Op, typename Dims, typename XprType>
349 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
350  public:
351  typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
352  typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
353  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
354  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
355  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
356  typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
357  typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
358  typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
359 
360  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
361  TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
362  { }
363  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
364  TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
365  { }
366 
367  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
368  const XprType& expression() const { return m_expr; }
369  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
370  const Dims& dims() const { return m_dims; }
371  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
372  const Op& reducer() const { return m_reducer; }
373 
374  protected:
375  typename XprType::Nested m_expr;
376  const Dims m_dims;
377  const Op m_reducer;
378 };
379 
380 
381 // Eval as rvalue
382 template<typename Op, typename Dims, typename ArgType, typename Device>
383 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
384 {
385  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
386  typedef typename XprType::Index Index;
387  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
388  static const int NumInputDims = internal::array_size<InputDimensions>::value;
389  static const int NumReducedDims = internal::array_size<Dims>::value;
390  static const int NumOutputDims = NumInputDims - NumReducedDims;
391  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
392  typedef typename XprType::Scalar Scalar;
393  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
394  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
395 
396  enum {
397  IsAligned = false,
398  PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
399  Layout = TensorEvaluator<ArgType, Device>::Layout,
400  CoordAccess = false, // to be implemented
401  };
402 
403  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
404  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
405  static const bool RunningFullReduction = (NumOutputDims==0);
406 
407  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
408  : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
409  {
410  EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
411  EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
412  YOU_MADE_A_PROGRAMMING_MISTAKE);
413 
414  // Bitmap indicating if an input dimension is reduced or not.
415  array<bool, NumInputDims> reduced;
416  for (int i = 0; i < NumInputDims; ++i) {
417  reduced[i] = false;
418  }
419  for (int i = 0; i < NumReducedDims; ++i) {
420  eigen_assert(op.dims()[i] >= 0);
421  eigen_assert(op.dims()[i] < NumInputDims);
422  reduced[op.dims()[i]] = true;
423  }
424 
425  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
426  internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
427 
428  // Precompute output strides.
429  if (NumOutputDims > 0) {
430  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
431  m_outputStrides[0] = 1;
432  for (int i = 1; i < NumOutputDims; ++i) {
433  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
434  }
435  } else {
436  m_outputStrides[NumOutputDims - 1] = 1;
437  for (int i = NumOutputDims - 2; i >= 0; --i) {
438  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
439  }
440  }
441  }
442 
443  // Precompute input strides.
444  if (NumInputDims > 0) {
445  array<Index, NumInputDims> input_strides;
446  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
447  input_strides[0] = 1;
448  for (int i = 1; i < NumInputDims; ++i) {
449  input_strides[i] = input_strides[i-1] * input_dims[i-1];
450  }
451  } else {
452  input_strides[NumInputDims - 1] = 1;
453  for (int i = NumInputDims - 2; i >= 0; --i) {
454  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
455  }
456  }
457 
458  int outputIndex = 0;
459  int reduceIndex = 0;
460  for (int i = 0; i < NumInputDims; ++i) {
461  if (reduced[i]) {
462  m_reducedStrides[reduceIndex] = input_strides[i];
463  ++reduceIndex;
464  } else {
465  m_preservedStrides[outputIndex] = input_strides[i];
466  ++outputIndex;
467  }
468  }
469  }
470 
471  // Special case for full reductions
472  if (NumOutputDims == 0) {
473  m_preservedStrides[0] = internal::array_prod(input_dims);
474  }
475  }
476 
477  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
478 
479  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
480  typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
481 
482  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
483  m_impl.evalSubExprsIfNeeded(NULL);
484 
485  // Use the FullReducer if possible.
486  if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
487  ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
488  (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) {
489 
490  bool need_assign = false;
491  if (!data) {
492  m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
493  data = m_result;
494  need_assign = true;
495  }
496 
497  Op reducer(m_reducer);
498  internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
499  return need_assign;
500  }
501  return true;
502  }
503 
504  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
505  m_impl.cleanup();
506  if (m_result) {
507  m_device.deallocate(m_result);
508  }
509  }
510 
511  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
512  {
513  if (RunningFullReduction && m_result) {
514  return *m_result;
515  }
516  Op reducer(m_reducer);
517  if (ReducingInnerMostDims || RunningFullReduction) {
518  const Index num_values_to_reduce =
519  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
520  return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
521  num_values_to_reduce, reducer);
522  } else {
523  typename Self::CoeffReturnType accum = reducer.initialize();
524  internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
525  return reducer.finalize(accum);
526  }
527  }
528 
529  // TODO(bsteiner): provide a more efficient implementation.
530  template<int LoadMode>
531  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
532  {
533  const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
534  EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
535  eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
536 
537  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
538  if (ReducingInnerMostDims) {
539  const Index num_values_to_reduce =
540  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
541  const Index firstIndex = firstInput(index);
542  for (Index i = 0; i < packetSize; ++i) {
543  Op reducer(m_reducer);
544  values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
545  num_values_to_reduce, reducer);
546  }
547  } else if (PreservingInnerMostDims) {
548  const Index firstIndex = firstInput(index);
549  const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
550  // TBD: extend this the the n innermost dimensions that we preserve.
551  if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
552  Op reducer(m_reducer);
553  typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
554  internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
555  return reducer.finalizePacket(accum);
556  } else {
557  for (int i = 0; i < packetSize; ++i) {
558  values[i] = coeff(index + i);
559  }
560  }
561  } else {
562  for (int i = 0; i < packetSize; ++i) {
563  values[i] = coeff(index + i);
564  }
565  }
566  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
567  return rslt;
568  }
569 
570  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
571 
572  private:
573  template <int, typename, typename> friend struct internal::GenericDimReducer;
574  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
575  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
576  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
577 #ifdef EIGEN_USE_THREADS
578  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
579 #endif
580 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
581  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
582 #endif
583 
584  // Returns the Index in the input tensor of the first value that needs to be
585  // used to compute the reduction at output index "index".
586  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
587  if (ReducingInnerMostDims) {
588  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
589  return index * m_preservedStrides[0];
590  } else {
591  return index * m_preservedStrides[NumPreservedStrides - 1];
592  }
593  }
594  // TBD: optimize the case where we preserve the innermost dimensions.
595  Index startInput = 0;
596  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
597  for (int i = NumOutputDims - 1; i > 0; --i) {
598  // This is index_i in the output tensor.
599  const Index idx = index / m_outputStrides[i];
600  startInput += idx * m_preservedStrides[i];
601  index -= idx * m_outputStrides[i];
602  }
603  if (PreservingInnerMostDims) {
604  eigen_assert(m_preservedStrides[0] == 1);
605  startInput += index;
606  } else {
607  startInput += index * m_preservedStrides[0];
608  }
609  } else {
610  for (int i = 0; i < NumOutputDims - 1; ++i) {
611  // This is index_i in the output tensor.
612  const Index idx = index / m_outputStrides[i];
613  startInput += idx * m_preservedStrides[i];
614  index -= idx * m_outputStrides[i];
615  }
616  if (PreservingInnerMostDims) {
617  eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
618  startInput += index;
619  } else {
620  startInput += index * m_preservedStrides[NumPreservedStrides - 1];
621  }
622  }
623  return startInput;
624  }
625 
626  // Dimensions of the output of the operation.
627  Dimensions m_dimensions;
628  // Precomputed strides for the output tensor.
629  array<Index, NumOutputDims> m_outputStrides;
630  // Subset of strides of the input tensor for the non-reduced dimensions.
631  // Indexed by output dimensions.
632  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
633  array<Index, NumPreservedStrides> m_preservedStrides;
634 
635  // Subset of strides of the input tensor for the reduced dimensions.
636  // Indexed by reduced dimensions.
637  array<Index, NumReducedDims> m_reducedStrides;
638  // Size of the input dimensions that are reduced.
639  // Indexed by reduced dimensions.
640  array<Index, NumReducedDims> m_reducedDims;
641 
642  // Evaluator for the input expression.
643  TensorEvaluator<ArgType, Device> m_impl;
644 
645  // Operation to apply for computing the reduction.
646  Op m_reducer;
647 
648  // For full reductions
649 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
650  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
651 #else
652  static const bool RunningOnGPU = false;
653 #endif
654  CoeffReturnType* m_result;
655 
656  const Device& m_device;
657 };
658 
659 } // end namespace Eigen
660 
661 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13