1 files changed, 361 insertions, 0 deletions
diff --git a/2.3-1/thirdparty/includes/OpenCV/opencv2/gpu/device/detail/reduce.hpp b/2.3-1/thirdparty/includes/OpenCV/opencv2/gpu/device/detail/reduce.hpp
new file mode 100644
index 00000000..091a160e
--- /dev/null
+++ b/2.3-1/thirdparty/includes/OpenCV/opencv2/gpu/device/detail/reduce.hpp
@@ -0,0 +1,361 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
+#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if __CUDA_ARCH__ >= 300
+                (void) smem;
+                (void) tid;
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__