/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ #define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ #include #include "../warp.hpp" #include "../warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { namespace reduce_key_val_detail { template struct GetType; template struct GetType { typedef T type; }; template struct GetType { typedef T type; }; template struct GetType { typedef T type; }; template struct For { template static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) { thrust::get(smem)[tid] = thrust::get(data); For::loadToSmem(smem, data, tid); } template static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) { thrust::get(data) = thrust::get(smem)[tid]; For::loadFromSmem(smem, data, tid); } template static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width) { thrust::get(val) = shfl_down(thrust::get(val), delta, width); For::copyShfl(val, delta, width); } template static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta) { thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; For::copy(svals, val, tid, delta); } template static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width) { typename GetType::type>::type reg = shfl_down(thrust::get(key), delta, width); if (thrust::get(cmp)(reg, thrust::get(key))) { thrust::get(key) = reg; thrust::get(val) = shfl_down(thrust::get(val), delta, width); } For::mergeShfl(key, val, cmp, delta, width); } template static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, const ValPointerTuple& svals, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int tid, unsigned int delta) { typename GetType::type>::type reg = thrust::get(skeys)[tid + delta]; if (thrust::get(cmp)(reg, thrust::get(key))) { thrust::get(skeys)[tid] = thrust::get(key) = reg; thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; } For::merge(skeys, key, svals, val, cmp, tid, delta); } }; template struct For { template static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) { } template static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) { } template static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int) { } template static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int) { } template static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int) { } template static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&, const ValPointerTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, unsigned int) { } }; ////////////////////////////////////////////////////// // loadToSmem template __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid) { smem[tid] = data; } template __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid) { data = smem[tid]; } template __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, const thrust::tuple& data, unsigned int tid) { For<0, thrust::tuple_size >::value>::loadToSmem(smem, data, tid); } template __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, const thrust::tuple& data, unsigned int tid) { For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); } ////////////////////////////////////////////////////// // copyVals template __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) { val = shfl_down(val, delta, width); } template __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta) { svals[tid] = val = svals[tid + delta]; } template __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, unsigned int delta, int width) { For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); } template __device__ __forceinline__ void copyVals(const thrust::tuple& svals, const thrust::tuple& val, unsigned int tid, unsigned int delta) { For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); } ////////////////////////////////////////////////////// // merge template __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) { K reg = shfl_down(key, delta, width); if (cmp(reg, key)) { key = reg; copyValsShfl(val, delta, width); } } template __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta) { K reg = skeys[tid + delta]; if (cmp(reg, key)) { skeys[tid] = key = reg; copyVals(svals, val, tid, delta); } } template __device__ __forceinline__ void mergeShfl(K& key, const thrust::tuple& val, const Cmp& cmp, unsigned int delta, int width) { K reg = shfl_down(key, delta, width); if (cmp(reg, key)) { key = reg; copyValsShfl(val, delta, width); } } template __device__ __forceinline__ void merge(volatile K* skeys, K& key, const thrust::tuple& svals, const thrust::tuple& val, const Cmp& cmp, unsigned int tid, unsigned int delta) { K reg = skeys[tid + delta]; if (cmp(reg, key)) { skeys[tid] = key = reg; copyVals(svals, val, tid, delta); } } template __device__ __forceinline__ void mergeShfl(const thrust::tuple& key, const thrust::tuple& val, const thrust::tuple& cmp, unsigned int delta, int width) { For<0, thrust::tuple_size >::value>::mergeShfl(key, val, cmp, delta, width); } template __device__ __forceinline__ void merge(const thrust::tuple& skeys, const thrust::tuple& key, const thrust::tuple& svals, const thrust::tuple& val, const thrust::tuple& cmp, unsigned int tid, unsigned int delta) { For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); } ////////////////////////////////////////////////////// // Generic template struct Generic { template static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) { loadToSmem(skeys, key, tid); loadValsToSmem(svals, val, tid); if (N >= 32) __syncthreads(); if (N >= 2048) { if (tid < 1024) merge(skeys, key, svals, val, cmp, tid, 1024); __syncthreads(); } if (N >= 1024) { if (tid < 512) merge(skeys, key, svals, val, cmp, tid, 512); __syncthreads(); } if (N >= 512) { if (tid < 256) merge(skeys, key, svals, val, cmp, tid, 256); __syncthreads(); } if (N >= 256) { if (tid < 128) merge(skeys, key, svals, val, cmp, tid, 128); __syncthreads(); } if (N >= 128) { if (tid < 64) merge(skeys, key, svals, val, cmp, tid, 64); __syncthreads(); } if (N >= 64) { if (tid < 32) merge(skeys, key, svals, val, cmp, tid, 32); } if (tid < 16) { merge(skeys, key, svals, val, cmp, tid, 16); merge(skeys, key, svals, val, cmp, tid, 8); merge(skeys, key, svals, val, cmp, tid, 4); merge(skeys, key, svals, val, cmp, tid, 2); merge(skeys, key, svals, val, cmp, tid, 1); } } }; template struct Unroll { static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N) { mergeShfl(key, val, cmp, I, N); Unroll::loopShfl(key, val, cmp, N); } static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) { merge(skeys, key, svals, val, cmp, tid, I); Unroll::loop(skeys, key, svals, val, tid, cmp); } }; template struct Unroll<0, KP, KR, VP, VR, Cmp> { static __device__ void loopShfl(KR, VR, Cmp, unsigned int) { } static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp) { } }; template struct WarpOptimized { template static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) { #if 0 // __CUDA_ARCH__ >= 300 (void) skeys; (void) svals; (void) tid; Unroll::loopShfl(key, val, cmp, N); #else loadToSmem(skeys, key, tid); loadToSmem(svals, val, tid); if (tid < N / 2) Unroll::loop(skeys, key, svals, val, tid, cmp); #endif } }; template struct GenericOptimized32 { enum { M = N / 32 }; template static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) { const unsigned int laneId = Warp::laneId(); #if 0 // __CUDA_ARCH__ >= 300 Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize); if (laneId == 0) { loadToSmem(skeys, key, tid / 32); loadToSmem(svals, val, tid / 32); } #else loadToSmem(skeys, key, tid); loadToSmem(svals, val, tid); if (laneId < 16) Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); __syncthreads(); if (laneId == 0) { loadToSmem(skeys, key, tid / 32); loadToSmem(svals, val, tid / 32); } #endif __syncthreads(); loadFromSmem(skeys, key, tid); if (tid < 32) { #if 0 // __CUDA_ARCH__ >= 300 loadFromSmem(svals, val, tid); Unroll::loopShfl(key, val, cmp, M); #else Unroll::loop(skeys, key, svals, val, tid, cmp); #endif } } }; template struct StaticIf; template struct StaticIf { typedef T1 type; }; template struct StaticIf { typedef T2 type; }; template struct IsPowerOf2 { enum { value = ((N != 0) && !(N & (N - 1))) }; }; template struct Dispatcher { typedef typename StaticIf< (N <= 32) && IsPowerOf2::value, WarpOptimized, typename StaticIf< (N <= 1024) && IsPowerOf2::value, GenericOptimized32, Generic >::type >::type reductor; }; } }}} #endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__