diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index cfe4e0b195..5dfa6fdf3c 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -52,14 +52,6 @@ set(FAISS_GPU_SRC impl/PQScanMultiPassPrecomputed.cu impl/RemapIndices.cpp impl/VectorResidual.cu - impl/scan/IVFInterleaved1.cu - impl/scan/IVFInterleaved32.cu - impl/scan/IVFInterleaved64.cu - impl/scan/IVFInterleaved128.cu - impl/scan/IVFInterleaved256.cu - impl/scan/IVFInterleaved512.cu - impl/scan/IVFInterleaved1024.cu - impl/scan/IVFInterleaved2048.cu impl/IcmEncoder.cu utils/BlockSelectFloat.cu utils/DeviceUtils.cu @@ -176,6 +168,74 @@ set(FAISS_GPU_HEADERS utils/warpselect/WarpSelectImpl.cuh ) +function(generate_ivf_interleaved_code) + set(SUB_CODEC_TYPE + "faiss::gpu::Codec<0, 1>" + "faiss::gpu::Codec<1, 1>" + "faiss::gpu::Codec<2, 1>" + "faiss::gpu::Codec<3, 1>" + "faiss::gpu::Codec<4, 1>" + "faiss::gpu::Codec<5, 1>" + "faiss::gpu::Codec<6, 1>" + "faiss::gpu::CodecFloat" + ) + + set(SUB_METRIC_TYPE + "faiss::gpu::IPDistance" + "faiss::gpu::L2Distance" + ) + + # Used for SUB_THREADS, SUB_NUM_WARP_Q, SUB_NUM_THREAD_Q + set(THREADS_AND_WARPS + "128|1024|8" + "128|1|1" + "128|128|3" + "128|256|4" + "128|32|2" + "128|512|8" + "128|64|3" + "64|2048|8" + ) + + # Traverse through the Cartesian product of X and Y + foreach(sub_codec ${SUB_CODEC_TYPE}) + foreach(metric_type ${SUB_METRIC_TYPE}) + foreach(threads_and_warps_str ${THREADS_AND_WARPS}) + string(REPLACE "|" ";" threads_and_warps ${threads_and_warps_str}) + list(GET threads_and_warps 0 sub_threads) + list(GET threads_and_warps 1 sub_num_warp_q) + list(GET threads_and_warps 2 sub_num_thread_q) + + # Define the output file name + set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}") + # Remove illegal characters from filename + string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename}) + set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu") + + # Read the template file + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content) + + # Replace the placeholders + string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}") + string(REPLACE "SUB_METRIC_TYPE" "${metric_type}" template_content "${template_content}") + string(REPLACE "SUB_THREADS" "${sub_threads}" template_content "${template_content}") + string(REPLACE "SUB_NUM_WARP_Q" "${sub_num_warp_q}" template_content "${template_content}") + string(REPLACE "SUB_NUM_THREAD_Q" "${sub_num_thread_q}" template_content "${template_content}") + + # Write the modified content to the output file + file(WRITE "${output_file}" "${template_content}") + + # Add the file to the sources + list(APPEND FAISS_GPU_SRC "${output_file}") + endforeach() + endforeach() + endforeach() + # Propagate modified variable to the parent scope + set(FAISS_GPU_SRC "${FAISS_GPU_SRC}" PARENT_SCOPE) +endfunction() + +generate_ivf_interleaved_code() + if(FAISS_ENABLE_RAFT) list(APPEND FAISS_GPU_HEADERS impl/RaftFlatIndex.cuh) diff --git a/faiss/gpu/impl/IVFInterleaved.cu b/faiss/gpu/impl/IVFInterleaved.cu index c9ee87ee42..9e595b4a59 100644 --- a/faiss/gpu/impl/IVFInterleaved.cu +++ b/faiss/gpu/impl/IVFInterleaved.cu @@ -210,25 +210,23 @@ void runIVFInterleavedScan( }; if (k == 1) { - ivf_interleaved_call(ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 1, 1>); } else if (k <= 32) { - ivf_interleaved_call(ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 32, 2>); } else if (k <= 64) { - ivf_interleaved_call(ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 64, 3>); } else if (k <= 128) { - ivf_interleaved_call(ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 128, 3>); } else if (k <= 256) { - ivf_interleaved_call(ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 256, 4>); } else if (k <= 512) { - ivf_interleaved_call(ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 512, 8>); } else if (k <= 1024) { - ivf_interleaved_call( - ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<128, 1024, 8>); } #if GPU_MAX_SELECTION_K >= 2048 else if (k <= 2048) { - ivf_interleaved_call( - ivfInterleavedScanImpl); + ivf_interleaved_call(ivfInterleavedScanImpl<64, 2048, 8>); } #endif } diff --git a/faiss/gpu/impl/IVFInterleaved.cuh b/faiss/gpu/impl/IVFInterleaved.cuh index 053f99db09..5f92c366e3 100644 --- a/faiss/gpu/impl/IVFInterleaved.cuh +++ b/faiss/gpu/impl/IVFInterleaved.cuh @@ -35,8 +35,7 @@ template < typename Metric, int ThreadsPerBlock, int NumWarpQ, - int NumThreadQ, - bool Residual> + int NumThreadQ> __global__ void ivfInterleavedScan( Tensor queries, Tensor residualBase, @@ -48,7 +47,8 @@ __global__ void ivfInterleavedScan( int k, // [query][probe][k] Tensor distanceOut, - Tensor indicesOut) { + Tensor indicesOut, + const bool Residual) { extern __shared__ float smem[]; constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; @@ -124,7 +124,7 @@ __global__ void ivfInterleavedScan( for (int dBase = 0; dBase < dimBlocks; dBase += kWarpSize) { const int loadDim = dBase + laneId; const float queryReg = query[loadDim]; - [[maybe_unused]] const float residualReg = + const float residualReg = Residual ? residualBaseSlice[loadDim] : 0; constexpr int kUnroll = 4; @@ -152,7 +152,7 @@ __global__ void ivfInterleavedScan( decV[j] = codec.decodeNew(dBase + d, encV[j]); } - if constexpr (Residual) { + if (Residual) { #pragma unroll for (int j = 0; j < kUnroll; ++j) { int d = i * kUnroll + j; @@ -174,9 +174,9 @@ __global__ void ivfInterleavedScan( const bool loadDimInBounds = loadDim < dim; const float queryReg = loadDimInBounds ? query[loadDim] : 0; - [[maybe_unused]] const float residualReg = - Residual && loadDimInBounds ? residualBaseSlice[loadDim] - : 0; + const float residualReg = Residual && loadDimInBounds + ? residualBaseSlice[loadDim] + : 0; for (int d = 0; d < dim - dimBlocks; ++d, data += wordsPerVectorBlockDim) { @@ -187,7 +187,7 @@ __global__ void ivfInterleavedScan( enc = WarpPackedBits::postRead( laneId, enc); float dec = codec.decodeNew(dimBlocks + d, enc); - if constexpr (Residual) { + if (Residual) { dec += SHFL_SYNC(residualReg, d, kWarpSize); } diff --git a/faiss/gpu/impl/scan/IVFInterleaved1.cu b/faiss/gpu/impl/scan/IVFInterleaved1.cu deleted file mode 100644 index c898ec2d6d..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved1.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_1_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved1024.cu b/faiss/gpu/impl/scan/IVFInterleaved1024.cu deleted file mode 100644 index d067a8b228..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved1024.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_1024_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved128.cu b/faiss/gpu/impl/scan/IVFInterleaved128.cu deleted file mode 100644 index 1814df4074..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved128.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_128_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved2048.cu b/faiss/gpu/impl/scan/IVFInterleaved2048.cu deleted file mode 100644 index 1ffb6fc9aa..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved2048.cu +++ /dev/null @@ -1,18 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -#if GPU_MAX_SELECTION_K >= 2048 -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_2048_PARAMS) -#endif - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved256.cu b/faiss/gpu/impl/scan/IVFInterleaved256.cu deleted file mode 100644 index c7817460f4..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved256.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_256_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved32.cu b/faiss/gpu/impl/scan/IVFInterleaved32.cu deleted file mode 100644 index 401a2f5ab2..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved32.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_32_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved512.cu b/faiss/gpu/impl/scan/IVFInterleaved512.cu deleted file mode 100644 index ac3c0d3e22..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved512.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_512_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleaved64.cu b/faiss/gpu/impl/scan/IVFInterleaved64.cu deleted file mode 100644 index 56a02b5054..0000000000 --- a/faiss/gpu/impl/scan/IVFInterleaved64.cu +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { -namespace gpu { - -IVF_INTERLEAVED_IMPL(IVFINTERLEAVED_64_PARAMS) - -} -} // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh b/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh index 7511374a69..f52753e5f3 100644 --- a/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh +++ b/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh @@ -36,74 +36,7 @@ void IVFINT_RUN( GpuScalarQuantizer* scalarQ, Tensor& outDistances, Tensor& outIndices, - GpuResources* res) { - const auto nq = queries.getSize(0); - const auto dim = queries.getSize(1); - const auto nprobe = listIds.getSize(1); - - const auto stream = res->getDefaultStreamCurrentDevice(); - - DeviceTensor distanceTemp( - res, - makeTempAlloc(AllocType::Other, stream), - {queries.getSize(0), listIds.getSize(1), k}); - DeviceTensor indicesTemp( - res, - makeTempAlloc(AllocType::Other, stream), - {queries.getSize(0), listIds.getSize(1), k}); - - const dim3 grid(nprobe, std::min(nq, (idx_t)getMaxGridCurrentDevice().y)); - - if (useResidual) { - ivfInterleavedScan< - CODEC_TYPE, - METRIC_TYPE, - THREADS, - NUM_WARP_Q, - NUM_THREAD_Q, - true><<>>( - queries, - residualBase, - listIds, - listData.data(), - listLengths.data(), - codec, - metric, - k, - distanceTemp, - indicesTemp); - } else { - ivfInterleavedScan< - CODEC_TYPE, - METRIC_TYPE, - THREADS, - NUM_WARP_Q, - NUM_THREAD_Q, - false><<>>( - queries, - residualBase, - listIds, - listData.data(), - listLengths.data(), - codec, - metric, - k, - distanceTemp, - indicesTemp); - } - - runIVFInterleavedScan2( - distanceTemp, - indicesTemp, - listIds, - k, - listIndices, - indicesOptions, - METRIC_TYPE::kDirection, - outDistances, - outIndices, - stream); -} + GpuResources* res); template void IVFINT_CODECS( @@ -270,24 +203,22 @@ void IVFINT_CODECS( } } -#define IVF_INTERLEAVED_SCAN_IMPL_ARGS \ - (Tensor & queries, \ - Tensor & listIds, \ - DeviceVector & listData, \ - DeviceVector & listIndices, \ - IndicesOptions indicesOptions, \ - DeviceVector & listLengths, \ - const int k, \ - faiss::MetricType metric_name, \ - const bool useResidual, \ - Tensor& residualBase, \ - GpuScalarQuantizer* scalarQ, \ - Tensor& outDistances, \ - Tensor& outIndices, \ - GpuResources* res) - template -void IVF_METRICS IVF_INTERLEAVED_SCAN_IMPL_ARGS { +void ivfInterleavedScanImpl( + Tensor& queries, + Tensor& listIds, + DeviceVector& listData, + DeviceVector& listIndices, + IndicesOptions indicesOptions, + DeviceVector& listLengths, + const int k, + faiss::MetricType metric_name, + const bool useResidual, + Tensor& residualBase, + GpuScalarQuantizer* scalarQ, + Tensor& outDistances, + Tensor& outIndices, + GpuResources* res) { FAISS_ASSERT(k <= NUM_WARP_Q); const auto call_codec = [&](const auto& func, const auto& metric) { @@ -324,42 +255,5 @@ void IVF_METRICS IVF_INTERLEAVED_SCAN_IMPL_ARGS { CUDA_TEST_ERROR(); } -template -void ivfInterleavedScanImpl IVF_INTERLEAVED_SCAN_IMPL_ARGS; - -#define IVF_INTERLEAVED_IMPL_HELPER(THREADS, NUM_WARP_Q, NUM_THREAD_Q) \ - template <> \ - void ivfInterleavedScanImpl \ - IVF_INTERLEAVED_SCAN_IMPL_ARGS { \ - IVF_METRICS( \ - queries, \ - listIds, \ - listData, \ - listIndices, \ - indicesOptions, \ - listLengths, \ - k, \ - metric_name, \ - useResidual, \ - residualBase, \ - scalarQ, \ - outDistances, \ - outIndices, \ - res); \ - } - -#define IVF_INTERLEAVED_IMPL(...) IVF_INTERLEAVED_IMPL_HELPER(__VA_ARGS__) - -// clang-format off -#define IVFINTERLEAVED_1_PARAMS 128,1,1 -#define IVFINTERLEAVED_32_PARAMS 128,32,2 -#define IVFINTERLEAVED_64_PARAMS 128,64,3 -#define IVFINTERLEAVED_128_PARAMS 128,128,3 -#define IVFINTERLEAVED_256_PARAMS 128,256,4 -#define IVFINTERLEAVED_512_PARAMS 128,512,8 -#define IVFINTERLEAVED_1024_PARAMS 128,1024,8 -#define IVFINTERLEAVED_2048_PARAMS 64,2048,8 -// clang-format on - } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/scan/IVFInterleavedScanKernelTemplate.cu b/faiss/gpu/impl/scan/IVFInterleavedScanKernelTemplate.cu new file mode 100644 index 0000000000..d25c25a780 --- /dev/null +++ b/faiss/gpu/impl/scan/IVFInterleavedScanKernelTemplate.cu @@ -0,0 +1,78 @@ +#include + +namespace faiss { +namespace gpu { + +template <> +void IVFINT_RUN< + SUB_CODEC_TYPE, + SUB_METRIC_TYPE, + SUB_THREADS, + SUB_NUM_WARP_Q, + SUB_NUM_THREAD_Q>( + SUB_CODEC_TYPE& codec, + Tensor& queries, + Tensor& listIds, + DeviceVector& listData, + DeviceVector& listIndices, + IndicesOptions indicesOptions, + DeviceVector& listLengths, + const int k, + SUB_METRIC_TYPE metric, + const bool useResidual, + Tensor& residualBase, + GpuScalarQuantizer* scalarQ, + Tensor& outDistances, + Tensor& outIndices, + GpuResources* res) { + const auto nq = queries.getSize(0); + const auto dim = queries.getSize(1); + const auto nprobe = listIds.getSize(1); + + const auto stream = res->getDefaultStreamCurrentDevice(); + + DeviceTensor distanceTemp( + res, + makeTempAlloc(AllocType::Other, stream), + {queries.getSize(0), listIds.getSize(1), k}); + DeviceTensor indicesTemp( + res, + makeTempAlloc(AllocType::Other, stream), + {queries.getSize(0), listIds.getSize(1), k}); + + const dim3 grid(nprobe, std::min(nq, (idx_t)getMaxGridCurrentDevice().y)); + + ivfInterleavedScan< + SUB_CODEC_TYPE, + SUB_METRIC_TYPE, + SUB_THREADS, + SUB_NUM_WARP_Q, + SUB_NUM_THREAD_Q> + <<>>( + queries, + residualBase, + listIds, + listData.data(), + listLengths.data(), + codec, + metric, + k, + distanceTemp, + indicesTemp, + useResidual); + + runIVFInterleavedScan2( + distanceTemp, + indicesTemp, + listIds, + k, + listIndices, + indicesOptions, + SUB_METRIC_TYPE::kDirection, + outDistances, + outIndices, + stream); +} + +} // namespace gpu +} // namespace faiss