Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

小彭老师,麻烦使用SSE优化一下下面的代码 #5

Open
zhangnatha opened this issue Aug 30, 2023 · 5 comments
Open

小彭老师,麻烦使用SSE优化一下下面的代码 #5

zhangnatha opened this issue Aug 30, 2023 · 5 comments

Comments

@zhangnatha
Copy link

zhangnatha commented Aug 30, 2023

以下的calSimilarity函数在算法过程中会执行(-180°~180°步长为0.1)很多次,下面时从工程中摘出来的代码,运行单次时,在编译器开O0优化,耗时为2046.41ms;在编译器开O3优化,耗时为310.06ms

gcc normal_calSimilarity.cpp -O0 -o normal_calSimilarity_gcc -lstdc++ -lm

硬件
CPU: i7-10700 CPU@2.90GHz x 16**

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>

struct templateFeat
{
	int x;
	int y;
	short dx;
	short dy;
	float mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
    int template_feat_size =  template_point.size();
    int search_feat_size =  search_point.size();

    std::vector<matchResult> results0Deg;
    float anMinScore     = 0.4 - 1;
    float NormMinScore   = 0.4 / template_feat_size;
    float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
    
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            float PartialScore = 0;
            float PartialSum   = 0;
            int   SumOfCoords  = 0;

            for(int m = 0; m < template_feat_size; m++)
            {
                int curX = i + template_point[m].x;
                int curY = j + template_point[m].y;
				
                if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
                {
                    continue;
                }

                short iTx = template_point[m].dx;
                short iTy = template_point[m].dy;
                float iTm = template_point[m].mag;

                int offSet = curY * 350 + curX;
                short iSx        = search_point[offSet].dx;
                short iSy        = search_point[offSet].dy; 
                float iSm        = search_point[offSet].mag;

                if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
                {
                    PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
                }
                SumOfCoords  = m + 1;
                PartialScore = PartialSum / SumOfCoords;

                if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
                {
                    break;
                }
            }

            if(PartialScore > 0.4)
            {
                results0Deg.push_back({i, j, 0, PartialScore});
            } 
        }
    }

    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数:template_point/search_point
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215);
    for (auto &feat : template_point)
    {
        feat.x = intXDist(gen);
        feat.y = intYDist(gen);
        feat.dx = shortDXDist(gen);
        feat.dy = shortDYDist(gen);
        feat.mag = floatDist(gen);
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = shortDxDist(gen);
        feat.dy = shortDyDist(gen);
        feat.mag = float_Dist(gen);
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("./match_result.txt",results0Deg);

    return true;
}
@archibate
Copy link
Contributor

archibate commented Aug 30, 2023

原版 418.72ms
SSE 50.32ms
SSE+OpenMP 21.99ms
g++ normal_calSimilarity.cpp -O3 -fopenmp -mavx2 -o normal_calSimilarity_gcc

#include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)

struct templateFeat
{
	__m128i x;
	__m128i y;
	__m64 dx;
	__m64 dy;
	__m128 mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
    static std::vector<matchResult> results0Deg;
    std::atomic<size_t> results0DegSize{0};
    results0Deg.resize(1920 * 1200);
    auto $anMinScore     = _mm_set1_ps(0.4f - 1);
    auto $NormMinScore   = _mm_set1_ps(0.4f / template_feat_size);
    auto $NormGreediness = _mm_set1_ps(((1 - 0.8f * 0.4f) / (1 - 0.8f)) / template_feat_size);
    
    #pragma omp parallel for collapse(2)
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            auto $PartialSum = _mm_setzero_ps();

            auto $i = _mm_set1_epi32(i);
            auto $j = _mm_set1_epi32(j);
            auto $m1 = _mm_setr_ps(1, 2, 3, 4);
            auto $0 = _mm_setzero_si128();
            auto $4 = _mm_set1_ps(4);
            auto $230 = _mm_set1_epi32(230);
            auto $350 = _mm_set1_epi32(350);

            for(int m = 0; m < template_feat_size / 4 * 4; m += 4)
            {
                auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
                auto $curY = _mm_add_epi32($j, template_point[m / 4].y);

                auto $ok = _mm_or_si128(
                    _mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0)),
                    _mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));

                auto $iTx = _mm_cvtpi16_ps(template_point[m / 4].dx);
                auto $iTy = _mm_cvtpi16_ps(template_point[m / 4].dy);
                auto $iTm = template_point[m / 4].mag;

                auto $offSet = _mm_and_si128($curY * $350 + $curX, $ok);
                auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                // isx isy isx isy isx isy isx isy
                // isx 0 isx 0 isx 0 isx 0
                // isy 0 isy 0 isy 0 isy 0
                auto $iSx = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16($0, $iSxy), 16));
                auto $iSy = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16($0, $iSxy), 16));
                auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                $PartialSum = _mm_add_ps($PartialSum,
                                         _mm_and_ps(_mm_castsi128_ps($ok),
                                                    _mm_mul_ps(
                                                    _mm_mul_ps($iSm, $iTm),
                                                    _mm_add_ps(
                                                    _mm_mul_ps($iSx, $iTx),
                                                    _mm_mul_ps($iSy, $iTy))
                                                    )));
                auto $cond = _mm_mul_ps($m1,
                                        _mm_min_ps(
                                        _mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)),
                                        _mm_mul_ps($NormMinScore, $m1)));
                /* for (int _ = 0; _ < 4; _++) */
                /*      printf("%f %f %f %f %f %f %f %f\n", $iSx[_], $iSy[_], $iTx[_], $iTy[_], $iSm[_], $PartialSum[_], $m1[_], $cond[_]); */

                auto $cmp = _mm_cmplt_ps($PartialSum, $cond);
                int cmpmask = _mm_movemask_ps($cmp);
                if (cmpmask) [[unlikely]] {
                    $PartialSum = _mm_setzero_ps();
                    break;
                }
                $m1 = _mm_add_ps($m1, $4);
            }
            $PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
            $PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
            auto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1);
            if(PartialScore > 0.4f)
            {
                results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
            }
        }
    }

    results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数:template_point/search_point
    std::mt19937 gen;
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215 / 4);
    for (auto &feat : template_point)
    {
        feat.x = _mm_setr_epi32 REP4(intXDist(gen));
        feat.y = _mm_setr_epi32 REP4(intYDist(gen));
        feat.dx = _mm_setr_pi16 REP4(shortDXDist(gen));
        feat.dy = _mm_setr_pi16 REP4(shortDYDist(gen));
        feat.mag = _mm_setr_ps REP4(floatDist(gen));
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = (shortDxDist(gen));
        feat.dy = (shortDyDist(gen));
        feat.mag = (float_Dist(gen));
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("/tmp/match_result.txt",results0Deg);

    return 0;
}

@zhangnatha
Copy link
Author

速度是上去了,但是:将随机数据改成固定数据,运行费SSE与SSE代码,产生的结果不一致,match_result.txt文件内容不一致。

  • 问题项

for循环内部计算:非SSEPartialScore = PartialSum / SumOfCoords; 的SumOfCoords值不应该是 SSEauto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1); 中的template_feat_size固定值

@archibate
Copy link
Contributor

修复后的SSE版本

#include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)

struct templateFeat
{
	__m128i x;
	__m128i y;
	__m64 dx;
	__m64 dy;
	__m128 mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
    static std::vector<matchResult> results0Deg;
    std::atomic<size_t> results0DegSize{0};
    results0Deg.resize(1920 * 1200);
    auto anMinScore     = (0.4 - 1);
    auto NormMinScore   = (0.4 / template_feat_size);
    auto NormGreediness = (((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size);

    static std::vector<float> thresholds;
    thresholds.resize(((template_feat_size + 3) & ~3));
    for (int m = 0; m < ((template_feat_size + 3) & ~3); m++) {
        auto SumOfCoords = m + 1;
        thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
    }
    
    /* #pragma omp parallel for collapse(2) */
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            float PartialScore = -999999;
            auto $PartialScore = _mm_setzero_ps();
            auto $PartialSum = _mm_setzero_ps();
            auto $SumOfCoords = _mm_setr_ps(-3, -2, -1, 0);
            auto $4 = _mm_set1_ps(4);
            auto $template_feat_size = _mm_set1_ps(template_feat_size);
            auto $i = _mm_set1_epi32(i);
            auto $j = _mm_set1_epi32(j);
            auto $0 = _mm_setzero_si128();
            auto $230 = _mm_set1_epi32(230);
            auto $350 = _mm_set1_epi32(350);

            for(int m = 0; m < ((template_feat_size + 3) & ~3); m += 4)
            {
                $SumOfCoords = _mm_add_ps($SumOfCoords, $4);

                auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
                auto $curY = _mm_add_epi32($j, template_point[m / 4].y);

                auto $notok = _mm_or_si128(_mm_or_si128(_mm_castps_si128(_mm_cmpge_ps($SumOfCoords, $template_feat_size)),
                    _mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0))),
                    _mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));
                /* if (0xffff == _mm_movemask_epi8($notok)) { */
                /*     continue; */
                /* } */

                auto $iTx = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dx));
                auto $iTy = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dy));
                auto $iTm = template_point[m / 4].mag;

                auto $offSet = _mm_andnot_si128($notok, _mm_add_epi32(_mm_mullo_epi32($curY, $350), $curX));
                auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                // isx isy isx isy isx isy isx isy
                auto $iSx = _mm_srai_epi32(_mm_slli_epi32($iSxy, 16), 16);
                auto $iSy = _mm_srai_epi32($iSxy, 16);
                auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                auto $accum = _mm_andnot_ps(_mm_castsi128_ps($notok),
                                            _mm_mul_ps(
                                            _mm_mul_ps($iSm, $iTm),
                                            _mm_cvtepi32_ps(
                                            _mm_add_epi32(
                                            _mm_mullo_epi32($iSx, $iTx),
                                            _mm_mullo_epi32($iSy, $iTy)))));
                /* auto $cond = _mm_mul_ps($m1, */
                /*                         _mm_min_ps( */
                /*                         _mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)), */
                /*                         _mm_mul_ps($NormMinScore, $m1))); */
                /* for (int _ = 0; _ < 4; _++) */
                /*      printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm_cvtepi32_ps($offSet)[_], _mm_cvtepi32_ps($curX)[_], _mm_cvtepi32_ps($curY)[_], _mm_cvtepi32_ps($iSx)[_], _mm_cvtepi32_ps($iSy)[_], _mm_cvtepi32_ps($iTx)[_], _mm_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm_cvtepi32_ps($notok)[_]); */
                /* exit(1); */

                // 0 1 2 3 (HI)
                $accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 4)));
                // 0 1+0 2+1 3+2
                $accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 8)));
                // 0 1+0 2+1+0 3+2+1+0

                $PartialScore = _mm_add_ps($PartialSum, $accum);
                /* auto $cond = _mm_mul_ps($SumOfCoords, _mm_min_ps( */
                /*     _mm_add_ps(_mm_set1_ps(anMinScore), _mm_mul_ps(_mm_set1_ps(NormGreediness), $SumOfCoords)), */
                /*     _mm_mul_ps(_mm_set1_ps(NormMinScore), $SumOfCoords))); */
                auto $cond = _mm_load_ps(thresholds.data() + m);
                auto $cmp = _mm_andnot_ps(_mm_castsi128_ps($notok), _mm_cmplt_ps($PartialScore, $cond));
                int cmpmask = _mm_movemask_ps($cmp);
                if (cmpmask) {
                    int bit = __builtin_ctz(cmpmask);
                    PartialScore = $PartialScore[bit] / (m + bit + 1);
                    goto out;
                }
                $PartialSum = _mm_add_ps($PartialSum, _mm_shuffle_ps($accum, $accum, 0xff));
            }
            $PartialScore = _mm_div_ps($PartialScore, $SumOfCoords);
            PartialScore = $PartialScore[(template_feat_size - 1) & 3];
        out:
            if(PartialScore > 0.4f)
            {
                results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
            }
        }
    }

    results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数:template_point/search_point
    std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point((215 + 3) / 4);
    for (auto &feat : template_point)
    {
        feat.x = _mm_set_epi32 REP4(intXDist(gen1));
        feat.y = _mm_set_epi32 REP4(intYDist(gen2));
        feat.dx = _mm_set_pi16 REP4(shortDXDist(gen3));
        feat.dy = _mm_set_pi16 REP4(shortDYDist(gen4));
        feat.mag = _mm_set_ps REP4(floatDist(gen5));
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = (shortDxDist(gen));
        /* static int once = printf("%d\n", feat.dx); */
        feat.dy = (shortDyDist(gen));
        feat.mag = (float_Dist(gen));
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("/tmp/match_result.txt",results0Deg);

    return 0;
}

旧版本

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>

struct templateFeat
{
	int x;
	int y;
	short dx;
	short dy;
	float mag;
};

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
    int template_feat_size =  template_point.size();
    int search_feat_size =  search_point.size();

    std::vector<matchResult> results0Deg;
    float anMinScore     = 0.4 - 1;
    float NormMinScore   = 0.4 / template_feat_size;
    float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
    
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j++)
        {
            float PartialScore = 0;
            float PartialSum   = 0;
            int   SumOfCoords  = 0;

            for(int m = 0; m < template_feat_size; m++)
            {
                int curX = i + template_point[m].x;
                int curY = j + template_point[m].y;
				
                if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
                {
                    continue;
                }

                int iTx = template_point[m].dx;
                int iTy = template_point[m].dy;
                float iTm = template_point[m].mag;

                int offSet = curY * 350 + curX;
                int iSx        = search_point[offSet].dx;
                int iSy        = search_point[offSet].dy; 
                float iSm        = search_point[offSet].mag;

                if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
                {
                    PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
                }
                SumOfCoords  = m + 1;
                PartialScore = PartialSum / SumOfCoords;

                /* printf("%d %f %d %d %d %d %d %d %d %f %f %f\n", m, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm), offSet, curX, curY, iSx, iSy, iTx, iTy, iSm, iTm, PartialSum); */
                /* exit(1); */
                /*  */
                /* printf("%d %f %f\n", m, PartialSum, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm)); */

                if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
                {
                    break;
                }
            }

            /* printf("%f %d\n", PartialScore, SumOfCoords); */
            if(PartialScore > 0.4)
            {
            /* printf("%d %d %f\n", i, j, PartialScore); */
            /*     exit(1); */
                results0Deg.push_back({i, j, 0, PartialScore});
            } 
        }
    }

    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数:template_point/search_point
    std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215);
    for (auto &feat : template_point)
    {
        feat.x = intXDist(gen1);
        feat.y = intYDist(gen2);
        feat.dx = shortDXDist(gen3);
        feat.dy = shortDYDist(gen4);
        feat.mag = floatDist(gen5);
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = shortDxDist(gen);
        /* static int once = printf("%d\n", feat.dx); */
        feat.dy = shortDyDist(gen);
        feat.mag = float_Dist(gen);
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("./match_result.txt",results0Deg);

    return 0;
}

@zhangnatha
Copy link
Author

最新版本的测试结果未有加速效果,测试记录:

  • 非SSE版本
$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 311.048 milliseconds.
Match result datas saved to match_result.txt!

耗时:311.048毫秒

  • SSE版本
$ g++ normal_calSimilaritySSE.cpp -O3 -mavx2 -o normal_calSimilarity_gccSSE
$ ./normal_calSimilarity_gccSSE 
Function took 817.517 milliseconds.
Match result datas saved to /tmp/match_result.txt!

耗时:817.517毫秒


考虑到:

  1. 测试数据的不可变性

  2. 数据的计算复杂度

  3. 数据类型转换

  4. 非必要条件判断

我将原先的非SSE代码修改如下:

  • 代码
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>

struct templateFeat
{
	int x;
	int y;
	float dx;
	float dy;
};

struct matchResult
{
	int i;
	int j;
	float angle;
	float score;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

std::vector<matchResult> calSimilarity(const std::vector<float> search_grad_x,const std::vector<float> search_grad_y,const std::vector<templateFeat> template_grad,int width,int height)
{
	std::vector<matchResult> results0Deg;

	float anMinScore = 0.5 - 1;
	float NormMinScore = 0.5 / template_grad.size();
	float NormGreediness = ((1 - 0.9 * 0.5) / (1 - 0.9))/template_grad.size();

	for (int i = 12; i < 117; i++) 
	{
	  for (int j = 13; j < 123; j++) 
	  {
		// 去掉条件[1]
		std::vector<templateFeat> template_grad_copy = template_grad;
		for(auto it = template_grad_copy.begin(); it != template_grad_copy.end();)
		{
			int curX = i + it->x;
			int curY = j + it->y;
			if(curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1)
			{
				it = template_grad_copy.erase(it);
			}
			else
			{
				++it;
			}
		}

		float PartialScore = 0;
		float PartialSum = 0;
		int SumOfCoords = 0;

		for (int m = 0; m < template_grad.size(); m++) 
		{
		  int curX = i + template_grad[m].x;
		  int curY = j + template_grad[m].y;

		//   条件[1]
		//   if (curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1) {
		// 	continue; 
		//   }

		  int offSet = curY * width + curX;
		  float iTx = template_grad[m].dx;
		  float iTy = template_grad[m].dy;
		  
		  float iSx = search_grad_x[offSet];
		  float iSy = search_grad_y[offSet];

		  if ((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0)) 
		  {
			PartialSum += (iSx * iTx) + (iSy * iTy);
		  }
		  SumOfCoords = m + 1;
		  PartialScore = PartialSum / SumOfCoords;

		  if (PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords,NormMinScore * SumOfCoords)))
			break;
		}
		if(PartialScore > 0.5)
		{
			results0Deg.push_back({i, j, 0, PartialScore});
		} 
	  }
	}
	return results0Deg;
}

int main()
{	
    int width = 1273;
    int height = 1231;
    int buffer_size = width * height;
	
    std::vector<float> search_grad_x(buffer_size);
    std::vector<float> search_grad_y(buffer_size);

    for (int i = 0; i < buffer_size; ++i) {
        search_grad_x[i] = static_cast<float>(i) * 0.5f;
    }

    for (int i = 0; i < buffer_size; ++i) {
        search_grad_y[i] = static_cast<float>(i) * 1.5f;
    }
    
    std::vector<templateFeat> template_grad;
    for (int i = 0; i < 125; ++i) {
        templateFeat feat;
        feat.x = i * 2.0;
        feat.y = i * 2.0 + 1.0;
		feat.dx = static_cast<float>(i) * 0.25f;
		feat.dy = static_cast<float>(i) * 1.5f;
        template_grad.push_back(feat);
    }
	
    //运行算法,计算耗时:std::milli 就是毫秒,std::micro 就是微秒seconds
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<matchResult> results0Deg = calSimilarity(search_grad_x,search_grad_y,template_grad,width,height);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " ms." << std::endl;
	
	//保存匹配结果
    saveDataToTxt("./match_result.txt",results0Deg);
	
    return 1;
}
  • 运行结果
$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 10.2154 ms.
Match result datas saved to match_result.txt!

新版本的耗时:10.2154 ms

请在上面新的基准代码上进行SSE修改吧!谢谢~

对比结果文件 match_result.txt 可以使用Meld工具www.meldmerge.org/ 进行对比测试~~~

@archibate
Copy link
Contributor

archibate commented Sep 1, 2023

//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
//#define USE_MULTISCORE//causing wrong result
#define USE_FIRSTUNROLL
#define USE_OPENMP
#define USE_UNROLLM4
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#include <immintrin.h>
#ifdef USE_OPENMP
#include <atomic>
#endif

struct templateFeat
{
	int x;
	int y;
	short dx;
	short dy;
	float mag;
};

#ifdef USE_MULTISCORE
struct templateFeatSimd
{
	__m256i x;
	__m256i y;
	__m128i dx;
	__m128i dy;
	__m256 mag;
};
#endif

struct matchResult
{
	int i;
	int j;
	int angle;
	float score;
};

struct searchFeat
{
	short dx;
	short dy;
	float mag;
};

bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
    std::ofstream file(txt_path);

    if (file.is_open()) {
        for (int i = 0; i < match_result.size(); ++i) {
            file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
        }

        file.close();
        std::cout << "Match result datas saved to match_result.txt!" << std::endl;
    } else {
        std::cout << "Unable to open file" << std::endl;
    }
    return true;
}

#ifdef USE_MULTISCORE
static float multiScore(templateFeatSimd const *template_point_simd,
                        searchFeat const *search_point,
                        float const *thresholds,
                        int template_feat_size,
                        int search_feat_size,
                        int i, int j, int mbase,
                        float PartialSumBase) {
    auto $PartialScore = _mm256_setzero_ps();
    auto $PartialSum = _mm256_set1_ps(PartialSumBase);
    auto $SumOfCoords = _mm256_add_ps(_mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8), _mm256_set1_ps(mbase));
    auto $8 = _mm256_set1_ps(8);
    auto $template_feat_size_plus1 = _mm256_set1_ps(template_feat_size + 1);
    auto $i = _mm256_set1_epi32(i);
    auto $j = _mm256_set1_epi32(j);
    auto $0 = _mm256_setzero_si256();
    auto $230 = _mm256_set1_epi32(230);
    auto $350 = _mm256_set1_epi32(350);

    /* printf("%d\n", mbase); */
    for(int m = mbase; m < ((template_feat_size + 7) & ~7); m += 8, $SumOfCoords = _mm256_add_ps($SumOfCoords, $8))
    {
        auto $curX = _mm256_add_epi32($i, template_point_simd[m / 8].x);
        auto $curY = _mm256_add_epi32($j, template_point_simd[m / 8].y);

        auto $notok = _mm256_or_si256(_mm256_or_si256(_mm256_castps_si256(_mm256_cmp_ps($SumOfCoords, $template_feat_size_plus1, _CMP_GE_OQ)),
                                                _mm256_or_si256(_mm256_srai_epi32($curX, 31), _mm256_srai_epi32($curY, 31))),
                                   _mm256_or_si256(_mm256_cmpgt_epi32($curX, $230), _mm256_cmpgt_epi32($curY, $350)));
        /* if (0xffff == _mm256_movemask_epi8($notok)) { */
        /*     continue; */
        /* } */

        auto $iTx = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dx);
        auto $iTy = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dy);
        auto $iTm = template_point_simd[m / 8].mag;

        auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), $curX));
        auto $iSxy = _mm256_i32gather_epi32((int *)search_point, $offSet, sizeof(searchFeat));
        // isx isy isx isy isx isy isx isy
        auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
        auto $iSy = _mm256_srai_epi32($iSxy, 16);
        auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point, $offSet, sizeof(searchFeat));

        auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                    _mm256_mul_ps(
                                    _mm256_mul_ps($iSm, $iTm),
                                    _mm256_cvtepi32_ps(
                                    _mm256_add_epi32(
                                    _mm256_mullo_epi32($iSx, $iTx),
                                    _mm256_mullo_epi32($iSy, $iTy)))));
        /* auto $cond = _mm256_mul_ps($m1, */
        /*                         _mm256_min_ps( */
        /*                         _mm256_add_ps($anMinScore, _mm256_mul_ps($NormGreediness, $m1)), */
        /*                         _mm256_mul_ps($NormMinScore, $m1))); */
        /* for (int _ = 0; _ < 4; _++) */
        /*      printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm256_cvtepi32_ps($offSet)[_], _mm256_cvtepi32_ps($curX)[_], _mm256_cvtepi32_ps($curY)[_], _mm256_cvtepi32_ps($iSx)[_], _mm256_cvtepi32_ps($iSy)[_], _mm256_cvtepi32_ps($iTx)[_], _mm256_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm256_cvtepi32_ps($notok)[_]); */
        /* exit(1); */

        /* for (int _ = 0; _ < 8; _++) */
        /*      printf("%d %f\n", _, $accum[_]); */

        // 0 1 2 3 4 5 6 7
        $accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 4)));
        // 0 1+0 2+1 3+2 4 5+4 6+5 7+6
        $accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 8)));
        // 0 1+0 2+1+0 3+2+1+0 4 5+4 6+5+4 7+6+5+4
        $accum = _mm256_add_ps($accum, _mm256_set_m128(_mm_permute_ps(_mm256_castps256_ps128($accum), 0xff), _mm_setzero_ps()));
        // 0 1+0 2+1+0 3+2+1+0 3+2+1+0+4 3+2+1+0+5+4 3+2+1+0+6+5+4 3+2+1+0+7+6+5+4

        /* for (int _ = 0; _ < 8; _++) */
        /*      printf("%d %f\n", _, $accum[_]); */
        /* exit(1); */

        $PartialScore = _mm256_add_ps($PartialSum, $accum);
        /* auto $cond = _mm256_mul_ps($SumOfCoords, _mm256_min_ps( */
        /*     _mm256_add_ps(_mm256_set1_ps(anMinScore), _mm256_mul_ps(_mm256_set1_ps(NormGreediness), $SumOfCoords)), */
        /*     _mm256_mul_ps(_mm256_set1_ps(NormMinScore), $SumOfCoords))); */
        auto $cond = _mm256_load_ps(thresholds + m);
        auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialScore, $cond, _CMP_LT_OQ));
        int cmpmask = _mm256_movemask_ps($cmp);
        if (cmpmask) [[unlikely]] {
            int bit = __builtin_ctz(cmpmask);
            return $PartialScore[bit] / (m + bit + 1);
        }
        $PartialSum = _mm256_add_ps($PartialSum, _mm256_broadcastss_ps(_mm256_castps256_ps128(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(_mm256_permute_ps($accum, 0xff)), 0xf)))));
    }

        /* for (int _ = 0; _ < 8; _++) */
        /*      printf("%d %f %f\n", _, $PartialSum[_], $SumOfCoords[_]); */
        /* exit(1); */

    $PartialScore = _mm256_div_ps($PartialScore, $SumOfCoords);
    return $PartialScore[(template_feat_size - 1) & 7];
}
#endif

std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,const std::vector<searchFeat> &search_point)
{
    int template_feat_size =  template_point.size();
    int search_feat_size =  search_point.size();

#ifdef USE_MULTISCORE
    static std::vector<templateFeatSimd> template_point_simd;
    template_point_simd.resize((template_feat_size + 7) & ~7);
    for (int m = 0; m < template_point_simd.size() * 8; m += 8) {
        template_point_simd[m / 8].x = _mm256_setr_epi32(template_point[(m) % template_feat_size].x, template_point[(m + 1) % template_feat_size].x, template_point[(m + 2) % template_feat_size].x, template_point[(m + 3) % template_feat_size].x, template_point[(m + 4) % template_feat_size].x, template_point[(m + 4 + 1) % template_feat_size].x, template_point[(m + 4 + 2) % template_feat_size].x, template_point[(m + 4 + 3) % template_feat_size].x);
        template_point_simd[m / 8].y = _mm256_setr_epi32(template_point[(m) % template_feat_size].y, template_point[(m + 1) % template_feat_size].y, template_point[(m + 2) % template_feat_size].y, template_point[(m + 3) % template_feat_size].y, template_point[(m + 4) % template_feat_size].y, template_point[(m + 4 + 1) % template_feat_size].y, template_point[(m + 4 + 2) % template_feat_size].y, template_point[(m + 4 + 3) % template_feat_size].y);
        template_point_simd[m / 8].dx = _mm_setr_epi16(template_point[(m) % template_feat_size].dx, template_point[(m + 1) % template_feat_size].dx, template_point[(m + 2) % template_feat_size].dx, template_point[(m + 3) % template_feat_size].dx, template_point[(m + 4) % template_feat_size].dx, template_point[(m + 4 + 1) % template_feat_size].dx, template_point[(m + 4 + 2) % template_feat_size].dx, template_point[(m + 4 + 3) % template_feat_size].dx);
        template_point_simd[m / 8].dy = _mm_setr_epi16(template_point[(m) % template_feat_size].dy, template_point[(m + 1) % template_feat_size].dy, template_point[(m + 2) % template_feat_size].dy, template_point[(m + 3) % template_feat_size].dy, template_point[(m + 4) % template_feat_size].dy, template_point[(m + 4 + 1) % template_feat_size].dy, template_point[(m + 4 + 2) % template_feat_size].dy, template_point[(m + 4 + 3) % template_feat_size].dy);
        template_point_simd[m / 8].mag = _mm256_setr_ps(template_point[(m) % template_feat_size].mag, template_point[(m + 1) % template_feat_size].mag, template_point[(m + 2) % template_feat_size].mag, template_point[(m + 3) % template_feat_size].mag, template_point[(m + 4) % template_feat_size].mag, template_point[(m + 4 + 1) % template_feat_size].mag, template_point[(m + 4 + 2) % template_feat_size].mag, template_point[(m + 4 + 3) % template_feat_size].mag);
    }
#endif

    static std::vector<matchResult> results0Deg;
    results0Deg.resize(1920 * 1200);
#ifdef USE_OPENMP
    std::atomic<size_t> results0DegSize = 0;
#else
    size_t results0DegSize = 0;
#endif
    float anMinScore     = 0.4 - 1;
    float NormMinScore   = 0.4 / template_feat_size;
    float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;

    static std::vector<float> thresholds;
    thresholds.resize(((template_feat_size + 7) & ~7));
    for (int m = 0; m < ((template_feat_size + 7) & ~7); m++) {
        auto SumOfCoords = m + 1;
        thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
    }
    
#ifdef USE_OPENMP
    #pragma omp parallel for schedule(dynamic, 8)
#endif
    for(int i = 0; i < 1920; i++)
    {
        for(int j = 0; j < 1200; j += 8)
        {
            int m;
            auto $PartialSum   = _mm256_setzero_ps();
            #ifdef USE_FIRSTUNROLL
            auto $SumOfCoords  = _mm256_set1_epi32(1);
            #else
            auto $SumOfCoords  = _mm256_set1_epi32(1);
            #endif
            auto $j = _mm256_add_epi32(_mm256_set1_epi32(j), _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
            auto $350 = _mm256_set1_epi32(350);
            auto $loopterm = _mm256_setzero_si256();

            #ifdef USE_FIRSTUNROLL
            {
                int curX = i + template_point[0].x;
                if (!(curX < 0 || curX > 230)) [[likely]] {
                    auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[0].y));
                    auto $notok = _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350));

                    int iTx = template_point[0].dx;
                    int iTy = template_point[0].dy;
                    float iTm = template_point[0].mag;

                    /* int offSet = curY * 350 + curX; */
                    auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
                    auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                    // isx isy isx isy isx isy isx isy
                    auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
                    auto $iSy = _mm256_srai_epi32($iSxy, 16);
                    auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                    auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                                _mm256_mul_ps(
                                                _mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
                                                _mm256_cvtepi32_ps(
                                                _mm256_add_epi32(
                                                _mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
                                                _mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
                    $PartialSum = _mm256_add_ps($PartialSum, $accum);

                    auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[0]), _CMP_LT_OQ));
                    $loopterm = _mm256_castps_si256($cmp);
                    $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
                }
            }
            if (_mm256_movemask_epi8($loopterm) != -1) {
                int curX = i + template_point[1].x;
                if (!(curX < 0 || curX > 230)) [[likely]] {
                    auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[1].y));
                    auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));

                    int iTx = template_point[1].dx;
                    int iTy = template_point[1].dy;
                    float iTm = template_point[1].mag;

                    /* int offSet = curY * 350 + curX; */
                    auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
                    auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                    // isx isy isx isy isx isy isx isy
                    auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
                    auto $iSy = _mm256_srai_epi32($iSxy, 16);
                    auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                    auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                                _mm256_mul_ps(
                                                _mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
                                                _mm256_cvtepi32_ps(
                                                _mm256_add_epi32(
                                                _mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
                                                _mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
                    $PartialSum = _mm256_add_ps($PartialSum, $accum);

                    auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[1]), _CMP_LT_OQ));
                    $loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
                    $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
                }
                if (_mm256_movemask_epi8($loopterm) != -1) [[unlikely]] {
                    m = 2;
                    #else
                    m = 0;
                    #endif

                    #ifdef USE_UNROLLM4
                    #pragma GCC unroll 4
                    #endif
                    for(;
                        m < template_feat_size;
                        m++, $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)))
                    ) {
                        int curX = i + template_point[m].x;
                        if (curX < 0 || curX > 230) [[unlikely]] { continue; }

                        auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[m].y));
                        auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));

                        int iTx = template_point[m].dx;
                        int iTy = template_point[m].dy;
                        float iTm = template_point[m].mag;

                        /* int offSet = curY * 350 + curX; */
                        auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
                        auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
                        // isx isy isx isy isx isy isx isy
                        auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
                        auto $iSy = _mm256_srai_epi32($iSxy, 16);
                        auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));

                        auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
                                                    _mm256_mul_ps(
                                                    _mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
                                                    _mm256_cvtepi32_ps(
                                                    _mm256_add_epi32(
                                                    _mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
                                                    _mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
                        $PartialSum = _mm256_add_ps($PartialSum, $accum);

                        auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[m]), _CMP_LT_OQ));
                        $loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
                        #ifndef USE_MULTISCORE
                        if (_mm256_movemask_epi8($loopterm) == -1) {
                            break;
                        }
                        #else
                        auto popcorn = __builtin_popcount(_mm256_movemask_ps(_mm256_castsi256_ps($loopterm)));
                        if ((popcorn == 8 || popcorn >= 7) && !(m & 7)) [[unlikely]] {
                            break;
                        }
                        #endif
                    }
            #ifdef USE_FIRSTUNROLL
                }
            }
            #endif

            auto $PartialScore = _mm256_div_ps($PartialSum, _mm256_cvtepi32_ps($SumOfCoords));
            #ifdef USE_MULTISCORE
            auto mask = _mm256_movemask_ps(_mm256_castsi256_ps($loopterm));
            if (mask != 0xff && m < template_feat_size) {
                auto k = __builtin_ctz(~mask);
                $PartialScore[k] = multiScore(
                    template_point_simd.data(), search_point.data(), thresholds.data(),
                    template_feat_size, search_feat_size, i, j, m, $PartialSum[k]);
            }
            #endif
            auto $PartialScoreAbove = _mm256_cmp_ps($PartialScore, _mm256_set1_ps(0.4f), _CMP_GT_OQ);
            for (int k = 0; k < 8; k++) {
                if($PartialScoreAbove[k]) {
                /* printf("%d %d %f\n", i, j, PartialScore); */
                /*     exit(1); */
#ifdef USE_OPENMP
                    results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j + k, 0, $PartialScore[k]};
#else
                    results0Deg[results0DegSize++] = {i, j + k, 0, $PartialScore[k]};
#endif
                }
            }
        }
    }

#ifdef USE_OPENMP
    results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
#else
    results0Deg.resize(results0DegSize);
#endif
    return results0Deg;
}

int main()
{
    //为了测试函数而生成的随机数:template_point/search_point
    std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
    std::uniform_int_distribution<int> intXDist(-18, 32);
    std::uniform_int_distribution<int> intYDist(-48, 49);
    std::uniform_int_distribution<short> shortDXDist(-348, 349);
    std::uniform_int_distribution<short> shortDYDist(-421, 352);
    std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);

    std::vector<templateFeat> template_point(215);
    for (auto &feat : template_point)
    {
        feat.x = intXDist(gen1);
        feat.y = intYDist(gen2);
        feat.dx = shortDXDist(gen3);
        feat.dy = shortDYDist(gen4);
        feat.mag = floatDist(gen5);
    }

    std::uniform_int_distribution<short> shortDxDist(-460, 460);
    std::uniform_int_distribution<short> shortDyDist(-476, 460);
    std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);

    std::vector<searchFeat> search_point(2304000);
    for (auto &feat : search_point)
    {
        feat.dx = shortDxDist(gen);
        /* static int once = printf("%d\n", feat.dx); */
        feat.dy = shortDyDist(gen);
        feat.mag = float_Dist(gen);
    }

    //运行算法,计算耗时
    auto start = std::chrono::high_resolution_clock::now();
    /* for (int i = 0; i < 100; i++) */
    /*     calSimilarity(template_point,search_point); */
    std::vector<matchResult> const &results0Deg = calSimilarity(template_point,search_point);
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start;
    std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;

    //保存匹配结果
    saveDataToTxt("/tmp/match_result.txt",results0Deg);

    return 0;
}

修复了性能提升不大的问题,现在是418ms -> 21ms了。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants