实现TopK问题的三种算法

2019-11-03  本文已影响0人  crazyhank

在检索类的应用中往往实现TopK的应用,比如特征检索场景下,要对一个向量进行距离查询,输出距离最近的前10个向量。

那么,实现这种TopK问题,可以基于哪些算法呢?我这里给出了三种方法:

写了一个Demo程序,用于展现着三种方法的性能:

#include <sys/time.h>
#include <iostream>
#include <queue>
#include <algorithm>
#include <limits.h>

using namespace std;
static uint64_t getTimeMS()
{
    struct timeval tv;

    gettimeofday(&tv, NULL);

    return (tv.tv_sec * 1000 + tv.tv_usec / 1000);
}

static void printTopK(vector<int>& nums, int k)
{
    cout << "Top" << k << " results: ";
    for (int i = 0; i < k; i++) {
        cout << nums[i] << " ";
    }
    cout << endl;
}

static void printTopK(priority_queue<int> pq, int k)
{
    cout << "Top" << k << " results: ";
    for (int i = 0; i < k; i++) {
        cout << pq.top() << " ";
        pq.pop();
    }
    cout << endl;
}



int main()
{
    int num = 10000000, k = 10;

    vector<int> myVec1;
    vector<int> myVec2;

    for (int i = 0; i < num; i++) {
        if (i < 10)
            myVec1.push_back(i);
        else
            myVec1.push_back(rand() % 1000 + 10);
    }
    myVec2 = myVec1;

    uint64_t startTime, endTime;

    startTime = getTimeMS();
    sort(myVec1.begin(), myVec1.end());
    endTime = getTimeMS();
    printTopK(myVec1, k);
    cout << "Sort costs " << (endTime - startTime) << " ms" << endl;

    priority_queue<int> pq;

    for (int i = 0; i < k; i++) {
        pq.push(INT_MAX);
    }

    startTime = getTimeMS();
    for (int i = 0; i < num; i++) {
        if (myVec2[i] < pq.top()) {
            pq.pop();
            pq.push(myVec2[i]);
        }
    }
    endTime = getTimeMS();
    cout << "Priority Queue costs " << (endTime - startTime) << " ms" << endl;
    printTopK(pq, k);

    vector<int> topK;
    startTime = getTimeMS();
    for (int i = 0; i < k; i++) {
        int min = INT_MAX;
        int index = 0;
        for (int j = 0; j < num; j++) {
            if (myVec2[j] < min) {
                min = myVec2[j];
                index = j;
            }
        }
        myVec2[index] = INT_MAX;
        topK.push_back(min);
    }
    endTime = getTimeMS();
    cout << "Loop costs " << (endTime - startTime) << " ms" << endl;
    printTopK(topK, k);

    //printTopK(myVec2, 20);
    return 0;
}

以下是三种方法运行的结果:

hank@hank-desktop:~/Study/Alg/topK$ ./main
Top10 results: 0 1 2 3 4 5 6 7 8 9
Sort costs 3987 ms
Priority Queue costs 175 ms
Top10 results: 9 8 7 6 5 4 3 2 1 0
Loop costs 432 ms
Top10 results: 0 1 2 3 4 5 6 7 8 9

构造了一个1000万个INT类型数据,随机生成,检索出最小的10个数据,可以看到基于二叉堆的实现方案性能最优,在C++语言中,二叉堆直接使用priority queue直接可以调用,非常方便。

上一篇 下一篇

猜你喜欢

热点阅读