cuda实现排序（纯享版，gpt4提供的代码）

2023-08-14 本文已影响0人寽虎非虫003

第一版能成功跑对的排序

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 512

__global__ void gpu_odd_even_sort(int *d, int n, int phase) {
    int index = threadIdx.x + blockDim.x * blockIdx.x;
    if (phase % 2 == 0) { // even phase
        if (index < n / 2) {
            int i = 2 * index;
            if (d[i] > d[i + 1]) {
                // swap elements
                int temp = d[i];
                d[i] = d[i + 1];
                d[i + 1] = temp;
            }
        }
    } else { // odd phase
        if (index < (n - 1) / 2) {
            int i = 2 * index + 1;
            if (d[i] > d[i + 1]) {
                // swap elements
                int temp = d[i];
                d[i] = d[i + 1];
                d[i + 1] = temp;
            }
        }
    }
}


int main() {
    int h[N];
    int *d;

    // Initialize host array
    for (int i = 0; i < N; i++) {
        // h[i] = rand() % N;
        h[i] = N-i;
    }

    // 打印结果
    printf("======初始数据========\n");
    for (size_t i = 0; i < N; i++)
    {
        printf("%d\t",h[i]);
    }
    printf("\n");

    // Allocate device memory
    cudaMalloc((void**)&d, sizeof(int) * N);

    // Copy host array to device array
    cudaMemcpy(d, h, sizeof(int) * N, cudaMemcpyHostToDevice);

    // Launch kernel
    for (int i = 0; i < N; i++) {
        gpu_odd_even_sort<<<N / 256, 256>>>(d, N, i);
        cudaDeviceSynchronize();
    }

    // Copy back to host
    cudaMemcpy(h, d, sizeof(int) * N, cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d);

    // 打印结果
    printf("======结果========\n");
    for (size_t i = 0; i < N; i++)
    {
        printf("%d\t",h[i]);
    }
    printf("\n");

    return 0;
}

结果

跑的结果

缺点还是很明显的，要连续从cpu调用N次cuda。

第二版能跑对的，Bitonic排序

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 512

__global__ void bitonic_sort_step(int *dev_values, int j, int k) {
    unsigned int i, ixj; /* Sorting partners: i and ixj */
    i = threadIdx.x + blockDim.x * blockIdx.x;
    ixj = i^j;

    /* The threads with the lowest ids sort the array. */
    if ((ixj)>i) {
        if ((i&k)==0) {
            /* Sort ascending */
            if (dev_values[i]>dev_values[ixj]) {
                /* exchange(i,ixj); */
                int temp = dev_values[i];
                dev_values[i] = dev_values[ixj];
                dev_values[ixj] = temp;
            }
        }
        if ((i&k)!=0) {
            /* Sort descending */
            if (dev_values[i]<dev_values[ixj]) {
                /* exchange(i,ixj); */
                int temp = dev_values[i];
                dev_values[i] = dev_values[ixj];
                dev_values[ixj] = temp;
            }
        }
    }
}

/**
 * Inplace bitonic sort using CUDA.
 */
void bitonic_sort(int *values) {
    int *dev_values;
    size_t size = N * sizeof(int);

    cudaMalloc((void**) &dev_values, size);
    cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);

    dim3 blocks(N/2,1);    /* Number of blocks   */
    dim3 threads(256,1);  /* Number of threads  */

    int j, k;
    static int iter_times{0};
    /* Major step */
    for (k = 2; k <= N; k <<= 1) {
        /* Minor step */
        for (j=k>>1; j>0; j=j>>1) {
            bitonic_sort_step<<<blocks, threads>>>(dev_values, j, k);
            printf("iter time is %d,k= %d, j=%d\n",++iter_times,k,j);
        }
    }
    cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
    cudaFree(dev_values);
}

int main() {
    int h[N];
    for (int i = 0; i < N; i++) {
        h[i] = N-i;
    }

    bitonic_sort(h);

    for (int i = 0; i < N; i++) {
        printf("%d ", h[i]);
    }
    printf("\n");

    return 0;
}

第三版能跑对的，thrust

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>

#define N 512

int main() {
    // Generate random numbers on the host
    thrust::host_vector<int> h(N);
    for (int i = 0; i < N; i++) {
        h[i] = rand() % N;
    }

    // Copy data to device
    thrust::device_vector<int> d = h;

    // Sort data on the device (uses Thrust's CUDA implementation of quicksort)
    thrust::sort(d.begin(), d.end());

    // Copy data back to host
    thrust::copy(d.begin(), d.end(), h.begin());

    // Print sorted data
    for (int i = 0; i < N; i++) {
        printf("%d ", h[i]);
    }
    printf("\n");

    return 0;
}

cuda实现排序（纯享版，gpt4提供的代码）

第一版能成功跑对的排序

第二版能跑对的，Bitonic排序

第三版能跑对的，thrust

猜你喜欢

热点阅读