oneApi 并行计算 PI 的值

2020/8/31 OpenMPCUDAMPI

这篇文章的延续(OpenMP MPI CUDA 并行计算 PI 的值)

继上篇文章简单利用了 OpenMP 和 CUDA 计算了 PI 的值,现在利用 oneApi 再试一次。大体思路保持一致。

# 环境配置

  • oneapi 2021.1-beta08、gcc 10
  • manjaro 20
  • CPU: i7-6700hp
  • 显卡:gtx965m
  • 内存:2133 16G

# 串行

写了两个版本,主要是想对比多一个序列存储随机值对速度的影响

# 版本 0

double estimate_pi_0(size_t n_points) {
    double estimated_pi;       // Estimated value of Pi
    size_t n_under_curve = 0; // Number of points fallen under the curve

    // Allocate storage for random numbers
    double x = 0;
    double y = 0;

    // Step 1. Generate n_points random numbers
    //  & Count the number of points fallen under the curve
    // 1.1. Generator initialization
    std::random_device rd;
    std::default_random_engine engine(rd());
    std::uniform_real_distribution<double> distr(0, 1);
    // 1.2. Random number generation
    for (int i = 0; i < n_points; i++)
    {
        x = distr(engine);
        y = distr(engine);
        if (x * x + y * y <= 1.0)
            n_under_curve++;
    }

    // Step 2. Calculate approximated value of Pi
    estimated_pi = n_under_curve * 4.0/ (double)n_points;

    return estimated_pi;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# 版本 1

double estimate_pi_1(size_t n_points) {
    double estimated_pi;       // Estimated value of Pi
    size_t n_under_curve = 0; // Number of points fallen under the curve

    // Allocate storage for random numbers
    std::vector<double> x(n_points);
    std::vector<double> y(n_points);

    // Step 1. Generate n_points random numbers
    // 1.1. Generator initialization
    std::random_device rd;
    std::default_random_engine engine(rd());
    std::uniform_real_distribution<double> distr(0, 1);
    // 1.2. Random number generation
    for(int i = 0; i < n_points; i++) {
        x[i] = distr(engine);
        y[i] = distr(engine);
    }

    // Step 2. Count the number of points fallen under the curve
    for ( int i = 0; i < n_points; i++ ) {
        if (x[i] * x[i] + y[i] * y[i] <= 1.0f)
            n_under_curve++;
    }

    // Step 3. Calculate approximated value of Pi
    estimated_pi = n_under_curve * 4.0/ (double)n_points;

    return estimated_pi;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

# 并行

# OpenMP

double estimate_pi_openmp(size_t n_points) {

	double x = 0;
	double y = 0;
	uint64_t n_under_curve = { 0 };

#pragma omp parallel num_threads(4)
	{
		// 随机数生成
		std::random_device rd;
		std::mt19937_64 gen(rd());
		std::uniform_real_distribution<double> dis(0.0, 1.0);

#pragma omp for reduction(+:n_under_curve) private(x, y)
		for (size_t j = 0; j < n_points; j++)
		{
			x = dis(gen);
			y = dis(gen);
			if (hypot(x, y) < 1.0)
				n_under_curve++;
		}
	}

	return 4.0 * n_under_curve / (double)n_points;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

# with MKL

double estimate_pi_mkl(size_t n_points) {
    double estimated_pi;          // Estimated value of Pi
    size_t n_under_curve = 0;    // Number of points fallen under the curve

    // Allocate storage for random numbers
    cl::sycl::buffer<double, 1> x_buf(cl::sycl::range<1>{n_points});
    cl::sycl::buffer<double, 1> y_buf(cl::sycl::range<1>{n_points});

    // Choose device to run on and create queue
    cl::sycl::gpu_selector selector;
    cl::sycl::queue queue(selector);

    std::cout << "Running on: " <<
        queue.get_device().get_info<cl::sycl::info::device::name>()  << " - ";
    // Step 1. Generate n_points random numbers
    // 1.1. Generator initialization
    std::random_device SEED;
    mkl::rng::philox4x32x10 engine(queue, SEED());
    mkl::rng::uniform<double, mkl::rng::uniform_method::standard> distr(0.0f, 1.0f);

    // 1.2. Random number generation
    mkl::rng::generate(distr, engine, n_points, x_buf);
    mkl::rng::generate(distr, engine, n_points, y_buf);

    //Step 2. Count the number of points fallen under the curve
    auto x_acc = x_buf.template get_access<cl::sycl::access::mode::read>();
    auto y_acc = y_buf.template get_access<cl::sycl::access::mode::read>();
    for ( int i = 0; i < n_points; i++ ) {
        if (x_acc[i] * x_acc[i] + y_acc[i] * y_acc[i] <= 1.0f)
            n_under_curve++;
    }

    // Step 3. Calculate approximated value of Pi
    estimated_pi = n_under_curve / ((double)n_points) * 4.0;

    return estimated_pi; 
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

# 编译及运行

# 编译

dpcpp -fiopenmp -fsycl -DMKL_ILP64 -lmkl_intel_ilp64 \
      -lmkl_sequential -lmkl_core -lmkl_sycl -O3 -std=c++14 \
      Pi_OpenMP_OneApi.cpp -o Pi_OpenMP_OneApi.out
1
2
3
  • -fiopenmp : 开启 OpenMP
  • -O3 : 启用优化
  • -fsycl : 启用 DPC++
  • -DMKL_ILP64 : 使用 64 位整数类型(可选)
  • 链接库 :
    • -lmkl_intel_ilp64
    • -lmkl_sequential
    • -lmkl_core -lmkl_sycl

# 运行

./Pi_OpenMP_OneApi.out <Number>
1
  • 模拟点数因子:实际计算值为 `Number * 4096 * 128`,4096 和 128 分别为 [此文](Pi_OpenMP_CUDA_OpenACC.md) 中 CUDA 线程块和线程数目预设参数,保持一致便于对比

# 结果展示

$ ./a.out 1
           
随机点数:524288

Serial_1        : The simulated value of pi: 3.1423873901  Relative error: 0.025297%  Takes 90.841303 ms
Serial_2        : The simulated value of pi: 3.1393356323  Relative error: 0.071843%  Takes 94.079305 ms
OpenMP          : The simulated value of pi: 3.1412429810  Relative error: 0.011130%  Takes 32.634521 ms
Intel Mkl       : The simulated value of pi: 3.1422576904  Relative error: 0.021169%  Takes 239.586461 ms
1
2
3
4
5
6
7
8
$ ./a.out 10 

随机点数:5242880

Serial_1        : The simulated value of pi: 3.1406837463  Relative error: 0.028931%  Takes 909.770236 ms
Serial_2        : The simulated value of pi: 3.1408027649  Relative error: 0.025143%  Takes 949.799343 ms
OpenMP          : The simulated value of pi: 3.1410621643  Relative error: 0.016886%  Takes 246.051302 ms
Intel Mkl       : The simulated value of pi: 3.1413108826  Relative error: 0.008969%  Takes 276.707375 ms
(base) 
1
2
3
4
5
6
7
8
9
$ ./a.out 100

随机点数:52428800

Serial_1        : The simulated value of pi: 3.1415371704  Relative error: 0.001766%  Takes 9159.288490 ms
Serial_2        : The simulated value of pi: 3.1415546417  Relative error: 0.001210%  Takes 9863.436424 ms
OpenMP          : The simulated value of pi: 3.1416116333  Relative error: 0.000604%  Takes 2459.353222 ms
Intel Mkl       : The simulated value of pi: 3.1411964417  Relative error: 0.012612%  Takes 684.947111 ms
1
2
3
4
5
6
7
8

# 完整代码

点此下载

# TODO

  • 完善代码
  • 添加 MKL 优化 示例
  • 添加 USM 示例
  • 添加队列示例
Last Updated: 2023-10-29T08:26:04.000Z