diff --git a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.DVMH_cuda.cu_opt b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.DVMH_cuda.cu_opt new file mode 100644 index 0000000..90ed1a4 --- /dev/null +++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.DVMH_cuda.cu_opt @@ -0,0 +1,2285 @@ + +#include +#define dcmplx2 Complex +#define cmplx2 Complex +typedef int __indexTypeInt; +typedef long long __indexTypeLLong; + + + + + +//--------------------- Kernel for loop on line 229 --------------------- + + __global__ void loop_cg_229_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _i; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 229 --------------------- + + __global__ void loop_cg_229_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _i; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 233 --------------------- + + __global__ void loop_cg_233_cuda_kernel_int(double _p[], double _r[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _r[_j] = 0.0e0; + _p[_j] = 0.0e0; + } + } + + +//--------------------- Kernel for loop on line 233 --------------------- + + __global__ void loop_cg_233_cuda_kernel_llong(double _p[], double _r[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _r[_j] = 0.0e0; + _p[_j] = 0.0e0; + } + } + + +//--------------------- Kernel for loop on line 272 --------------------- + + __global__ void loop_cg_272_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 272 --------------------- + + __global__ void loop_cg_272_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 285 --------------------- + + __global__ void loop_cg_285_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 285 --------------------- + + __global__ void loop_cg_285_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 301 --------------------- + + __global__ void loop_cg_301_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _i; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 301 --------------------- + + __global__ void loop_cg_301_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _i; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_i <= end_1) + { + +// Loop body + _x[_i] = 1.0e0; + } + } + + +//--------------------- Kernel for loop on line 347 --------------------- + + __global__ void loop_cg_347_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 347 --------------------- + + __global__ void loop_cg_347_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1; + _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1); + _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2); + if (_j % warpSize == 0) + { + norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2; + norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1; + } + } + + +//--------------------- Kernel for loop on line 367 --------------------- + + __global__ void loop_cg_367_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 367 --------------------- + + __global__ void loop_cg_367_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _x[_j] = _z[_j] * _norm_temp2; + } + } + + +//--------------------- Kernel for loop on line 522 --------------------- + + __global__ void loop_cg_522_cuda_kernel_int(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _d = _x[_j]; + _r[_j] = _d; + _p[_j] = _d; + } + } + + +//--------------------- Kernel for loop on line 522 --------------------- + + __global__ void loop_cg_522_cuda_kernel_llong(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _q[_j] = 0.0e0; + _z[_j] = 0.0e0; + _d = _x[_j]; + _r[_j] = _d; + _p[_j] = _d; + } + } + + +//--------------------- Kernel for loop on line 537 --------------------- + + __global__ void loop_cg_537_cuda_kernel_int(double _r[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _rho = _r[_j] * _r[_j] + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 537 --------------------- + + __global__ void loop_cg_537_cuda_kernel_llong(double _r[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _rho = _r[_j] * _r[_j] + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 558 --------------------- + + __global__ void loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + int _k; + double _sum; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _sum = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid ; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum; + } + _sum = __dvmh_warpReduceSum(_sum); + if (lid == 0) { + _q[_j] = _sum; + } + } + } + + +//--------------------- Kernel for loop on line 558 --------------------- + + __global__ void loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + int _k; + double _sum; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _sum = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid ; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum; + } + _sum = __dvmh_warpReduceSum(_sum); + if (lid == 0) { + _q[_j] = _sum; + } + } + } + + +//--------------------- Kernel for loop on line 567 --------------------- + + __global__ void loop_cg_567_cuda_kernel_int(double _q[], double _p[], double _d, double d_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = _q[_j] * _p[_j] + _d; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _d = __dvmh_blockReduceSum(_d); + if (_j % warpSize == 0) + { + d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d; + } + } + + +//--------------------- Kernel for loop on line 567 --------------------- + + __global__ void loop_cg_567_cuda_kernel_llong(double _q[], double _p[], double _d, double d_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = _q[_j] * _p[_j] + _d; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _d = __dvmh_blockReduceSum(_d); + if (_j % warpSize == 0) + { + d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d; + } + } + + +//--------------------- Kernel for loop on line 577 --------------------- + + __global__ void loop_cg_577_cuda_kernel_int(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _alpha) + { + +// Private variables + double _d; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _z[_j] = _p[_j] * _alpha + _z[_j]; + _d = (-(_alpha * _q[_j])) + _r[_j]; + _r[_j] = _d; + _rho = _d * _d + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 577 --------------------- + + __global__ void loop_cg_577_cuda_kernel_llong(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _alpha) + { + +// Private variables + double _d; + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _z[_j] = _p[_j] * _alpha + _z[_j]; + _d = (-(_alpha * _q[_j])) + _r[_j]; + _r[_j] = _d; + _rho = _d * _d + _rho; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _rho = __dvmh_blockReduceSum(_rho); + if (_j % warpSize == 0) + { + rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho; + } + } + + +//--------------------- Kernel for loop on line 588 --------------------- + + __global__ void loop_cg_588_cuda_kernel_int(double _p[], double _r[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _beta) + { + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _p[_j] = _p[_j] * _beta + _r[_j]; + } + } + + +//--------------------- Kernel for loop on line 588 --------------------- + + __global__ void loop_cg_588_cuda_kernel_llong(double _p[], double _r[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _beta) + { + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _p[_j] = _p[_j] * _beta + _r[_j]; + } + } + + +//--------------------- Kernel for loop on line 605 --------------------- + + __global__ void loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + int _k; + double _d; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _d = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _d = _z_rma[_colidx[_k]] * _a[_k] + _d; + } + _d = __dvmh_warpReduceSum(_d); + if (lid == 0) { + _r[_j] = _d; + } + } + } + + +//--------------------- Kernel for loop on line 605 --------------------- + + __global__ void loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + int _k; + double _d; + int cond_0; + int __k; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int tid = gid / warpSize; + int lid = gid % warpSize; +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize; + if (_j <= end_1) + { + +// Loop body + _d = 0.e0; + for (_k = _rowstr[_j] + lid, + (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? + cond_0 = (-1) : + cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), + __k = 0 + lid; + __k < cond_0 ; + _k = _k + warpSize, __k = __k + warpSize) + { + _d = _z_rma[_colidx[_k]] * _a[_k] + _d; + } + _d = __dvmh_warpReduceSum(_d); + if (lid == 0) { + _r[_j] = _d; + } + } + } + + +//--------------------- Kernel for loop on line 618 --------------------- + + __global__ void loop_cg_618_cuda_kernel_int(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeInt _j; + __indexTypeInt rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = (-_r[_j]) + _x[_j]; + _sum = _d * _d + _sum; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _sum = __dvmh_blockReduceSum(_sum); + if (_j % warpSize == 0) + { + sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum; + } + } + + +//--------------------- Kernel for loop on line 618 --------------------- + + __global__ void loop_cg_618_cuda_kernel_llong(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks) + { + +// Private variables + double _d; + +// Local needs + __indexTypeLLong _j; + __indexTypeLLong rest_blocks, cur_blocks; + +// Calculate each thread's loop variables' values + rest_blocks = add_blocks + blockIdx.x; + cur_blocks = rest_blocks; + _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x); + if (_j <= end_1) + { + +// Loop body + _d = (-_r[_j]) + _x[_j]; + _sum = _d * _d + _sum; + } + +// Reduction + _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y); + _sum = __dvmh_blockReduceSum(_sum); + if (_j % warpSize == 0) + { + sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum; + } + } + + + +#ifdef _MS_F_ +#define loop_cg_229_cuda_ loop_cg_229_cuda +#define loop_cg_233_cuda_ loop_cg_233_cuda +#define loop_cg_272_cuda_ loop_cg_272_cuda +#define loop_cg_285_cuda_ loop_cg_285_cuda +#define loop_cg_301_cuda_ loop_cg_301_cuda +#define loop_cg_347_cuda_ loop_cg_347_cuda +#define loop_cg_367_cuda_ loop_cg_367_cuda +#define loop_cg_522_cuda_ loop_cg_522_cuda +#define loop_cg_537_cuda_ loop_cg_537_cuda +#define loop_cg_558_cuda_ loop_cg_558_cuda +#define loop_cg_567_cuda_ loop_cg_567_cuda +#define loop_cg_577_cuda_ loop_cg_577_cuda +#define loop_cg_588_cuda_ loop_cg_588_cuda +#define loop_cg_605_cuda_ loop_cg_605_cuda +#define loop_cg_618_cuda_ loop_cg_618_cuda +#endif + +extern "C" { + extern DvmType loop_cg_618_cuda_kernel_llong_regs, loop_cg_618_cuda_kernel_int_regs, loop_cg_605_cuda_kernel_llong_regs, loop_cg_605_cuda_kernel_int_regs, loop_cg_588_cuda_kernel_llong_regs, loop_cg_588_cuda_kernel_int_regs, loop_cg_577_cuda_kernel_llong_regs, loop_cg_577_cuda_kernel_int_regs, loop_cg_567_cuda_kernel_llong_regs, loop_cg_567_cuda_kernel_int_regs, loop_cg_558_cuda_kernel_llong_regs, loop_cg_558_cuda_kernel_int_regs, loop_cg_537_cuda_kernel_llong_regs, loop_cg_537_cuda_kernel_int_regs, loop_cg_522_cuda_kernel_llong_regs, loop_cg_522_cuda_kernel_int_regs, loop_cg_367_cuda_kernel_llong_regs, loop_cg_367_cuda_kernel_int_regs, loop_cg_347_cuda_kernel_llong_regs, loop_cg_347_cuda_kernel_int_regs, loop_cg_301_cuda_kernel_llong_regs, loop_cg_301_cuda_kernel_int_regs, loop_cg_285_cuda_kernel_llong_regs, loop_cg_285_cuda_kernel_int_regs, loop_cg_272_cuda_kernel_llong_regs, loop_cg_272_cuda_kernel_int_regs, loop_cg_233_cuda_kernel_llong_regs, loop_cg_233_cuda_kernel_int_regs, loop_cg_229_cuda_kernel_llong_regs, loop_cg_229_cuda_kernel_int_regs; + + +// CUDA handler for loop on line 229 + + void loop_cg_229_cuda_(DvmType *loop_ref, DvmType _x[]) + { + void *x_base; + DvmType d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_229_cuda_kernel_int<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_229_cuda_kernel_llong<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 233 + + void loop_cg_233_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _z[], DvmType _q[]) + { + void *p_base, *r_base, *z_base, *q_base; + DvmType d_p[4], d_r[4], d_z[4], d_q[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_base = dvmh_get_natural_base(&device_num, _p); + r_base = dvmh_get_natural_base(&device_num, _r); + z_base = dvmh_get_natural_base(&device_num, _z); + q_base = dvmh_get_natural_base(&device_num, _q); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, q_base, _q, d_q); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_233_cuda_kernel_int<<>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_233_cuda_kernel_llong<<>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 272 + + void loop_cg_272_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[]) + { + void *z_base, *x_base; + DvmType d_z[4], d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *norm_temp2_grid; + double _norm_temp2; + void *norm_temp1_grid; + double _norm_temp1; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0); + red_num = 2; + loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0); + +// Get 'natural' bases + z_base = dvmh_get_natural_base(&device_num, _z); + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + red_num = 2; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_272_cuda_kernel_int<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_272_cuda_kernel_llong<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + red_num = 2; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 285 + + void loop_cg_285_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2) + { + void *x_base, *z_base; + DvmType d_x[4], d_z[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + z_base = dvmh_get_natural_base(&device_num, _z); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_285_cuda_kernel_int<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + else + { + loop_cg_285_cuda_kernel_llong<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 301 + + void loop_cg_301_cuda_(DvmType *loop_ref, DvmType _x[]) + { + void *x_base; + DvmType d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_301_cuda_kernel_int<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_301_cuda_kernel_llong<<>>((double *)x_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 347 + + void loop_cg_347_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[]) + { + void *z_base, *x_base; + DvmType d_z[4], d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *norm_temp2_grid; + double _norm_temp2; + void *norm_temp1_grid; + double _norm_temp1; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0); + red_num = 2; + loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0); + loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0); + +// Get 'natural' bases + z_base = dvmh_get_natural_base(&device_num, _z); + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + red_num = 2; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_347_cuda_kernel_int<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_347_cuda_kernel_llong<<>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + red_num = 2; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 367 + + void loop_cg_367_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2) + { + void *x_base, *z_base; + DvmType d_x[4], d_z[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + x_base = dvmh_get_natural_base(&device_num, _x); + z_base = dvmh_get_natural_base(&device_num, _z); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, x_base, _x, d_x); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_367_cuda_kernel_int<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + else + { + loop_cg_367_cuda_kernel_llong<<>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 522 + + void loop_cg_522_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _x[], DvmType _z[], DvmType _q[]) + { + void *p_base, *r_base, *x_base, *z_base, *q_base; + DvmType d_p[4], d_r[4], d_x[4], d_z[4], d_q[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_base = dvmh_get_natural_base(&device_num, _p); + r_base = dvmh_get_natural_base(&device_num, _r); + x_base = dvmh_get_natural_base(&device_num, _x); + z_base = dvmh_get_natural_base(&device_num, _z); + q_base = dvmh_get_natural_base(&device_num, _q); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + dvmh_fill_header_(&device_num, q_base, _q, d_q); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_522_cuda_kernel_int<<>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_522_cuda_kernel_llong<<>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 537 + + void loop_cg_537_cuda_(DvmType *loop_ref, DvmType _r[]) + { + void *r_base; + DvmType d_r[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *rho_grid; + double _rho; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0); + loop_red_init_(loop_ref, &red_num, &_rho, 0); + +// Get 'natural' bases + r_base = dvmh_get_natural_base(&device_num, _r); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, r_base, _r, d_r); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_537_cuda_kernel_int<<>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_537_cuda_kernel_llong<<>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 558 + + void loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[]) + { + void *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base; + DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_rma_base = dvmh_get_natural_base(&device_num, _p_rma); + q_base = dvmh_get_natural_base(&device_num, _q); + colidx_base = dvmh_get_natural_base(&device_num, _colidx); + a_base = dvmh_get_natural_base(&device_num, _a); + rowstr_base = dvmh_get_natural_base(&device_num, _rowstr); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma); + dvmh_fill_header_(&device_num, q_base, _q, d_q); + dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx); + dvmh_fill_header_(&device_num, a_base, _a, d_a); + dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref); + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_558_cuda_kernel_int<<>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_558_cuda_kernel_llong<<>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 567 + + void loop_cg_567_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[]) + { + void *q_base, *p_base; + DvmType d_q[4], d_p[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *d_grid; + double _d; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &d_grid, 0); + loop_red_init_(loop_ref, &red_num, &_d, 0); + +// Get 'natural' bases + q_base = dvmh_get_natural_base(&device_num, _q); + p_base = dvmh_get_natural_base(&device_num, _p); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, q_base, _q, d_q); + dvmh_fill_header_(&device_num, p_base, _p, d_p); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_567_cuda_kernel_int<<>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_567_cuda_kernel_llong<<>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 577 + + void loop_cg_577_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _r[], DvmType _p[], DvmType _z[], double *_alpha) + { + void *q_base, *r_base, *p_base, *z_base; + DvmType d_q[4], d_r[4], d_p[4], d_z[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *rho_grid; + double _rho; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0); + loop_red_init_(loop_ref, &red_num, &_rho, 0); + +// Get 'natural' bases + q_base = dvmh_get_natural_base(&device_num, _q); + r_base = dvmh_get_natural_base(&device_num, _r); + p_base = dvmh_get_natural_base(&device_num, _p); + z_base = dvmh_get_natural_base(&device_num, _z); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, q_base, _q, d_q); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, z_base, _z, d_z); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_577_cuda_kernel_int<<>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha); + } + else + { + loop_cg_577_cuda_kernel_llong<<>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + + +// CUDA handler for loop on line 588 + + void loop_cg_588_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], double *_beta) + { + void *p_base, *r_base; + DvmType d_p[4], d_r[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + p_base = dvmh_get_natural_base(&device_num, _p); + r_base = dvmh_get_natural_base(&device_num, _r); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, p_base, _p, d_p); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_588_cuda_kernel_int<<>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta); + } + else + { + loop_cg_588_cuda_kernel_llong<<>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 605 + + void loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[]) + { + void *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base; + DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Get 'natural' bases + z_rma_base = dvmh_get_natural_base(&device_num, _z_rma); + r_base = dvmh_get_natural_base(&device_num, _r); + colidx_base = dvmh_get_natural_base(&device_num, _colidx); + a_base = dvmh_get_natural_base(&device_num, _a); + rowstr_base = dvmh_get_natural_base(&device_num, _rowstr); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma); + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx); + dvmh_fill_header_(&device_num, a_base, _a, d_a); + dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_int_regs, &threads, &stream, 0); + } + else + { + loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_llong_regs, &threads, &stream, 0); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref); + addBlocks = 0; + blocks = dim3(1, 1, 1); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_605_cuda_kernel_int<<>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_605_cuda_kernel_llong<<>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + } + + +// CUDA handler for loop on line 618 + + void loop_cg_618_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _x[]) + { + void *r_base, *x_base; + DvmType d_r[4], d_x[4]; + DvmType idxTypeInKernel; + dim3 blocks, threads; + cudaStream_t stream; + DvmType idxL[1], idxH[1], loopSteps[1]; + DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks; + void *sum_grid; + double _sum; + DvmType red_num, num_of_red_blocks, fill_flag; + DvmType shared_mem; + DvmType device_num; + +// Get device number + device_num = loop_get_device_num_(loop_ref); + +// Register reduction for CUDA-execution + red_num = 1; + loop_cuda_register_red(loop_ref, red_num, &sum_grid, 0); + loop_red_init_(loop_ref, &red_num, &_sum, 0); + +// Get 'natural' bases + r_base = dvmh_get_natural_base(&device_num, _r); + x_base = dvmh_get_natural_base(&device_num, _x); + +// Fill 'device' headers + dvmh_fill_header_(&device_num, r_base, _r, d_r); + dvmh_fill_header_(&device_num, x_base, _x, d_x); + +// Guess index type in CUDA kernel + idxTypeInKernel = loop_guess_index_type_(loop_ref); + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) + { + idxTypeInKernel = rt_INT; + } + if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) + { + idxTypeInKernel = rt_LLONG; + } + +// Get CUDA configuration parameters + threads = dim3(0, 0, 0); +#ifdef CUDA_FERMI_ARCH + shared_mem = 8; +#else + shared_mem = 0; +#endif + if (idxTypeInKernel == rt_INT) + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_int_regs, &threads, &stream, &shared_mem); + } + else + { + loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_llong_regs, &threads, &stream, &shared_mem); + } + loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps); + blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x; + overallBlocks = blocksS[0]; + restBlocks = overallBlocks; + addBlocks = 0; + blocks = dim3(1, 1, 1); + +// Prepare reduction + num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref)); + fill_flag = 0; + red_num = 1; + loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag); + maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X); + +// GPU execution + while (restBlocks > 0) + { + if (restBlocks <= maxBlocks) + { + blocks = restBlocks; + } + else + { + blocks = maxBlocks; + } + if (idxTypeInKernel == rt_INT) + { + loop_cg_618_cuda_kernel_int<<>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks); + } + else + { + loop_cg_618_cuda_kernel_llong<<>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks); + } + addBlocks += blocks.x; + restBlocks -= blocks.x; + } + +// Finish reduction + red_num = 1; + loop_red_finish(loop_ref, red_num); + } + +} diff --git a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.fdv b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.fdv new file mode 100644 index 0000000..f077345 --- /dev/null +++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.fdv @@ -0,0 +1,1008 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! S E R I A L V E R S I O N ! +! ! +! C G ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is a serial version of the NPB CG code. ! +! Refer to NAS Technical Reports 95-020 for details. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + + +c--------------------------------------------------------------------- +c NPB CG serial version +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c +c Authors: M. Yarrow +c C. Kuszmaul +c A.S. Kolganov +c +c--------------------------------------------------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + program cg +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + + implicit none + + include 'globals.h' + + + common / main_int_mem / colidx, rowstr, + > iv, arow, acol + integer colidx(nz), rowstr(na+1), + > iv(na), arow(na), acol(naz), + > bl_low, bl_high, blGen,gBL(2) + + + common / main_flt_mem / aelt, a, + > x, + > z, + > p, + > q, + > r + double precision aelt(naz), a(nz), + > x(na+1), + > z(na+1), + > p(na+1), + > q(na+1), + > r(na+1) + + + + +CDVM$ TEMPLATE ttt(na+2) +CDVM$ DISTRIBUTE ttt(BLOCK) +CDVM$ ALIGN z(I) WITH ttt(I) + +CDVM$ ALIGN x(I) WITH z(I) +CDVM$ ALIGN r(I) WITH z(I) +CDVM$ ALIGN p(I) WITH z(I) +CDVM$ ALIGN q(I) WITH z(I) + + + integer i, j, k, it, sumL + + double precision zeta, randlc + external randlc + double precision rnorm + double precision norm_temp1,norm_temp2 + + double precision t, mflops, tmax + character class + logical verified + double precision zeta_verify_value, epsilon, err + + integer fstatus + character t_names(t_last)*8 + + do i = 1, T_last + call timer_clear( i ) + end do + + open(unit=2, file='timer.flag', status='old', iostat=fstatus) + if (fstatus .eq. 0) then + timeron = .true. + t_names(t_init) = 'init' + t_names(t_bench) = 'benchmk' + t_names(t_conj_grad) = 'conjgd' + close(2) + else + timeron = .false. + endif + + call timer_start( T_init ) + + firstrow = 1 + lastrow = na + firstcol = 1 + lastcol = na + + + if( na .eq. 1400 .and. + & nonzer .eq. 7 .and. + & niter .eq. 15 .and. + & shift .eq. 10.d0 ) then + class = 'S' + zeta_verify_value = 8.5971775078648d0 + else if( na .eq. 7000 .and. + & nonzer .eq. 8 .and. + & niter .eq. 15 .and. + & shift .eq. 12.d0 ) then + class = 'W' + zeta_verify_value = 10.362595087124d0 + else if( na .eq. 14000 .and. + & nonzer .eq. 11 .and. + & niter .eq. 15 .and. + & shift .eq. 20.d0 ) then + class = 'A' + zeta_verify_value = 17.130235054029d0 + else if( na .eq. 75000 .and. + & nonzer .eq. 13 .and. + & niter .eq. 75 .and. + & shift .eq. 60.d0 ) then + class = 'B' + zeta_verify_value = 22.712745482631d0 + else if( na .eq. 150000 .and. + & nonzer .eq. 15 .and. + & niter .eq. 75 .and. + & shift .eq. 110.d0 ) then + class = 'C' + zeta_verify_value = 28.973605592845d0 + else if( na .eq. 1500000 .and. + & nonzer .eq. 21 .and. + & niter .eq. 100 .and. + & shift .eq. 500.d0 ) then + class = 'D' + zeta_verify_value = 52.514532105794d0 + else if( na .eq. 9000000 .and. + & nonzer .eq. 26 .and. + & niter .eq. 100 .and. + & shift .eq. 1.5d3 ) then + class = 'E' + zeta_verify_value = 77.522164599383d0 + else + class = 'U' + endif + + write( *,1000 ) + write( *,1001 ) na + write( *,1002 ) niter + write( *,* ) + 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-DVMH)', + > ' - CG Benchmark', /) + 1001 format(' Size: ', i11 ) + 1002 format(' Iterations: ', i5 ) + + naa = na + nzz = nz + + +c--------------------------------------------------------------------- +c Inialize random number generator +c--------------------------------------------------------------------- + tran = 314159265.0D0 + amult = 1220703125.0D0 + zeta = randlc( tran, amult ) + +c--------------------------------------------------------------------- +c +c--------------------------------------------------------------------- + call makea(naa, nzz, a, colidx, rowstr, + > firstrow, lastrow, firstcol, lastcol, + > arow, acol, aelt, iv) + + + +c--------------------------------------------------------------------- +c Note: as a result of the above call to makea: +c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 +c values of colidx which are col indexes go from firstcol --> lastcol +c So: +c Shift the col index vals from actual (firstcol --> lastcol ) +c to local, i.e., (1 --> lastcol-firstcol+1) +c--------------------------------------------------------------------- + do j=1,lastrow-firstrow+1 + do k=rowstr(j),rowstr(j+1)-1 + colidx(k) = colidx(k) - firstcol + 1 + enddo + enddo + +c--------------------------------------------------------------------- +c set starting vector to (1, 1, .... 1) +c--------------------------------------------------------------------- +CDVM$ region +CDVM$ parallel (i) on x(i) + do i = 1, na+1 + x(i) = 1.0D0 + enddo +CDVM$ parallel (j) on x(j) + do j=1, lastcol-firstcol+1 + q(j) = 0.0d0 + z(j) = 0.0d0 + r(j) = 0.0d0 + p(j) = 0.0d0 + enddo +CDVM$ end region + zeta = 0.0d0 + +c--------------------------------------------------------------------- +c----> +c Do one iteration untimed to init all code and data page tables +c----> (then reinit, start timing, to niter its) +c--------------------------------------------------------------------- + do it = 1, 1 + +c--------------------------------------------------------------------- +c The call to the conjugate gradient routine: +c--------------------------------------------------------------------- + call conj_grad ( colidx, + > rowstr, + > x, + > z, + > a, + > p, + > q, + > r, + > rnorm ) + +c--------------------------------------------------------------------- +c zeta = shift + 1/(x.z) +c So, first: (x.z) +c Also, find norm of z +c So, first: (z.z) +c--------------------------------------------------------------------- + norm_temp1 = 0.0d0 + norm_temp2 = 0.0d0 +CDVM$ region +CDVM$ parallel (j) on x(j),reduction(SUM(norm_temp1),SUM(norm_temp2)) + do j=1, lastcol-firstcol+1 + norm_temp1 = norm_temp1 + x(j)*z(j) + norm_temp2 = norm_temp2 + z(j)*z(j) + enddo +CDVM$ end region + norm_temp2 = 1.0d0 / sqrt( norm_temp2 ) + + +c--------------------------------------------------------------------- +c Normalize z to obtain x +c--------------------------------------------------------------------- +CDVM$ region +CDVM$ parallel (j) on x(j) + do j=1, lastcol-firstcol+1 + x(j) = norm_temp2*z(j) + enddo +CDVM$ end region + + enddo ! end of do one iteration untimed + + +c--------------------------------------------------------------------- +c set starting vector to (1, 1, .... 1) +c--------------------------------------------------------------------- +c +c +c +CDVM$ region +CDVM$ parallel (i) on x(i) + do i = 1, na+1 + x(i) = 1.0D0 + enddo +CDVM$ end region + zeta = 0.0d0 + + call timer_stop( T_init ) + + write (*, 2000) timer_read(T_init) + 2000 format(' Initialization time = ',f15.3,' seconds') + + call timer_start( T_bench ) + +c--------------------------------------------------------------------- +c----> +c Main Iteration for inverse power method +c----> +c--------------------------------------------------------------------- + do it = 1, niter + +c--------------------------------------------------------------------- +c The call to the conjugate gradient routine: +c--------------------------------------------------------------------- + if ( timeron ) call timer_start( T_conj_grad ) + call conj_grad ( colidx, + > rowstr, + > x, + > z, + > a, + > p, + > q, + > r, + > rnorm ) + if ( timeron ) call timer_stop( T_conj_grad ) + + +c--------------------------------------------------------------------- +c zeta = shift + 1/(x.z) +c So, first: (x.z) +c Also, find norm of z +c So, first: (z.z) +c--------------------------------------------------------------------- + norm_temp1 = 0.0d0 + norm_temp2 = 0.0d0 +CDVM$ region +CDVM$ parallel (j) on x(j),reduction(SUM(norm_temp1),SUM(norm_temp2)) + do j=1, lastcol-firstcol+1 + norm_temp1 = norm_temp1 + x(j)*z(j) + norm_temp2 = norm_temp2 + z(j)*z(j) + enddo +CDVM$ end region + norm_temp2 = 1.0d0 / sqrt( norm_temp2 ) + + + zeta = shift + 1.0d0 / norm_temp1 + if( it .eq. 1 ) write( *,9000 ) + write( *,9001 ) it, rnorm, zeta + + 9000 format( /,' iteration ||r|| zeta' ) + 9001 format( 4x, i5, 7x, e20.14, f20.13 ) + +c--------------------------------------------------------------------- +c Normalize z to obtain x +c--------------------------------------------------------------------- +CDVM$ region +CDVM$ parallel (j) on x(j) + do j=1, lastcol-firstcol+1 + x(j) = norm_temp2*z(j) + enddo +CDVM$ end region + + enddo ! end of main iter inv pow meth + + call timer_stop( T_bench ) + +c--------------------------------------------------------------------- +c End of timed section +c--------------------------------------------------------------------- + + t = timer_read( T_bench ) + + + write(*,100) + 100 format(' Benchmark completed ') + + epsilon = 1.d-10 + if (class .ne. 'U') then + +c err = abs( zeta - zeta_verify_value) + err = abs( zeta - zeta_verify_value )/zeta_verify_value + if( err .le. epsilon .and. ( .not. isnan(err))) then + verified = .TRUE. + write(*, 200) + write(*, 201) zeta + write(*, 202) err + 200 format(' VERIFICATION SUCCESSFUL ') + 201 format(' Zeta is ', E20.13) + 202 format(' Error is ', E20.13) + else + verified = .FALSE. + write(*, 300) + write(*, 301) zeta + write(*, 302) zeta_verify_value + 300 format(' VERIFICATION FAILED') + 301 format(' Zeta ', E20.13) + 302 format(' The correct zeta is ', E20.13) + endif + else + verified = .FALSE. + write (*, 400) + write (*, 401) + write (*, 201) zeta + 400 format(' Problem size unknown') + 401 format(' NO VERIFICATION PERFORMED') + endif + + + if( t .ne. 0. ) then + mflops = float( 2*niter*na ) + & * ( 3.+float( nonzer*(nonzer+1) ) + & + 25.*(5.+float( nonzer*(nonzer+1) )) + & + 3. ) / t / 1000000.0 + else + mflops = 0.0 + endif + + + call print_results('CG', class, na, 0, 0, + > niter, t, + > mflops, ' floating point', + > verified, npbversion, compiletime, + > cs1, cs2, cs3, cs4, cs5, cs6, cs7) + + + + 600 format( i4, 2e19.12) + + +c--------------------------------------------------------------------- +c More timers +c--------------------------------------------------------------------- + if (.not.timeron) goto 999 + + tmax = timer_read(T_bench) + if (tmax .eq. 0.0) tmax = 1.0 + + write(*,800) + 800 format(' SECTION Time (secs)') + do i=1, t_last + t = timer_read(i) + if (i.eq.t_init) then + write(*,810) t_names(i), t + else + write(*,810) t_names(i), t, t*100./tmax + if (i.eq.t_conj_grad) then + t = tmax - t + write(*,820) 'rest', t, t*100./tmax + endif + endif + 810 format(2x,a8,':',f9.3:' (',f6.2,'%)') + 820 format(' --> ',a8,':',f9.3,' (',f6.2,'%)') + end do + + 999 continue + + + end ! end main + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine conj_grad ( colidx, + > rowstr, + > x, + > z, + > a, + > p, + > q, + > r, + > rnorm ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + +c--------------------------------------------------------------------- +c Floaging point arrays here are named as in NPB1 spec discussion of +c CG algorithm +c--------------------------------------------------------------------- + + implicit none + + + include 'globals.h' + + + double precision x(*), + > z(*), + > a(nzz) + integer colidx(nzz), rowstr(naa+1) + + double precision p(*), + > q(*), + > r(*) + + + integer j, k + integer cgit, cgitmax, mlen,idx, idxl + + double precision d, sum, rho, rho0, alpha, beta, rnorm + + data cgitmax / 25 / +CDVM$ INHERIT x, z, r, p, q + + rho = 0.0d0 + +c--------------------------------------------------------------------- +c Initialize the CG algorithm: +c--------------------------------------------------------------------- + +CDVM$ region +CDVM$ parallel (j) on q(j), private(d) + do j=1,naa+1 + q(j) = 0.0d0 + z(j) = 0.0d0 + d = x(j) + r(j) = d + p(j) = d + enddo + + +c--------------------------------------------------------------------- +c rho = r.r +c Now, obtain the norm of r: First, sum squares of r elements locally... +c--------------------------------------------------------------------- + +CDVM$ parallel(j) on r(j), reduction(SUM(rho)) + do j=1, lastcol-firstcol+1 + rho = rho + r(j)*r(j) + enddo +! mlen = 128 +! DVM$ parallel(j) on r(j), reduction(MAX(mlen)) +! do j=1,lastrow-firstrow+1 +! mlen = max(mlen, rowstr(j+1) - rowstr(j)) +! enddo +CDVM$ end region +! write(*,*) 'maxlen = ', mlen +c--------------------------------------------------------------------- +c----> +c The conj grad iteration loop +c----> +c--------------------------------------------------------------------- + do cgit = 1, cgitmax + + d = 0.0d0 +CDVM$ region + +CDVM$ parallel (j) on p(j), private(sum,k), remote_access(p(:)) + do j=1,lastrow-firstrow+1 + sum = 0.d0 + do k=rowstr(j),rowstr(j+1)-1 + sum = sum + a(k)*p(colidx(k)) + enddo + q(j) = sum + enddo + +CDVM$ parallel (j) on q(j), reduction(SUM(d)) + do j=1, lastcol-firstcol+1 + d = d + p(j)*q(j) + enddo +CDVM$ end region + alpha = rho / d + rho0 = rho + + rho = 0.0d0 +CDVM$ region +CDVM$ parallel (j) on r(j), private(d), reduction(SUM(rho)) + do j=1, lastcol-firstcol+1 + z(j) = z(j) + alpha*p(j) + d = r(j) - alpha*q(j) + r(j) = d + rho = rho + d*d + enddo +CDVM$ end region + beta = rho / rho0 + +CDVM$ region +CDVM$ parallel (j) on r(j) + do j=1, lastcol-firstcol+1 + p(j) = r(j) + beta*p(j) + enddo +CDVM$ end region + + enddo ! end of do cgit=1,cgitmax + + +c--------------------------------------------------------------------- +c Compute residual norm explicitly: ||r|| = ||x - A.z|| +c First, form A.z +c The partition submatrix-vector multiply +c--------------------------------------------------------------------- + + sum = 0.0d0 +CDVM$ region +CDVM$ parallel (j) on r(j), private(d,k),remote_access(z(:)) + do j=1,lastrow-firstrow+1 + d = 0.d0 + do k=rowstr(j),rowstr(j+1)-1 + d = d + a(k)*z(colidx(k)) + enddo + r(j) = d + enddo + + +c--------------------------------------------------------------------- +c At this point, r contains A.z +c--------------------------------------------------------------------- +CDVM$ parallel (j) on r(j), private(d), reduction(SUM(sum)) + do j=1, lastcol-firstcol+1 + d = x(j) - r(j) + sum = sum + d*d + enddo +CDVM$ end region + rnorm = sqrt( sum ) + + + + return + end ! end of routine conj_grad + + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine makea( n, nz, a, colidx, rowstr, + > firstrow, lastrow, firstcol, lastcol, + > arow, acol, aelt, iv ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + include 'npbparams.h' + integer n, nz + integer firstrow, lastrow, firstcol, lastcol + integer colidx(nz), rowstr(n+1) + integer iv(n), arow(n), acol(nonzer+1,n) + double precision aelt(nonzer+1,n) + double precision a(nz) + +c--------------------------------------------------------------------- +c generate the test problem for benchmark 6 +c makea generates a sparse matrix with a +c prescribed sparsity distribution +c +c parameter type usage +c +c input +c +c n i number of cols/rows of matrix +c nz i nonzeros as declared array size +c rcond r*8 condition number +c shift r*8 main diagonal shift +c +c output +c +c a r*8 array for nonzeros +c colidx i col indices +c rowstr i row pointers +c +c workspace +c +c iv, arow, acol i +c aelt r*8 +c--------------------------------------------------------------------- + + integer i, iouter, ivelt, nzv, nn1 + integer ivc(nonzer+1) + double precision vc(nonzer+1) + +c--------------------------------------------------------------------- +c nonzer is approximately (int(sqrt(nnza /n))); +c--------------------------------------------------------------------- + + external sparse, sprnvc, vecset + +c--------------------------------------------------------------------- +c nn1 is the smallest power of two not less than n +c--------------------------------------------------------------------- + + nn1 = 1 + 50 continue + nn1 = 2 * nn1 + if (nn1 .lt. n) goto 50 + +c--------------------------------------------------------------------- +c Generate nonzero positions and save for the use in sparse. +c--------------------------------------------------------------------- + + do iouter = 1, n + nzv = nonzer + call sprnvc( n, nzv, nn1, vc, ivc ) + call vecset( n, vc, ivc, nzv, iouter, .5D0 ) + arow(iouter) = nzv + do ivelt = 1, nzv + acol(ivelt, iouter) = ivc(ivelt) + aelt(ivelt, iouter) = vc(ivelt) + enddo + enddo + +c--------------------------------------------------------------------- +c ... make the sparse matrix from list of elements with duplicates +c (iv is used as workspace) +c--------------------------------------------------------------------- + call sparse( a, colidx, rowstr, n, nz, nonzer, arow, acol, + > aelt, firstrow, lastrow, + > iv, rcond, shift ) + return + + end +c-------end of makea------------------------------ + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine sparse( a, colidx, rowstr, n, nz, nonzer, arow, acol, + > aelt, firstrow, lastrow, + > nzloc, rcond, shift ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer colidx(*), rowstr(*) + integer firstrow, lastrow + integer n, nz, nonzer, arow(*), acol(nonzer+1,*) + double precision a(*), aelt(nonzer+1,*), rcond, shift + +c--------------------------------------------------------------------- +c rows range from firstrow to lastrow +c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values +c--------------------------------------------------------------------- + integer nzloc(n), nrows + +c--------------------------------------------------- +c generate a sparse matrix from a list of +c [col, row, element] tri +c--------------------------------------------------- + + integer i, j, j1, j2, nza, k, kk, nzrow, jcol + double precision xi, size, scale, ratio, va + +c--------------------------------------------------------------------- +c how many rows of result +c--------------------------------------------------------------------- + nrows = lastrow - firstrow + 1 + +c--------------------------------------------------------------------- +c ...count the number of triples in each row +c--------------------------------------------------------------------- + do j = 1, nrows+1 + rowstr(j) = 0 + enddo + + do i = 1, n + do nza = 1, arow(i) + j = acol(nza, i) + 1 + rowstr(j) = rowstr(j) + arow(i) + end do + end do + + rowstr(1) = 1 + do j = 2, nrows+1 + rowstr(j) = rowstr(j) + rowstr(j-1) + enddo + nza = rowstr(nrows+1) - 1 + +c--------------------------------------------------------------------- +c ... rowstr(j) now is the location of the first nonzero +c of row j of a +c--------------------------------------------------------------------- + + if (nza .gt. nz) then + write(*,*) 'Space for matrix elements exceeded in sparse' + write(*,*) 'nza, nzmax = ',nza, nz + stop + endif + + +c--------------------------------------------------------------------- +c ... preload data pages +c--------------------------------------------------------------------- + do j = 1, nrows + do k = rowstr(j), rowstr(j+1)-1 + a(k) = 0.d0 + colidx(k) = 0 + enddo + nzloc(j) = 0 + enddo + +c--------------------------------------------------------------------- +c ... generate actual values by summing duplicates +c--------------------------------------------------------------------- + + size = 1.0D0 + ratio = rcond ** (1.0D0 / dfloat(n)) + + do i = 1, n + do nza = 1, arow(i) + j = acol(nza, i) + + scale = size * aelt(nza, i) + do nzrow = 1, arow(i) + jcol = acol(nzrow, i) + va = aelt(nzrow, i) * scale + +c--------------------------------------------------------------------- +c ... add the identity * rcond to the generated matrix to bound +c the smallest eigenvalue from below by rcond +c--------------------------------------------------------------------- + if (jcol .eq. j .and. j .eq. i) then + va = va + rcond - shift + endif + + do k = rowstr(j), rowstr(j+1)-1 + if (colidx(k) .gt. jcol) then +c--------------------------------------------------------------------- +c ... insert colidx here orderly +c--------------------------------------------------------------------- + do kk = rowstr(j+1)-2, k, -1 + if (colidx(kk) .gt. 0) then + a(kk+1) = a(kk) + colidx(kk+1) = colidx(kk) + endif + enddo + colidx(k) = jcol + a(k) = 0.d0 + goto 40 + else if (colidx(k) .eq. 0) then + colidx(k) = jcol + goto 40 + else if (colidx(k) .eq. jcol) then +c--------------------------------------------------------------------- +c ... mark the duplicated entry +c--------------------------------------------------------------------- + nzloc(j) = nzloc(j) + 1 + goto 40 + endif + enddo + print *,'internal error in sparse: i=',i + stop + 40 continue + a(k) = a(k) + va + enddo + 60 continue + enddo + size = size * ratio + enddo + + +c--------------------------------------------------------------------- +c ... remove empty entries and generate final results +c--------------------------------------------------------------------- + do j = 2, nrows + nzloc(j) = nzloc(j) + nzloc(j-1) + enddo + + do j = 1, nrows + if (j .gt. 1) then + j1 = rowstr(j) - nzloc(j-1) + else + j1 = 1 + endif + j2 = rowstr(j+1) - nzloc(j) - 1 + nza = rowstr(j) + do k = j1, j2 + a(k) = a(nza) + colidx(k) = colidx(nza) + nza = nza + 1 + enddo + enddo + do j = 2, nrows+1 + rowstr(j) = rowstr(j) - nzloc(j-1) + enddo + nza = rowstr(nrows+1) - 1 + + +CC write (*, 11000) nza + return +11000 format ( //,'final nonzero count in sparse ', + 1 /,'number of nonzeros = ', i16 ) + end +c-------end of sparse----------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine sprnvc( n, nz, nn1, v, iv ) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + double precision v(*) + integer n, nz, nn1, iv(*) + common /urando/ amult, tran + double precision amult, tran + + +c--------------------------------------------------------------------- +c generate a sparse n-vector (v, iv) +c having nzv nonzeros +c +c mark(i) is set to 1 if position i is nonzero. +c mark is all zero on entry and is reset to all zero before exit +c this corrects a performance bug found by John G. Lewis, caused by +c reinitialization of mark on every one of the n calls to sprnvc +c--------------------------------------------------------------------- + + integer nzv, ii, i, icnvrt + + external randlc, icnvrt + double precision randlc, vecelt, vecloc + + + nzv = 0 + +100 continue + if (nzv .ge. nz) goto 110 + + vecelt = randlc( tran, amult ) + +c--------------------------------------------------------------------- +c generate an integer between 1 and n in a portable manner +c--------------------------------------------------------------------- + vecloc = randlc(tran, amult) + i = icnvrt(vecloc, nn1) + 1 + if (i .gt. n) goto 100 + +c--------------------------------------------------------------------- +c was this integer generated already? +c--------------------------------------------------------------------- + do ii = 1, nzv + if (iv(ii) .eq. i) goto 100 + enddo + nzv = nzv + 1 + v(nzv) = vecelt + iv(nzv) = i + goto 100 +110 continue + + return + end +c-------end of sprnvc----------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + function icnvrt(x, ipwr2) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + double precision x + integer ipwr2, icnvrt + +c--------------------------------------------------------------------- +c scale a double precision number x in (0,1) by a power of 2 and chop it +c--------------------------------------------------------------------- + icnvrt = int(ipwr2 * x) + + return + end +c-------end of icnvrt----------------------------- + + +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + subroutine vecset(n, v, iv, nzv, i, val) +c--------------------------------------------------------------------- +c--------------------------------------------------------------------- + + implicit none + integer n, iv(*), nzv, i, k + double precision v(*), val + +c--------------------------------------------------------------------- +c set ith element of sparse vector (v, iv) with +c nzv nonzeros to val +c--------------------------------------------------------------------- + + logical set + + set = .false. + do k = 1, nzv + if (iv(k) .eq. i) then + v(k) = val + set = .true. + endif + enddo + if (.not. set) then + nzv = nzv + 1 + v(nzv) = val + iv(nzv) = i + endif + return + end +c-------end of vecset----------------------------- + + include 'print_results.f' + include 'timers.f' + include 'randdp.f' + diff --git a/sapfor/experts/Sapfor_2017/CMakeLists.txt b/sapfor/experts/Sapfor_2017/CMakeLists.txt index 97f4aeb..c93d69d 100644 --- a/sapfor/experts/Sapfor_2017/CMakeLists.txt +++ b/sapfor/experts/Sapfor_2017/CMakeLists.txt @@ -94,7 +94,9 @@ set(UTILS _src/Utils/AstWrapper.h _src/Utils/types.h _src/Utils/utils.cpp _src/Utils/utils.h - _src/Utils/version.h) + _src/Utils/version.h + _src/Utils/module_utils.h + _src/Utils/module_utils.cpp) set(OMEGA _src/SageAnalysisTool/OmegaForSage/add-assert.cpp _src/SageAnalysisTool/OmegaForSage/affine.cpp diff --git a/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp b/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp index 0b58816..642a5b7 100644 --- a/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp +++ b/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp @@ -832,15 +832,22 @@ static pair return make_pair(retDir, lastReturn); } -static pair getModuleRename(const map>& byUse, const DIST::Array* array, - const string& filename, const pair& lineRange) +static pair getModuleRename(const set& allocatableStmts, const DIST::Array* array) { - auto declS = array->GetDeclSymbol(filename, lineRange, getAllFilesInProject())->GetOriginal(); - for (auto& elem : byUse) - for (auto& localS : setToMapWithSortByStr(elem.second)) - if (OriginalSymbol(localS.second) == declS) - return make_pair(elem.first, localS.second->identifier()); - return make_pair("", ""); + if (array->GetLocation().first == DIST::l_MODULE) + { + set arrayNames; + for (auto& alloc : allocatableStmts) + if (alloc->variant() == ALLOCATE_STMT) + arrayNames.insert(getNameByUse(alloc, array->GetShortName(), array->GetLocation().second)); + + if (arrayNames.size() > 1 || arrayNames.size() == 0) + printInternalError(convertFileName(__FILE__).c_str(), __LINE__); + + return make_pair(array->GetShortName(), *arrayNames.begin()); + } + else + return make_pair("", ""); } static pair @@ -848,10 +855,7 @@ getNewDirective(const string &fullArrayName, const vector &distrRules, const vector &alignRules, const DataDirective &dataDir, - const map> &byUse, - const string& filename, - const pair& lineRange, - bool alignToRealign) + const set& allocatableStmts) { string out = ""; DIST::Array* outA = NULL; @@ -873,7 +877,7 @@ getNewDirective(const string &fullArrayName, if (dataDir.alignRules[i].alignArray->GetName() == fullArrayName) { string rule = alignRules[i]; - if (alignToRealign) + if (allocatableStmts.size()) { auto it = rule.find("ALIGN"); while (it != string::npos) @@ -881,8 +885,8 @@ getNewDirective(const string &fullArrayName, rule = rule.replace(it, 5, "REALIGN"); it = rule.find("ALIGN", it + 7); } - - auto renamePair = getModuleRename(byUse, dataDir.alignRules[i].alignArray, filename, lineRange); + + auto renamePair = getModuleRename(allocatableStmts, dataDir.alignRules[i].alignArray); if (renamePair.first != "") { it = rule.find(renamePair.first); @@ -1722,29 +1726,12 @@ void insertDistributionToFile(SgFile *file, const char *fin_name, const DataDire if (distrArrays.find(fullArrayName) != distrArrays.end()) { - map> byUseInFunc; - const vector &allocatableStmtsCopy = getAttributes(st, set{ ALLOCATE_STMT }); set allocatableStmts; if (allocatableStmtsCopy.size()) - { allocatableStmts = filterAllocateStats(file, allocatableStmtsCopy, currSymb->identifier()); - - - for (auto &alloc : allocatableStmts) - { - if (alloc->fileName() != currFilename) - if (!alloc->switchToFile()) - printInternalError(convertFileName(__FILE__).c_str(), __LINE__); - - auto byUse = moduleRefsByUseInFunction(alloc); - for (auto &byUseElem : byUse) - byUseInFunc[byUseElem.first].insert(byUseElem.second.begin(), byUseElem.second.end()); - - SgFile::switchToFile(currFilename); - } - } - pair dirWithArray = getNewDirective(fullArrayName, distrRules, alignRules, dataDir, byUseInFunc, filename, lineRange, allocatableStmts.size() != 0); + + pair dirWithArray = getNewDirective(fullArrayName, distrRules, alignRules, dataDir, allocatableStmts); string toInsert = dirWithArray.second; if (toInsert != "") diff --git a/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp b/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp index 5dad847..9b760c7 100644 --- a/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp +++ b/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp @@ -166,155 +166,6 @@ void DvmhRegionInserter::updateParallelFunctions(const map>> &modByUse, const string& varName, - const set& locNames, vector &altNames) -{ - for (auto& elem : modByUse) - { - if (locNames.count(elem.first)) - { - for (auto& byUse : elem.second) - { - SgSymbol* toCmp = byUse.second ? byUse.second : byUse.first; - checkNull(toCmp, convertFileName(__FILE__).c_str(), __LINE__); - if (toCmp->identifier() == varName) - altNames.push_back(byUse.first->identifier()); - } - } - } -} - -static void fillInfo(SgStatement *start, - set &useMod, - map>> &modByUse, - map>> &modByUseOnly) -{ - for (SgStatement* st = start; st != start->lastNodeOfStmt(); st = st->lexNext()) - { - if (isSgExecutableStatement(st)) - break; - if (st->variant() == CONTAINS_STMT) - break; - if (st != start && (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR)) - break; - fillUseStatement(st, useMod, modByUse, modByUseOnly); - } -} - -static SgStatement* findModWithName(const vector &modules, const string &name) -{ - for (auto& elem : modules) - if (elem->variant() == MODULE_STMT) - if (elem->symbol()->identifier() == name) - return elem; - return NULL; -} - -static string getNameByUse(SgStatement *place, const string &varName, const string &locName) -{ - SgStatement* func = getFuncStat(place); - if (func == NULL) - return varName; - else - { - map> graphUse; - - set useMod; - map>> modByUse; - map>> modByUseOnly; - - fillInfo(func, useMod, modByUse, modByUseOnly); - SgStatement* cp = func->controlParent(); - if (isSgProgHedrStmt(cp) || cp->variant() == MODULE_STMT) // if function in contains region - fillInfo(cp, useMod, modByUse, modByUseOnly); - - set useModDone; - bool needRepeat = true; - - vector modules; - findModulesInFile(func->getFile(), modules); - - while (needRepeat) - { - needRepeat = false; - set newUseMod; - for (auto& useM : useMod) - { - if (useModDone.find(useM) == useModDone.end()) - { - auto modSt = findModWithName(modules, useM); - if (modSt == NULL || useM == "dvmh_template_mod") - continue; - - checkNull(modSt, convertFileName(__FILE__).c_str(), __LINE__); - - set tmpUse; - fillInfo(modSt, tmpUse, modByUse, modByUseOnly); - useModDone.insert(useM); - - for (auto& use : tmpUse) - { - newUseMod.insert(use); - - if (use != "dvmh_template_mod") - graphUse[use].insert(useM); - } - } - } - - for (auto& newU : newUseMod) - { - if (useModDone.find(newU) == useModDone.end()) - { - useModDone.insert(newU); - needRepeat = true; - } - } - } - - vector altNames; - findByUse(modByUse, varName, { locName }, altNames); - findByUse(modByUseOnly, varName, { locName }, altNames); - - if (altNames.size() == 0) - { - set locations = { locName }; - bool changed = true; - while (changed) - { - changed = false; - for (auto& loc : locations) - { - if (graphUse.find(loc) != graphUse.end()) - { - for (auto& use : graphUse[loc]) - { - if (locations.find(use) == locations.end()) - { - locations.insert(use); - changed = true; - } - } - } - } - } - - findByUse(modByUse, varName, locations, altNames); - findByUse(modByUseOnly, varName, locations, altNames); - } - - if (altNames.size() == 0) - return varName; - else if (altNames.size() >= 1) - { - set setAlt(altNames.begin(), altNames.end()); - return *setAlt.begin(); - } - else - printInternalError(convertFileName(__FILE__).c_str(), __LINE__); - } -} - static SgStatement* skipDvmhRegionInterval(SgStatement *start) { if (start->variant() != ACC_REGION_DIR) diff --git a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp index 35c3dfb..f583a20 100644 --- a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp +++ b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp @@ -2296,123 +2296,6 @@ void checkForRecursion(SgFile *file, map> &allFuncInfo } } -static void fillUseStmt(SgStatement *stat, map> &byUse) -{ - if (stat->variant() != USE_STMT) - printInternalError(convertFileName(__FILE__).c_str(), __LINE__); - - SgExpression* ex = stat->expr(0); - if (ex && ex->variant() == ONLY_NODE) - { - for (auto exI = ex->lhs(); exI; exI = exI->rhs()) - { - if (exI->lhs()->variant() == RENAME_NODE) - { - SgExpression* ren = exI->lhs(); - if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol()) - byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol()); - } - } - } - else if (ex && ex->lhs()) - { - for (auto exI = ex; exI; exI = exI->rhs()) - { - if (exI->lhs()->variant() == RENAME_NODE) - { - SgExpression* ren = exI->lhs(); - if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol()) - byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol()); - } - } - } -} - -map> moduleRefsByUseInFunction(SgStatement *stIn) -{ - checkNull(stIn, convertFileName(__FILE__).c_str(), __LINE__); - - map> byUse; - int var = stIn->variant(); - while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR) - { - stIn = stIn->controlParent(); - if (stIn == NULL) - return byUse; - var = stIn->variant(); - } - - auto mapOfUses = createMapOfModuleUses(stIn->getFile()); - set useMods; - - for (SgStatement *stat = stIn->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext()) - { - if (stat->variant() == USE_STMT) - { - fillUseStmt(stat, byUse); - useMods.insert(stat->symbol()->identifier()); - } - } - - const int cpOfSt = stIn->controlParent()->variant(); - //contains of func - if (cpOfSt == PROG_HEDR || cpOfSt == PROC_HEDR || cpOfSt == FUNC_HEDR) - { - for (SgStatement *stat = stIn->controlParent()->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext()) - { - if (stat->variant() == USE_STMT) - { - fillUseStmt(stat, byUse); - useMods.insert(stat->symbol()->identifier()); - } - } - } - - bool chages = true; - while (chages) - { - chages = false; - set newUseMods(useMods); - for (auto &elem : useMods) - { - auto it = mapOfUses.find(elem); - if (it != mapOfUses.end()) - { - for (auto &elem2 : it->second) - { - if (newUseMods.find(elem2) == newUseMods.end()) - { - newUseMods.insert(elem2); - chages = true; - } - } - } - } - useMods = newUseMods; - } - - vector modules; - findModulesInFile(stIn->getFile(), modules); - for (auto &mod : modules) - { - if (useMods.find(mod->symbol()->identifier()) != useMods.end()) - { - for (SgStatement *stat = mod->lexNext(); stat != mod->lastNodeOfStmt(); stat = stat->lexNext()) - { - const int var = stat->variant(); - if (var == USE_STMT) - { - fillUseStmt(stat, byUse); - useMods.insert(stat->symbol()->identifier()); - } - else if (var == PROC_HEDR || var == FUNC_HEDR) - break; - } - } - } - return byUse; -} - void propagateWritesToArrays(map> &allFuncInfo) { map funcByName; diff --git a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h index a4d3715..0660d67 100644 --- a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h +++ b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h @@ -45,7 +45,6 @@ int CheckFunctionsToInline(SgProject *proj, const std::map &fi void checkForRecursion(SgFile *file, std::map> &allFuncInfo, std::vector &messagesForFile); bool isPassFullArray(SgExpression *ex); void doMacroExpand(SgFile *file, std::vector &messages); -std::map> moduleRefsByUseInFunction(SgStatement *stIn); void propagateWritesToArrays(std::map> &allFuncInfo); void detectCopies(std::map> &allFuncInfo); void fillInterfaceBlock(std::map>& allFuncInfo); diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp index eb9b0e2..677e0a9 100644 --- a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp +++ b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp @@ -879,46 +879,6 @@ void initTags() #include "tag.h" } - -void findModulesInFile(SgFile *file, vector &modules) -{ - SgStatement *first = file->firstStatement(); - set functions; - - int funcNum = file->numberOfFunctions(); - for (int i = 0; i < funcNum; ++i) - functions.insert(file->functions(i)); - - while (first) - { - if (first->variant() == MODULE_STMT) - { - modules.push_back(first); - first = first->lastNodeOfStmt(); - } - else - { - if (functions.size()) - { - auto it = functions.find(first); - if (it != functions.end()) - first = (*it)->lastNodeOfStmt(); - } - } - - first = first->lexNext(); - } -} - -void getModulesAndFunctions(SgFile *file, vector &modulesAndFunctions) -{ - findModulesInFile(file, modulesAndFunctions); - - int funcNum = file->numberOfFunctions(); - for (int i = 0; i < funcNum; ++i) - modulesAndFunctions.push_back(file->functions(i)); -} - void tryToFindPrivateInAttributes(SgStatement *st, set &privates, bool onlyReduction, bool onlyUsers) { set privatesVars; @@ -2365,76 +2325,6 @@ objT& getObjectForFileFromMap(const char *fileName, map &mapObject template vector& getObjectForFileFromMap(const char *fileName, map>&); template PredictorStats& getObjectForFileFromMap(const char *fileName, map&); -SgSymbol* getFromModule(const map> &byUse, SgSymbol *orig, bool processAsModule) -{ - if (!processAsModule) - { - checkNull(orig->scope(), convertFileName(__FILE__).c_str(), __LINE__); - if (orig->scope()->variant() != MODULE_STMT) - return orig; - } - - if (byUse.size()) - { - for (auto& elem : byUse) - { - for (auto& localS : setToMapWithSortByStr(elem.second)) - if (OriginalSymbol(localS.second)->thesymb == orig->thesymb) - return localS.second; - } - } - return orig; -} - -map> createMapOfModuleUses(SgFile *file) -{ - map> retValMap; - - vector modules; - findModulesInFile(file, modules); - - for (int z = 0; z < modules.size(); ++z) - { - SgStatement *curr = modules[z]; - string modName = curr->symbol()->identifier(); - for (SgStatement *st = curr->lexNext(); st != curr->lastNodeOfStmt(); st = st->lexNext()) - { - if (st->variant() == USE_STMT) - retValMap[modName].insert(st->symbol()->identifier()); - else if (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR) - break; - } - } - - bool repeat = true; - while (repeat) - { - repeat = false; - for (auto &elem : retValMap) - { - set toAdd(elem.second); - for (auto &inUse : elem.second) - { - auto it = retValMap.find(inUse); - if (it != retValMap.end()) - { - for (auto &inUseToAdd : it->second) - { - if (toAdd.find(inUseToAdd) == toAdd.end()) - { - toAdd.insert(inUseToAdd); - repeat = true; - } - } - } - } - elem.second = toAdd; - } - } - - return retValMap; -} - void printSymbolTable(SgFile *file, string filter, const set& vars) { for (auto s = file->firstSymbol(); s; s = s->next()) @@ -2574,96 +2464,6 @@ SgStatement* duplicateProcedure(SgStatement *toDup, const string *newName, bool return toMove; } -void fillModuleUse(SgFile *file, map> &moduleUses, map &moduleDecls) -{ - const string currFN = file->filename(); - for (SgStatement* st = file->firstStatement(); st; st = st->lexNext()) - { - if (st->fileName() == currFN) - { - if (st->variant() == USE_STMT) - moduleUses[currFN].insert(st->symbol()->identifier()); - - if (st->variant() == MODULE_STMT) - { - string moduleN = st->symbol()->identifier(); - auto it = moduleDecls.find(moduleN); - if (it != moduleDecls.end()) - printInternalError(convertFileName(__FILE__).c_str(), __LINE__); - moduleDecls[moduleN] = currFN; - } - } - } -} - -void filterModuleUse(map>& moduleUsesByFile, map& moduleDecls) -{ - for (auto& elem : moduleUsesByFile) - { - set newSet; - for (auto& setElem : elem.second) - { - auto it = moduleDecls.find(setElem); - if (it == moduleDecls.end()) - newSet.insert(setElem); - else if (elem.first != it->second) - newSet.insert(setElem); - } - elem.second = newSet; - } - - /*map> modIncludeMod; - - for (auto& mod : moduleDecls) - { - string name = mod.first; - string file = mod.second; - - auto it = moduleUsesByFile.find(file); - if (it != moduleUsesByFile.end()) - modIncludeMod[name] = it->second; - } - - bool change = true; - while (change) - { - change = false; - for (auto& mod : modIncludeMod) - { - set newSet = mod.second; - for (auto& included : mod.second) - { - auto it = modIncludeMod.find(included); - if (it == modIncludeMod.end()) - continue; - - for (auto& elem : it->second) - { - if (newSet.find(elem) == newSet.end()) - { - newSet.insert(elem); - change = true; - } - } - } - mod.second = newSet; - } - } - - for (auto& elem : moduleUsesByFile) - { - set newSet = elem.second; - for (auto& setElem : elem.second) - { - auto it = modIncludeMod.find(setElem); - if (it != modIncludeMod.end()) - for (auto& toRem : it->second) - newSet.erase(toRem); - } - elem.second = newSet; - }*/ -} - SgExpression* makeExprList(const vector& items, bool withSort) { SgExpression* list = NULL; @@ -2847,59 +2647,6 @@ int getNextFreeLabel() return -1; } -static void addUseStatements(SgStatement* currF, SgStatement* obj, vector& useStats, - const vector& funcContains) -{ - for (auto& funcSt : funcContains) - { - if (currF == funcSt) - { - SgStatement* last = obj->lastNodeOfStmt(); - for (SgStatement* st = obj->lexNext(); st != last; st = st->lexNext()) - { - if (st->variant() == USE_STMT) - useStats.push_back(st); - else if (st->variant() == CONTAINS_STMT) - break; - } - break; - } - } -} - -void fillUsedModulesInFunction(SgStatement *st, vector &useStats) -{ - checkNull(st, convertFileName(__FILE__).c_str(), __LINE__); - - int var = st->variant(); - while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR) - { - st = st->controlParent(); - checkNull(st, convertFileName(__FILE__).c_str(), __LINE__); - var = st->variant(); - } - - for (SgStatement *stat = st->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext()) - if (stat->variant() == USE_STMT) - useStats.push_back(stat); - - for (int i = 0; i < current_file->numberOfFunctions(); ++i) - { - vector funcContains; - findContainsFunctions(current_file->functions(i), funcContains); - addUseStatements(st, current_file->functions(i), useStats, funcContains); - } - - vector modules; - findModulesInFile(st->getFile(), modules); - for (auto &module : modules) - { - vector funcContains; - findContainsFunctions(module, funcContains, true); - addUseStatements(st, module, useStats, funcContains); - } -} - static void recFillUsedVars(SgExpression *exp, map &vars) { if (exp) diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h index ff889c0..5ce6d38 100644 --- a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h +++ b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h @@ -5,6 +5,7 @@ #include "../Distribution/Distribution.h" #include "../GraphCall/graph_calls.h" #include "../DynamicAnalysis/gcov_info.h" +#include "module_utils.h" SgStatement* declaratedInStmt(SgSymbol *toFind, std::vector *allDecls = NULL, bool printInternal = true, SgStatement* scope = NULL); @@ -15,8 +16,7 @@ std::string removeIncludeStatsAndUnparse(SgFile *file, const char *fileName, con SgSymbol* findSymbolOrCreate(SgFile *file, const std::string toFind, SgType *type = NULL, SgStatement *scope = NULL); void recExpressionPrint(SgExpression *exp); void removeSubstrFromStr(std::string &str, const std::string &del); -void getModulesAndFunctions(SgFile *file, std::vector &modulesAndFunctions); -void findModulesInFile(SgFile *file, std::vector &modules); + void tryToFindPrivateInAttributes(SgStatement* st, std::set& privatesVars, bool onlyReduction = false, bool onlyUsers = false); void fillNonDistrArraysAsPrivate(SgStatement *st, @@ -60,15 +60,11 @@ const CommonBlock* isArrayInCommon(const std::map &co std::vector fillArraysFromDir(Statement *st); -SgSymbol* getFromModule(const std::map> &byUse, SgSymbol *orig, bool processAsModule = false); -std::map> createMapOfModuleUses(SgFile* file); void printSymbolTable(SgFile* file, std::string filter = "", const std::set& vars = {}); SgStatement* getFuncStat(SgStatement *st, const std::set additional = std::set()); std::map> createDefUseMapByPlace(); SgStatement* duplicateProcedure(SgStatement* toDup, const std::string* newName, bool withAttributes = false, bool withComment = false, bool withSameLines = true, bool dontInsert = false); -void fillModuleUse(SgFile* file, std::map>& moduleUses, std::map& moduleDecls); -void filterModuleUse(std::map>& moduleUses, std::map& moduleDecls); SgExpression* makeExprList(const std::vector& items, bool withSort = true); std::string unparseProjectToString(SgFile* file, const int curr_regime); @@ -77,7 +73,6 @@ std::vector makeDeclaration(const std::vector& symbolsT int getNextFreeLabel(); -void fillUsedModulesInFunction(SgStatement *st, std::vector &useStats); void fillVisibleInUseVariables(SgStatement *useSt, std::map &vars); std::string preprocDataString(std::string data, bool full = true); @@ -91,7 +86,13 @@ void getVariables(SgExpression* ex, std::set& variables, const std::s template std::set getAllVariables(SgStatement* stFrom, SgStatement* stTo, const std::set& variants); -SgProject* createProject(const char* proj_name, std::vector& parallelRegions, std::vector& subs_parallelRegions, std::map>& hiddenData, std::map& filesNameWithoutExt, std::map>& moduleUsesByFile, std::map& moduleDecls, std::map>>& exctactedModuleStats, bool printSymbTable); +SgProject* createProject(const char* proj_name, std::vector& parallelRegions, + std::vector& subs_parallelRegions, + std::map>& hiddenData, + std::map& filesNameWithoutExt, + std::map>& moduleUsesByFile, + std::map& moduleDecls, + std::map>>& exctactedModuleStats, bool printSymbTable); bool isArrayType(SgType* type); bool isArrayRef(SgExpression* ex); diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/module_utils.cpp b/sapfor/experts/Sapfor_2017/_src/Utils/module_utils.cpp new file mode 100644 index 0000000..99df6ba --- /dev/null +++ b/sapfor/experts/Sapfor_2017/_src/Utils/module_utils.cpp @@ -0,0 +1,676 @@ + +#include +#include +#include +#include + +#include "dvm.h" +#include "errors.h" +#include "utils.h" +#include "../GraphCall/graph_calls_func.h" + +#include "module_utils.h" + +using std::vector; +using std::set; +using std::string; +using std::map; +using std::pair; +using std::make_pair; + +void findModulesInFile(SgFile* file, vector& modules) +{ + SgStatement* first = file->firstStatement(); + set functions; + + int funcNum = file->numberOfFunctions(); + for (int i = 0; i < funcNum; ++i) + functions.insert(file->functions(i)); + + while (first) + { + if (first->variant() == MODULE_STMT) + { + modules.push_back(first); + first = first->lastNodeOfStmt(); + } + else + { + if (functions.size()) + { + auto it = functions.find(first); + if (it != functions.end()) + first = (*it)->lastNodeOfStmt(); + } + } + + first = first->lexNext(); + } +} + +void getModulesAndFunctions(SgFile* file, vector& modulesAndFunctions) +{ + findModulesInFile(file, modulesAndFunctions); + + int funcNum = file->numberOfFunctions(); + for (int i = 0; i < funcNum; ++i) + modulesAndFunctions.push_back(file->functions(i)); +} + +SgSymbol* getFromModule(const map>& byUse, SgSymbol* orig, bool processAsModule) +{ + if (!processAsModule) + { + checkNull(orig->scope(), convertFileName(__FILE__).c_str(), __LINE__); + if (orig->scope()->variant() != MODULE_STMT) + return orig; + } + + if (byUse.size()) + { + for (auto& elem : byUse) + { + for (auto& localS : setToMapWithSortByStr(elem.second)) + if (OriginalSymbol(localS.second)->thesymb == orig->thesymb) + return localS.second; + } + } + return orig; +} + +map> createMapOfModuleUses(SgFile* file) +{ + map> retValMap; + + vector modules; + findModulesInFile(file, modules); + + for (int z = 0; z < modules.size(); ++z) + { + SgStatement* curr = modules[z]; + string modName = curr->symbol()->identifier(); + for (SgStatement* st = curr->lexNext(); st != curr->lastNodeOfStmt(); st = st->lexNext()) + { + if (st->variant() == USE_STMT) + retValMap[modName].insert(st->symbol()->identifier()); + else if (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR) + break; + } + } + + bool repeat = true; + while (repeat) + { + repeat = false; + for (auto& elem : retValMap) + { + set toAdd(elem.second); + for (auto& inUse : elem.second) + { + auto it = retValMap.find(inUse); + if (it != retValMap.end()) + { + for (auto& inUseToAdd : it->second) + { + if (toAdd.find(inUseToAdd) == toAdd.end()) + { + toAdd.insert(inUseToAdd); + repeat = true; + } + } + } + } + elem.second = toAdd; + } + } + + return retValMap; +} + +void fillModuleUse(SgFile* file, map>& moduleUses, map& moduleDecls) +{ + const string currFN = file->filename(); + for (SgStatement* st = file->firstStatement(); st; st = st->lexNext()) + { + if (st->fileName() == currFN) + { + if (st->variant() == USE_STMT) + moduleUses[currFN].insert(st->symbol()->identifier()); + + if (st->variant() == MODULE_STMT) + { + string moduleN = st->symbol()->identifier(); + auto it = moduleDecls.find(moduleN); + if (it != moduleDecls.end()) + printInternalError(convertFileName(__FILE__).c_str(), __LINE__); + moduleDecls[moduleN] = currFN; + } + } + } +} + +void filterModuleUse(map>& moduleUsesByFile, map& moduleDecls) +{ + for (auto& elem : moduleUsesByFile) + { + set newSet; + for (auto& setElem : elem.second) + { + auto it = moduleDecls.find(setElem); + if (it == moduleDecls.end()) + newSet.insert(setElem); + else if (elem.first != it->second) + newSet.insert(setElem); + } + elem.second = newSet; + } + + /*map> modIncludeMod; + + for (auto& mod : moduleDecls) + { + string name = mod.first; + string file = mod.second; + + auto it = moduleUsesByFile.find(file); + if (it != moduleUsesByFile.end()) + modIncludeMod[name] = it->second; + } + + bool change = true; + while (change) + { + change = false; + for (auto& mod : modIncludeMod) + { + set newSet = mod.second; + for (auto& included : mod.second) + { + auto it = modIncludeMod.find(included); + if (it == modIncludeMod.end()) + continue; + + for (auto& elem : it->second) + { + if (newSet.find(elem) == newSet.end()) + { + newSet.insert(elem); + change = true; + } + } + } + mod.second = newSet; + } + } + + for (auto& elem : moduleUsesByFile) + { + set newSet = elem.second; + for (auto& setElem : elem.second) + { + auto it = modIncludeMod.find(setElem); + if (it != modIncludeMod.end()) + for (auto& toRem : it->second) + newSet.erase(toRem); + } + elem.second = newSet; + }*/ +} + +static void addUseStatements(SgStatement* currF, SgStatement* obj, vector& useStats, + const vector& funcContains) +{ + for (auto& funcSt : funcContains) + { + if (currF == funcSt) + { + SgStatement* last = obj->lastNodeOfStmt(); + for (SgStatement* st = obj->lexNext(); st != last; st = st->lexNext()) + { + if (st->variant() == USE_STMT) + useStats.push_back(st); + else if (st->variant() == CONTAINS_STMT) + break; + } + break; + } + } +} + +void fillUsedModulesInFunction(SgStatement* st, vector& useStats) +{ + checkNull(st, convertFileName(__FILE__).c_str(), __LINE__); + + int var = st->variant(); + while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR) + { + st = st->controlParent(); + checkNull(st, convertFileName(__FILE__).c_str(), __LINE__); + var = st->variant(); + } + + for (SgStatement* stat = st->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext()) + if (stat->variant() == USE_STMT) + useStats.push_back(stat); + + for (int i = 0; i < current_file->numberOfFunctions(); ++i) + { + vector funcContains; + findContainsFunctions(current_file->functions(i), funcContains); + addUseStatements(st, current_file->functions(i), useStats, funcContains); + } + + vector modules; + findModulesInFile(st->getFile(), modules); + for (auto& module : modules) + { + vector funcContains; + findContainsFunctions(module, funcContains, true); + addUseStatements(st, module, useStats, funcContains); + } +} + +static void findByUse(map>>& modByUse, const string& varName, + const set& locNames, vector& altNames) +{ + for (auto& elem : modByUse) + { + if (locNames.count(elem.first)) + { + for (auto& byUse : elem.second) + { + SgSymbol* toCmp = byUse.second ? byUse.second : byUse.first; + checkNull(toCmp, convertFileName(__FILE__).c_str(), __LINE__); + if (toCmp->identifier() == varName) + altNames.push_back(byUse.first->identifier()); + } + } + } +} + +static void fillInfo(SgStatement* start, + set& useMod, + map>>& modByUse, + map>>& modByUseOnly) +{ + for (SgStatement* st = start; st != start->lastNodeOfStmt(); st = st->lexNext()) + { + if (isSgExecutableStatement(st)) + break; + if (st->variant() == CONTAINS_STMT) + break; + if (st != start && (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR)) + break; + fillUseStatement(st, useMod, modByUse, modByUseOnly); + } +} + +static SgStatement* findModWithName(const vector& modules, const string& name) +{ + for (auto& elem : modules) + if (elem->variant() == MODULE_STMT) + if (elem->symbol()->identifier() == name) + return elem; + return NULL; +} + +string getNameByUse(SgStatement* place, const string& varName, const string& locName) +{ + int old_id = -1; + string oldFileName = ""; + if (place->getFileId() != current_file_id) + { + old_id = current_file_id; + oldFileName = current_file->filename(); + if (!place->switchToFile()) + printInternalError(convertFileName(__FILE__).c_str(), __LINE__); + } + + SgStatement* func = getFuncStat(place, { MODULE_STMT }); + string returnVal = varName; + if (func != NULL) + { + map> graphUse; + + set useMod; + map>> modByUse; + map>> modByUseOnly; + + fillInfo(func, useMod, modByUse, modByUseOnly); + SgStatement* cp = func->controlParent(); + if (isSgProgHedrStmt(cp) || cp->variant() == MODULE_STMT) // if function in contains region + fillInfo(cp, useMod, modByUse, modByUseOnly); + + set useModDone; + bool needRepeat = true; + + vector modules; + findModulesInFile(func->getFile(), modules); + + while (needRepeat) + { + needRepeat = false; + set newUseMod; + for (auto& useM : useMod) + { + if (useModDone.find(useM) == useModDone.end()) + { + auto modSt = findModWithName(modules, useM); + if (modSt == NULL || useM == "dvmh_template_mod") + continue; + + checkNull(modSt, convertFileName(__FILE__).c_str(), __LINE__); + + set tmpUse; + fillInfo(modSt, tmpUse, modByUse, modByUseOnly); + useModDone.insert(useM); + + for (auto& use : tmpUse) + { + newUseMod.insert(use); + + if (use != "dvmh_template_mod") + graphUse[use].insert(useM); + } + } + } + + for (auto& newU : newUseMod) + { + if (useModDone.find(newU) == useModDone.end()) + { + useModDone.insert(newU); + needRepeat = true; + } + } + } + + vector altNames; + findByUse(modByUse, varName, { locName }, altNames); + findByUse(modByUseOnly, varName, { locName }, altNames); + + if (altNames.size() == 0) + { + set locations = { locName }; + bool changed = true; + while (changed) + { + changed = false; + for (auto& loc : locations) + { + if (graphUse.find(loc) != graphUse.end()) + { + for (auto& use : graphUse[loc]) + { + if (locations.find(use) == locations.end()) + { + locations.insert(use); + changed = true; + } + } + } + } + } + + findByUse(modByUse, varName, locations, altNames); + findByUse(modByUseOnly, varName, locations, altNames); + } + + if (altNames.size() == 0) + returnVal = varName; + else if (altNames.size() >= 1) + { + set setAlt(altNames.begin(), altNames.end()); + returnVal = *setAlt.begin(); + } + else + printInternalError(convertFileName(__FILE__).c_str(), __LINE__); + } + + if (old_id != -1) + { + if (SgFile::switchToFile(oldFileName) == -1) + printInternalError(convertFileName(__FILE__).c_str(), __LINE__); + } + + return returnVal; +} + +void fixUseOnlyStmt(SgFile *file, const vector ®s) +{ + for (int z = 0; z < file->numberOfFunctions(); ++z) + { + vector modules; + findModulesInFile(file, modules); + map mod; + for (auto &elem : modules) + mod[elem->symbol()->identifier()] = elem; + + if (modules.size()) + { + SgStatement *func = file->functions(z); + bool hasTemplateUse = false; + set needToAdd; + + for (auto st = func; st != func->lastNodeOfStmt(); st = st->lexNext()) + { + if (isSgExecutableStatement(st)) + break; + + if (st->variant() == USE_STMT) + { + SgExpression *ex = st->expr(0); + string modName = st->symbol()->identifier(); + + auto it = mod.find(modName); + if (modName == "dvmh_Template_Mod") + { + hasTemplateUse = true; + break; + } + + if (ex && ex->variant() == ONLY_NODE && it != mod.end()) + { + set allS; + for (auto exI = ex->lhs(); exI; exI = exI->rhs()) + { + if (exI->lhs()->variant() == RENAME_NODE) + { + if (exI->lhs()->lhs()->symbol()) + allS.insert(exI->lhs()->lhs()->symbol()->identifier()); + if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol()) + allS.insert(exI->lhs()->rhs()->symbol()->identifier()); + } + } + + for (auto &parReg : regs) + { + const DataDirective &dataDir = parReg->GetDataDir(); + for (auto &rule : dataDir.distrRules) + { + DIST::Array *curr = rule.first; + auto location = curr->GetLocation(); + if (location.first == 2 && location.second == modName) + needToAdd.insert(curr); + } + + for (auto& rule : dataDir.alignRules) + { + DIST::Array* curr = rule.alignArray; + auto location = curr->GetLocation(); + if (location.first == 2 && location.second == modName) + needToAdd.insert(curr); + } + } + } + } + } + + if (!hasTemplateUse && needToAdd.size()) + { + SgStatement* useSt = new SgStatement(USE_STMT); + useSt->setSymbol(*findSymbolOrCreate(file, "dvmh_Template_Mod")); + useSt->setlineNumber(getNextNegativeLineNumber()); + + func->insertStmtAfter(*useSt, *func); + } + } + } +} + +void fillUseStatement(SgStatement *st, set &useMod, + map>> &modByUse, + map>> &modByUseOnly) +{ + if (st->variant() == USE_STMT) + { + SgExpression *ex = st->expr(0); + string modName = st->symbol()->identifier(); + convertToLower(modName); + useMod.insert(modName); + + if (ex) + { + SgExpression *start = ex; + bool only = false; + if (ex->variant() == ONLY_NODE) + { + start = ex->lhs(); + only = true; + } + + for (auto exI = start; exI; exI = exI->rhs()) + { + if (exI->lhs()->variant() == RENAME_NODE) + { + SgSymbol *left = NULL, *right = NULL; + if (exI->lhs()->lhs()->symbol()) + left = exI->lhs()->lhs()->symbol(); + if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol()) + right = exI->lhs()->rhs()->symbol(); + if (only) + modByUseOnly[modName].push_back(std::make_pair(left, right)); + else + modByUse[modName].push_back(std::make_pair(left, right)); + } + } + } + } +} + +static void fillUseStmt(SgStatement* stat, map>& byUse) +{ + if (stat->variant() != USE_STMT) + printInternalError(convertFileName(__FILE__).c_str(), __LINE__); + + SgExpression* ex = stat->expr(0); + if (ex && ex->variant() == ONLY_NODE) + { + for (auto exI = ex->lhs(); exI; exI = exI->rhs()) + { + if (exI->lhs()->variant() == RENAME_NODE) + { + SgExpression* ren = exI->lhs(); + if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol()) + byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol()); + } + } + } + else if (ex && ex->lhs()) + { + for (auto exI = ex; exI; exI = exI->rhs()) + { + if (exI->lhs()->variant() == RENAME_NODE) + { + SgExpression* ren = exI->lhs(); + if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol()) + byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol()); + } + } + } +} + +map> moduleRefsByUseInFunction(SgStatement* stIn) +{ + checkNull(stIn, convertFileName(__FILE__).c_str(), __LINE__); + + map> byUse; + int var = stIn->variant(); + while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR) + { + stIn = stIn->controlParent(); + if (stIn == NULL) + return byUse; + var = stIn->variant(); + } + + auto mapOfUses = createMapOfModuleUses(stIn->getFile()); + set useMods; + + for (SgStatement* stat = stIn->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext()) + { + if (stat->variant() == USE_STMT) + { + fillUseStmt(stat, byUse); + useMods.insert(stat->symbol()->identifier()); + } + } + + const int cpOfSt = stIn->controlParent()->variant(); + //contains of func + if (cpOfSt == PROG_HEDR || cpOfSt == PROC_HEDR || cpOfSt == FUNC_HEDR) + { + for (SgStatement* stat = stIn->controlParent()->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext()) + { + if (stat->variant() == USE_STMT) + { + fillUseStmt(stat, byUse); + useMods.insert(stat->symbol()->identifier()); + } + } + } + + bool chages = true; + while (chages) + { + chages = false; + set newUseMods(useMods); + for (auto& elem : useMods) + { + auto it = mapOfUses.find(elem); + if (it != mapOfUses.end()) + { + for (auto& elem2 : it->second) + { + if (newUseMods.find(elem2) == newUseMods.end()) + { + newUseMods.insert(elem2); + chages = true; + } + } + } + } + useMods = newUseMods; + } + + vector modules; + findModulesInFile(stIn->getFile(), modules); + for (auto& mod : modules) + { + if (useMods.find(mod->symbol()->identifier()) != useMods.end()) + { + for (SgStatement* stat = mod->lexNext(); stat != mod->lastNodeOfStmt(); stat = stat->lexNext()) + { + const int var = stat->variant(); + if (var == USE_STMT) + { + fillUseStmt(stat, byUse); + useMods.insert(stat->symbol()->identifier()); + } + else if (var == PROC_HEDR || var == FUNC_HEDR) + break; + } + } + } + return byUse; +} diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/version.h b/sapfor/experts/Sapfor_2017/_src/Utils/version.h index bbc2467..94a1911 100644 --- a/sapfor/experts/Sapfor_2017/_src/Utils/version.h +++ b/sapfor/experts/Sapfor_2017/_src/Utils/version.h @@ -1,3 +1,3 @@ #pragma once -#define VERSION_SPF "2388" +#define VERSION_SPF "2389" diff --git a/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp b/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp index 69d2607..96625f0 100644 --- a/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp +++ b/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp @@ -60,128 +60,6 @@ void VarDeclCorrecter(SgFile *file) } } -void fixUseOnlyStmt(SgFile *file, const vector ®s) -{ - for (int z = 0; z < file->numberOfFunctions(); ++z) - { - vector modules; - findModulesInFile(file, modules); - map mod; - for (auto &elem : modules) - mod[elem->symbol()->identifier()] = elem; - - if (modules.size()) - { - SgStatement *func = file->functions(z); - bool hasTemplateUse = false; - set needToAdd; - - for (auto st = func; st != func->lastNodeOfStmt(); st = st->lexNext()) - { - if (isSgExecutableStatement(st)) - break; - - if (st->variant() == USE_STMT) - { - SgExpression *ex = st->expr(0); - string modName = st->symbol()->identifier(); - - auto it = mod.find(modName); - if (modName == "dvmh_Template_Mod") - { - hasTemplateUse = true; - break; - } - - if (ex && ex->variant() == ONLY_NODE && it != mod.end()) - { - set allS; - for (auto exI = ex->lhs(); exI; exI = exI->rhs()) - { - if (exI->lhs()->variant() == RENAME_NODE) - { - if (exI->lhs()->lhs()->symbol()) - allS.insert(exI->lhs()->lhs()->symbol()->identifier()); - if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol()) - allS.insert(exI->lhs()->rhs()->symbol()->identifier()); - } - } - - for (auto &parReg : regs) - { - const DataDirective &dataDir = parReg->GetDataDir(); - for (auto &rule : dataDir.distrRules) - { - DIST::Array *curr = rule.first; - auto location = curr->GetLocation(); - if (location.first == 2 && location.second == modName) - needToAdd.insert(curr); - } - - for (auto& rule : dataDir.alignRules) - { - DIST::Array* curr = rule.alignArray; - auto location = curr->GetLocation(); - if (location.first == 2 && location.second == modName) - needToAdd.insert(curr); - } - } - } - } - } - - if (!hasTemplateUse && needToAdd.size()) - { - SgStatement* useSt = new SgStatement(USE_STMT); - useSt->setSymbol(*findSymbolOrCreate(file, "dvmh_Template_Mod")); - useSt->setlineNumber(getNextNegativeLineNumber()); - - func->insertStmtAfter(*useSt, *func); - } - } - } -} - -void fillUseStatement(SgStatement *st, set &useMod, - map>> &modByUse, - map>> &modByUseOnly) -{ - if (st->variant() == USE_STMT) - { - SgExpression *ex = st->expr(0); - string modName = st->symbol()->identifier(); - convertToLower(modName); - useMod.insert(modName); - - if (ex) - { - SgExpression *start = ex; - bool only = false; - if (ex->variant() == ONLY_NODE) - { - start = ex->lhs(); - only = true; - } - - for (auto exI = start; exI; exI = exI->rhs()) - { - if (exI->lhs()->variant() == RENAME_NODE) - { - SgSymbol *left = NULL, *right = NULL; - if (exI->lhs()->lhs()->symbol()) - left = exI->lhs()->lhs()->symbol(); - if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol()) - right = exI->lhs()->rhs()->symbol(); - if (only) - modByUseOnly[modName].push_back(std::make_pair(left, right)); - else - modByUse[modName].push_back(std::make_pair(left, right)); - } - } - } - } -} - struct ModuleInfo { set useMod; diff --git a/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h b/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h index 65a6fc5..ae62d4f 100644 --- a/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h +++ b/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h @@ -38,7 +38,6 @@ void resolveFunctionCalls(SgFile* file, const std::set& toResolve, bool checkAndMoveFormatOperators(SgFile* file, std::vector &currMessage, bool withError = true); int VerifyFile(SgFile *file); -void fixUseOnlyStmt(SgFile *file, const std::vector ®s); void correctModuleProcNames(SgFile *file, const std::set& globalF); void correctModuleSymbols(SgFile *file); void replaceStructuresToSimpleTypes(SgFile* file); @@ -48,7 +47,6 @@ bool checkArgumentsDeclaration(SgProject *project, const std::map &derivedTypesDecl); bool isDerivedAssign(SgStatement *st); std::map createDerivedTypeDeclMap(SgStatement *forS); -void fillUseStatement(SgStatement* st, std::set& useMod, std::map>>& modByUse, std::map>>& modByUseOnly); void removeExecutableFromModuleDeclaration(SgFile* current, const std::set& filesInProj, std::vector& hiddenData); bool needToReplaceInterfaceName(SgStatement* interf);