fixed dead flag for functions

This commit is contained in:
ALEXks
2025-02-09 20:48:06 +03:00
committed by Dudarenko
parent c6b09ad285
commit 68c779790d
19 changed files with 897 additions and 241 deletions

View File

@@ -487,7 +487,7 @@ typedef long long __indexTypeLLong;
//--------------------- Kernel for loop on line 558 ---------------------
__global__ void loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
__global__ void loop_cg_558_cuda_kernel_int(double _q[], double _p[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
{
// Private variables
@@ -496,8 +496,8 @@ typedef long long __indexTypeLLong;
int cond_0;
int __k;
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = gid / warpSize;
int lid = gid % warpSize;
int tid = gid / 32;
int lid = gid % 32;
// Local needs
__indexTypeInt _j;
@@ -520,7 +520,7 @@ typedef long long __indexTypeLLong;
__k < cond_0 ;
_k = _k + warpSize, __k = __k + warpSize)
{
_sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
_sum = _p[_colidx[_k]] * _a[_k] + _sum;
}
_sum = __dvmh_warpReduceSum(_sum);
if (lid == 0) {
@@ -532,7 +532,7 @@ typedef long long __indexTypeLLong;
//--------------------- Kernel for loop on line 558 ---------------------
__global__ void loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
__global__ void loop_cg_558_cuda_kernel_llong(double _q[], double _p[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
{
// Private variables
@@ -541,8 +541,8 @@ typedef long long __indexTypeLLong;
int cond_0;
int __k;
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = gid / warpSize;
int lid = gid % warpSize;
int tid = gid / 32;
int lid = gid % 32;
// Local needs
__indexTypeLLong _j;
__indexTypeLLong rest_blocks, cur_blocks;
@@ -564,7 +564,7 @@ typedef long long __indexTypeLLong;
__k < cond_0 ;
_k = _k + warpSize, __k = __k + warpSize)
{
_sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
_sum = _p[_colidx[_k]] * _a[_k] + _sum;
}
_sum = __dvmh_warpReduceSum(_sum);
if (lid == 0) {
@@ -752,7 +752,7 @@ typedef long long __indexTypeLLong;
//--------------------- Kernel for loop on line 605 ---------------------
__global__ void loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
__global__ void loop_cg_605_cuda_kernel_int(double _r[], double _z[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
{
// Private variables
@@ -761,8 +761,8 @@ typedef long long __indexTypeLLong;
int cond_0;
int __k;
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = gid / warpSize;
int lid = gid % warpSize;
int tid = gid / 32;
int lid = gid % 32;
// Local needs
__indexTypeInt _j;
__indexTypeInt rest_blocks, cur_blocks;
@@ -784,7 +784,7 @@ typedef long long __indexTypeLLong;
__k < cond_0 ;
_k = _k + warpSize, __k = __k + warpSize)
{
_d = _z_rma[_colidx[_k]] * _a[_k] + _d;
_d = _z[_colidx[_k]] * _a[_k] + _d;
}
_d = __dvmh_warpReduceSum(_d);
if (lid == 0) {
@@ -796,7 +796,7 @@ typedef long long __indexTypeLLong;
//--------------------- Kernel for loop on line 605 ---------------------
__global__ void loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
__global__ void loop_cg_605_cuda_kernel_llong(double _r[], double _z[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
{
// Private variables
@@ -805,8 +805,8 @@ typedef long long __indexTypeLLong;
int cond_0;
int __k;
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = gid / warpSize;
int lid = gid % warpSize;
int tid = gid / 32;
int lid = gid % 32;
// Local needs
__indexTypeLLong _j;
__indexTypeLLong rest_blocks, cur_blocks;
@@ -828,7 +828,7 @@ typedef long long __indexTypeLLong;
__k < cond_0 ;
_k = _k + warpSize, __k = __k + warpSize)
{
_d = _z_rma[_colidx[_k]] * _a[_k] + _d;
_d = _z[_colidx[_k]] * _a[_k] + _d;
}
_d = __dvmh_warpReduceSum(_d);
if (lid == 0) {
@@ -1727,10 +1727,11 @@ extern "C" {
// CUDA handler for loop on line 558
void loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
void loop_cg_558_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
{
void *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base;
DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4];
void *q_base, *p_base, *colidx_base, *a_base, *rowstr_base;
DvmType d_q[4], d_p[4], d_colidx[4], d_a[4], d_rowstr[4];
DvmType idxTypeInKernel;
dim3 blocks, threads;
cudaStream_t stream;
@@ -1742,15 +1743,15 @@ extern "C" {
device_num = loop_get_device_num_(loop_ref);
// Get 'natural' bases
p_rma_base = dvmh_get_natural_base(&device_num, _p_rma);
q_base = dvmh_get_natural_base(&device_num, _q);
p_base = dvmh_get_natural_base(&device_num, _p);
colidx_base = dvmh_get_natural_base(&device_num, _colidx);
a_base = dvmh_get_natural_base(&device_num, _a);
rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
// Fill 'device' headers
dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma);
dvmh_fill_header_(&device_num, q_base, _q, d_q);
dvmh_fill_header_(&device_num, p_base, _p, d_p);
dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
dvmh_fill_header_(&device_num, a_base, _a, d_a);
dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
@@ -1778,8 +1779,8 @@ extern "C" {
}
loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
overallBlocks = blocksS[0];
restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
overallBlocks = blocksS[0]* dvmh_get_warp_size(loop_ref);;
restBlocks = overallBlocks;
addBlocks = 0;
blocks = dim3(1, 1, 1);
maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
@@ -1798,11 +1799,11 @@ extern "C" {
}
if (idxTypeInKernel == rt_INT)
{
loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)q_base, (double *)p_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
}
else
{
loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)q_base, (double *)p_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
}
addBlocks += blocks.x;
restBlocks -= blocks.x;
@@ -2098,10 +2099,10 @@ extern "C" {
// CUDA handler for loop on line 605
void loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
void loop_cg_605_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _z[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
{
void *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base;
DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4];
void *r_base, *z_base, *colidx_base, *a_base, *rowstr_base;
DvmType d_r[4], d_z[4], d_colidx[4], d_a[4], d_rowstr[4];
DvmType idxTypeInKernel;
dim3 blocks, threads;
cudaStream_t stream;
@@ -2113,15 +2114,15 @@ extern "C" {
device_num = loop_get_device_num_(loop_ref);
// Get 'natural' bases
z_rma_base = dvmh_get_natural_base(&device_num, _z_rma);
r_base = dvmh_get_natural_base(&device_num, _r);
z_base = dvmh_get_natural_base(&device_num, _z);
colidx_base = dvmh_get_natural_base(&device_num, _colidx);
a_base = dvmh_get_natural_base(&device_num, _a);
rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
// Fill 'device' headers
dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma);
dvmh_fill_header_(&device_num, r_base, _r, d_r);
dvmh_fill_header_(&device_num, z_base, _z, d_z);
dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
dvmh_fill_header_(&device_num, a_base, _a, d_a);
dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
@@ -2155,7 +2156,7 @@ extern "C" {
blocks = dim3(1, 1, 1);
maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
// GPU execution
while (restBlocks > 0)
{
@@ -2169,15 +2170,15 @@ extern "C" {
}
if (idxTypeInKernel == rt_INT)
{
loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)r_base, (double *)z_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
}
else
{
loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)r_base, (double *)z_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
}
addBlocks += blocks.x;
restBlocks -= blocks.x;
}
}
}

View File

@@ -550,11 +550,11 @@ c The conj grad iteration loop
c---->
c---------------------------------------------------------------------
do cgit = 1, cgitmax
d = 0.0d0
! DVM$ interval 11
CDVM$ region
CDVM$ parallel (j) on p(j), private(sum,k), remote_access(p(:))
!WANR for many process, remote_access(p(:)) is needed
CDVM$ parallel (j) on p(j), private(sum,k)
do j=1,lastrow-firstrow+1
sum = 0.d0
do k=rowstr(j),rowstr(j+1)-1
@@ -570,7 +570,7 @@ CDVM$ parallel (j) on q(j), reduction(SUM(d))
CDVM$ end region
alpha = rho / d
rho0 = rho
! DVM$ end interval
rho = 0.0d0
CDVM$ region
CDVM$ parallel (j) on r(j), private(d), reduction(SUM(rho))
@@ -598,10 +598,10 @@ c Compute residual norm explicitly: ||r|| = ||x - A.z||
c First, form A.z
c The partition submatrix-vector multiply
c---------------------------------------------------------------------
!WANR for many process, remote_access(z(:)) is needed
sum = 0.0d0
CDVM$ region
CDVM$ parallel (j) on r(j), private(d,k),remote_access(z(:))
CDVM$ parallel (j) on r(j), private(d,k)
do j=1,lastrow-firstrow+1
d = 0.d0
do k=rowstr(j),rowstr(j+1)-1