diff --git a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.DVMH_cuda.cu_opt b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.DVMH_cuda.cu_opt
new file mode 100644
index 0000000..90ed1a4
--- /dev/null
+++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.DVMH_cuda.cu_opt
@@ -0,0 +1,2285 @@
+
+#include <dvmhlib_cuda.h>
+#define dcmplx2 Complex<double>
+#define cmplx2 Complex<float>
+typedef int __indexTypeInt; 
+typedef long long __indexTypeLLong;
+
+
+
+
+
+//--------------------- Kernel for loop on line 229 ---------------------
+
+      __global__ void   loop_cg_229_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _i;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 229 ---------------------
+
+      __global__ void   loop_cg_229_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _i;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 233 ---------------------
+
+      __global__ void   loop_cg_233_cuda_kernel_int(double _p[], double _r[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _r[_j] = 0.0e0;
+            _p[_j] = 0.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 233 ---------------------
+
+      __global__ void   loop_cg_233_cuda_kernel_llong(double _p[], double _r[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _r[_j] = 0.0e0;
+            _p[_j] = 0.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 272 ---------------------
+
+      __global__ void   loop_cg_272_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 272 ---------------------
+
+      __global__ void   loop_cg_272_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 285 ---------------------
+
+      __global__ void   loop_cg_285_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 285 ---------------------
+
+      __global__ void   loop_cg_285_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 301 ---------------------
+
+      __global__ void   loop_cg_301_cuda_kernel_int(double _x[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _i;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 301 ---------------------
+
+      __global__ void   loop_cg_301_cuda_kernel_llong(double _x[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _i;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _i = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_i <= end_1) 
+         {
+
+// Loop body
+            _x[_i] = 1.0e0;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 347 ---------------------
+
+      __global__ void   loop_cg_347_cuda_kernel_int(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 347 ---------------------
+
+      __global__ void   loop_cg_347_cuda_kernel_llong(double _z[], double _x[], double _norm_temp1, double norm_temp1_grid[], double _norm_temp2, double norm_temp2_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _norm_temp1 = _z[_j] * _x[_j] + _norm_temp1;
+            _norm_temp2 = _z[_j] * _z[_j] + _norm_temp2;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _norm_temp1 = __dvmh_blockReduceSum(_norm_temp1);
+         _norm_temp2 = __dvmh_blockReduceSum(_norm_temp2);
+         if (_j % warpSize == 0) 
+         {
+            norm_temp2_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp2;
+            norm_temp1_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _norm_temp1;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 367 ---------------------
+
+      __global__ void   loop_cg_367_cuda_kernel_int(double _x[], double _z[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 367 ---------------------
+
+      __global__ void   loop_cg_367_cuda_kernel_llong(double _x[], double _z[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _norm_temp2)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _x[_j] = _z[_j] * _norm_temp2;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 522 ---------------------
+
+      __global__ void   loop_cg_522_cuda_kernel_int(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _d = _x[_j];
+            _r[_j] = _d;
+            _p[_j] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 522 ---------------------
+
+      __global__ void   loop_cg_522_cuda_kernel_llong(double _p[], double _r[], double _x[], double _z[], double _q[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _q[_j] = 0.0e0;
+            _z[_j] = 0.0e0;
+            _d = _x[_j];
+            _r[_j] = _d;
+            _p[_j] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 537 ---------------------
+
+      __global__ void   loop_cg_537_cuda_kernel_int(double _r[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _rho = _r[_j] * _r[_j] + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 537 ---------------------
+
+      __global__ void   loop_cg_537_cuda_kernel_llong(double _r[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _rho = _r[_j] * _r[_j] + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 558 ---------------------
+
+      __global__ void   loop_cg_558_cuda_kernel_int(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _sum;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+         
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _sum = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                    (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                        cond_0 = (-1) : 
+                        cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1),
+                    __k = 0 + lid ; 
+                 __k < cond_0 ; 
+                 _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
+            }
+            _sum = __dvmh_warpReduceSum(_sum);
+            if (lid == 0) {
+                _q[_j] = _sum;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 558 ---------------------
+
+      __global__ void   loop_cg_558_cuda_kernel_llong(double _p_rma[], double _q[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _sum;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _sum = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                    (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                        cond_0 = (-1) : 
+                        cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1),
+                    __k = 0 + lid ; 
+                 __k < cond_0 ; 
+                 _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _sum = _p_rma[_colidx[_k]] * _a[_k] + _sum;
+            }
+            _sum = __dvmh_warpReduceSum(_sum);
+            if (lid == 0) {
+                _q[_j] = _sum;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 567 ---------------------
+
+      __global__ void   loop_cg_567_cuda_kernel_int(double _q[], double _p[], double _d, double d_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = _q[_j] * _p[_j] + _d;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _d = __dvmh_blockReduceSum(_d);
+         if (_j % warpSize == 0) 
+         {
+            d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 567 ---------------------
+
+      __global__ void   loop_cg_567_cuda_kernel_llong(double _q[], double _p[], double _d, double d_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = _q[_j] * _p[_j] + _d;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _d = __dvmh_blockReduceSum(_d);
+         if (_j % warpSize == 0) 
+         {
+            d_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _d;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 577 ---------------------
+
+      __global__ void   loop_cg_577_cuda_kernel_int(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _alpha)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _z[_j] = _p[_j] * _alpha + _z[_j];
+            _d = (-(_alpha * _q[_j])) + _r[_j];
+            _r[_j] = _d;
+            _rho = _d * _d + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 577 ---------------------
+
+      __global__ void   loop_cg_577_cuda_kernel_llong(double _q[], double _r[], double _p[], double _z[], double _rho, double rho_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _alpha)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _z[_j] = _p[_j] * _alpha + _z[_j];
+            _d = (-(_alpha * _q[_j])) + _r[_j];
+            _r[_j] = _d;
+            _rho = _d * _d + _rho;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _rho = __dvmh_blockReduceSum(_rho);
+         if (_j % warpSize == 0) 
+         {
+            rho_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _rho;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 588 ---------------------
+
+      __global__ void   loop_cg_588_cuda_kernel_int(double _p[], double _r[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks, double _beta)
+      {
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _p[_j] = _p[_j] * _beta + _r[_j];
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 588 ---------------------
+
+      __global__ void   loop_cg_588_cuda_kernel_llong(double _p[], double _r[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks, double _beta)
+      {
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _p[_j] = _p[_j] * _beta + _r[_j];
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 605 ---------------------
+
+      __global__ void   loop_cg_605_cuda_kernel_int(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _d;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                 (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                    cond_0 = (-1) :
+                    cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), 
+                    __k = 0  + lid;
+                  __k < cond_0 ; 
+                  _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _d = _z_rma[_colidx[_k]] * _a[_k] + _d;
+            }
+            _d = __dvmh_warpReduceSum(_d);
+            if (lid == 0) {
+                _r[_j] = _d;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 605 ---------------------
+
+      __global__ void   loop_cg_605_cuda_kernel_llong(double _z_rma[], double _r[], int _colidx[], double _a[], int _rowstr[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         int _k;
+         double _d;
+         int cond_0;
+         int __k;
+         int gid = blockIdx.x * blockDim.x + threadIdx.x;
+         int tid = gid / warpSize;
+         int lid = gid % warpSize;
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x) / warpSize;
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = 0.e0;
+            for (_k = _rowstr[_j] + lid, 
+                 (_rowstr[_j] > _rowstr[_j + 1] - 1 && 1 > 0 || _rowstr[_j] < _rowstr[_j + 1] - 1 && 1 < 0) ? 
+                    cond_0 = (-1) :
+                    cond_0 = abs(_rowstr[_j] - (_rowstr[_j + 1] - 1)) + abs(1), 
+                    __k = 0  + lid;
+                  __k < cond_0 ; 
+                  _k = _k + warpSize, __k = __k + warpSize)
+            {
+               _d = _z_rma[_colidx[_k]] * _a[_k] + _d;
+            }
+            _d = __dvmh_warpReduceSum(_d);
+            if (lid == 0) {
+                _r[_j] = _d;
+            }
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 618 ---------------------
+
+      __global__ void   loop_cg_618_cuda_kernel_int(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeInt begin_1, __indexTypeInt end_1, __indexTypeInt add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeInt _j;
+         __indexTypeInt rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = (-_r[_j]) + _x[_j];
+            _sum = _d * _d + _sum;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _sum = __dvmh_blockReduceSum(_sum);
+         if (_j % warpSize == 0) 
+         {
+            sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum;
+         }
+      }
+
+
+//--------------------- Kernel for loop on line 618 ---------------------
+
+      __global__ void   loop_cg_618_cuda_kernel_llong(double _r[], double _x[], double _sum, double sum_grid[], __indexTypeLLong begin_1, __indexTypeLLong end_1, __indexTypeLLong add_blocks)
+      {
+
+// Private variables
+         double _d;
+
+// Local needs
+         __indexTypeLLong _j;
+         __indexTypeLLong rest_blocks, cur_blocks;
+
+// Calculate each thread's loop variables' values
+         rest_blocks = add_blocks + blockIdx.x;
+         cur_blocks = rest_blocks;
+         _j = begin_1 + (cur_blocks * blockDim.x + threadIdx.x);
+         if (_j <= end_1) 
+         {
+
+// Loop body
+            _d = (-_r[_j]) + _x[_j];
+            _sum = _d * _d + _sum;
+         }
+
+// Reduction
+         _j = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.x * blockDim.y);
+         _sum = __dvmh_blockReduceSum(_sum);
+         if (_j % warpSize == 0) 
+         {
+            sum_grid[(add_blocks + blockIdx.x) * (blockDim.x * blockDim.y * blockDim.z / warpSize) + _j / warpSize] = _sum;
+         }
+      }
+
+
+
+#ifdef _MS_F_
+#define loop_cg_229_cuda_ loop_cg_229_cuda
+#define loop_cg_233_cuda_ loop_cg_233_cuda
+#define loop_cg_272_cuda_ loop_cg_272_cuda
+#define loop_cg_285_cuda_ loop_cg_285_cuda
+#define loop_cg_301_cuda_ loop_cg_301_cuda
+#define loop_cg_347_cuda_ loop_cg_347_cuda
+#define loop_cg_367_cuda_ loop_cg_367_cuda
+#define loop_cg_522_cuda_ loop_cg_522_cuda
+#define loop_cg_537_cuda_ loop_cg_537_cuda
+#define loop_cg_558_cuda_ loop_cg_558_cuda
+#define loop_cg_567_cuda_ loop_cg_567_cuda
+#define loop_cg_577_cuda_ loop_cg_577_cuda
+#define loop_cg_588_cuda_ loop_cg_588_cuda
+#define loop_cg_605_cuda_ loop_cg_605_cuda
+#define loop_cg_618_cuda_ loop_cg_618_cuda
+#endif
+
+extern "C" {
+      extern  DvmType loop_cg_618_cuda_kernel_llong_regs, loop_cg_618_cuda_kernel_int_regs, loop_cg_605_cuda_kernel_llong_regs, loop_cg_605_cuda_kernel_int_regs, loop_cg_588_cuda_kernel_llong_regs, loop_cg_588_cuda_kernel_int_regs, loop_cg_577_cuda_kernel_llong_regs, loop_cg_577_cuda_kernel_int_regs, loop_cg_567_cuda_kernel_llong_regs, loop_cg_567_cuda_kernel_int_regs, loop_cg_558_cuda_kernel_llong_regs, loop_cg_558_cuda_kernel_int_regs, loop_cg_537_cuda_kernel_llong_regs, loop_cg_537_cuda_kernel_int_regs, loop_cg_522_cuda_kernel_llong_regs, loop_cg_522_cuda_kernel_int_regs, loop_cg_367_cuda_kernel_llong_regs, loop_cg_367_cuda_kernel_int_regs, loop_cg_347_cuda_kernel_llong_regs, loop_cg_347_cuda_kernel_int_regs, loop_cg_301_cuda_kernel_llong_regs, loop_cg_301_cuda_kernel_int_regs, loop_cg_285_cuda_kernel_llong_regs, loop_cg_285_cuda_kernel_int_regs, loop_cg_272_cuda_kernel_llong_regs, loop_cg_272_cuda_kernel_int_regs, loop_cg_233_cuda_kernel_llong_regs, loop_cg_233_cuda_kernel_int_regs, loop_cg_229_cuda_kernel_llong_regs, loop_cg_229_cuda_kernel_int_regs;
+
+
+//    CUDA handler for loop on line 229 
+
+      void   loop_cg_229_cuda_(DvmType *loop_ref, DvmType _x[])
+      {
+         void   *x_base;
+         DvmType d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_229_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_229_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_229_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 233 
+
+      void   loop_cg_233_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _z[], DvmType _q[])
+      {
+         void   *p_base, *r_base, *z_base, *q_base;
+         DvmType d_p[4], d_r[4], d_z[4], d_q[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         q_base = dvmh_get_natural_base(&device_num, _q);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_233_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_233_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_233_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 272 
+
+      void   loop_cg_272_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[])
+      {
+         void   *z_base, *x_base;
+         DvmType d_z[4], d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *norm_temp2_grid;
+         double _norm_temp2;
+         void   *norm_temp1_grid;
+         double _norm_temp1;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0);
+         red_num = 2;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0);
+
+// Get 'natural' bases
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_272_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         red_num = 2;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_272_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_272_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+         red_num = 2;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 285 
+
+      void   loop_cg_285_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2)
+      {
+         void   *x_base, *z_base;
+         DvmType d_x[4], d_z[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_285_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_285_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            else 
+            {
+               loop_cg_285_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 301 
+
+      void   loop_cg_301_cuda_(DvmType *loop_ref, DvmType _x[])
+      {
+         void   *x_base;
+         DvmType d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_301_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_301_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_301_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 347 
+
+      void   loop_cg_347_cuda_(DvmType *loop_ref, DvmType _z[], DvmType _x[])
+      {
+         void   *z_base, *x_base;
+         DvmType d_z[4], d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *norm_temp2_grid;
+         double _norm_temp2;
+         void   *norm_temp1_grid;
+         double _norm_temp1;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp1_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp1, 0);
+         red_num = 2;
+         loop_cuda_register_red(loop_ref, red_num, &norm_temp2_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_norm_temp2, 0);
+
+// Get 'natural' bases
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_347_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         red_num = 2;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_347_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_347_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)z_base, (double *)x_base, _norm_temp1, (double *)norm_temp1_grid, _norm_temp2, (double *)norm_temp2_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+         red_num = 2;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 367 
+
+      void   loop_cg_367_cuda_(DvmType *loop_ref, DvmType _x[], DvmType _z[], double *_norm_temp2)
+      {
+         void   *x_base, *z_base;
+         DvmType d_x[4], d_z[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         x_base = dvmh_get_natural_base(&device_num, _x);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_367_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_367_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            else 
+            {
+               loop_cg_367_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)x_base, (double *)z_base, idxL[0], idxH[0], addBlocks, *_norm_temp2);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 522 
+
+      void   loop_cg_522_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], DvmType _x[], DvmType _z[], DvmType _q[])
+      {
+         void   *p_base, *r_base, *x_base, *z_base, *q_base;
+         DvmType d_p[4], d_r[4], d_x[4], d_z[4], d_q[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+         q_base = dvmh_get_natural_base(&device_num, _q);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_522_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_522_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_522_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, (double *)x_base, (double *)z_base, (double *)q_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 537 
+
+      void   loop_cg_537_cuda_(DvmType *loop_ref, DvmType _r[])
+      {
+         void   *r_base;
+         DvmType d_r[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *rho_grid;
+         double _rho;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_rho, 0);
+
+// Get 'natural' bases
+         r_base = dvmh_get_natural_base(&device_num, _r);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_537_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_537_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_537_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)r_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 558 
+
+      void   loop_cg_558_cuda_(DvmType *loop_ref, DvmType _p_rma[], DvmType _q[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
+      {
+         void   *p_rma_base, *q_base, *colidx_base, *a_base, *rowstr_base;
+         DvmType d_p_rma[4], d_q[4], d_colidx[4], d_a[4], d_rowstr[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_rma_base = dvmh_get_natural_base(&device_num, _p_rma);
+         q_base = dvmh_get_natural_base(&device_num, _q);
+         colidx_base = dvmh_get_natural_base(&device_num, _colidx);
+         a_base = dvmh_get_natural_base(&device_num, _a);
+         rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_rma_base, _p_rma, d_p_rma);
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
+         dvmh_fill_header_(&device_num, a_base, _a, d_a);
+         dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_558_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+         maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_558_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_558_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_rma_base, (double *)q_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 567 
+
+      void   loop_cg_567_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _p[])
+      {
+         void   *q_base, *p_base;
+         DvmType d_q[4], d_p[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *d_grid;
+         double _d;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &d_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_d, 0);
+
+// Get 'natural' bases
+         q_base = dvmh_get_natural_base(&device_num, _q);
+         p_base = dvmh_get_natural_base(&device_num, _p);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_567_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_567_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_567_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)p_base, _d, (double *)d_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 577 
+
+      void   loop_cg_577_cuda_(DvmType *loop_ref, DvmType _q[], DvmType _r[], DvmType _p[], DvmType _z[], double *_alpha)
+      {
+         void   *q_base, *r_base, *p_base, *z_base;
+         DvmType d_q[4], d_r[4], d_p[4], d_z[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *rho_grid;
+         double _rho;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &rho_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_rho, 0);
+
+// Get 'natural' bases
+         q_base = dvmh_get_natural_base(&device_num, _q);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         z_base = dvmh_get_natural_base(&device_num, _z);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, q_base, _q, d_q);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, z_base, _z, d_z);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_577_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_577_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha);
+            }
+            else 
+            {
+               loop_cg_577_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)q_base, (double *)r_base, (double *)p_base, (double *)z_base, _rho, (double *)rho_grid, idxL[0], idxH[0], addBlocks, *_alpha);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+
+//    CUDA handler for loop on line 588 
+
+      void   loop_cg_588_cuda_(DvmType *loop_ref, DvmType _p[], DvmType _r[], double *_beta)
+      {
+         void   *p_base, *r_base;
+         DvmType d_p[4], d_r[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         p_base = dvmh_get_natural_base(&device_num, _p);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, p_base, _p, d_p);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_588_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_588_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta);
+            }
+            else 
+            {
+               loop_cg_588_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)p_base, (double *)r_base, idxL[0], idxH[0], addBlocks, *_beta);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 605 
+
+      void   loop_cg_605_cuda_(DvmType *loop_ref, DvmType _z_rma[], DvmType _r[], DvmType _colidx[], DvmType _a[], DvmType _rowstr[])
+      {
+         void   *z_rma_base, *r_base, *colidx_base, *a_base, *rowstr_base;
+         DvmType d_z_rma[4], d_r[4], d_colidx[4], d_a[4], d_rowstr[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Get 'natural' bases
+         z_rma_base = dvmh_get_natural_base(&device_num, _z_rma);
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         colidx_base = dvmh_get_natural_base(&device_num, _colidx);
+         a_base = dvmh_get_natural_base(&device_num, _a);
+         rowstr_base = dvmh_get_natural_base(&device_num, _rowstr);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, z_rma_base, _z_rma, d_z_rma);
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, colidx_base, _colidx, d_colidx);
+         dvmh_fill_header_(&device_num, a_base, _a, d_a);
+         dvmh_fill_header_(&device_num, rowstr_base, _rowstr, d_rowstr);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_int_regs, &threads, &stream, 0);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, 0, loop_cg_605_cuda_kernel_llong_regs, &threads, &stream, 0);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks * dvmh_get_warp_size(loop_ref);
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+         maxBlocks = maxBlocks / dvmh_get_warp_size(loop_ref) * dvmh_get_warp_size(loop_ref);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_605_cuda_kernel_int<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_605_cuda_kernel_llong<<<blocks, threads, 0, stream>>>((double *)z_rma_base, (double *)r_base, (int *)colidx_base, (double *)a_base, (int *)rowstr_base, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+      }
+
+
+//    CUDA handler for loop on line 618 
+
+      void   loop_cg_618_cuda_(DvmType *loop_ref, DvmType _r[], DvmType _x[])
+      {
+         void   *r_base, *x_base;
+         DvmType d_r[4], d_x[4];
+         DvmType idxTypeInKernel;
+         dim3 blocks, threads;
+         cudaStream_t stream;
+         DvmType idxL[1], idxH[1], loopSteps[1];
+         DvmType blocksS[1], restBlocks, maxBlocks, addBlocks, overallBlocks;
+         void   *sum_grid;
+         double _sum;
+         DvmType red_num, num_of_red_blocks, fill_flag;
+         DvmType shared_mem;
+         DvmType device_num;
+
+// Get device number
+         device_num = loop_get_device_num_(loop_ref);
+
+// Register reduction for CUDA-execution
+         red_num = 1;
+         loop_cuda_register_red(loop_ref, red_num, &sum_grid, 0);
+         loop_red_init_(loop_ref, &red_num, &_sum, 0);
+
+// Get 'natural' bases
+         r_base = dvmh_get_natural_base(&device_num, _r);
+         x_base = dvmh_get_natural_base(&device_num, _x);
+
+// Fill 'device' headers
+         dvmh_fill_header_(&device_num, r_base, _r, d_r);
+         dvmh_fill_header_(&device_num, x_base, _x, d_x);
+
+// Guess index type in CUDA kernel
+         idxTypeInKernel = loop_guess_index_type_(loop_ref);
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(int)) 
+         {
+            idxTypeInKernel = rt_INT;
+         }
+         if (idxTypeInKernel == rt_LONG && sizeof(long) == sizeof(long long)) 
+         {
+            idxTypeInKernel = rt_LLONG;
+         }
+
+// Get CUDA configuration parameters
+         threads = dim3(0, 0, 0);
+#ifdef CUDA_FERMI_ARCH
+         shared_mem = 8;
+#else
+         shared_mem = 0;
+#endif
+         if (idxTypeInKernel == rt_INT) 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_int_regs, &threads, &stream, &shared_mem);
+         }
+         else 
+         {
+            loop_cuda_get_config(loop_ref, shared_mem, loop_cg_618_cuda_kernel_llong_regs, &threads, &stream, &shared_mem);
+         }
+         loop_fill_bounds_(loop_ref, idxL, idxH, loopSteps);
+         blocksS[0] = (idxH[0] - idxL[0] + threads.x) / threads.x;
+         overallBlocks = blocksS[0];
+         restBlocks = overallBlocks;
+         addBlocks = 0;
+         blocks = dim3(1, 1, 1);
+
+// Prepare reduction
+         num_of_red_blocks = overallBlocks * (threads.x * threads.y * threads.z / dvmh_get_warp_size(loop_ref));
+         fill_flag = 0;
+         red_num = 1;
+         loop_cuda_red_prepare(loop_ref, red_num, num_of_red_blocks, fill_flag);
+         maxBlocks = loop_cuda_get_device_prop(loop_ref, CUDA_MAX_GRID_X);
+
+// GPU execution
+         while (restBlocks > 0)
+         {
+            if (restBlocks <= maxBlocks) 
+            {
+               blocks = restBlocks;
+            }
+            else 
+            {
+               blocks = maxBlocks;
+            }
+            if (idxTypeInKernel == rt_INT) 
+            {
+               loop_cg_618_cuda_kernel_int<<<blocks, threads, shared_mem, stream>>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks);
+            }
+            else 
+            {
+               loop_cg_618_cuda_kernel_llong<<<blocks, threads, shared_mem, stream>>>((double *)r_base, (double *)x_base, _sum, (double *)sum_grid, idxL[0], idxH[0], addBlocks);
+            }
+            addBlocks += blocks.x;
+            restBlocks -= blocks.x;
+         }
+
+// Finish reduction
+         red_num = 1;
+         loop_red_finish(loop_ref, red_num);
+      }
+
+}
diff --git a/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.fdv b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.fdv
new file mode 100644
index 0000000..f077345
--- /dev/null
+++ b/dvm/tools/tester/trunk/test-suite/Performance/NPB/FDVMH.fdv/CG/cluster/cg.fdv
@@ -0,0 +1,1008 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                      S E R I A L     V E R S I O N                      !
+!                                                                         !
+!                                   C G                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is a serial version of the NPB CG code.               !
+!    Refer to NAS Technical Reports 95-020 for details.                   !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c      NPB CG serial version      
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c Authors: M. Yarrow
+c          C. Kuszmaul
+c          A.S. Kolganov
+c
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      program cg
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      implicit none
+
+      include 'globals.h'
+
+
+      common / main_int_mem /  colidx,     rowstr,
+     >                         iv,         arow,     acol
+      integer                  colidx(nz), rowstr(na+1),
+     >                         iv(na),  arow(na), acol(naz),
+     >                         bl_low, bl_high, blGen,gBL(2)
+
+
+      common / main_flt_mem /  aelt,     a,
+     >                         x,
+     >                         z,
+     >                         p,
+     >                         q,
+     >                         r
+      double precision         aelt(naz), a(nz),
+     >                         x(na+1),
+     >                         z(na+1),
+     >                         p(na+1),
+     >                         q(na+1),
+     >                         r(na+1)
+
+
+
+ 
+CDVM$ TEMPLATE ttt(na+2)        
+CDVM$ DISTRIBUTE ttt(BLOCK)   
+CDVM$ ALIGN  z(I) WITH ttt(I) 
+
+CDVM$ ALIGN  x(I) WITH z(I)
+CDVM$ ALIGN  r(I) WITH z(I)
+CDVM$ ALIGN  p(I) WITH z(I)
+CDVM$ ALIGN  q(I) WITH z(I)
+
+
+      integer            i, j, k, it, sumL
+
+      double precision   zeta, randlc
+      external           randlc
+      double precision   rnorm
+      double precision   norm_temp1,norm_temp2
+
+      double precision   t, mflops, tmax
+      character          class
+      logical            verified
+      double precision   zeta_verify_value, epsilon, err
+
+      integer   fstatus
+      character t_names(t_last)*8
+
+      do i = 1, T_last
+         call timer_clear( i )
+      end do
+
+      open(unit=2, file='timer.flag', status='old', iostat=fstatus)
+      if (fstatus .eq. 0) then
+         timeron = .true.
+         t_names(t_init) = 'init'
+         t_names(t_bench) = 'benchmk'
+         t_names(t_conj_grad) = 'conjgd'
+         close(2)
+      else
+         timeron = .false.
+      endif
+
+      call timer_start( T_init )
+
+      firstrow = 1
+      lastrow  = na
+      firstcol = 1
+      lastcol  = na
+
+
+      if( na .eq. 1400 .and. 
+     &    nonzer .eq. 7 .and. 
+     &    niter .eq. 15 .and.
+     &    shift .eq. 10.d0 ) then
+         class = 'S'
+         zeta_verify_value = 8.5971775078648d0
+      else if( na .eq. 7000 .and. 
+     &         nonzer .eq. 8 .and. 
+     &         niter .eq. 15 .and.
+     &         shift .eq. 12.d0 ) then
+         class = 'W'
+         zeta_verify_value = 10.362595087124d0
+      else if( na .eq. 14000 .and. 
+     &         nonzer .eq. 11 .and. 
+     &         niter .eq. 15 .and.
+     &         shift .eq. 20.d0 ) then
+         class = 'A'
+         zeta_verify_value = 17.130235054029d0
+      else if( na .eq. 75000 .and. 
+     &         nonzer .eq. 13 .and. 
+     &         niter .eq. 75 .and.
+     &         shift .eq. 60.d0 ) then
+         class = 'B'
+         zeta_verify_value = 22.712745482631d0
+      else if( na .eq. 150000 .and. 
+     &         nonzer .eq. 15 .and. 
+     &         niter .eq. 75 .and.
+     &         shift .eq. 110.d0 ) then
+         class = 'C'
+         zeta_verify_value = 28.973605592845d0
+      else if( na .eq. 1500000 .and. 
+     &         nonzer .eq. 21 .and. 
+     &         niter .eq. 100 .and.
+     &         shift .eq. 500.d0 ) then
+         class = 'D'
+         zeta_verify_value = 52.514532105794d0
+      else if( na .eq. 9000000 .and. 
+     &         nonzer .eq. 26 .and. 
+     &         niter .eq. 100 .and.
+     &         shift .eq. 1.5d3 ) then
+         class = 'E'
+         zeta_verify_value = 77.522164599383d0
+      else
+         class = 'U'
+      endif
+
+      write( *,1000 ) 
+      write( *,1001 ) na
+      write( *,1002 ) niter
+      write( *,* )
+ 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-DVMH)',
+     >          ' - CG Benchmark', /)
+ 1001 format(' Size: ', i11 )
+ 1002 format(' Iterations: ', i5 )
+
+      naa = na
+      nzz = nz
+
+
+c---------------------------------------------------------------------
+c  Inialize random number generator
+c---------------------------------------------------------------------
+      tran    = 314159265.0D0
+      amult   = 1220703125.0D0
+      zeta    = randlc( tran, amult )
+
+c---------------------------------------------------------------------
+c  
+c---------------------------------------------------------------------
+      call makea(naa, nzz, a, colidx, rowstr, 
+     >           firstrow, lastrow, firstcol, lastcol, 
+     >           arow, acol, aelt, iv)
+
+
+          
+c---------------------------------------------------------------------
+c  Note: as a result of the above call to makea:
+c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
+c        values of colidx which are col indexes go from firstcol --> lastcol
+c        So:
+c        Shift the col index vals from actual (firstcol --> lastcol ) 
+c        to local, i.e., (1 --> lastcol-firstcol+1)
+c---------------------------------------------------------------------
+      do j=1,lastrow-firstrow+1
+         do k=rowstr(j),rowstr(j+1)-1
+            colidx(k) = colidx(k) - firstcol + 1
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c  set starting vector to (1, 1, .... 1)
+c---------------------------------------------------------------------
+CDVM$ region
+CDVM$ parallel (i) on x(i)
+      do i = 1, na+1
+         x(i) = 1.0D0
+      enddo
+CDVM$ parallel (j) on x(j)
+      do j=1, lastcol-firstcol+1
+         q(j) = 0.0d0
+         z(j) = 0.0d0
+         r(j) = 0.0d0
+         p(j) = 0.0d0
+      enddo
+CDVM$ end region
+      zeta  = 0.0d0
+      
+c---------------------------------------------------------------------
+c---->
+c  Do one iteration untimed to init all code and data page tables
+c---->                    (then reinit, start timing, to niter its)
+c---------------------------------------------------------------------
+      do it = 1, 1
+
+c---------------------------------------------------------------------
+c  The call to the conjugate gradient routine:
+c---------------------------------------------------------------------
+         call conj_grad ( colidx,
+     >                    rowstr,
+     >                    x,
+     >                    z,
+     >                    a,
+     >                    p,
+     >                    q,
+     >                    r,
+     >                    rnorm )
+
+c---------------------------------------------------------------------
+c  zeta = shift + 1/(x.z)
+c  So, first: (x.z)
+c  Also, find norm of z
+c  So, first: (z.z)
+c---------------------------------------------------------------------
+         norm_temp1 = 0.0d0
+         norm_temp2 = 0.0d0
+CDVM$ region
+CDVM$ parallel (j) on x(j),reduction(SUM(norm_temp1),SUM(norm_temp2))
+         do j=1, lastcol-firstcol+1
+            norm_temp1 = norm_temp1 + x(j)*z(j)
+            norm_temp2 = norm_temp2 + z(j)*z(j)
+         enddo
+CDVM$ end region
+         norm_temp2 = 1.0d0 / sqrt( norm_temp2 )
+
+
+c---------------------------------------------------------------------
+c  Normalize z to obtain x
+c---------------------------------------------------------------------
+CDVM$ region
+CDVM$ parallel (j) on x(j)
+         do j=1, lastcol-firstcol+1      
+            x(j) = norm_temp2*z(j)    
+         enddo                           
+CDVM$ end region
+
+      enddo                              ! end of do one iteration untimed
+
+
+c---------------------------------------------------------------------
+c  set starting vector to (1, 1, .... 1)
+c---------------------------------------------------------------------
+c
+c  
+c
+CDVM$ region
+CDVM$ parallel (i) on x(i)
+      do i = 1, na+1
+         x(i) = 1.0D0
+      enddo
+CDVM$ end region
+      zeta  = 0.0d0
+
+      call timer_stop( T_init )
+
+      write (*, 2000) timer_read(T_init)
+ 2000 format(' Initialization time = ',f15.3,' seconds')
+
+      call timer_start( T_bench )
+
+c---------------------------------------------------------------------
+c---->
+c  Main Iteration for inverse power method
+c---->
+c---------------------------------------------------------------------
+      do it = 1, niter
+
+c---------------------------------------------------------------------
+c  The call to the conjugate gradient routine:
+c---------------------------------------------------------------------
+         if ( timeron ) call timer_start( T_conj_grad )
+         call conj_grad ( colidx,
+     >                    rowstr,
+     >                    x,
+     >                    z,
+     >                    a,
+     >                    p,
+     >                    q,
+     >                    r,
+     >                    rnorm )
+         if ( timeron ) call timer_stop( T_conj_grad )
+
+
+c---------------------------------------------------------------------
+c  zeta = shift + 1/(x.z)
+c  So, first: (x.z)
+c  Also, find norm of z
+c  So, first: (z.z)
+c---------------------------------------------------------------------
+         norm_temp1 = 0.0d0
+         norm_temp2 = 0.0d0
+CDVM$ region
+CDVM$ parallel (j) on x(j),reduction(SUM(norm_temp1),SUM(norm_temp2))
+         do j=1, lastcol-firstcol+1
+            norm_temp1 = norm_temp1 + x(j)*z(j)
+            norm_temp2 = norm_temp2 + z(j)*z(j)
+         enddo
+CDVM$ end region
+         norm_temp2 = 1.0d0 / sqrt( norm_temp2 )
+
+
+         zeta = shift + 1.0d0 / norm_temp1
+         if( it .eq. 1 ) write( *,9000 )
+         write( *,9001 ) it, rnorm, zeta
+
+ 9000    format( /,'   iteration           ||r||                 zeta' )
+ 9001    format( 4x, i5, 7x, e20.14, f20.13 )
+
+c---------------------------------------------------------------------
+c  Normalize z to obtain x
+c---------------------------------------------------------------------
+CDVM$ region
+CDVM$ parallel (j) on x(j)
+         do j=1, lastcol-firstcol+1      
+            x(j) = norm_temp2*z(j)    
+         enddo                           
+CDVM$ end region
+
+      enddo                              ! end of main iter inv pow meth
+
+      call timer_stop( T_bench )
+
+c---------------------------------------------------------------------
+c  End of timed section
+c---------------------------------------------------------------------
+
+      t = timer_read( T_bench )
+
+
+      write(*,100)
+ 100  format(' Benchmark completed ')
+
+      epsilon = 1.d-10
+      if (class .ne. 'U') then
+
+c         err = abs( zeta - zeta_verify_value)
+         err = abs( zeta - zeta_verify_value )/zeta_verify_value
+         if( err .le. epsilon .and. ( .not. isnan(err))) then
+            verified = .TRUE.
+            write(*, 200)
+            write(*, 201) zeta
+            write(*, 202) err
+ 200        format(' VERIFICATION SUCCESSFUL ')
+ 201        format(' Zeta is    ', E20.13)
+ 202        format(' Error is   ', E20.13)
+         else
+            verified = .FALSE.
+            write(*, 300) 
+            write(*, 301) zeta
+            write(*, 302) zeta_verify_value
+ 300        format(' VERIFICATION FAILED')
+ 301        format(' Zeta                ', E20.13)
+ 302        format(' The correct zeta is ', E20.13)
+         endif
+      else
+         verified = .FALSE.
+         write (*, 400)
+         write (*, 401)
+         write (*, 201) zeta
+ 400     format(' Problem size unknown')
+ 401     format(' NO VERIFICATION PERFORMED')
+      endif
+
+
+      if( t .ne. 0. ) then
+         mflops = float( 2*niter*na )
+     &               * ( 3.+float( nonzer*(nonzer+1) )
+     &                 + 25.*(5.+float( nonzer*(nonzer+1) ))
+     &                 + 3. ) / t / 1000000.0
+      else
+         mflops = 0.0
+      endif
+
+
+         call print_results('CG', class, na, 0, 0,
+     >                      niter, t,
+     >                      mflops, '          floating point', 
+     >                      verified, npbversion, compiletime,
+     >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+
+
+
+ 600  format( i4, 2e19.12)
+
+
+c---------------------------------------------------------------------
+c      More timers
+c---------------------------------------------------------------------
+      if (.not.timeron) goto 999
+
+      tmax = timer_read(T_bench)
+      if (tmax .eq. 0.0) tmax = 1.0
+
+      write(*,800)
+ 800  format('  SECTION   Time (secs)')
+      do i=1, t_last
+         t = timer_read(i)
+         if (i.eq.t_init) then
+            write(*,810) t_names(i), t
+         else
+            write(*,810) t_names(i), t, t*100./tmax
+            if (i.eq.t_conj_grad) then
+               t = tmax - t
+               write(*,820) 'rest', t, t*100./tmax
+            endif
+         endif
+ 810     format(2x,a8,':',f9.3:'  (',f6.2,'%)')
+ 820     format('    --> ',a8,':',f9.3,'  (',f6.2,'%)')
+      end do
+
+ 999  continue
+
+
+      end                              ! end main
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine conj_grad ( colidx,
+     >                       rowstr,
+     >                       x,
+     >                       z,
+     >                       a,
+     >                       p,
+     >                       q,
+     >                       r,
+     >                       rnorm )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c  Floaging point arrays here are named as in NPB1 spec discussion of 
+c  CG algorithm
+c---------------------------------------------------------------------
+ 
+      implicit none
+
+
+      include 'globals.h'
+
+
+      double precision   x(*),
+     >                   z(*),
+     >                   a(nzz)
+      integer            colidx(nzz), rowstr(naa+1)
+
+      double precision   p(*),
+     >                   q(*),
+     >                   r(*)
+
+
+      integer   j, k
+      integer   cgit, cgitmax, mlen,idx, idxl
+
+      double precision   d, sum, rho, rho0, alpha, beta, rnorm
+
+      data      cgitmax / 25 /
+CDVM$ INHERIT x, z, r, p, q
+
+      rho = 0.0d0
+
+c---------------------------------------------------------------------
+c  Initialize the CG algorithm:
+c---------------------------------------------------------------------
+
+CDVM$ region
+CDVM$ parallel (j) on q(j), private(d)
+      do j=1,naa+1
+         q(j) = 0.0d0
+         z(j) = 0.0d0
+         d = x(j)
+         r(j) = d
+         p(j) = d
+      enddo
+
+
+c---------------------------------------------------------------------
+c  rho = r.r
+c  Now, obtain the norm of r: First, sum squares of r elements locally...
+c---------------------------------------------------------------------
+
+CDVM$ parallel(j) on r(j), reduction(SUM(rho))
+      do j=1, lastcol-firstcol+1
+         rho = rho + r(j)*r(j)
+      enddo
+!        mlen = 128	  
+! DVM$ parallel(j) on r(j), reduction(MAX(mlen))	 
+!      do j=1,lastrow-firstrow+1
+!          mlen = max(mlen, rowstr(j+1) - rowstr(j))
+!      enddo
+CDVM$ end region
+!      write(*,*) 'maxlen = ', mlen
+c---------------------------------------------------------------------
+c---->
+c  The conj grad iteration loop
+c---->
+c---------------------------------------------------------------------
+      do cgit = 1, cgitmax
+
+         d = 0.0d0
+CDVM$ region
+	 
+CDVM$ parallel (j) on p(j), private(sum,k), remote_access(p(:))
+         do j=1,lastrow-firstrow+1
+            sum = 0.d0
+            do k=rowstr(j),rowstr(j+1)-1              
+              sum = sum + a(k)*p(colidx(k))              
+            enddo
+            q(j) = sum
+         enddo
+
+CDVM$ parallel (j) on q(j), reduction(SUM(d)) 
+         do j=1, lastcol-firstcol+1
+            d = d + p(j)*q(j)
+         enddo
+CDVM$ end region
+         alpha = rho / d
+         rho0 = rho
+
+         rho = 0.0d0
+CDVM$ region		 
+CDVM$ parallel (j) on r(j), private(d), reduction(SUM(rho))
+         do j=1, lastcol-firstcol+1
+            z(j) = z(j) + alpha*p(j)
+            d = r(j) - alpha*q(j)		
+            r(j) = d
+            rho = rho + d*d
+         enddo
+CDVM$ end region
+         beta = rho / rho0
+
+CDVM$ region
+CDVM$ parallel (j) on r(j)
+         do j=1, lastcol-firstcol+1
+            p(j) = r(j) + beta*p(j)
+         enddo
+CDVM$ end region
+
+      enddo                             ! end of do cgit=1,cgitmax
+
+
+c---------------------------------------------------------------------
+c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
+c  First, form A.z
+c  The partition submatrix-vector multiply
+c---------------------------------------------------------------------
+
+      sum = 0.0d0
+CDVM$ region
+CDVM$ parallel (j) on r(j), private(d,k),remote_access(z(:))	  
+      do j=1,lastrow-firstrow+1
+         d = 0.d0
+         do k=rowstr(j),rowstr(j+1)-1
+            d = d + a(k)*z(colidx(k))
+         enddo
+         r(j) = d
+      enddo
+
+
+c---------------------------------------------------------------------
+c  At this point, r contains A.z
+c---------------------------------------------------------------------
+CDVM$ parallel (j) on r(j), private(d), reduction(SUM(sum))
+      do j=1, lastcol-firstcol+1
+         d   = x(j) - r(j)         
+         sum = sum + d*d
+      enddo
+CDVM$ end region
+      rnorm = sqrt( sum )
+
+
+
+      return
+      end                               ! end of routine conj_grad
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine makea( n, nz, a, colidx, rowstr, 
+     >                  firstrow, lastrow, firstcol, lastcol,
+     >                  arow, acol, aelt, iv )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit            none
+      include             'npbparams.h'
+      integer             n, nz
+      integer             firstrow, lastrow, firstcol, lastcol
+      integer             colidx(nz), rowstr(n+1)
+      integer             iv(n), arow(n), acol(nonzer+1,n)
+      double precision    aelt(nonzer+1,n)
+      double precision    a(nz)
+
+c---------------------------------------------------------------------
+c       generate the test problem for benchmark 6
+c       makea generates a sparse matrix with a
+c       prescribed sparsity distribution
+c
+c       parameter    type        usage
+c
+c       input
+c
+c       n            i           number of cols/rows of matrix
+c       nz           i           nonzeros as declared array size
+c       rcond        r*8         condition number
+c       shift        r*8         main diagonal shift
+c
+c       output
+c
+c       a            r*8         array for nonzeros
+c       colidx       i           col indices
+c       rowstr       i           row pointers
+c
+c       workspace
+c
+c       iv, arow, acol i
+c       aelt           r*8
+c---------------------------------------------------------------------
+
+      integer          i, iouter, ivelt, nzv, nn1
+      integer          ivc(nonzer+1)
+      double precision vc(nonzer+1)
+
+c---------------------------------------------------------------------
+c      nonzer is approximately  (int(sqrt(nnza /n)));
+c---------------------------------------------------------------------
+
+      external          sparse, sprnvc, vecset
+
+c---------------------------------------------------------------------
+c    nn1 is the smallest power of two not less than n
+c---------------------------------------------------------------------
+
+      nn1 = 1
+ 50   continue
+        nn1 = 2 * nn1
+        if (nn1 .lt. n) goto 50
+
+c---------------------------------------------------------------------
+c  Generate nonzero positions and save for the use in sparse.
+c---------------------------------------------------------------------
+
+      do iouter = 1, n
+         nzv = nonzer
+         call sprnvc( n, nzv, nn1, vc, ivc )
+         call vecset( n, vc, ivc, nzv, iouter, .5D0 )
+         arow(iouter) = nzv
+         do ivelt = 1, nzv
+            acol(ivelt, iouter) = ivc(ivelt)
+            aelt(ivelt, iouter) = vc(ivelt)
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c       ... make the sparse matrix from list of elements with duplicates
+c           (iv is used as  workspace)
+c---------------------------------------------------------------------
+      call sparse( a, colidx, rowstr, n, nz, nonzer, arow, acol, 
+     >             aelt, firstrow, lastrow,
+     >             iv, rcond, shift )
+      return
+
+      end
+c-------end   of makea------------------------------
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine sparse( a, colidx, rowstr, n, nz, nonzer, arow, acol, 
+     >                   aelt, firstrow, lastrow,
+     >                   nzloc, rcond, shift )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           none
+      integer            colidx(*), rowstr(*)
+      integer            firstrow, lastrow
+      integer            n, nz, nonzer, arow(*), acol(nonzer+1,*)
+      double precision   a(*), aelt(nonzer+1,*), rcond, shift
+
+c---------------------------------------------------------------------
+c       rows range from firstrow to lastrow
+c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
+c---------------------------------------------------------------------
+      integer            nzloc(n), nrows
+
+c---------------------------------------------------
+c       generate a sparse matrix from a list of
+c       [col, row, element] tri
+c---------------------------------------------------
+
+      integer            i, j, j1, j2, nza, k, kk, nzrow, jcol
+      double precision   xi, size, scale, ratio, va
+
+c---------------------------------------------------------------------
+c    how many rows of result
+c---------------------------------------------------------------------
+      nrows = lastrow - firstrow + 1
+
+c---------------------------------------------------------------------
+c     ...count the number of triples in each row
+c---------------------------------------------------------------------
+      do j = 1, nrows+1
+         rowstr(j) = 0
+      enddo
+
+      do i = 1, n
+         do nza = 1, arow(i)
+            j = acol(nza, i) + 1
+            rowstr(j) = rowstr(j) + arow(i)
+         end do
+      end do
+
+      rowstr(1) = 1
+      do j = 2, nrows+1
+         rowstr(j) = rowstr(j) + rowstr(j-1)
+      enddo
+      nza = rowstr(nrows+1) - 1
+
+c---------------------------------------------------------------------
+c     ... rowstr(j) now is the location of the first nonzero
+c           of row j of a
+c---------------------------------------------------------------------
+
+      if (nza .gt. nz) then
+         write(*,*) 'Space for matrix elements exceeded in sparse'
+         write(*,*) 'nza, nzmax = ',nza, nz
+         stop
+      endif
+
+
+c---------------------------------------------------------------------
+c     ... preload data pages
+c---------------------------------------------------------------------
+      do j = 1, nrows
+         do k = rowstr(j), rowstr(j+1)-1
+             a(k) = 0.d0
+             colidx(k) = 0
+         enddo
+         nzloc(j) = 0
+      enddo
+
+c---------------------------------------------------------------------
+c     ... generate actual values by summing duplicates
+c---------------------------------------------------------------------
+
+      size = 1.0D0
+      ratio = rcond ** (1.0D0 / dfloat(n))
+
+      do i = 1, n
+         do nza = 1, arow(i)
+            j = acol(nza, i)
+
+            scale = size * aelt(nza, i)
+            do nzrow = 1, arow(i)
+               jcol = acol(nzrow, i)
+               va = aelt(nzrow, i) * scale
+
+c---------------------------------------------------------------------
+c       ... add the identity * rcond to the generated matrix to bound
+c           the smallest eigenvalue from below by rcond
+c---------------------------------------------------------------------
+               if (jcol .eq. j .and. j .eq. i) then
+                  va = va + rcond - shift
+               endif
+
+               do k = rowstr(j), rowstr(j+1)-1
+                  if (colidx(k) .gt. jcol) then
+c---------------------------------------------------------------------
+c       ... insert colidx here orderly
+c---------------------------------------------------------------------
+                     do kk = rowstr(j+1)-2, k, -1
+                        if (colidx(kk) .gt. 0) then
+                           a(kk+1)  = a(kk)
+                           colidx(kk+1) = colidx(kk)
+                        endif
+                     enddo
+                     colidx(k) = jcol
+                     a(k)  = 0.d0
+                     goto 40
+                  else if (colidx(k) .eq. 0) then
+                     colidx(k) = jcol
+                     goto 40
+                  else if (colidx(k) .eq. jcol) then
+c---------------------------------------------------------------------
+c       ... mark the duplicated entry
+c---------------------------------------------------------------------
+                     nzloc(j) = nzloc(j) + 1
+                     goto 40
+                  endif
+               enddo
+               print *,'internal error in sparse: i=',i
+               stop
+   40          continue
+               a(k) = a(k) + va
+            enddo
+   60       continue
+         enddo
+         size = size * ratio
+      enddo
+
+
+c---------------------------------------------------------------------
+c       ... remove empty entries and generate final results
+c---------------------------------------------------------------------
+      do j = 2, nrows
+         nzloc(j) = nzloc(j) + nzloc(j-1)
+      enddo
+
+      do j = 1, nrows
+         if (j .gt. 1) then
+            j1 = rowstr(j) - nzloc(j-1)
+         else
+            j1 = 1
+         endif
+         j2 = rowstr(j+1) - nzloc(j) - 1
+         nza = rowstr(j)
+         do k = j1, j2
+            a(k) = a(nza)
+            colidx(k) = colidx(nza)
+            nza = nza + 1
+         enddo
+      enddo
+      do j = 2, nrows+1
+         rowstr(j) = rowstr(j) - nzloc(j-1)
+      enddo
+      nza = rowstr(nrows+1) - 1
+
+
+CC       write (*, 11000) nza
+      return
+11000   format ( //,'final nonzero count in sparse ',
+     1            /,'number of nonzeros       = ', i16 )
+      end
+c-------end   of sparse-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine sprnvc( n, nz, nn1, v, iv )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           none
+      double precision   v(*)
+      integer            n, nz, nn1, iv(*)
+      common /urando/    amult, tran
+      double precision   amult, tran
+
+
+c---------------------------------------------------------------------
+c       generate a sparse n-vector (v, iv)
+c       having nzv nonzeros
+c
+c       mark(i) is set to 1 if position i is nonzero.
+c       mark is all zero on entry and is reset to all zero before exit
+c       this corrects a performance bug found by John G. Lewis, caused by
+c       reinitialization of mark on every one of the n calls to sprnvc
+c---------------------------------------------------------------------
+
+        integer            nzv, ii, i, icnvrt
+
+        external           randlc, icnvrt
+        double precision   randlc, vecelt, vecloc
+
+
+        nzv = 0
+
+100     continue
+        if (nzv .ge. nz) goto 110
+
+         vecelt = randlc( tran, amult )
+
+c---------------------------------------------------------------------
+c   generate an integer between 1 and n in a portable manner
+c---------------------------------------------------------------------
+         vecloc = randlc(tran, amult)
+         i = icnvrt(vecloc, nn1) + 1
+         if (i .gt. n) goto 100
+
+c---------------------------------------------------------------------
+c  was this integer generated already?
+c---------------------------------------------------------------------
+         do ii = 1, nzv
+            if (iv(ii) .eq. i) goto 100
+         enddo
+         nzv = nzv + 1
+         v(nzv) = vecelt
+         iv(nzv) = i
+         goto 100
+110     continue
+
+      return
+      end
+c-------end   of sprnvc-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      function icnvrt(x, ipwr2)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           none
+      double precision   x
+      integer            ipwr2, icnvrt
+
+c---------------------------------------------------------------------
+c    scale a double precision number x in (0,1) by a power of 2 and chop it
+c---------------------------------------------------------------------
+      icnvrt = int(ipwr2 * x)
+
+      return
+      end
+c-------end   of icnvrt-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      subroutine vecset(n, v, iv, nzv, i, val)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      implicit           none
+      integer            n, iv(*), nzv, i, k
+      double precision   v(*), val
+
+c---------------------------------------------------------------------
+c       set ith element of sparse vector (v, iv) with
+c       nzv nonzeros to val
+c---------------------------------------------------------------------
+
+      logical set
+
+      set = .false.
+      do k = 1, nzv
+         if (iv(k) .eq. i) then
+            v(k) = val
+            set  = .true.
+         endif
+      enddo
+      if (.not. set) then
+         nzv     = nzv + 1
+         v(nzv)  = val
+         iv(nzv) = i
+      endif
+      return
+      end
+c-------end   of vecset-----------------------------
+
+      include 'print_results.f'
+      include 'timers.f'
+      include 'randdp.f'
+
diff --git a/sapfor/experts/Sapfor_2017/CMakeLists.txt b/sapfor/experts/Sapfor_2017/CMakeLists.txt
index 97f4aeb..c93d69d 100644
--- a/sapfor/experts/Sapfor_2017/CMakeLists.txt
+++ b/sapfor/experts/Sapfor_2017/CMakeLists.txt
@@ -94,7 +94,9 @@ set(UTILS _src/Utils/AstWrapper.h
         _src/Utils/types.h
         _src/Utils/utils.cpp
         _src/Utils/utils.h
-        _src/Utils/version.h)
+        _src/Utils/version.h
+        _src/Utils/module_utils.h
+        _src/Utils/module_utils.cpp)
         
 set(OMEGA _src/SageAnalysisTool/OmegaForSage/add-assert.cpp
         _src/SageAnalysisTool/OmegaForSage/affine.cpp
diff --git a/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp b/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp
index 0b58816..642a5b7 100644
--- a/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp
+++ b/sapfor/experts/Sapfor_2017/_src/DirectiveProcessing/insert_directive.cpp
@@ -832,15 +832,22 @@ static pair<templateDir, string>
     return make_pair(retDir, lastReturn);
 }
 
-static pair<string, string> getModuleRename(const map<string, set<SgSymbol*>>& byUse, const DIST::Array* array,
-                                            const string& filename, const pair<int, int>& lineRange)
+static pair<string, string> getModuleRename(const set<SgStatement*>& allocatableStmts, const DIST::Array* array)
 {
-    auto declS = array->GetDeclSymbol(filename, lineRange, getAllFilesInProject())->GetOriginal();
-    for (auto& elem : byUse)
-        for (auto& localS : setToMapWithSortByStr(elem.second))
-            if (OriginalSymbol(localS.second) == declS)
-                return make_pair(elem.first, localS.second->identifier());
-    return make_pair("", "");
+    if (array->GetLocation().first == DIST::l_MODULE)
+    {
+        set<string> arrayNames;
+        for (auto& alloc : allocatableStmts)
+            if (alloc->variant() == ALLOCATE_STMT)
+                arrayNames.insert(getNameByUse(alloc, array->GetShortName(), array->GetLocation().second));
+
+        if (arrayNames.size() > 1 || arrayNames.size() == 0)
+            printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
+
+        return make_pair(array->GetShortName(), *arrayNames.begin());
+    }
+    else
+        return make_pair("", "");
 }
 
 static pair<DIST::Array*, string>
@@ -848,10 +855,7 @@ getNewDirective(const string &fullArrayName,
                 const vector<string> &distrRules,
                 const vector<string> &alignRules,
                 const DataDirective &dataDir,
-                const map<string, set<SgSymbol*>> &byUse,
-                const string& filename,
-                const pair<int, int>& lineRange,
-                bool alignToRealign)
+                const set<SgStatement*>& allocatableStmts)
 {
     string out = "";
     DIST::Array* outA = NULL;
@@ -873,7 +877,7 @@ getNewDirective(const string &fullArrayName,
         if (dataDir.alignRules[i].alignArray->GetName() == fullArrayName)
         {
             string rule = alignRules[i];
-            if (alignToRealign)
+            if (allocatableStmts.size())
             {
                 auto it = rule.find("ALIGN");
                 while (it != string::npos)
@@ -881,8 +885,8 @@ getNewDirective(const string &fullArrayName,
                     rule = rule.replace(it, 5, "REALIGN");
                     it = rule.find("ALIGN", it + 7);
                 }
-
-                auto renamePair = getModuleRename(byUse, dataDir.alignRules[i].alignArray, filename, lineRange);
+                
+                auto renamePair = getModuleRename(allocatableStmts, dataDir.alignRules[i].alignArray);
                 if (renamePair.first != "")
                 {
                     it = rule.find(renamePair.first);
@@ -1722,29 +1726,12 @@ void insertDistributionToFile(SgFile *file, const char *fin_name, const DataDire
 
                         if (distrArrays.find(fullArrayName) != distrArrays.end())
                         {
-                            map<string, set<SgSymbol*>> byUseInFunc;
-
                             const vector<SgStatement*> &allocatableStmtsCopy = getAttributes<SgStatement*, SgStatement*>(st, set<int>{ ALLOCATE_STMT });
                             set<SgStatement*> allocatableStmts;
                             if (allocatableStmtsCopy.size())
-                            {
                                 allocatableStmts = filterAllocateStats(file, allocatableStmtsCopy, currSymb->identifier());
-
-                                
-                                for (auto &alloc : allocatableStmts)
-                                {
-                                    if (alloc->fileName() != currFilename)
-                                        if (!alloc->switchToFile())
-                                            printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
-
-                                    auto byUse = moduleRefsByUseInFunction(alloc);
-                                    for (auto &byUseElem : byUse)
-                                        byUseInFunc[byUseElem.first].insert(byUseElem.second.begin(), byUseElem.second.end());
-
-                                    SgFile::switchToFile(currFilename);
-                                }
-                            }
-                            pair<DIST::Array*, string> dirWithArray = getNewDirective(fullArrayName, distrRules, alignRules, dataDir, byUseInFunc, filename, lineRange, allocatableStmts.size() != 0);
+   
+                            pair<DIST::Array*, string> dirWithArray = getNewDirective(fullArrayName, distrRules, alignRules, dataDir, allocatableStmts);
 
                             string toInsert = dirWithArray.second;
                             if (toInsert != "")
diff --git a/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp b/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp
index 5dad847..9b760c7 100644
--- a/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp
+++ b/sapfor/experts/Sapfor_2017/_src/DvmhRegions/DvmhRegionInserter.cpp
@@ -166,155 +166,6 @@ void DvmhRegionInserter::updateParallelFunctions(const map<string, vector<LoopGr
     }
 }
 
-static void findByUse(map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUse, const string& varName, 
-                      const set<string>& locNames, vector<string> &altNames)
-{
-    for (auto& elem : modByUse)
-    {
-        if (locNames.count(elem.first))
-        {
-            for (auto& byUse : elem.second)
-            {
-                SgSymbol* toCmp = byUse.second ? byUse.second : byUse.first;
-                checkNull(toCmp, convertFileName(__FILE__).c_str(), __LINE__);
-                if (toCmp->identifier() == varName)
-                    altNames.push_back(byUse.first->identifier());
-            }
-        }
-    }
-}
-
-static void fillInfo(SgStatement *start,
-                     set<string> &useMod,
-                     map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUse,
-                     map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUseOnly)
-{
-    for (SgStatement* st = start; st != start->lastNodeOfStmt(); st = st->lexNext())
-    {
-        if (isSgExecutableStatement(st))
-            break;
-        if (st->variant() == CONTAINS_STMT)
-            break;
-        if (st != start && (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR))
-            break;
-        fillUseStatement(st, useMod, modByUse, modByUseOnly);
-    }
-}
-
-static SgStatement* findModWithName(const vector<SgStatement*> &modules, const string &name)
-{
-    for (auto& elem : modules)
-        if (elem->variant() == MODULE_STMT)
-            if (elem->symbol()->identifier() == name)
-                return elem;
-    return NULL;
-}
-
-static string getNameByUse(SgStatement *place, const string &varName, const string &locName)
-{
-    SgStatement* func = getFuncStat(place);
-    if (func == NULL)
-        return varName;
-    else
-    {
-        map<string, set<string>> graphUse;
-
-        set<string> useMod;
-        map<string, vector<pair<SgSymbol*, SgSymbol*>>> modByUse;
-        map<string, vector<pair<SgSymbol*, SgSymbol*>>> modByUseOnly;
-
-        fillInfo(func, useMod, modByUse, modByUseOnly);
-        SgStatement* cp = func->controlParent();
-        if (isSgProgHedrStmt(cp) || cp->variant() == MODULE_STMT) // if function in contains region
-            fillInfo(cp, useMod, modByUse, modByUseOnly);
-
-        set<string> useModDone;
-        bool needRepeat = true;
-
-        vector<SgStatement*> modules;
-        findModulesInFile(func->getFile(), modules);
-
-        while (needRepeat)
-        {
-            needRepeat = false;
-            set<string> newUseMod;
-            for (auto& useM : useMod)
-            {
-                if (useModDone.find(useM) == useModDone.end())
-                {
-                    auto modSt = findModWithName(modules, useM);
-                    if (modSt == NULL || useM == "dvmh_template_mod")
-                        continue;
-
-                    checkNull(modSt, convertFileName(__FILE__).c_str(), __LINE__);
-
-                    set<string> tmpUse;
-                    fillInfo(modSt, tmpUse, modByUse, modByUseOnly);
-                    useModDone.insert(useM);
-
-                    for (auto& use : tmpUse)
-                    {
-                        newUseMod.insert(use);
-
-                        if (use != "dvmh_template_mod")
-                            graphUse[use].insert(useM);
-                    }
-                }
-            }
-
-            for (auto& newU : newUseMod)
-            {
-                if (useModDone.find(newU) == useModDone.end())
-                {
-                    useModDone.insert(newU);
-                    needRepeat = true;
-                }
-            }
-        }
-
-        vector<string> altNames;        
-        findByUse(modByUse, varName, { locName }, altNames);
-        findByUse(modByUseOnly, varName, { locName }, altNames);
-        
-        if (altNames.size() == 0) 
-        {
-            set<string> locations = { locName };
-            bool changed = true;
-            while (changed)
-            {
-                changed = false;
-                for (auto& loc : locations)
-                {
-                    if (graphUse.find(loc) != graphUse.end())
-                    {
-                        for (auto& use : graphUse[loc])
-                        {
-                            if (locations.find(use) == locations.end())
-                            {
-                                locations.insert(use);
-                                changed = true;
-                            }
-                        }
-                    }
-                }
-            }
-
-            findByUse(modByUse, varName, locations, altNames);
-            findByUse(modByUseOnly, varName, locations, altNames);
-        }
-
-        if (altNames.size() == 0)
-            return varName;
-        else if (altNames.size() >= 1)
-        {
-            set<string> setAlt(altNames.begin(), altNames.end());
-            return *setAlt.begin();
-        }
-        else
-            printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
-    }
-}
-
 static SgStatement* skipDvmhRegionInterval(SgStatement *start)
 {
     if (start->variant() != ACC_REGION_DIR)
diff --git a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp
index 35c3dfb..f583a20 100644
--- a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp
+++ b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls.cpp
@@ -2296,123 +2296,6 @@ void checkForRecursion(SgFile *file, map<string, vector<FuncInfo*>> &allFuncInfo
     }
 }
 
-static void fillUseStmt(SgStatement *stat, map<string, set<SgSymbol*>> &byUse)
-{
-    if (stat->variant() != USE_STMT)
-        printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
-
-    SgExpression* ex = stat->expr(0);
-    if (ex && ex->variant() == ONLY_NODE)
-    {
-        for (auto exI = ex->lhs(); exI; exI = exI->rhs())
-        {
-            if (exI->lhs()->variant() == RENAME_NODE)
-            {
-                SgExpression* ren = exI->lhs();
-                if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol())
-                    byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol());
-            }
-        }
-    }
-    else if (ex && ex->lhs())
-    {
-        for (auto exI = ex; exI; exI = exI->rhs())
-        {
-            if (exI->lhs()->variant() == RENAME_NODE)
-            {
-                SgExpression* ren = exI->lhs();
-                if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol())
-                    byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol());
-            }
-        }
-    }
-}
-
-map<string, set<SgSymbol*>> moduleRefsByUseInFunction(SgStatement *stIn)
-{
-    checkNull(stIn, convertFileName(__FILE__).c_str(), __LINE__);
-
-    map<string, set<SgSymbol*>> byUse;
-    int var = stIn->variant();
-    while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR)
-    {
-        stIn = stIn->controlParent();
-        if (stIn == NULL)
-            return byUse;
-        var = stIn->variant();
-    }
-    
-    auto mapOfUses = createMapOfModuleUses(stIn->getFile());
-    set<string> useMods;
-
-    for (SgStatement *stat = stIn->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext())
-    {
-        if (stat->variant() == USE_STMT)
-        {
-            fillUseStmt(stat, byUse);
-            useMods.insert(stat->symbol()->identifier());
-        }
-    }
-
-    const int cpOfSt = stIn->controlParent()->variant();
-    //contains of func
-    if (cpOfSt == PROG_HEDR || cpOfSt == PROC_HEDR || cpOfSt == FUNC_HEDR)
-    {
-        for (SgStatement *stat = stIn->controlParent()->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext())
-        {
-            if (stat->variant() == USE_STMT)
-            {
-                fillUseStmt(stat, byUse);
-                useMods.insert(stat->symbol()->identifier());
-            }
-        }
-    }
-
-    bool chages = true;
-    while (chages)
-    {
-        chages = false;
-        set<string> newUseMods(useMods);
-        for (auto &elem : useMods)
-        {
-            auto it = mapOfUses.find(elem);
-            if (it != mapOfUses.end())
-            {
-                for (auto &elem2 : it->second)
-                {
-                    if (newUseMods.find(elem2) == newUseMods.end())
-                    {
-                        newUseMods.insert(elem2);
-                        chages = true;
-                    }
-                }
-            }
-        }
-        useMods = newUseMods;
-    }
-
-    vector<SgStatement*> modules;
-    findModulesInFile(stIn->getFile(), modules);
-    for (auto &mod : modules)
-    {
-        if (useMods.find(mod->symbol()->identifier()) != useMods.end())
-        {
-            for (SgStatement *stat = mod->lexNext(); stat != mod->lastNodeOfStmt(); stat = stat->lexNext())
-            {
-                const int var = stat->variant();
-                if (var == USE_STMT)
-                {
-                    fillUseStmt(stat, byUse);
-                    useMods.insert(stat->symbol()->identifier());
-                }
-                else if (var == PROC_HEDR || var == FUNC_HEDR)
-                    break;
-            }
-        }
-    }
-    return byUse;
-}
-
 void propagateWritesToArrays(map<string, vector<FuncInfo*>> &allFuncInfo)
 {
     map<string, FuncInfo*> funcByName;
diff --git a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h
index a4d3715..0660d67 100644
--- a/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h
+++ b/sapfor/experts/Sapfor_2017/_src/GraphCall/graph_calls_func.h
@@ -45,7 +45,6 @@ int CheckFunctionsToInline(SgProject *proj, const std::map<std::string, int> &fi
 void checkForRecursion(SgFile *file, std::map<std::string, std::vector<FuncInfo*>> &allFuncInfo, std::vector<Messages> &messagesForFile);
 bool isPassFullArray(SgExpression *ex);
 void doMacroExpand(SgFile *file, std::vector<Messages> &messages);
-std::map<std::string, std::set<SgSymbol*>> moduleRefsByUseInFunction(SgStatement *stIn);
 void propagateWritesToArrays(std::map<std::string, std::vector<FuncInfo*>> &allFuncInfo);
 void detectCopies(std::map<std::string, std::vector<FuncInfo*>> &allFuncInfo);
 void fillInterfaceBlock(std::map<std::string, std::vector<FuncInfo*>>& allFuncInfo);
diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp
index eb9b0e2..677e0a9 100644
--- a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp
+++ b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.cpp
@@ -879,46 +879,6 @@ void initTags()
 #include "tag.h"
 }
 
-
-void findModulesInFile(SgFile *file, vector<SgStatement*> &modules)
-{
-    SgStatement *first = file->firstStatement();
-    set<SgStatement*> functions;
-
-    int funcNum = file->numberOfFunctions();
-    for (int i = 0; i < funcNum; ++i)
-        functions.insert(file->functions(i));
-
-    while (first)
-    {
-        if (first->variant() == MODULE_STMT)
-        {
-            modules.push_back(first);
-            first = first->lastNodeOfStmt();
-        }
-        else
-        {
-            if (functions.size())
-            {
-                auto it = functions.find(first);
-                if (it != functions.end())
-                    first = (*it)->lastNodeOfStmt();
-            }
-        }
-
-        first = first->lexNext();
-    }
-}
-
-void getModulesAndFunctions(SgFile *file, vector<SgStatement*> &modulesAndFunctions)
-{
-    findModulesInFile(file, modulesAndFunctions);
-
-    int funcNum = file->numberOfFunctions();
-    for (int i = 0; i < funcNum; ++i)
-        modulesAndFunctions.push_back(file->functions(i));
-}
-
 void tryToFindPrivateInAttributes(SgStatement *st, set<string> &privates, bool onlyReduction, bool onlyUsers)
 {
     set<Symbol*> privatesVars;
@@ -2365,76 +2325,6 @@ objT& getObjectForFileFromMap(const char *fileName, map<string, objT> &mapObject
 template vector<SpfInterval*>& getObjectForFileFromMap(const char *fileName, map<string, vector<SpfInterval*>>&);
 template PredictorStats& getObjectForFileFromMap(const char *fileName, map<string, PredictorStats>&);
 
-SgSymbol* getFromModule(const map<string, set<SgSymbol*>> &byUse, SgSymbol *orig, bool processAsModule)
-{
-    if (!processAsModule)
-    {
-        checkNull(orig->scope(), convertFileName(__FILE__).c_str(), __LINE__);
-        if (orig->scope()->variant() != MODULE_STMT)
-            return orig;
-    }
-
-    if (byUse.size())
-    {
-        for (auto& elem : byUse) 
-        {
-            for (auto& localS : setToMapWithSortByStr(elem.second))
-                if (OriginalSymbol(localS.second)->thesymb == orig->thesymb)
-                    return localS.second;
-        }
-    }
-    return orig;
-}
-
-map<string, set<string>> createMapOfModuleUses(SgFile *file)
-{
-	map<string, set<string>> retValMap;
-
-	vector<SgStatement*> modules;
-	findModulesInFile(file, modules);
-
-	for (int z = 0; z < modules.size(); ++z)
-	{
-		SgStatement *curr = modules[z];
-        string modName = curr->symbol()->identifier();
-        for (SgStatement *st = curr->lexNext(); st != curr->lastNodeOfStmt(); st = st->lexNext())
-        {
-            if (st->variant() == USE_STMT)
-                retValMap[modName].insert(st->symbol()->identifier());
-            else if (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR)
-                break;
-        }
-	}
-
-    bool repeat = true;
-    while (repeat)
-    {
-        repeat = false;
-        for (auto &elem : retValMap)
-        {
-            set<string> toAdd(elem.second);
-            for (auto &inUse : elem.second)
-            {
-                auto it = retValMap.find(inUse);
-                if (it != retValMap.end())
-                {
-                    for (auto &inUseToAdd : it->second)
-                    {
-                        if (toAdd.find(inUseToAdd) == toAdd.end())
-                        {
-                            toAdd.insert(inUseToAdd);
-                            repeat = true;
-                        }
-                    }
-                }
-            }
-            elem.second = toAdd;
-        }
-    }
-
-    return retValMap;
-}
-
 void printSymbolTable(SgFile *file, string filter, const set<int>& vars)
 {
     for (auto s = file->firstSymbol(); s; s = s->next())
@@ -2574,96 +2464,6 @@ SgStatement* duplicateProcedure(SgStatement *toDup, const string *newName, bool
     return toMove;
 }
 
-void fillModuleUse(SgFile *file, map<string, set<string>> &moduleUses, map<string, string> &moduleDecls)
-{
-    const string currFN = file->filename();
-    for (SgStatement* st = file->firstStatement(); st; st = st->lexNext())
-    {
-        if (st->fileName() == currFN)
-        {
-            if (st->variant() == USE_STMT)
-                moduleUses[currFN].insert(st->symbol()->identifier());
-
-            if (st->variant() == MODULE_STMT)
-            {
-                string moduleN = st->symbol()->identifier();
-                auto it = moduleDecls.find(moduleN);
-                if (it != moduleDecls.end())
-                    printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
-                moduleDecls[moduleN] = currFN;
-            }
-        }
-    }
-}
-
-void filterModuleUse(map<string, set<string>>& moduleUsesByFile, map<string, string>& moduleDecls)
-{
-    for (auto& elem : moduleUsesByFile)
-    {
-        set<string> newSet;
-        for (auto& setElem : elem.second)
-        {
-            auto it = moduleDecls.find(setElem);
-            if (it == moduleDecls.end())
-                newSet.insert(setElem);
-            else if (elem.first != it->second)
-                newSet.insert(setElem);
-        }
-        elem.second = newSet;
-    }
-
-    /*map<string, set<string>> modIncludeMod;
-
-    for (auto& mod : moduleDecls)
-    {
-        string name = mod.first;
-        string file = mod.second;
-
-        auto it = moduleUsesByFile.find(file);
-        if (it != moduleUsesByFile.end())
-            modIncludeMod[name] = it->second;
-    }
-
-    bool change = true;
-    while (change)
-    {
-        change = false;
-        for (auto& mod : modIncludeMod)
-        {
-            set<string> newSet = mod.second;
-            for (auto& included : mod.second)
-            {
-                auto it = modIncludeMod.find(included);
-                if (it == modIncludeMod.end())
-                    continue;
-
-                for (auto& elem : it->second)
-                {
-                    if (newSet.find(elem) == newSet.end())
-                    {
-                        newSet.insert(elem);
-                        change = true;
-                    }
-                }
-            }
-            mod.second = newSet;
-        }
-    }
-
-    for (auto& elem : moduleUsesByFile)
-    {
-        set<string> newSet = elem.second;
-        for (auto& setElem : elem.second)
-        {
-            auto it = modIncludeMod.find(setElem);
-            if (it != modIncludeMod.end())
-                for (auto& toRem : it->second)
-                    newSet.erase(toRem);
-        }
-        elem.second = newSet;
-    }*/
-}
-
 SgExpression* makeExprList(const vector<SgExpression*>& items, bool withSort)
 {
     SgExpression* list = NULL;
@@ -2847,59 +2647,6 @@ int getNextFreeLabel()
     return -1;
 }
 
-static void addUseStatements(SgStatement* currF, SgStatement* obj, vector<SgStatement*>& useStats,
-                             const vector<SgStatement*>& funcContains)
-{
-    for (auto& funcSt : funcContains)
-    {
-        if (currF == funcSt)
-        {
-            SgStatement* last = obj->lastNodeOfStmt();
-            for (SgStatement* st = obj->lexNext(); st != last; st = st->lexNext())
-            {
-                if (st->variant() == USE_STMT)
-                    useStats.push_back(st);
-                else if (st->variant() == CONTAINS_STMT)
-                    break;
-            }
-            break;
-        }
-    }
-}
-
-void fillUsedModulesInFunction(SgStatement *st, vector<SgStatement*> &useStats)
-{
-    checkNull(st, convertFileName(__FILE__).c_str(), __LINE__);
-
-    int var = st->variant();
-    while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR)
-    {
-        st = st->controlParent();
-        checkNull(st, convertFileName(__FILE__).c_str(), __LINE__);
-        var = st->variant();
-    }
-
-    for (SgStatement *stat = st->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext())
-        if (stat->variant() == USE_STMT)
-            useStats.push_back(stat);
-
-    for (int i = 0; i < current_file->numberOfFunctions(); ++i)
-    {
-        vector<SgStatement*> funcContains;
-        findContainsFunctions(current_file->functions(i), funcContains);
-        addUseStatements(st, current_file->functions(i), useStats, funcContains);        
-    }
-
-    vector<SgStatement*> modules;
-    findModulesInFile(st->getFile(), modules);
-    for (auto &module : modules)
-    {
-        vector<SgStatement*> funcContains;
-        findContainsFunctions(module, funcContains, true);
-        addUseStatements(st, module, useStats, funcContains);
-    }
-}
-
 static void recFillUsedVars(SgExpression *exp, map<string, SgSymbol*> &vars)
 {
     if (exp)
diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h
index ff889c0..5ce6d38 100644
--- a/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h
+++ b/sapfor/experts/Sapfor_2017/_src/Utils/SgUtils.h
@@ -5,6 +5,7 @@
 #include "../Distribution/Distribution.h"
 #include "../GraphCall/graph_calls.h"
 #include "../DynamicAnalysis/gcov_info.h"
+#include "module_utils.h"
 
 SgStatement* declaratedInStmt(SgSymbol *toFind, std::vector<SgStatement*> *allDecls = NULL, bool printInternal = true, SgStatement* scope = NULL);
 
@@ -15,8 +16,7 @@ std::string removeIncludeStatsAndUnparse(SgFile *file, const char *fileName, con
 SgSymbol* findSymbolOrCreate(SgFile *file, const std::string toFind, SgType *type = NULL, SgStatement *scope = NULL);
 void recExpressionPrint(SgExpression *exp);
 void removeSubstrFromStr(std::string &str, const std::string &del);
-void getModulesAndFunctions(SgFile *file, std::vector<SgStatement*> &modulesAndFunctions);
-void findModulesInFile(SgFile *file, std::vector<SgStatement*> &modules);
+
 void tryToFindPrivateInAttributes(SgStatement* st, std::set<std::string>& privatesVars, bool onlyReduction = false, bool onlyUsers = false);
 
 void fillNonDistrArraysAsPrivate(SgStatement *st,
@@ -60,15 +60,11 @@ const CommonBlock* isArrayInCommon(const std::map<std::string, CommonBlock*> &co
 
 std::vector<DIST::Array*> fillArraysFromDir(Statement *st);
 
-SgSymbol* getFromModule(const std::map<std::string, std::set<SgSymbol*>> &byUse, SgSymbol *orig, bool processAsModule = false);
-std::map<std::string, std::set<std::string>> createMapOfModuleUses(SgFile* file);
 void printSymbolTable(SgFile* file, std::string filter = "", const std::set<int>& vars = {});
 SgStatement* getFuncStat(SgStatement *st, const std::set<int> additional = std::set<int>());
 std::map<SgStatement*, std::vector<DefUseList>> createDefUseMapByPlace();
 SgStatement* duplicateProcedure(SgStatement* toDup, const std::string* newName, bool withAttributes = false, bool withComment = false, bool withSameLines = true, bool dontInsert = false);
 
-void fillModuleUse(SgFile* file, std::map<std::string, std::set<std::string>>& moduleUses, std::map<std::string, std::string>& moduleDecls);
-void filterModuleUse(std::map<std::string, std::set<std::string>>& moduleUses, std::map<std::string, std::string>& moduleDecls);
 SgExpression* makeExprList(const std::vector<SgExpression*>& items, bool withSort = true);
 std::string unparseProjectToString(SgFile* file, const int curr_regime);
 
@@ -77,7 +73,6 @@ std::vector<SgStatement*> makeDeclaration(const std::vector<SgSymbol*>& symbolsT
 
 int getNextFreeLabel();
 
-void fillUsedModulesInFunction(SgStatement *st, std::vector<SgStatement*> &useStats);
 void fillVisibleInUseVariables(SgStatement *useSt, std::map<std::string, SgSymbol*> &vars);
 
 std::string preprocDataString(std::string data, bool full = true);
@@ -91,7 +86,13 @@ void getVariables(SgExpression* ex, std::set<SgSymbol*>& variables, const std::s
 template<typename T>
 std::set<T> getAllVariables(SgStatement* stFrom, SgStatement* stTo, const std::set<int>& variants);
 
-SgProject* createProject(const char* proj_name, std::vector<ParallelRegion*>& parallelRegions, std::vector<ParallelRegion*>& subs_parallelRegions, std::map<std::string, std::vector<SgStatement*>>& hiddenData, std::map<std::string, int>& filesNameWithoutExt, std::map<std::string, std::set<std::string>>& moduleUsesByFile, std::map<std::string, std::string>& moduleDecls, std::map<std::string, std::map<SgStatement*, std::vector<SgStatement*>>>& exctactedModuleStats, bool printSymbTable);
+SgProject* createProject(const char* proj_name, std::vector<ParallelRegion*>& parallelRegions, 
+                         std::vector<ParallelRegion*>& subs_parallelRegions,
+                         std::map<std::string, std::vector<SgStatement*>>& hiddenData, 
+                         std::map<std::string, int>& filesNameWithoutExt, 
+                         std::map<std::string, std::set<std::string>>& moduleUsesByFile,
+                         std::map<std::string, std::string>& moduleDecls, 
+                         std::map<std::string, std::map<SgStatement*, std::vector<SgStatement*>>>& exctactedModuleStats, bool printSymbTable);
 
 bool isArrayType(SgType* type);
 bool isArrayRef(SgExpression* ex);
diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/module_utils.cpp b/sapfor/experts/Sapfor_2017/_src/Utils/module_utils.cpp
new file mode 100644
index 0000000..99df6ba
--- /dev/null
+++ b/sapfor/experts/Sapfor_2017/_src/Utils/module_utils.cpp
@@ -0,0 +1,676 @@
+
+#include <vector>
+#include <set>
+#include <map>
+#include <string>
+
+#include "dvm.h"
+#include "errors.h"
+#include "utils.h"
+#include "../GraphCall/graph_calls_func.h"
+
+#include "module_utils.h"
+
+using std::vector;
+using std::set;
+using std::string;
+using std::map;
+using std::pair;
+using std::make_pair;
+
+void findModulesInFile(SgFile* file, vector<SgStatement*>& modules)
+{
+    SgStatement* first = file->firstStatement();
+    set<SgStatement*> functions;
+
+    int funcNum = file->numberOfFunctions();
+    for (int i = 0; i < funcNum; ++i)
+        functions.insert(file->functions(i));
+
+    while (first)
+    {
+        if (first->variant() == MODULE_STMT)
+        {
+            modules.push_back(first);
+            first = first->lastNodeOfStmt();
+        }
+        else
+        {
+            if (functions.size())
+            {
+                auto it = functions.find(first);
+                if (it != functions.end())
+                    first = (*it)->lastNodeOfStmt();
+            }
+        }
+
+        first = first->lexNext();
+    }
+}
+
+void getModulesAndFunctions(SgFile* file, vector<SgStatement*>& modulesAndFunctions)
+{
+    findModulesInFile(file, modulesAndFunctions);
+
+    int funcNum = file->numberOfFunctions();
+    for (int i = 0; i < funcNum; ++i)
+        modulesAndFunctions.push_back(file->functions(i));
+}
+
+SgSymbol* getFromModule(const map<string, set<SgSymbol*>>& byUse, SgSymbol* orig, bool processAsModule)
+{
+    if (!processAsModule)
+    {
+        checkNull(orig->scope(), convertFileName(__FILE__).c_str(), __LINE__);
+        if (orig->scope()->variant() != MODULE_STMT)
+            return orig;
+    }
+
+    if (byUse.size())
+    {
+        for (auto& elem : byUse)
+        {
+            for (auto& localS : setToMapWithSortByStr(elem.second))
+                if (OriginalSymbol(localS.second)->thesymb == orig->thesymb)
+                    return localS.second;
+        }
+    }
+    return orig;
+}
+
+map<string, set<string>> createMapOfModuleUses(SgFile* file)
+{
+    map<string, set<string>> retValMap;
+
+    vector<SgStatement*> modules;
+    findModulesInFile(file, modules);
+
+    for (int z = 0; z < modules.size(); ++z)
+    {
+        SgStatement* curr = modules[z];
+        string modName = curr->symbol()->identifier();
+        for (SgStatement* st = curr->lexNext(); st != curr->lastNodeOfStmt(); st = st->lexNext())
+        {
+            if (st->variant() == USE_STMT)
+                retValMap[modName].insert(st->symbol()->identifier());
+            else if (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR)
+                break;
+        }
+    }
+
+    bool repeat = true;
+    while (repeat)
+    {
+        repeat = false;
+        for (auto& elem : retValMap)
+        {
+            set<string> toAdd(elem.second);
+            for (auto& inUse : elem.second)
+            {
+                auto it = retValMap.find(inUse);
+                if (it != retValMap.end())
+                {
+                    for (auto& inUseToAdd : it->second)
+                    {
+                        if (toAdd.find(inUseToAdd) == toAdd.end())
+                        {
+                            toAdd.insert(inUseToAdd);
+                            repeat = true;
+                        }
+                    }
+                }
+            }
+            elem.second = toAdd;
+        }
+    }
+
+    return retValMap;
+}
+
+void fillModuleUse(SgFile* file, map<string, set<string>>& moduleUses, map<string, string>& moduleDecls)
+{
+    const string currFN = file->filename();
+    for (SgStatement* st = file->firstStatement(); st; st = st->lexNext())
+    {
+        if (st->fileName() == currFN)
+        {
+            if (st->variant() == USE_STMT)
+                moduleUses[currFN].insert(st->symbol()->identifier());
+
+            if (st->variant() == MODULE_STMT)
+            {
+                string moduleN = st->symbol()->identifier();
+                auto it = moduleDecls.find(moduleN);
+                if (it != moduleDecls.end())
+                    printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
+                moduleDecls[moduleN] = currFN;
+            }
+        }
+    }
+}
+
+void filterModuleUse(map<string, set<string>>& moduleUsesByFile, map<string, string>& moduleDecls)
+{
+    for (auto& elem : moduleUsesByFile)
+    {
+        set<string> newSet;
+        for (auto& setElem : elem.second)
+        {
+            auto it = moduleDecls.find(setElem);
+            if (it == moduleDecls.end())
+                newSet.insert(setElem);
+            else if (elem.first != it->second)
+                newSet.insert(setElem);
+        }
+        elem.second = newSet;
+    }
+
+    /*map<string, set<string>> modIncludeMod;
+
+    for (auto& mod : moduleDecls)
+    {
+        string name = mod.first;
+        string file = mod.second;
+
+        auto it = moduleUsesByFile.find(file);
+        if (it != moduleUsesByFile.end())
+            modIncludeMod[name] = it->second;
+    }
+
+    bool change = true;
+    while (change)
+    {
+        change = false;
+        for (auto& mod : modIncludeMod)
+        {
+            set<string> newSet = mod.second;
+            for (auto& included : mod.second)
+            {
+                auto it = modIncludeMod.find(included);
+                if (it == modIncludeMod.end())
+                    continue;
+
+                for (auto& elem : it->second)
+                {
+                    if (newSet.find(elem) == newSet.end())
+                    {
+                        newSet.insert(elem);
+                        change = true;
+                    }
+                }
+            }
+            mod.second = newSet;
+        }
+    }
+
+    for (auto& elem : moduleUsesByFile)
+    {
+        set<string> newSet = elem.second;
+        for (auto& setElem : elem.second)
+        {
+            auto it = modIncludeMod.find(setElem);
+            if (it != modIncludeMod.end())
+                for (auto& toRem : it->second)
+                    newSet.erase(toRem);
+        }
+        elem.second = newSet;
+    }*/
+}
+
+static void addUseStatements(SgStatement* currF, SgStatement* obj, vector<SgStatement*>& useStats,
+    const vector<SgStatement*>& funcContains)
+{
+    for (auto& funcSt : funcContains)
+    {
+        if (currF == funcSt)
+        {
+            SgStatement* last = obj->lastNodeOfStmt();
+            for (SgStatement* st = obj->lexNext(); st != last; st = st->lexNext())
+            {
+                if (st->variant() == USE_STMT)
+                    useStats.push_back(st);
+                else if (st->variant() == CONTAINS_STMT)
+                    break;
+            }
+            break;
+        }
+    }
+}
+
+void fillUsedModulesInFunction(SgStatement* st, vector<SgStatement*>& useStats)
+{
+    checkNull(st, convertFileName(__FILE__).c_str(), __LINE__);
+
+    int var = st->variant();
+    while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR)
+    {
+        st = st->controlParent();
+        checkNull(st, convertFileName(__FILE__).c_str(), __LINE__);
+        var = st->variant();
+    }
+
+    for (SgStatement* stat = st->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext())
+        if (stat->variant() == USE_STMT)
+            useStats.push_back(stat);
+
+    for (int i = 0; i < current_file->numberOfFunctions(); ++i)
+    {
+        vector<SgStatement*> funcContains;
+        findContainsFunctions(current_file->functions(i), funcContains);
+        addUseStatements(st, current_file->functions(i), useStats, funcContains);
+    }
+
+    vector<SgStatement*> modules;
+    findModulesInFile(st->getFile(), modules);
+    for (auto& module : modules)
+    {
+        vector<SgStatement*> funcContains;
+        findContainsFunctions(module, funcContains, true);
+        addUseStatements(st, module, useStats, funcContains);
+    }
+}
+
+static void findByUse(map<string, vector<pair<SgSymbol*, SgSymbol*>>>& modByUse, const string& varName,
+    const set<string>& locNames, vector<string>& altNames)
+{
+    for (auto& elem : modByUse)
+    {
+        if (locNames.count(elem.first))
+        {
+            for (auto& byUse : elem.second)
+            {
+                SgSymbol* toCmp = byUse.second ? byUse.second : byUse.first;
+                checkNull(toCmp, convertFileName(__FILE__).c_str(), __LINE__);
+                if (toCmp->identifier() == varName)
+                    altNames.push_back(byUse.first->identifier());
+            }
+        }
+    }
+}
+
+static void fillInfo(SgStatement* start,
+    set<string>& useMod,
+    map<string, vector<pair<SgSymbol*, SgSymbol*>>>& modByUse,
+    map<string, vector<pair<SgSymbol*, SgSymbol*>>>& modByUseOnly)
+{
+    for (SgStatement* st = start; st != start->lastNodeOfStmt(); st = st->lexNext())
+    {
+        if (isSgExecutableStatement(st))
+            break;
+        if (st->variant() == CONTAINS_STMT)
+            break;
+        if (st != start && (st->variant() == PROC_HEDR || st->variant() == FUNC_HEDR))
+            break;
+        fillUseStatement(st, useMod, modByUse, modByUseOnly);
+    }
+}
+
+static SgStatement* findModWithName(const vector<SgStatement*>& modules, const string& name)
+{
+    for (auto& elem : modules)
+        if (elem->variant() == MODULE_STMT)
+            if (elem->symbol()->identifier() == name)
+                return elem;
+    return NULL;
+}
+
+string getNameByUse(SgStatement* place, const string& varName, const string& locName)
+{
+    int old_id = -1;
+    string oldFileName = "";
+    if (place->getFileId() != current_file_id)
+    {
+        old_id = current_file_id;
+        oldFileName = current_file->filename();
+        if (!place->switchToFile())
+            printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
+    }
+
+    SgStatement* func = getFuncStat(place, { MODULE_STMT });
+    string returnVal = varName;
+    if (func != NULL)
+    {
+        map<string, set<string>> graphUse;
+
+        set<string> useMod;
+        map<string, vector<pair<SgSymbol*, SgSymbol*>>> modByUse;
+        map<string, vector<pair<SgSymbol*, SgSymbol*>>> modByUseOnly;
+
+        fillInfo(func, useMod, modByUse, modByUseOnly);
+        SgStatement* cp = func->controlParent();
+        if (isSgProgHedrStmt(cp) || cp->variant() == MODULE_STMT) // if function in contains region
+            fillInfo(cp, useMod, modByUse, modByUseOnly);
+
+        set<string> useModDone;
+        bool needRepeat = true;
+
+        vector<SgStatement*> modules;
+        findModulesInFile(func->getFile(), modules);
+
+        while (needRepeat)
+        {
+            needRepeat = false;
+            set<string> newUseMod;
+            for (auto& useM : useMod)
+            {
+                if (useModDone.find(useM) == useModDone.end())
+                {
+                    auto modSt = findModWithName(modules, useM);
+                    if (modSt == NULL || useM == "dvmh_template_mod")
+                        continue;
+
+                    checkNull(modSt, convertFileName(__FILE__).c_str(), __LINE__);
+
+                    set<string> tmpUse;
+                    fillInfo(modSt, tmpUse, modByUse, modByUseOnly);
+                    useModDone.insert(useM);
+
+                    for (auto& use : tmpUse)
+                    {
+                        newUseMod.insert(use);
+
+                        if (use != "dvmh_template_mod")
+                            graphUse[use].insert(useM);
+                    }
+                }
+            }
+
+            for (auto& newU : newUseMod)
+            {
+                if (useModDone.find(newU) == useModDone.end())
+                {
+                    useModDone.insert(newU);
+                    needRepeat = true;
+                }
+            }
+        }
+
+        vector<string> altNames;
+        findByUse(modByUse, varName, { locName }, altNames);
+        findByUse(modByUseOnly, varName, { locName }, altNames);
+
+        if (altNames.size() == 0)
+        {
+            set<string> locations = { locName };
+            bool changed = true;
+            while (changed)
+            {
+                changed = false;
+                for (auto& loc : locations)
+                {
+                    if (graphUse.find(loc) != graphUse.end())
+                    {
+                        for (auto& use : graphUse[loc])
+                        {
+                            if (locations.find(use) == locations.end())
+                            {
+                                locations.insert(use);
+                                changed = true;
+                            }
+                        }
+                    }
+                }
+            }
+
+            findByUse(modByUse, varName, locations, altNames);
+            findByUse(modByUseOnly, varName, locations, altNames);
+        }
+
+        if (altNames.size() == 0)
+            returnVal = varName;
+        else if (altNames.size() >= 1)
+        {
+            set<string> setAlt(altNames.begin(), altNames.end());
+            returnVal = *setAlt.begin();
+        }
+        else
+            printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
+    }
+
+    if (old_id != -1)
+    {
+        if (SgFile::switchToFile(oldFileName) == -1)
+            printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
+    }
+
+    return returnVal;
+}
+
+void fixUseOnlyStmt(SgFile *file, const vector<ParallelRegion*> &regs)
+{
+    for (int z = 0; z < file->numberOfFunctions(); ++z)
+    {
+        vector<SgStatement*> modules;
+        findModulesInFile(file, modules);
+        map<string, SgStatement*> mod;
+        for (auto &elem : modules)
+            mod[elem->symbol()->identifier()] = elem;
+
+        if (modules.size())
+        {
+            SgStatement *func = file->functions(z);
+            bool hasTemplateUse = false;
+            set<DIST::Array*> needToAdd;
+
+            for (auto st = func; st != func->lastNodeOfStmt(); st = st->lexNext())
+            {
+                if (isSgExecutableStatement(st))
+                    break;
+
+                if (st->variant() == USE_STMT)
+                {
+                    SgExpression *ex = st->expr(0);
+                    string modName = st->symbol()->identifier();
+
+                    auto it = mod.find(modName);
+                    if (modName == "dvmh_Template_Mod")
+                    {
+                        hasTemplateUse = true;
+                        break;
+                    }
+                        
+                    if (ex && ex->variant() == ONLY_NODE && it != mod.end())
+                    {
+                        set<string> allS;
+                        for (auto exI = ex->lhs(); exI; exI = exI->rhs())
+                        {
+                            if (exI->lhs()->variant() == RENAME_NODE)
+                            {
+                                if (exI->lhs()->lhs()->symbol())
+                                    allS.insert(exI->lhs()->lhs()->symbol()->identifier());
+                                if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol())
+                                    allS.insert(exI->lhs()->rhs()->symbol()->identifier());
+                            }
+                        }
+
+                        for (auto &parReg : regs)
+                        {
+                            const DataDirective &dataDir = parReg->GetDataDir();
+                            for (auto &rule : dataDir.distrRules)
+                            {
+                                DIST::Array *curr = rule.first;
+                                auto location = curr->GetLocation();
+                                if (location.first == 2 && location.second == modName)
+                                    needToAdd.insert(curr);
+                            }
+
+                            for (auto& rule : dataDir.alignRules)
+                            {
+                                DIST::Array* curr = rule.alignArray;
+                                auto location = curr->GetLocation();
+                                if (location.first == 2 && location.second == modName)
+                                    needToAdd.insert(curr);
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (!hasTemplateUse && needToAdd.size())
+            {
+                SgStatement* useSt = new SgStatement(USE_STMT);
+                useSt->setSymbol(*findSymbolOrCreate(file, "dvmh_Template_Mod"));
+                useSt->setlineNumber(getNextNegativeLineNumber());
+
+                func->insertStmtAfter(*useSt, *func);
+            }
+        }
+    }
+}
+
+void fillUseStatement(SgStatement *st, set<string> &useMod,
+                      map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUse,
+                      map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUseOnly)
+{
+    if (st->variant() == USE_STMT)
+    {
+        SgExpression *ex = st->expr(0);
+        string modName = st->symbol()->identifier();
+        convertToLower(modName);
+        useMod.insert(modName);
+
+        if (ex)
+        {
+            SgExpression *start = ex;
+            bool only = false;
+            if (ex->variant() == ONLY_NODE)
+            {
+                start = ex->lhs();
+                only = true;
+            }
+
+            for (auto exI = start; exI; exI = exI->rhs())
+            {
+                if (exI->lhs()->variant() == RENAME_NODE)
+                {
+                    SgSymbol *left = NULL, *right = NULL;
+                    if (exI->lhs()->lhs()->symbol())
+                        left = exI->lhs()->lhs()->symbol();
+                    if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol())
+                        right = exI->lhs()->rhs()->symbol();
+                    if (only)
+                        modByUseOnly[modName].push_back(std::make_pair(left, right));
+                    else
+                        modByUse[modName].push_back(std::make_pair(left, right));
+                }
+            }
+        }
+    }
+}
+
+static void fillUseStmt(SgStatement* stat, map<string, set<SgSymbol*>>& byUse)
+{
+    if (stat->variant() != USE_STMT)
+        printInternalError(convertFileName(__FILE__).c_str(), __LINE__);
+
+    SgExpression* ex = stat->expr(0);
+    if (ex && ex->variant() == ONLY_NODE)
+    {
+        for (auto exI = ex->lhs(); exI; exI = exI->rhs())
+        {
+            if (exI->lhs()->variant() == RENAME_NODE)
+            {
+                SgExpression* ren = exI->lhs();
+                if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol())
+                    byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol());
+            }
+        }
+    }
+    else if (ex && ex->lhs())
+    {
+        for (auto exI = ex; exI; exI = exI->rhs())
+        {
+            if (exI->lhs()->variant() == RENAME_NODE)
+            {
+                SgExpression* ren = exI->lhs();
+                if (ren->lhs()->symbol() && ren->rhs() && ren->rhs()->symbol())
+                    byUse[ren->rhs()->symbol()->identifier()].insert(ren->lhs()->symbol());
+            }
+        }
+    }
+}
+
+map<string, set<SgSymbol*>> moduleRefsByUseInFunction(SgStatement* stIn)
+{
+    checkNull(stIn, convertFileName(__FILE__).c_str(), __LINE__);
+
+    map<string, set<SgSymbol*>> byUse;
+    int var = stIn->variant();
+    while (var != PROG_HEDR && var != PROC_HEDR && var != FUNC_HEDR)
+    {
+        stIn = stIn->controlParent();
+        if (stIn == NULL)
+            return byUse;
+        var = stIn->variant();
+    }
+
+    auto mapOfUses = createMapOfModuleUses(stIn->getFile());
+    set<string> useMods;
+
+    for (SgStatement* stat = stIn->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext())
+    {
+        if (stat->variant() == USE_STMT)
+        {
+            fillUseStmt(stat, byUse);
+            useMods.insert(stat->symbol()->identifier());
+        }
+    }
+
+    const int cpOfSt = stIn->controlParent()->variant();
+    //contains of func
+    if (cpOfSt == PROG_HEDR || cpOfSt == PROC_HEDR || cpOfSt == FUNC_HEDR)
+    {
+        for (SgStatement* stat = stIn->controlParent()->lexNext(); !isSgExecutableStatement(stat); stat = stat->lexNext())
+        {
+            if (stat->variant() == USE_STMT)
+            {
+                fillUseStmt(stat, byUse);
+                useMods.insert(stat->symbol()->identifier());
+            }
+        }
+    }
+
+    bool chages = true;
+    while (chages)
+    {
+        chages = false;
+        set<string> newUseMods(useMods);
+        for (auto& elem : useMods)
+        {
+            auto it = mapOfUses.find(elem);
+            if (it != mapOfUses.end())
+            {
+                for (auto& elem2 : it->second)
+                {
+                    if (newUseMods.find(elem2) == newUseMods.end())
+                    {
+                        newUseMods.insert(elem2);
+                        chages = true;
+                    }
+                }
+            }
+        }
+        useMods = newUseMods;
+    }
+
+    vector<SgStatement*> modules;
+    findModulesInFile(stIn->getFile(), modules);
+    for (auto& mod : modules)
+    {
+        if (useMods.find(mod->symbol()->identifier()) != useMods.end())
+        {
+            for (SgStatement* stat = mod->lexNext(); stat != mod->lastNodeOfStmt(); stat = stat->lexNext())
+            {
+                const int var = stat->variant();
+                if (var == USE_STMT)
+                {
+                    fillUseStmt(stat, byUse);
+                    useMods.insert(stat->symbol()->identifier());
+                }
+                else if (var == PROC_HEDR || var == FUNC_HEDR)
+                    break;
+            }
+        }
+    }
+    return byUse;
+}
diff --git a/sapfor/experts/Sapfor_2017/_src/Utils/version.h b/sapfor/experts/Sapfor_2017/_src/Utils/version.h
index bbc2467..94a1911 100644
--- a/sapfor/experts/Sapfor_2017/_src/Utils/version.h
+++ b/sapfor/experts/Sapfor_2017/_src/Utils/version.h
@@ -1,3 +1,3 @@
 #pragma once
 
-#define VERSION_SPF "2388"
+#define VERSION_SPF "2389"
diff --git a/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp b/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp
index 69d2607..96625f0 100644
--- a/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp
+++ b/sapfor/experts/Sapfor_2017/_src/VerificationCode/CorrectVarDecl.cpp
@@ -60,128 +60,6 @@ void VarDeclCorrecter(SgFile *file)
     }
 }
 
-void fixUseOnlyStmt(SgFile *file, const vector<ParallelRegion*> &regs)
-{
-    for (int z = 0; z < file->numberOfFunctions(); ++z)
-    {
-        vector<SgStatement*> modules;
-        findModulesInFile(file, modules);
-        map<string, SgStatement*> mod;
-        for (auto &elem : modules)
-            mod[elem->symbol()->identifier()] = elem;
-
-        if (modules.size())
-        {
-            SgStatement *func = file->functions(z);
-            bool hasTemplateUse = false;
-            set<DIST::Array*> needToAdd;
-
-            for (auto st = func; st != func->lastNodeOfStmt(); st = st->lexNext())
-            {
-                if (isSgExecutableStatement(st))
-                    break;
-
-                if (st->variant() == USE_STMT)
-                {
-                    SgExpression *ex = st->expr(0);
-                    string modName = st->symbol()->identifier();
-
-                    auto it = mod.find(modName);
-                    if (modName == "dvmh_Template_Mod")
-                    {
-                        hasTemplateUse = true;
-                        break;
-                    }
-                        
-                    if (ex && ex->variant() == ONLY_NODE && it != mod.end())
-                    {
-                        set<string> allS;
-                        for (auto exI = ex->lhs(); exI; exI = exI->rhs())
-                        {
-                            if (exI->lhs()->variant() == RENAME_NODE)
-                            {
-                                if (exI->lhs()->lhs()->symbol())
-                                    allS.insert(exI->lhs()->lhs()->symbol()->identifier());
-                                if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol())
-                                    allS.insert(exI->lhs()->rhs()->symbol()->identifier());
-                            }
-                        }
-
-                        for (auto &parReg : regs)
-                        {
-                            const DataDirective &dataDir = parReg->GetDataDir();
-                            for (auto &rule : dataDir.distrRules)
-                            {
-                                DIST::Array *curr = rule.first;
-                                auto location = curr->GetLocation();
-                                if (location.first == 2 && location.second == modName)
-                                    needToAdd.insert(curr);
-                            }
-
-                            for (auto& rule : dataDir.alignRules)
-                            {
-                                DIST::Array* curr = rule.alignArray;
-                                auto location = curr->GetLocation();
-                                if (location.first == 2 && location.second == modName)
-                                    needToAdd.insert(curr);
-                            }
-                        }
-                    }
-                }
-            }
-
-            if (!hasTemplateUse && needToAdd.size())
-            {
-                SgStatement* useSt = new SgStatement(USE_STMT);
-                useSt->setSymbol(*findSymbolOrCreate(file, "dvmh_Template_Mod"));
-                useSt->setlineNumber(getNextNegativeLineNumber());
-
-                func->insertStmtAfter(*useSt, *func);
-            }
-        }
-    }
-}
-
-void fillUseStatement(SgStatement *st, set<string> &useMod,
-                      map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUse,
-                      map<string, vector<pair<SgSymbol*, SgSymbol*>>> &modByUseOnly)
-{
-    if (st->variant() == USE_STMT)
-    {
-        SgExpression *ex = st->expr(0);
-        string modName = st->symbol()->identifier();
-        convertToLower(modName);
-        useMod.insert(modName);
-
-        if (ex)
-        {
-            SgExpression *start = ex;
-            bool only = false;
-            if (ex->variant() == ONLY_NODE)
-            {
-                start = ex->lhs();
-                only = true;
-            }
-
-            for (auto exI = start; exI; exI = exI->rhs())
-            {
-                if (exI->lhs()->variant() == RENAME_NODE)
-                {
-                    SgSymbol *left = NULL, *right = NULL;
-                    if (exI->lhs()->lhs()->symbol())
-                        left = exI->lhs()->lhs()->symbol();
-                    if (exI->lhs()->rhs() && exI->lhs()->rhs()->symbol())
-                        right = exI->lhs()->rhs()->symbol();
-                    if (only)
-                        modByUseOnly[modName].push_back(std::make_pair(left, right));
-                    else
-                        modByUse[modName].push_back(std::make_pair(left, right));
-                }
-            }
-        }
-    }
-}
-
 struct ModuleInfo
 {
     set<string> useMod;
diff --git a/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h b/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h
index 65a6fc5..ae62d4f 100644
--- a/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h
+++ b/sapfor/experts/Sapfor_2017/_src/VerificationCode/verifications.h
@@ -38,7 +38,6 @@ void resolveFunctionCalls(SgFile* file, const std::set<std::string>& toResolve,
 bool checkAndMoveFormatOperators(SgFile* file, std::vector<Messages> &currMessage, bool withError = true);
 
 int VerifyFile(SgFile *file);
-void fixUseOnlyStmt(SgFile *file, const std::vector<ParallelRegion*> &regs);
 void correctModuleProcNames(SgFile *file, const std::set<std::string>& globalF);
 void correctModuleSymbols(SgFile *file);
 void replaceStructuresToSimpleTypes(SgFile* file);
@@ -48,7 +47,6 @@ bool checkArgumentsDeclaration(SgProject *project, const std::map<std::string, s
 void replaceDerivedAssigns(SgFile *file, SgStatement *stToCopy, SgStatement *insertB, const std::map<std::string, SgStatement*> &derivedTypesDecl);
 bool isDerivedAssign(SgStatement *st);
 std::map<std::string, SgStatement*> createDerivedTypeDeclMap(SgStatement *forS);
-void fillUseStatement(SgStatement* st, std::set<std::string>& useMod, std::map<std::string, std::vector<std::pair<SgSymbol*, SgSymbol*>>>& modByUse, std::map<std::string, std::vector<std::pair<SgSymbol*, SgSymbol*>>>& modByUseOnly);
 void removeExecutableFromModuleDeclaration(SgFile* current, const std::set<std::string>& filesInProj, std::vector<SgStatement*>& hiddenData);
 bool needToReplaceInterfaceName(SgStatement* interf);