Files
SAPFOR/dvm/fdvm/trunk/fdvm/acc_across.cpp

5996 lines
268 KiB
C++
Raw Blame History

#include "dvm.h"
#include "aks_structs.h"
#include "acc_data.h"
using namespace std;
// all flags
#define LongT C_DvmType()
#define debugMode 0
#define kerneloff 0
// extern variables
extern reduction_operation_list *red_struct_list;
extern symb_list *shared_list, *acc_func_list;
extern symb_list *RGname_list;
extern symb_list *acc_call_list;
extern vector<SgSymbol *> loopVars;
// extern functions
extern SgStatement *Create_C_Function(SgSymbol*);
extern SgExpression *RedPost(SgSymbol*, SgSymbol*, SgSymbol*, SgSymbol*);
extern SgSymbol *GridSymbolForRedInAdapter(SgSymbol *, SgStatement *);
extern SgSymbol *GpuHeaderSymbolInAdapter(SgSymbol *, SgStatement *);
extern SgSymbol *GpuBaseSymbolInAdapter(SgSymbol *, SgStatement *);
extern SgExpression *CudaReplicate(SgSymbol *, SgSymbol *, SgSymbol *, SgSymbol *);
extern SgStatement *IncludeLine(char*);
extern void optimizeLoopBodyForOne(vector<newInfo> &allNewInfo);
extern void searchIdxs(vector<acrossInfo> &allInfo, SgExpression *st);
extern int warpSize;
// local functions
vector<ArgsForKernel> Create_C_Adapter_Function_Across_variants(SgSymbol*, SgSymbol*, const int, const int, const int, const vector<SageSymbols>&, const vector<SageSymbols>&);
vector<ArgsForKernel> Create_C_Adapter_Function_Across_OneThread(SgSymbol*, SgSymbol*, const int, const int);
symb_list* AddToSymbList(symb_list*, SgSymbol*);
symb_list* AddNewToSymbList(symb_list*, SgSymbol*);
void CreateReductionBlocksAcross(SgStatement*, int, SgExpression*, SgSymbol*);
//void CompleteStructuresForReductionInKernelAcross(void);
void DeclarationOfReductionBlockInKernelAcross(SgExpression *ered, reduction_operation_list *rsl);
void DeclarationCreateReductionBlocksAcross(int, SgExpression*);
AnalyzeReturnGpuO1 analyzeLoopBody(int type);
// local static variables
static SgSymbol *red_first = NULL;
static bool createBodyKernel = false;
static bool createConvert_XY = true;
static const int numLoopVars = 16;
static bool ifReadLvlMode = false;
static vector <stack <SgStatement*> > copyOfBody;
static vector<SgSymbol*> allRegNames;
static vector<ParamsForAllVariants> allVariants;
static const char *funcDvmhConvXYfortVer = " attributes(device) subroutine dvmh_convert_XY_int(x,y,Rx,Ry,slash,idx)\n implicit none\n integer ,value:: x\n integer ,value:: y\n integer ,value:: Rx\n integer ,value:: Ry\n integer ,value:: slash\n integer ,device:: idx \n \n if(slash .eq. 0) then\n if(Rx .eq. Ry) then\n if(x + y .lt. Rx) then\n idx = y + (1+x+y)*(x+y)/2\n else\n idx = Rx*(Rx-1)+x-(2*Rx-x-y-1)*(2*Rx-x-y-2)/2\n endif \n elseif(Rx .lt. Ry) then\n if(x + y .lt. Rx) then\n idx = y + ((1+x+y)*(x+y)) / 2\n elseif(x + y .lt. Ry) then\n idx = ((1+Rx)*Rx) / 2 + Rx - x - 1 + Rx * (x+y-Rx)\n else\n idx = Rx*Ry-Ry+y-(((Rx+Ry-y-x-1)*(Rx+Ry-y-x-2))/2)\n endif\n else\n if(x + y .lt. Ry) then\n idx = x + (1+x+y)*(x+y) / 2\n elseif(x + y .lt. Rx) then\n idx = (1+Ry)*Ry/2 + (Ry-y-1) + Ry * (x+y-Ry)\n else\n idx = Rx*Ry-Rx+x-((Rx+Ry-y-x-1)*(Rx+Ry-y-x-2)/2)\n endif\n endif\n else\n if(Rx .eq. Ry) then\n if(x + Rx-1-y .lt. Rx) then\n idx = Rx-1-y + (x+Rx-y)*(x+Rx-1-y)/2\n else\n idx = Rx*(Rx-1) + x - (Rx-x+y)*(Rx-x+y-1)/2\n endif\n elseif(Rx .lt. Ry) then\n if(x + Ry-1-y .lt. Rx) then \n idx = Ry-1-y + ((x+Ry-y)*(x+Ry-1-y)) / 2\n elseif(x + Ry-1-y .lt. Ry) then\n idx = ((1+Rx)*Rx)/2+Rx-x-1+Rx*(x+Ry-1-y-Rx)\n else\n idx = Rx*Ry-1-y-(((Rx+y-x)*(Rx+y-x-1))/2)\n endif\n else\n if(x + Ry-1-y .lt. Ry) then\n idx = x + (1+x+Ry-1-y)*(x+Ry-1-y)/2\n elseif(x + Ry-1-y .lt. Rx) then\n idx = (1+Ry)*Ry/2 + y + Ry * (x-y-1)\n else\n idx = Rx*Ry-Rx+x-((Rx+y-x)*(Rx+y-x-1)/2)\n endif\n endif\n endif\n end subroutine\n";
static const char *funcDvmhConvXYfortVerLong = " attributes(device) subroutine dvmh_convert_XY_llong(x,y,Rx,Ry,slash,idx)\n implicit none\n integer*8 ,value:: x\n integer*8 ,value:: y\n integer*8 ,value:: Rx\n integer*8 ,value:: Ry\n integer*8 ,value:: slash\n integer*8 ,device:: idx \n \n if(slash .eq. 0) then\n if(Rx .eq. Ry) then\n if(x + y .lt. Rx) then\n idx = y + (1+x+y)*(x+y)/2\n else\n idx = Rx*(Rx-1)+x-(2*Rx-x-y-1)*(2*Rx-x-y-2)/2\n endif \n elseif(Rx .lt. Ry) then\n if(x + y .lt. Rx) then\n idx = y + ((1+x+y)*(x+y)) / 2\n elseif(x + y .lt. Ry) then\n idx = ((1+Rx)*Rx) / 2 + Rx - x - 1 + Rx * (x+y-Rx)\n else\n idx = Rx*Ry-Ry+y-(((Rx+Ry-y-x-1)*(Rx+Ry-y-x-2))/2)\n endif\n else\n if(x + y .lt. Ry) then\n idx = x + (1+x+y)*(x+y) / 2\n elseif(x + y .lt. Rx) then\n idx = (1+Ry)*Ry/2 + (Ry-y-1) + Ry * (x+y-Ry)\n else\n idx = Rx*Ry-Rx+x-((Rx+Ry-y-x-1)*(Rx+Ry-y-x-2)/2)\n endif\n endif\n else\n if(Rx .eq. Ry) then\n if(x + Rx-1-y .lt. Rx) then\n idx = Rx-1-y + (x+Rx-y)*(x+Rx-1-y)/2\n else\n idx = Rx*(Rx-1) + x - (Rx-x+y)*(Rx-x+y-1)/2\n endif\n elseif(Rx .lt. Ry) then\n if(x + Ry-1-y .lt. Rx) then \n idx = Ry-1-y + ((x+Ry-y)*(x+Ry-1-y)) / 2\n elseif(x + Ry-1-y .lt. Ry) then\n idx = ((1+Rx)*Rx)/2+Rx-x-1+Rx*(x+Ry-1-y-Rx)\n else\n idx = Rx*Ry-1-y-(((Rx+y-x)*(Rx+y-x-1))/2)\n endif\n else\n if(x + Ry-1-y .lt. Ry) then\n idx = x + (1+x+Ry-1-y)*(x+Ry-1-y)/2\n elseif(x + Ry-1-y .lt. Rx) then\n idx = (1+Ry)*Ry/2 + y + Ry * (x-y-1)\n else\n idx = Rx*Ry-Rx+x-((Rx+y-x)*(Rx+y-x-1)/2)\n endif\n endif\n endif\n end subroutine\n" ;
static const char* fermiPreprocDir = "CUDA_FERMI_ARCH";
// local variables
SgStatement *kernelScope, *block;
void InitializeAcrossACC()
{
red_first = NULL;
createBodyKernel = false;
createConvert_XY = true;
ifReadLvlMode = false;
copyOfBody.clear();
allRegNames.clear();
allVariants.clear();
}
static inline int pow(int n)
{
int tmp = 1;
tmp = tmp << n;
return tmp;
}
static void setDvmDebugLvl()
{
char *s = getenv("DVMH_LOGLEVEL");
if (!ifReadLvlMode && s != NULL)
{
sscanf(s, "%d", &DVM_DEBUG_LVL);
ifReadLvlMode = true;
}
}
static inline void mywarn(const char *str)
{
#if debugMode
printf("%s\n", str);
#endif
}
static char *getLoopLine(const char *sadapter)
{
char *newLine = new char[strlen(sadapter) + 16];
newLine[0] = '\0';
strcat(newLine, "loop on line ");
int k = (int)strlen(newLine);
int i = (int)strlen(sadapter) - 1 - 6;
for (; sadapter[i] != '_'; i--);
for (i++; sadapter[i] != '_'; i++, k++)
{
newLine[k] = sadapter[i];
}
newLine[k] = '\\';
newLine[k + 1] = 'n';
newLine[k + 2] = '\0';
return newLine;
}
// generating function call (specially for across):
//loop_cuda_register_red(DvmhLoopRef *InDvmhLoop, DvmType InRedNum, void **ArrayPtr, void **LocPtr)
static SgExpression *RegisterReduction_forAcross(SgSymbol *s_loop_ref, SgSymbol *s_var_num, SgSymbol *s_red, SgSymbol *s_loc)
{
SgExpression *eloc;
SgFunctionCallExp *fe = new SgFunctionCallExp(*fdvm[RED_CUDA]);
fe->addArg(*new SgVarRefExp(s_loop_ref));
fe->addArg(*new SgVarRefExp(s_var_num));
fe->addArg(*new SgCastExp(*C_PointerType(C_PointerType(SgTypeVoid())), SgAddrOp(*new SgVarRefExp(*s_red))));
if (s_loc != NULL)
eloc = &(SgAddrOp(*new SgVarRefExp(*s_loc)));
else
eloc = new SgValueExp(0);
fe->addArg(*eloc);
return fe;
}
SgExpression *CreateBlocksThreadsSpec(SgSymbol *s_shared, SgSymbol *s_blocks, SgSymbol *s_threads, SgSymbol *s_stream)
{
SgExprListExp *el, *ell, *elm;
SgExpression *mult;
el = new SgExprListExp(*new SgVarRefExp(s_blocks));
ell = new SgExprListExp(*new SgVarRefExp(s_threads));
el->setRhs(ell);
mult = new SgVarRefExp(s_shared);
elm = new SgExprListExp(*mult);
ell->setRhs(elm);
ell = new SgExprListExp(*new SgVarRefExp(s_stream));
elm->setRhs(ell);
return ((SgExpression *)el);
}
SgExpression* CreateBlocksThreadsSpec(int size, SgSymbol *s_blocks, SgSymbol *s_threads)
{
SgExprListExp *el, *ell, *elm;
SgExpression *mult;
el = new SgExprListExp(*new SgVarRefExp(s_blocks));
ell = new SgExprListExp(*new SgVarRefExp(s_threads));
el->setRhs(ell);
//size==0 - parallel loop without reduction clause
mult = size ? &((*ThreadsGridSize(s_threads)) * (*new SgValueExp(size))) : new SgValueExp(size);
elm = new SgExprListExp(*mult);
ell->setRhs(elm);
return((SgExpression *)el);
}
SgExpression* CreateBlocksThreadsSpec(SgSymbol *s_blocks, SgSymbol *s_threads)
{
SgExprListExp *el, *ell;
el = new SgExprListExp(*new SgVarRefExp(s_blocks));
ell = new SgExprListExp(*new SgVarRefExp(s_threads));
el->setRhs(ell);
return((SgExpression *)el);
}
static void getDefaultCudaBlock(int &x, int &y, int &z, int loopDep, int loopIndep)
{
if (options.isOn(AUTO_TFM))
{
if (loopDep == 0)
{
if (loopIndep == 1) { x = 256; y = 1; z = 1; }
else if (loopIndep == 2) { x = 32; y = 14; z = 1; }
else { x = 32; y = 7; z = 2; }
}
else if (loopDep == 1)
{
if (loopIndep == 0) { x = 1; y = 1; z = 1; }
else if (loopIndep == 1) { x = 256; y = 1; z = 1; }
else if (loopIndep == 2) { x = 32; y = 5; z = 1; }
else { x = 16; y = 8; z = 2; }
}
else if (loopDep == 2)
{
if (loopIndep == 0) { x = 32; y = 1; z = 1; }
else if (loopIndep == 1) { x = 32; y = 4; z = 1; }
else { x = 16; y = 8; z = 2; }
}
else if (loopDep >= 3)
{
if (loopIndep == 0) { x = 32; y = 5; z = 1; }
else { x = 32; y = 5; z = 2; }
}
}
else
{
if (loopDep == 0)
{
if (loopIndep == 1) { x = 256; y = 1; z = 1; }
else if (loopIndep == 2) { x = 32; y = 14; z = 1; }
else { x = 32; y = 7; z = 2; }
}
else if (loopDep == 1)
{
if (loopIndep == 0) { x = 1; y = 1; z = 1; }
else if (loopIndep == 1) { x = 256; y = 1; z = 1; }
else if (loopIndep == 2) { x = 32; y = 8; z = 1; }
else { x = 16; y = 8; z = 2; }
}
else if (loopDep == 2)
{
if (loopIndep == 0) { x = 32; y = 1; z = 1; }
else if (loopIndep == 1) { x = 32; y = 4; z = 1; }
else { x = 16; y = 8; z = 2; }
}
else if (loopDep >= 3)
{
if (loopIndep == 0) { x = 8; y = 4; z = 1; }
else { x = 8; y = 4; z = 2; }
}
}
}
static const char *getKeyWordType(SgType *inType)
{
const char *ret = NULL;
if (inType->baseType()->variant() == SgTypeFloat()->variant())
ret = "float";
else if (inType->baseType()->variant() == SgTypeDouble()->variant())
ret = "double";
else if (inType->baseType()->variant() == SgTypeInt()->variant())
ret = "int";
else if (inType->baseType()->variant() == SgTypeBool()->variant())
ret = "bool";
else if (inType->baseType()->variant() == SgTypeChar()->variant())
ret = "char";
else if (inType->baseType()->variant() == SgTypeVoid()->variant())
ret = "void";
return ret;
}
static int getSizeOf()
{
int ret = 1;
for (SgExpression *er = red_list; er; er = er->rhs())
{
SgExpression *red_expr_ref = er->lhs()->rhs(); // reduction variable reference
SgType *inType = red_expr_ref->type();
SgExpression* len = inType->length();
if (len && len->isInteger())
{
ret = MAX(ret, len->valueInteger());
continue;
}
SgExpression* kind = inType->selector();
if (kind && kind->lhs())
{
SgExpression *kvalue = Calculate(kind->lhs());
if (kvalue->isInteger())
{
ret = MAX(ret, kvalue->valueInteger());
continue;
}
}
if (inType->variant() == SgTypeFloat()->variant())
ret = MAX(ret, sizeof(float));
else if (inType->variant() == SgTypeDouble()->variant())
ret = MAX(ret, sizeof(double));
else if (inType->variant() == SgTypeInt()->variant())
ret = MAX(ret, sizeof(int));
else if (inType->variant() == SgTypeBool()->variant())
ret = MAX(ret, sizeof(bool));
else if (inType->variant() == SgTypeChar()->variant())
ret = MAX(ret, sizeof(char));
}
return ret;
}
static SgStatement *CreateKernelProcedureDevice(SgSymbol *skernel)
{
SgStatement *st, *st_end;
SgExpression *e;
st = new SgStatement(PROC_HEDR);
st->setSymbol(*skernel);
e = new SgExpression(ACC_ATTRIBUTES_OP, new SgExpression(ACC_DEVICE_OP), NULL, NULL);
//e ->setRhs(new SgExpression(ACC_GLOBAL_OP));
st->setExpression(2, *e);
st_end = new SgStatement(CONTROL_END);
st_end->setSymbol(*skernel);
cur_in_mod->insertStmtAfter(*st, *mod_gpu);
st->insertStmtAfter(*st_end, *st);
st->setVariant(PROS_HEDR);
cur_in_mod = st_end;
return st;
}
static SgStatement* AssignStatement(SgExpression &lhs, SgExpression &rhs)
{
SgStatement *st;
if (options.isOn(C_CUDA))
st = new SgCExpStmt(SgAssignOp(lhs, rhs));
else
st = new SgAssignStmt(lhs, rhs);
return st;
}
static char* createName(const char* oldName, const char* variant)
{
char* correctName = new char[strlen(oldName) + strlen(variant) + 1];
correctName[0] = '\0';
strcat(correctName, oldName);
strcat(correctName, variant);
return correctName;
}
static SgSymbol *createVariantOfSAdapter(SgSymbol *sadapter, const char *variant)
{
SgSymbol *s_adapter;
const char *oldName = sadapter->identifier();
s_adapter = new SgSymbol(FUNCTION_NAME, createName(oldName, variant), *C_VoidType(), *block_C);
return s_adapter;
}
static SgSymbol *createVariantOfKernelSymbol(SgSymbol *kernel_symb, const char *variant)
{
SgSymbol *sk;
char *oldName = kernel_symb->identifier();
sk = new SgSymbol(PROCEDURE_NAME, createName(oldName, variant), *mod_gpu);
if (options.isOn(C_CUDA))
sk->setType(C_VoidType());
return sk;
}
static void createNewAdapter(SgSymbol *sadapter, ParamsForAllVariants &newVar, char *str)
{
SgSymbol *s_adapter;
char *nameOfNewSAdapter;
nameOfNewSAdapter = new char[strlen(sadapter->identifier()) + strlen(str) + 1];
nameOfNewSAdapter[0] = '\0';
strcat(nameOfNewSAdapter, sadapter->identifier());
s_adapter = createVariantOfSAdapter(sadapter, str);
strcat(nameOfNewSAdapter, str);
newVar.nameOfNewSAdapter = nameOfNewSAdapter;
newVar.s_adapter = s_adapter;
}
static void createNewKernel(SgSymbol *kernel_symb, ParamsForAllVariants &newVar, char *str)
{
SgSymbol *s_ks;
char *nameOfNewSK;
nameOfNewSK = new char[strlen(kernel_symb->identifier()) + strlen(str) + 1];
nameOfNewSK[0] = '\0';
strcat(nameOfNewSK, kernel_symb->identifier());
s_ks = createVariantOfKernelSymbol(kernel_symb, str);
strcat(nameOfNewSK, str);
newVar.nameOfNewKernelSymb = nameOfNewSK;
newVar.s_kernel_symb = s_ks;
}
static int countBit(int num)
{
int ret = 0;
while (num != 0)
{
if ((num & 1) == 1)
ret++;
num = num >> 1;
}
return ret;
}
static void generateAllBitmasks(int dep, int all, vector<int> &out)
{
if (dep == all)
out.push_back(pow(all) - 1);
else
{
int maxVar = pow(all);
for (int i = 1; i < maxVar; ++i)
{
if (countBit(i) == dep)
out.push_back(i);
}
}
}
static void GetAllCombinations2(vector<ParamsForAllVariants> &allVariants, SgSymbol *sadapter, SgSymbol *kernel_symb, int numAcr,
const vector<SageSymbols>& allSymb)
{
const unsigned sizeOfAllSymb = allSymb.size();
char *tmpstrAdapter = new char[16];
char *tmpstrKernel = new char[16];
tmpstrAdapter[0] = '\0';
tmpstrKernel[0] = '\0';
ParamsForAllVariants newVar;
newVar.allDims = sizeOfAllSymb;
newVar.loopSymb.resize(numLoopVars);
newVar.loopAcrossSymb.resize(numLoopVars);
newVar.nameOfNewSAdapter = NULL;
newVar.s_adapter = NULL;
newVar.acrossV = numAcr;
newVar.loopV = newVar.allDims - newVar.acrossV;
newVar.type = (1 << numAcr) - 1;
sprintf(tmpstrAdapter, "%d", newVar.type);
strcat(tmpstrAdapter, "_case");
sprintf(tmpstrKernel, "_%d", newVar.type);
strcat(tmpstrKernel, "_case");
createNewAdapter(sadapter, newVar, tmpstrAdapter);
createNewKernel(kernel_symb, newVar, tmpstrKernel);
int k = 0;
for (int r = 0; r < sizeOfAllSymb; ++r)
{
if (r < numAcr)
{
newVar.loopAcrossSymb[r].across_left = newVar.loopAcrossSymb[r].across_right = 0;
newVar.loopAcrossSymb[r].symb = allSymb[sizeOfAllSymb - r - 1].symb;
newVar.loopAcrossSymb[r].len = sizeOfAllSymb - r - 1;
}
else
{
newVar.loopSymb[k].across_left = newVar.loopSymb[k].across_right = 0;
newVar.loopSymb[k].symb = allSymb[sizeOfAllSymb - r - 1].symb;
newVar.loopSymb[k].len = sizeOfAllSymb - r - 1;
k++;
}
}
allVariants.push_back(newVar);
}
static void GetAllVariants2(vector<ParamsForAllVariants> &allVariants, SgSymbol *sadapter, SgSymbol *kernel_symb)
{
int acrossV = 0;
SageAcrossInfo Info = GetLoopsWithParAndAcrDir();
vector<SageSymbols> allSymb = GetSymbInParalell(dvm_parallel_dir->expr(2));
const int allDims = allSymb.size();
for (int z = 0; z < Info.idxs.size() && (acrossV < allDims); ++z)
{
SageArrayIdxs& idxInfo = Info.idxs[z];
for (int i = 0; i < idxInfo.dim && (acrossV < allDims); ++i)
if (idxInfo.symb[i].across_left != 0 || idxInfo.symb[i].across_right != 0)
acrossV++;
}
// correct dependencies lvl only for ACROSS with one dep
SgStatement *st = loop_body;
SgExpression* dvmDir = dvm_parallel_dir->expr(1);
vector<acrossInfo> allInfo;
bool nextStep = true;
loopVars.clear();
while (dvmDir)
{
SgExpression *t = dvmDir->lhs();
if (t->variant() == ACROSS_OP)
{
vector<SgExpression*> toAnalyze;
SgExpression* list = t->lhs();
while (list)
{
if (list->lhs()->variant() == ARRAY_REF)
toAnalyze.push_back(list->lhs());
else if (list->lhs()->variant() == ARRAY_OP)
{
if (list->lhs()->lhs()->variant() == ARRAY_REF)
toAnalyze.push_back(list->lhs()->lhs());
}
list = list->rhs();
}
for (int i = 0; i < toAnalyze.size(); ++i)
{
SgExpression* array = toAnalyze[i];
acrossInfo tmpI;
tmpI.nameOfArray = array->symbol()->identifier();
tmpI.symbol = array->symbol();
tmpI.allDim = 0;
tmpI.widthL = 0;
tmpI.widthR = 0;
tmpI.acrossPos = 0;
tmpI.acrossNum = 0;
SgExpression* tt = array->lhs();
int position = 0;
while (tt)
{
bool here = true;
if (tt->lhs()->lhs()->valueInteger() != 0)
{
tmpI.acrossPos = position;
tmpI.acrossNum++;
tmpI.widthL = (-1) * tt->lhs()->lhs()->valueInteger();
here = false;
}
if (tt->lhs()->rhs()->valueInteger() != 0)
{
tmpI.acrossPos = position;
if (here)
tmpI.acrossNum++;
tmpI.widthR = tt->lhs()->rhs()->valueInteger();
}
position++;
tt = tt->rhs();
}
for (int i = 0; i < position; ++i)
{
tmpI.dims.push_back(0);
tmpI.symbs.push_back(NULL);
}
allInfo.push_back(tmpI);
}
break;
}
dvmDir = dvmDir->rhs();
}
for (size_t i = 0; i < allInfo.size(); ++i)
{
if (allInfo[i].acrossNum > 1)
{
nextStep = false;
break;
}
}
if (nextStep)
{
SgExpression* dvmDir = dvm_parallel_dir->expr(2);
while (dvmDir)
{
loopVars.push_back(dvmDir->lhs()->symbol());
dvmDir = dvmDir->rhs();
}
while (st)
{
for (int i = 0; i < 3; ++i)
if (st->expr(i))
searchIdxs(allInfo, st->expr(i));
st = st->lexNext();
}
for (size_t i = 0; i < allInfo.size(); ++i)
{
if (allInfo[i].symbs[allInfo[i].acrossPos] == NULL)
{
nextStep = false;
break;
}
}
if (nextStep)
{
vector<char*> uniqSymbs;
uniqSymbs.push_back(allInfo[0].symbs[allInfo[0].acrossPos]->identifier() );
for (size_t i = 1; i < allInfo.size(); ++i)
{
bool uniq = true;
char *cmpd = allInfo[i].symbs[allInfo[i].acrossPos]->identifier();
for (size_t k = 0; k < uniqSymbs.size(); ++k)
{
if (strcmp(uniqSymbs[k], cmpd) == 0)
{
uniq = false;
break;
}
}
if (uniq)
{
uniqSymbs.push_back(cmpd);
}
}
acrossV = MIN((int)uniqSymbs.size(), allDims);
}
}
for (int i = 1; i <= acrossV; ++i)
GetAllCombinations2(allVariants, sadapter, kernel_symb, i, allSymb);
}
/*void printAllVars(vector<ParamsForAllVariants> &vectorT)
{
for (size_t i = 0; i < vectorT.size(); ++i)
{
printf("acrossV = %d loopV = %d alldims = %d\n", vectorT[i].acrossV, vectorT[i].loopV, vectorT[i].allDims);
printf("nameOfKernel = %s nameOfAdapt = %s \n", vectorT[i].nameOfNewKernelSymb, vectorT[i].nameOfNewSAdapter);
for (int k = 0; k < vectorT[i].loopV; ++k)
{
printf("%s, L = %d, R = %d, len= %d\n", vectorT[i].loopSymb[k]->symb->identifier(), vectorT[i].loopSymb[k]->across_left, vectorT[i].loopSymb[k]->across_right, vectorT[i].loopSymb[k]->len);
}
for (int k = 0; k < vectorT[i].acrossV; ++k)
{
printf("%s, L = %d, R = %d, len= %d\n", vectorT[i].loopAcrossSymb[k]->symb->identifier(), vectorT[i].loopAcrossSymb[k]->across_left, vectorT[i].loopAcrossSymb[k]->across_right, vectorT[i].loopAcrossSymb[k]->len);
}
printf("\n");
}
printf("\n");
}*/
ArgsForKernel *Create_C_Adapter_Function_Across(SgSymbol *sadapter)
{
createBodyKernel = true;
// clear information
allRegNames.clear();
SgStatement *st_hedr, *st_end, *first_exec, *stmt;
vector<SgStatement*> cuda_kernel;
SgExpression *fe, *ae, *el, *arg_list;
SgType *typ;
SgSymbol *s_loop_ref, *sarg, *s, *current_symbol;
symb_list *sl;
vector<SgSymbol *> argsForVariantFunction;
setDvmDebugLvl();
mywarn("start: getAllVars");
allVariants.clear();
GetAllVariants2(allVariants, sadapter, kernel_symb);
mywarn(" end: getAllVars");
cuda_kernel.resize(countKernels);
current_symbol = SymbMapping(current_file->filept->cur_symb); //CUR_FILE_CUR_SYMB();
if (options.isOn(ONE_THREAD))
{
const vector<SageSymbols> tmpStr = GetSymbInParalell(dvm_parallel_dir->expr(2));
int num = tmpStr.size();
vector<ArgsForKernel> retValueForKernel = Create_C_Adapter_Function_Across_OneThread(sadapter, kernel_symb, num, 0);
for (unsigned t = 0; t < countKernels; ++t)
{
loop_body = CopyOfBody.top();
CopyOfBody.pop();
currentLoop = new Loop(loop_body, options.isOn(OPT_EXP_COMP));
SgType *typeParams = indexTypeInKernel(rtTypes[t]);
for (int i = 0; i < num; ++i)
{
char *str = new char[64];
char *addL = new char[64];
str[0] = addL[0] = '\0';
retValueForKernel[t].otherVarsForOneTh.push_back(tmpStr[i].symb);
strcat(str, tmpStr[i].symb->identifier());
strcat(str, "_");
strcat(addL, str);
strcat(addL, "low");
retValueForKernel[t].otherVars.push_back(new SgSymbol(VARIABLE_NAME, addL, typeParams, kernel_symb->scope()));
addL[0] = '\0';
strcat(addL, str);
strcat(addL, "high");
retValueForKernel[t].otherVars.push_back(new SgSymbol(VARIABLE_NAME, addL, typeParams, kernel_symb->scope()));
addL[0] = '\0';
strcat(addL, str);
strcat(addL, "idx");
retValueForKernel[t].otherVars.push_back(new SgSymbol(VARIABLE_NAME, addL, typeParams, kernel_symb->scope()));
}
string kernel_symbNew = kernel_symb->identifier();
if (rtTypes[t] == rt_INT)
kernel_symbNew += "_int";
else if (rtTypes[t] == rt_LONG)
kernel_symbNew += "_long";
else if (rtTypes[t] == rt_LLONG)
kernel_symbNew += "_llong";
cuda_kernel[t] = CreateLoopKernelAcross(new SgSymbol(FUNCTION_NAME, kernel_symbNew.c_str(), *C_VoidType(), *block_C), &retValueForKernel[t], indexTypeInKernel(rtTypes[t]));
if (options.isOn(RTC))
{
acc_call_list = ACC_RTC_ExpandCallList(acc_call_list);
if (options.isOn(C_CUDA))
ACC_RTC_ConvertCudaKernel(cuda_kernel[t], kernel_symbNew.c_str());
else
ACC_RTC_AddCalledProcedureComment(kernel_symb);
RTC_FKernelArgs.push_back((SgFunctionCallExp*)cuda_kernel[t]->expr(0));
}
delete currentLoop;
currentLoop = NULL;
}
if (options.isOn(RTC))
ACC_RTC_CompleteAllParams();
}
else
{
mywarn("start: create all VARIANTS");
// if only type ~ 1 across symb
bool ifOne = true;
for (size_t i = 0; i < allVariants.size(); ++i)
{
if (allVariants[i].acrossV != 1)
ifOne = false;
}
// set global if true
if (ifOne)
dontGenConvertXY = true;
else
dontGenConvertXY = false;
for (size_t i = 0; i < allVariants.size(); ++i)
{
#if debugMode
printf("%d case\n", allVariants[i].type);
#endif
ParamsForAllVariants tmp = allVariants[i];
vector<ArgsForKernel> retValueForKernel;
for (unsigned k = 0; k < countKernels; ++k)
{
loop_body = CopyOfBody.top();
CopyOfBody.pop();
// temporary check for ON mapping
const bool contitionOfOptimization = options.isOn(AUTO_TFM);
if (contitionOfOptimization)
currentLoop = new Loop(loop_body, true);
string kernel_symb = tmp.s_kernel_symb->identifier();
if (rtTypes[k] == rt_INT)
kernel_symb += "_int";
else if (rtTypes[k] == rt_LONG)
kernel_symb += "_long";
else if (rtTypes[k] == rt_LLONG)
kernel_symb += "_llong";
if (tmp.acrossV == 1 && tmp.type == 1)
{
if (k == 0) // create CUDA handler once
retValueForKernel = Create_C_Adapter_Function_Across_variants(tmp.s_adapter, tmp.s_kernel_symb, tmp.loopV, tmp.acrossV, tmp.allDims, tmp.loopSymb, tmp.loopAcrossSymb);
cuda_kernel[k] = CreateLoopKernelAcross(new SgSymbol(FUNCTION_NAME, kernel_symb.c_str(), *C_VoidType(), *block_C), &retValueForKernel[k], tmp.acrossV, indexTypeInKernel(rtTypes[k]));
if (options.isOn(RTC))
acc_call_list = ACC_RTC_ExpandCallList(acc_call_list);
}
else if (tmp.acrossV != 1 && (tmp.type == 3 || tmp.type == 7 || tmp.type > 14))
{
// optimized loop body
if (options.isOn(GPU_O1))
analyzeLoopBody(ACROSS_TYPE);
if (k == 0) // create CUDA handler once
retValueForKernel = Create_C_Adapter_Function_Across_variants(tmp.s_adapter, tmp.s_kernel_symb, tmp.loopV, tmp.acrossV, tmp.allDims, tmp.loopSymb, tmp.loopAcrossSymb);
cuda_kernel[k] = CreateLoopKernelAcross(new SgSymbol(FUNCTION_NAME, kernel_symb.c_str(), *C_VoidType(), *block_C), &retValueForKernel[k], tmp.acrossV, indexTypeInKernel(rtTypes[k]));
if (options.isOn(RTC))
{
acc_call_list = ACC_RTC_ExpandCallList(acc_call_list);
if (!options.isOn(C_CUDA) && options.isOn(AUTO_TFM))
{
if (strstr(kernel_symb.c_str(), "_llong") != NULL)
acc_call_list = AddNewToSymbList(acc_call_list, createNewFunctionSymbol("dvmh_convert_XY_llong"));
else if (strstr(kernel_symb.c_str(), "_int") != NULL)
acc_call_list = AddNewToSymbList(acc_call_list, createNewFunctionSymbol("dvmh_convert_XY_int"));
}
}
}
if (newVars.size() != 0)
{
correctPrivateList(RESTORE);
newVars.clear();
}
if (contitionOfOptimization)
{
delete currentLoop;
currentLoop = NULL;
}
}
if (options.isOn(RTC))
{
for (unsigned diff = 0; diff < RTC_FCall.size() / countKernels; ++diff)
{
for (unsigned k = 0; k < countKernels; ++k)
RTC_FKernelArgs.push_back((SgFunctionCallExp*)cuda_kernel[k]->expr(0));
}
for (unsigned k = 0; k < countKernels; ++k)
{
string kernel_symb = tmp.s_kernel_symb->identifier();
if (rtTypes[k] == rt_INT)
kernel_symb += "_int";
else if (rtTypes[k] == rt_LONG)
kernel_symb += "_long";
else if (rtTypes[k] == rt_LLONG)
kernel_symb += "_llong";
if (options.isOn(C_CUDA))
ACC_RTC_ConvertCudaKernel(cuda_kernel[k], kernel_symb.c_str());
else
ACC_RTC_AddCalledProcedureComment(new SgSymbol(VARIABLE_NAME, kernel_symb.c_str()));
}
ACC_RTC_CompleteAllParams();
}
}
mywarn(" end: create all VARIANTS");
//create new control function
st_hedr = Create_C_Function(sadapter);
st_end = st_hedr->lexNext();
fe = st_hedr->expr(0);
st_hedr->addComment(Cuda_LoopHandlerComment());
first_exec = st_end;
mywarn("start: create dummy argument list ");
// create dummy argument list: loop_ref, <dvm-array-headers>, <uses>
typ = C_PointerType(C_Derived_Type(s_DvmhLoopRef));
s_loop_ref = new SgSymbol(VARIABLE_NAME, "loop_ref", *typ, *st_hedr);
argsForVariantFunction.push_back(s_loop_ref);
ae = new SgVarRefExp(s_loop_ref); //loop_ref
ae->setType(typ);
ae = new SgPointerDerefExp(*ae);
arg_list = new SgExprListExp(*ae);
fe->setLhs(arg_list);
for (sl = acc_array_list; sl; sl = sl->next) // <dvm-array-headers>
{
SgArrayType *typearray = new SgArrayType(*C_DvmType());
typearray->addDimension(NULL);
sarg = new SgSymbol(VARIABLE_NAME, sl->symb->identifier(), *typearray, *st_hedr);
argsForVariantFunction.push_back(sarg);
ae = new SgArrayRefExp(*sarg);
ae->setType(*typearray);
el = new SgExpression(EXPR_LIST);
el->setLhs(NULL);
ae->setLhs(*el);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
}
for (el = uses_list; el; el = el->rhs()) // <uses>
{
s = el->lhs()->symbol();
typ = C_PointerType(C_Type(s->type()));
sarg = new SgSymbol(VARIABLE_NAME, s->identifier(), *typ, *st_hedr);
argsForVariantFunction.push_back(sarg);
if (isByValue(s))
SYMB_ATTR(sarg->thesymb) = SYMB_ATTR(sarg->thesymb) | USE_IN_BIT;
ae = UsedValueRef(s, sarg);
ae->setType(typ);
ae = new SgPointerDerefExp(*ae);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
}
mywarn(" end: create dummy argument list ");
mywarn("start: create IF BLOCK ");
SgSymbol *which_run = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("which_run"), *C_Type(SgTypeInt()), *st_hedr);
stmt = makeSymbolDeclaration(which_run);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(which_run), *GetDependencyMask(s_loop_ref)));
st_end->insertStmtBefore(*stmt, *st_hedr);
char *str = new char[64];
str[0] = '\0';
strcat(str, "which_run in ");
strncat(str, sadapter->identifier(), strlen(sadapter->identifier()) - 6);
strcat(str, " is %d\\n");
SgFunctionCallExp *tmpF2 = new SgFunctionCallExp(*createNewFunctionSymbol("printf"));
tmpF2->addArg(*new SgValueExp(str));
tmpF2->addArg(*new SgVarRefExp(which_run));
if (DVM_DEBUG_LVL > 5)
st_end->insertStmtBefore(*new SgCExpStmt(*tmpF2), *st_hedr);
SgSymbol *s_cudaEvent = new SgSymbol(TYPE_NAME, "cudaEvent_t", *block_C);
SgSymbol *cudaEventStart = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("start"), *C_Derived_Type(s_cudaEvent), *st_hedr);
SgSymbol *cudaEventStop = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("stop"), *C_Derived_Type(s_cudaEvent), *st_hedr);
SgSymbol *gpuTime = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("gpuTime"), *SgTypeFloat(), *st_hedr);
SgSymbol *minGpuTime = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("minGpuTime"), *SgTypeFloat(), *st_hedr);
SgSymbol *s_i = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("__s_i"), *C_Type(SgTypeInt()), *st_hedr);
SgSymbol *s_k = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("__s_k"), *C_Type(SgTypeInt()), *st_hedr);
SgSymbol *min_s_i = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("min_s_i"), *C_Type(SgTypeInt()), *st_hedr);
SgSymbol *min_s_k = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("min_s_k"), *C_Type(SgTypeInt()), *st_hedr);
SgSymbol *max_cuda_block = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("__max_cuda_block"), *C_Type(SgTypeInt()), *st_hedr);
SgWhileStmt *whileSt = NULL;
SgWhileStmt *whileSt1 = NULL;
SgIfStmt *if_st;
vector<vector<int> > allVarForIfBlock;
vector<SgFunctionCallExp *> allFuncCalls;
for (size_t k = 0; k < allVariants.size(); ++k)
{
SgFunctionCallExp *funcCall;
if ((size_t)allVariants[k].acrossV > allVarForIfBlock.size() &&
(allVariants[k].type == 1 || allVariants[k].type == 3 || allVariants[k].type == 7 || allVariants[k].type > 14))
{
vector<int> tmp;
generateAllBitmasks(allVariants[k].acrossV, allVariants[k].allDims, tmp);
allVarForIfBlock.push_back(tmp);
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol(allVariants[k].nameOfNewSAdapter));
for (size_t i = 0; i < argsForVariantFunction.size(); ++i)
funcCall->addArg(*new SgVarRefExp(argsForVariantFunction[i]));
funcCall->addArg(*new SgVarRefExp(which_run));
allFuncCalls.push_back(funcCall);
}
}
if (options.isOn(SPEED_TEST_L0))
{
stmt = makeSymbolDeclarationWithInit(s_i, new SgValueExp(16));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclarationWithInit(s_k, new SgValueExp(1));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclaration(min_s_i);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclaration(min_s_k);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclaration(max_cuda_block);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclarationWithInit(minGpuTime, new SgValueExp(99999));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclaration(gpuTime);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclaration(cudaEventStart);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stmt = makeSymbolDeclaration(cudaEventStop);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
SgFunctionCallExp *eventF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventCreate"));
eventF->addArg(SgAddrOp(*new SgVarRefExp(cudaEventStart)));
st_end->insertStmtBefore(*new SgCExpStmt(*eventF), *st_hedr);
eventF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventCreate"));
eventF->addArg(SgAddrOp(*new SgVarRefExp(cudaEventStop)));
st_end->insertStmtBefore(*new SgCExpStmt(*eventF), *st_hedr);
SgFunctionCallExp *tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("printf"));
tmpF->addArg(*new SgValueExp(getLoopLine(sadapter->identifier())));
st_end->insertStmtBefore(*new SgCExpStmt(*tmpF), *st_hedr);
tmpF2 = new SgFunctionCallExp(*createNewFunctionSymbol("MAX"));
tmpF2->addArg(*new SgVarRefExp(allRegNames[0]));
if (allRegNames.size() == 1)
tmpF2->addArg(*new SgVarRefExp(allRegNames[0]));
else
tmpF2->addArg(*new SgVarRefExp(allRegNames[1]));
for (size_t i = 2; i < allRegNames.size(); ++i)
{
SgFunctionCallExp *tmpF1 = new SgFunctionCallExp(*createNewFunctionSymbol("MAX"));
tmpF1->addArg(*tmpF2);
tmpF1->addArg(*new SgVarRefExp(allRegNames[i]));
tmpF2 = tmpF1;
}
tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("MIN"));
tmpF->addArg(*new SgValueExp(384));
tmpF->addArg(*new SgValueExp(65535) / *tmpF2);
tmpF2 = tmpF;
st_end->insertStmtBefore(*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(max_cuda_block), *tmpF2)), *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_i), *new SgVarRefExp(s_i) + *new SgValueExp(16)));
whileSt = new SgWhileStmt(*new SgVarRefExp(s_i) < *new SgValueExp(257), *stmt);
st_hedr->lastExecutable()->insertStmtAfter(*whileSt, *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_k), *new SgVarRefExp(s_k) + *new SgValueExp(1)));
whileSt1 = new SgWhileStmt(*new SgVarRefExp(s_k) < *new SgValueExp(17), *stmt);
whileSt->insertStmtAfter(*whileSt1);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_k), *new SgValueExp(1)));
whileSt->insertStmtAfter(*stmt);
}
for (size_t i = 0; i < allVarForIfBlock.size(); ++i)
{
SgExpression *e = NULL;
for (size_t k = 0; k < allVarForIfBlock[i].size(); ++k)
{
if (k == 0)
e = &(SgEqOp(*new SgVarRefExp(which_run), *new SgValueExp(allVarForIfBlock[i][k])));
else
e = &(*e || SgEqOp(*new SgVarRefExp(which_run), *new SgValueExp(allVarForIfBlock[i][k])));
}
if (options.isOn(SPEED_TEST_L0))
{
allFuncCalls[i]->addArg(*new SgVarRefExp(s_i));
allFuncCalls[i]->addArg(*new SgVarRefExp(s_k));
}
stmt = new SgCExpStmt(*allFuncCalls[i]);
if_st = new SgIfStmt(*e, *stmt);
if (!options.isOn(SPEED_TEST_L0))
st_end->insertStmtBefore(*if_st, *st_hedr);
else
{
whileSt1->lastExecutable()->insertStmtBefore(*if_st);
}
}
tmpF2 = new SgFunctionCallExp(*createNewFunctionSymbol("printf"));
tmpF2->addArg(*new SgValueExp("It may be wrong!!\\n"));
if (DVM_DEBUG_LVL > 5)
{
if_st = new SgIfStmt(SgEqOp(*new SgVarRefExp(which_run), *new SgValueExp(0)), *new SgCExpStmt(*tmpF2));
st_end->insertStmtBefore(*if_st, *st_hedr);
}
if (options.isOn(SPEED_TEST_L0))
{
SgFunctionCallExp *tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventRecord"));
tmpF->addArg(*new SgVarRefExp(cudaEventStart));
tmpF->addArg(*new SgValueExp(0));
whileSt1->insertStmtAfter(*new SgCExpStmt(*tmpF));
tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventRecord"));
tmpF->addArg(*new SgVarRefExp(cudaEventStop));
tmpF->addArg(*new SgValueExp(0));
whileSt1->lastExecutable()->insertStmtBefore(*new SgCExpStmt(*tmpF));
tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventSynchronize"));
tmpF->addArg(*new SgVarRefExp(cudaEventStop));
whileSt1->lastExecutable()->insertStmtBefore(*new SgCExpStmt(*tmpF));
tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventElapsedTime"));
tmpF->addArg(SgAddrOp(*new SgVarRefExp(gpuTime)));
tmpF->addArg(*new SgVarRefExp(cudaEventStart));
tmpF->addArg(*new SgVarRefExp(cudaEventStop));
whileSt1->lastExecutable()->insertStmtBefore(*new SgCExpStmt(*tmpF));
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(min_s_i), *new SgVarRefExp(s_i)));
if_st = new SgIfStmt(*new SgVarRefExp(gpuTime) < *new SgVarRefExp(minGpuTime), *stmt);
whileSt1->lastExecutable()->insertStmtBefore(*if_st);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(min_s_k), *new SgVarRefExp(s_k)));
if_st->insertStmtAfter(*stmt);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(minGpuTime), *new SgVarRefExp(gpuTime)));
if_st->insertStmtAfter(*stmt);
if (options.isOn(SPEED_TEST_L1))
{
tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("printf"));
tmpF->addArg(*new SgValueExp(" cuda-block [%d, %d] with time - %f ms\\n"));
tmpF->addArg(*new SgVarRefExp(s_i));
tmpF->addArg(*new SgVarRefExp(s_k));
tmpF->addArg(*new SgVarRefExp(gpuTime));
whileSt1->lastExecutable()->insertStmtBefore(*new SgCExpStmt(*tmpF));
}
tmpF = new SgFunctionCallExp(*createNewFunctionSymbol("printf"));
tmpF->addArg(*new SgValueExp(" minimum time = %f ms, optimal cuda-block = [%d, %d]\\n\\n"));
tmpF->addArg(*new SgVarRefExp(minGpuTime));
tmpF->addArg(*new SgVarRefExp(min_s_i));
tmpF->addArg(*new SgVarRefExp(min_s_k));
st_end->insertStmtBefore(*new SgCExpStmt(*tmpF), *st_hedr);
SgFunctionCallExp *eventF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventDestroy"));
eventF->addArg(*new SgVarRefExp(cudaEventStart));
st_end->insertStmtBefore(*new SgCExpStmt(*eventF), *st_hedr);
eventF = new SgFunctionCallExp(*createNewFunctionSymbol("cudaEventDestroy"));
eventF->addArg(*new SgVarRefExp(cudaEventStop));
st_end->insertStmtBefore(*new SgCExpStmt(*eventF), *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_k), *new SgVarRefExp(s_k) + *new SgValueExp(1)));
SgContinueStmt *contST = new SgContinueStmt();
if_st = new SgIfStmt(*new SgVarRefExp(s_k) * *new SgVarRefExp(s_i) > *new SgVarRefExp(max_cuda_block), *contST);
whileSt1->insertStmtAfter(*if_st);
if_st->insertStmtAfter(*stmt);
}
mywarn(" end: create IF BLOCK ");
}
if (options.isOn(C_CUDA))
RenamingCudaFunctionVariables(st_hedr, s_loop_ref, 0); //(st_hedr, current_symbol->next(), 0);
return NULL;
}
vector<ArgsForKernel> Create_C_Adapter_Function_Across_OneThread(SgSymbol *sadapter, SgSymbol *kernel_symb, const int loopV, const int acrossV)
{
#if debugMode
warn("PARALLEL directive with ACROSS clause in region", 420, dvm_parallel_dir);
#endif
SgSymbol **reduction_ptr;
SgSymbol *lowI, *highI, *idxI;
symb_list *sl;
SgStatement *st_hedr, *st_end, *stmt, *first_exec;
SgExpression *fe, *ae, *arg_list, *el, *e, *espec, *er;
SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *uses_first, *scalar_first;
SgSymbol *s_blocks, *s_threads, *s_dev_num, *s_tmp_var, *idxTypeInKernel;
SgType *typ;
SgFunctionCallExp *funcCall;
vector<char*> dvm_array_headers;
int ln, num, uses_num, has_red_array, use_device_num, num_of_red_arrays = 0, nbuf = 0;
// init block
reduction_ptr = NULL;
lowI = highI = idxI = h_first = hgpu_first = base_first = red_first = uses_first = scalar_first = NULL;
s_loop_ref = sarg = s = sb = sg = sdev = h_first = s_blocks = s_threads = s_dev_num = s_tmp_var = NULL;
sl = NULL;
typ = NULL;
funcCall = NULL;
st_hedr = st_end = stmt = first_exec = NULL;
fe = ae = arg_list = el = e = espec = er = NULL;
ln = num = uses_num = has_red_array = use_device_num = num_of_red_arrays = 0;
// end of init block
mywarn("start: create fuction header ");
// create fuction header
st_hedr = Create_C_Function(sadapter);
st_hedr->addComment(Cuda_LoopHandlerComment());
st_end = st_hedr->lexNext();
fe = st_hedr->expr(0);
first_exec = st_end;
mywarn(" end: create fuction header ");
mywarn("start: create dummy argument list ");
// create dummy argument list: loop_ref, <dvm-array-headers>, <uses>
typ = C_PointerType(C_Derived_Type(s_DvmhLoopRef));
s_loop_ref = new SgSymbol(VARIABLE_NAME, "loop_ref", *typ, *st_hedr);
ae = new SgVarRefExp(s_loop_ref); //loop_ref
ae->setType(typ);
ae = new SgPointerDerefExp(*ae);
arg_list = new SgExprListExp(*ae);
fe->setLhs(arg_list);
for (sl = acc_array_list, ln = 0; sl; sl = sl->next, ++ln) // <dvm-array-headers>
{
SgArrayType *typearray = new SgArrayType(*C_DvmType());
typearray->addDimension(NULL);
sarg = new SgSymbol(VARIABLE_NAME, sl->symb->identifier(), *typearray, *st_hedr);
dvm_array_headers.push_back(sl->symb->identifier());
ae = new SgArrayRefExp(*sarg);
ae->setType(*typearray);
el = new SgExpression(EXPR_LIST);
el->setLhs(NULL);
ae->setLhs(*el);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
if (!ln)
h_first = sarg;
if (IS_REMOTE_ACCESS_BUFFER(sl->symb)) // case of RTS2 interface
nbuf++;
}
for (el = uses_list, ln = 0; el; el = el->rhs(), ++ln) // <uses>
{
s = el->lhs()->symbol();
typ = C_PointerType(C_Type(s->type()));
sarg = new SgSymbol(VARIABLE_NAME, s->identifier(), *typ, *st_hedr);
if (isByValue(s))
SYMB_ATTR(sarg->thesymb) = SYMB_ATTR(sarg->thesymb) | USE_IN_BIT;
ae = UsedValueRef(s, sarg);
ae->setType(typ);
ae = new SgPointerDerefExp(*ae);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
if (!ln)
uses_first = sarg;
}
uses_num = ln;
mywarn(" end: create dummy argument list ");
if (red_list) // reduction section
{
mywarn("start: in reduction section ");
s_tmp_var = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("tmpVar"), *C_DvmType(), *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
//looking through the reduction_op_list
for (er = red_list; er; er = er->rhs())
num_of_red_arrays++;
reduction_ptr = new SgSymbol*[num_of_red_arrays];
for (er = red_list, ln = 0; er; er = er->rhs(), ++ln)
{
SgExpression *ered, *ev, *en, *loc_var_ref;
SgSymbol *sred, *s_loc_var, *sgrid_loc;
int is_array;
SgType *loc_type = NULL, *btype = NULL;
loc_var_ref = NULL;
s_loc_var = NULL;
is_array = 0;
ered = er->lhs(); // reduction (variant==ARRAY_OP)
ev = ered->rhs(); // reduction variable reference for reduction operations except MINLOC,MAXLOC
if (isSgExprListExp(ev))
{
ev = ev->lhs(); // reduction variable reference
loc_var_ref = ered->rhs()->rhs()->lhs(); //location array reference
en = ered->rhs()->rhs()->rhs()->lhs(); // number of elements in location array
loc_el_num = LocElemNumber(en);
loc_type = loc_var_ref->symbol()->type();
}
else if (isSgArrayRefExp(ev) && !ev->lhs()) //whole array
is_array = 1;
s = sred = &(ev->symbol()->copy());
SYMB_SCOPE(s->thesymb) = st_hedr->thebif;
if (is_array)
{
SgArrayType *typearray = new SgArrayType(*C_Type(ev->symbol()->type()));
typearray->addRange(*ArrayLengthInElems(ev->symbol(), NULL, 0));
s->setType(*typearray);
}
else
s->setType(C_Type(ev->symbol()->type()));
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
if (!ln)
red_first = s;
s_loc_var = sgrid_loc = NULL;
if (loc_var_ref)
{
s = s_loc_var = &(loc_var_ref->symbol()->copy());
if (isSgArrayType(loc_type))
btype = loc_type->baseType();
else
btype = loc_type;
//!printf("__112\n");
SgArrayType *typearray = new SgArrayType(*C_Type(btype));
typearray->addRange(*new SgValueExp(loc_el_num));
s_loc_var->setType(*typearray);
SYMB_SCOPE(s->thesymb) = st_hedr->thebif;
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
s = sgrid_loc = GridSymbolForRedInAdapter(s, st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
//!printf("__113\n");
/*--- executable statements: register reductions in RTS ---*/
e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
{
stmt->addComment("// Register reduction for CUDA-execution");
first_exec = stmt;
}
stmt = new SgCExpStmt(*InitReduction(s_loop_ref, s_tmp_var, sred, s_loc_var));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
for (er = red_list, ln = 0; er; er = er->rhs(), ++ln)
{
char *buf_tmp = new char[8];
sprintf(buf_tmp, "%d", ln);
reduction_ptr[ln] = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "cuda_ptr_"), buf_tmp)), *C_PointerType(C_Type(er->lhs()->rhs()->symbol()->type())), *st_hedr);
st_hedr->insertStmtAfter(*makeSymbolDeclaration(reduction_ptr[ln]), *st_hedr);
delete[]buf_tmp;
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("cudaMalloc"));
funcCall->addArg(*new SgCastExp(*C_PointerType(C_PointerType(SgTypeVoid())), SgAddrOp(*new SgVarRefExp(reduction_ptr[ln]))));
funcCall->addArg(SgSizeOfOp(*new SgKeywordValExp(getKeyWordType(reduction_ptr[ln]->type()))));
stmt = new SgCExpStmt(*funcCall);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
mywarn(" end: out reduction section ");
}
mywarn("start: create vars ");
// create type for static arrays
SgArrayType *tpArr = new SgArrayType(*LongT);
SgValueExp *dimSize = new SgValueExp(loopV + acrossV + 2);
tpArr->addDimension(dimSize);
lowI = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("lowI"), *LongT, *st_hedr);
s->setType(tpArr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
highI = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("highI"), *LongT, *st_hedr);
s->setType(tpArr);
addDeclExpList(s, stmt->expr(0));
idxI = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("idxI"), *LongT, *st_hedr);
s->setType(tpArr);
addDeclExpList(s, stmt->expr(0));
idxTypeInKernel = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("idxTypeInKernel"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
mywarn(" end: create vars ");
mywarn("start: create assigns");
s_blocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("blocks"), *t_dim3, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
s_threads = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("threads"), *t_dim3, *st_hedr);
addDeclExpList(s, stmt->expr(0));
for (s = uses_first, ln = 0; ln < uses_num; s = s->next(), ++ln) // uses
if (!(s->attributes() & USE_IN_BIT)) // passing to kernel scalar argument by reference
{
sdev = GpuScalarAdrSymbolInAdapter(s, st_hedr); // creating new symbol for address in device
if (!ln)
{
scalar_first = sdev;
stmt = makeSymbolDeclaration(sdev);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(sdev, stmt->expr(0));
}
for (sl = acc_array_list, ln = 0; sl; sl = sl->next, ++ln)
{
s = GpuHeaderSymbolInAdapter(sl->symb, st_hedr);
if (!ln)
{
hgpu_first = s;
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(s, stmt->expr(0));
}
for (sl = acc_array_list, ln = 0; sl; sl = sl->next, ++ln)
{
s = GpuBaseSymbolInAdapter(sl->symb, st_hedr);
if (!ln)
{
base_first = s;
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(s, stmt->expr(0));
}
num = ln;
/* -------- call dvmh_get_device_addr(long *deviceRef, void *variable) ----*/
for (s = uses_first, sdev = scalar_first, ln = 0; ln < uses_num; s = s->next(), ++ln) // uses
if (!(s->attributes() & USE_IN_BIT)) // passing to kernel scalar argument by reference
{
s_dev_num = doDeviceNumVar(st_hedr, first_exec, s_dev_num, s_loop_ref);
e = &SgAssignOp(*new SgVarRefExp(sdev), *GetDeviceAddr(s_dev_num, s));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
stmt->addComment("// Get device addresses");
sdev = sdev->next();
}
/* -------- call dvmh_get_natural_base(long *deviceRef, long dvmDesc[] ) ----*/
for (sl = acc_array_list, s = h_first, sb = base_first, ln = 0; ln < num; sl = sl->next, s = s->next(), sb = sb->next(), ln++)
{
s_dev_num = doDeviceNumVar(st_hedr, first_exec, s_dev_num, s_loop_ref);
e = &SgAssignOp(*new SgVarRefExp(sb), *GetNaturalBase(s_dev_num, s));
stmt = new SgCExpStmt(*e);
SgStatement *cur = stmt;
st_end->insertStmtBefore(*stmt, *st_hedr);
if (IS_REMOTE_ACCESS_BUFFER(sl->symb)) // case of RTS2 interface
{
e = LoopGetRemoteBuf(s_loop_ref, nbuf--, s);
stmt = new SgCExpStmt(*e);
cur->insertStmtBefore(*stmt, *st_hedr);
}
if (!ln)
stmt->addComment("// Get natural bases");
}
/* -------- call dvmh_fill_header_(long *deviceRef, void *base, long dvmDesc[], long dvmhDesc[]);----*/
for (s = h_first, sg = hgpu_first, sb = base_first, ln = 0; ln < num; s = s->next(), sg = sg->next(), sb = sb->next(), ln++)
{
e = FillHeader(s_dev_num, sb, s, sg);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
stmt->addComment("// Fill device headers");
}
/* -------- call loop_fill_bounds_(loop_ref, lowI, highI, idxI); ----*/
stmt = new SgCExpStmt(*FillBounds(s_loop_ref, lowI, highI, idxI));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Get bounds");
mywarn(" end: create assigns");
stmt = new SgCExpStmt(SgAssignOp(*new SgRecordRefExp(*s_blocks, "x"), *new SgValueExp(1)));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Start counting");
stmt = new SgCExpStmt(SgAssignOp(*new SgRecordRefExp(*s_threads, "x"), *new SgValueExp(1)));
st_end->insertStmtBefore(*stmt, *st_hedr);
if (options.isOn(RTC))
{
/* -------- call loop_cuda_rtc_set_lang_(loop_ref, lang); ------------*/
if (options.isOn(C_CUDA))
stmt = new SgCExpStmt(*RtcSetLang(s_loop_ref, 1));
else
stmt = new SgCExpStmt(*RtcSetLang(s_loop_ref, 0));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Set CUDA language for launching kernels in RTC");
}
/* -------- call loop_guess_index_type_(loop_ref); ------------*/
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(idxTypeInKernel), *GuessIndexType(s_loop_ref)));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Guess index type in CUDA kernel");
SgFunctionCallExp *sizeofL = new SgFunctionCallExp(*createNewFunctionSymbol("sizeof"));
SgFunctionCallExp *sizeofLL = new SgFunctionCallExp(*createNewFunctionSymbol("sizeof"));
SgFunctionCallExp *sizeofI = new SgFunctionCallExp(*createNewFunctionSymbol("sizeof"));
sizeofL->addArg(*new SgKeywordValExp("long")); //addArg(*new SgVarRefExp(new SgSymbol(VARIABLE_NAME, "long")));
sizeofLL->addArg(*new SgKeywordValExp("long long")); //addArg(*new SgVarRefExp(new SgSymbol(VARIABLE_NAME, "long long")));
sizeofI->addArg(*new SgKeywordValExp("int")); //addArg(*new SgVarRefExp(new SgSymbol(VARIABLE_NAME, "int")));
stmt = new SgIfStmt(SgEqOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_LONG")))
&&
SgEqOp(*sizeofL, *sizeofI),
*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_INT")))));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgIfStmt(SgEqOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_LONG")))
&&
SgEqOp(*sizeofL, *sizeofLL),
*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_LLONG")))));
st_end->insertStmtBefore(*stmt, *st_hedr);
/* args for kernel */
{
espec = CreateBlocksThreadsSpec(s_blocks, s_threads);
funcCall = CallKernel(kernel_symb, espec);
for (sg = hgpu_first, sb = base_first, sl = acc_array_list, ln = 0; ln<num; sg = sg->next(), sb = sb->next(), sl = sl->next, ln++)
{
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? C_Type(sl->symb->type()) : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sb));
funcCall->addArg(*e);
for (int i = NumberOfCoeffs(sg); i>0; i--)
funcCall->addArg(*new SgArrayRefExp(*sg, *new SgValueExp(i)));
}
if (red_list)
{
reduction_operation_list *rsl;
int i = 0;
for (rsl = red_struct_list, s = red_first; rsl; rsl = rsl->next, ++i) //s!=s_blocks_info
{
if (rsl->redvar_size == 0) //reduction variable is scalar
{
if (options.isOn(RTC))
{
SgVarRefExp *toAdd = new SgVarRefExp(s);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCall->addArg(*toAdd);
}
else
funcCall->addArg(*new SgVarRefExp(s));
}
else
{
int i;
has_red_array = 1;
for (i = 0; i < rsl->redvar_size; i++)
funcCall->addArg(*new SgArrayRefExp(*s, *new SgValueExp(i)));
}
s = s->next();
if (options.isOn(C_CUDA))
funcCall->addArg(*new SgVarRefExp(reduction_ptr[i]));
else
funcCall->addArg(*new SgCastExp(*C_PointerType(new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(reduction_ptr[i])));
}
}
for (s = uses_first, sdev = scalar_first, ln = 0; ln < uses_num; s = s->next(), ln++) // uses
{
if (s->attributes() & USE_IN_BIT)
funcCall->addArg(SgDerefOp(*new SgVarRefExp(*s))); // passing argument by value to kernel
else
{ // passing argument by reference to kernel
SgType *tp = NULL;
if (s->type()->hasBaseType())
tp = s->type()->baseType();
else
tp = s->type();
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? tp : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sdev));
funcCall->addArg(*e);
sdev = sdev->next();
}
}
for (int i = 0; i < acrossV + loopV; ++i)
{
funcCall->addArg(*new SgArrayRefExp(*lowI, *new SgValueExp(i)));
funcCall->addArg(*new SgArrayRefExp(*highI, *new SgValueExp(i)));
funcCall->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(i)));
}
}
stmt = createKernelCallsInCudaHandler(funcCall, s_loop_ref, idxTypeInKernel, s_blocks);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (red_list)
{
ln = 0;
for (er = red_list; er; er = er->rhs(), ++ln)
{
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("cudaMemcpy"));
funcCall->addArg(SgAddrOp(*new SgVarRefExp(&(er->lhs()->rhs()->symbol()->copy()))));
funcCall->addArg(*new SgVarRefExp(reduction_ptr[ln]));
funcCall->addArg(SgSizeOfOp(*new SgKeywordValExp(getKeyWordType(reduction_ptr[ln]->type()))));
funcCall->addArg(*new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "cudaMemcpyDeviceToHost")));
stmt = new SgCExpStmt(*funcCall);
st_end->insertStmtBefore(*stmt, *st_hedr);
e = &SgAssignOp(*new SgVarRefExp(*s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(*RedPost(s_loop_ref, s_tmp_var, &(er->lhs()->rhs()->symbol()->copy()), NULL)); // loop_red_post_
st_end->insertStmtBefore(*stmt, *st_hedr);
}
ln = 0;
for (er = red_list; er; er = er->rhs(), ++ln)
{
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("cudaFree"));
funcCall->addArg(*new SgVarRefExp(reduction_ptr[ln]));
stmt = new SgCExpStmt(*funcCall);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (ln == 0)
stmt->addComment("// Free temporary variables");
}
}
// create args for kernel and return it
vector<ArgsForKernel> argsKernel(countKernels);
for (unsigned i = 0; i < countKernels; ++i)
argsKernel[i].st_header = st_hedr;
delete[]reduction_ptr;
mywarn(" end Adapter Function");
if (options.isOn(C_CUDA))
RenamingCudaFunctionVariables(st_hedr, s_loop_ref, 0);
return argsKernel;
}
static inline void insertReductionArgs(SgSymbol **reduction_ptr, SgSymbol **reduction_loc_ptr,
SgSymbol **reduction_symb, SgSymbol **reduction_loc_symb,
SgFunctionCallExp *funcCallKernel, SgSymbol* numBlocks, int &has_red_array)
{
reduction_operation_list *rsl;
SgSymbol *s;
SgExpression *e;
for (rsl = red_struct_list, s = red_first; rsl; rsl = rsl->next) //s!=s_blocks_info
{
if (rsl->redvar_size > 0)
{
funcCallKernel->addArg(*new SgVarRefExp(*numBlocks));
break;
}
}
int i = 0;
for (rsl = red_struct_list, s = red_first; rsl; rsl = rsl->next, ++i) //s!=s_blocks_info
{
if (rsl->redvar_size == 0) //reduction variable is scalar
{
if (options.isOn(RTC))
{
SgVarRefExp *toAdd = new SgVarRefExp(reduction_symb[i]);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
}
else
funcCallKernel->addArg(*new SgVarRefExp(reduction_symb[i]));
}
else //TODO!!
{
has_red_array = 1;
for (int k = 0; k < rsl->redvar_size; ++k)
funcCallKernel->addArg(*new SgArrayRefExp(*reduction_symb[i], *new SgValueExp(k)));
}
if (options.isOn(C_CUDA))
funcCallKernel->addArg(*new SgVarRefExp(reduction_ptr[i]));
else
funcCallKernel->addArg(*new SgCastExp(*C_PointerType(new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(reduction_ptr[i])));
//TODO!!
if (rsl->locvar) //MAXLOC,MINLOC
{
for (int k = 0; k < rsl->number; ++k)
funcCallKernel->addArg(*new SgArrayRefExp(*reduction_loc_symb[i], *new SgValueExp(k)));
s = s->next();
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? C_Type(rsl->locvar->type()) : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(s));
funcCallKernel->addArg(*e);
s = s->next();
}
}
}
static void createArgsForKernelForTwoDeps(SgFunctionCallExp*& funcCallKernel, SgSymbol* kernel_symb, SgExpression* espec, SgSymbol*& sg, SgSymbol* hgpu_first,
SgSymbol*& sb, SgSymbol* base_first, symb_list*& sl, int& ln, int num, SgExpression*& e, SgSymbol** reduction_ptr,
SgSymbol** reduction_loc_ptr, SgSymbol** reduction_symb, SgSymbol** reduction_loc_symb, SgSymbol* red_blocks, int& has_red_array,
SgSymbol* diag, const int& loopV, SgSymbol** num_elems, const int& acrossV, SgSymbol* acrossBase[16], SgSymbol* loopBase[16],
SgSymbol* idxI, const vector<SageSymbols>& loopAcrossSymb, const vector<SageSymbols>& loopSymb, SgSymbol*& s, SgSymbol* uses_first,
SgSymbol*& sdev, SgSymbol* scalar_first, int uses_num, vector<char*>& dvm_array_headers,
SgSymbol** addressingParams, SgSymbol** outTypeOfTransformation, SgSymbol* type_of_run, SgSymbol* bIdxs)
{
funcCallKernel = CallKernel(kernel_symb, espec);
for (sg = hgpu_first, sb = base_first, sl = acc_array_list, ln = 0; ln < num; sg = sg->next(), sb = sb->next(), sl = sl->next, ln++)
{
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? C_Type(sl->symb->type()) : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sb));
funcCallKernel->addArg(*e);
for (int i = NumberOfCoeffs(sg); i > 0; i--)
funcCallKernel->addArg(*new SgArrayRefExp(*sg, *new SgValueExp(i)));
}
if (red_list)
insertReductionArgs(reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, funcCallKernel, red_blocks, has_red_array);
if (options.isOn(RTC)) // diag is modifiable value
{
SgVarRefExp* toAdd = new SgVarRefExp(diag);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
}
else
funcCallKernel->addArg(*new SgVarRefExp(diag));
if (loopV > 2)
for (int k = 1; k < loopV + 2; ++k)
{
if (loopV > 2 && k == 2)
continue;
funcCallKernel->addArg(*new SgVarRefExp(num_elems[k]));
}
else if (loopV > 0)
for (int k = 1; k < loopV + 1; ++k)
funcCallKernel->addArg(*new SgVarRefExp(num_elems[k]));
for (int i = 0; i < acrossV; ++i)
{
if (i <= 1 && options.isOn(RTC)) // across base is modifiable value
{
SgVarRefExp* toAdd = new SgVarRefExp(acrossBase[i]);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
}
else
funcCallKernel->addArg(*new SgVarRefExp(acrossBase[i]));
}
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgVarRefExp(loopBase[i]));
for (int i = 0; i < acrossV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[i].len)));
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[i].len)));
for (s = uses_first, sdev = scalar_first, ln = 0; ln < uses_num; s = s->next(), ln++) // uses
{
if (s->attributes() & USE_IN_BIT)
funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(*s))); // passing argument by value to kernel
else
{ // passing argument by reference to kernel
SgType* tp = NULL;
if (s->type()->hasBaseType())
tp = s->type()->baseType();
else
tp = s->type();
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? tp : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sdev));
funcCallKernel->addArg(*e);
sdev = sdev->next();
}
}
if (options.isOn(AUTO_TFM))
{
for (size_t i = 0; i < dvm_array_headers.size(); ++i)
{
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(0)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(1)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(2)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(3)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(4)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(5)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(6)));
funcCallKernel->addArg(*new SgVarRefExp(*outTypeOfTransformation[i]));
}
}
funcCallKernel->addArg(*new SgVarRefExp(type_of_run));
for (int i = 0; i < acrossV + loopV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*bIdxs, *new SgValueExp(i)));
}
vector<ArgsForKernel> Create_C_Adapter_Function_Across_variants(SgSymbol *sadapter, SgSymbol *kernel_symb, const int loopV, const int acrossV,
const int allDims, const vector<SageSymbols>& loopSymb, const vector<SageSymbols>& loopAcrossSymb)
{
#if debugMode
warn("PARALLEL directive with ACROSS clause in region", 420, dvm_parallel_dir);
#endif
SgSymbol **num_elems = new SgSymbol*[allDims + 1];
SgSymbol **reduction_ptr = NULL, **reduction_loc_ptr = NULL, **addressingParams = NULL;
SgSymbol **reduction_symb = NULL, **reduction_loc_symb = NULL;
SgSymbol *lowI, *highI, *idxI, *bIdxs;
SgSymbol *elem, *red_blocks, *shared_mem, *stream_t;
SgSymbol *M, *N, *M1, *M2, *M3, *q, *diag, *Emax, *Emin, *Allmin, *SE, *var1, *var2, *var3;
SgSymbol *acrossBase[numLoopVars], *loopBase[numLoopVars], **outTypeOfTransformation = NULL;
SgSymbol *nums[3], *steps = NULL;
const char *s_cuda_var[3] = { "x", "y", "z" };
symb_list *sl;
SgStatement *st_hedr, *st_end, *stmt, *first_exec;
SgExpression *fe, *ae, *arg_list, *el, *e, *espec, *ex, *er;
SgSymbol *s_loop_ref, *sarg, *s, *sb, *sg, *sdev, *h_first, *hgpu_first, *base_first, *uses_first, *scalar_first;
SgSymbol *s_blocks, *s_threads, *s_dev_num, *s_tmp_var, *type_of_run, *s_i = NULL, *s_k = NULL, *s_tmp_var_1;
SgSymbol *idxTypeInKernel;
SgType *typ;
SgFunctionCallExp *funcCall, *funcCallKernel;
vector<char*> dvm_array_headers;
int ln, num, uses_num, has_red_array, use_device_num, num_of_red_arrays, nbuf = 0;
// init block
lowI = highI = idxI = elem = red_blocks = shared_mem = stream_t = bIdxs = NULL;
M = N = M1 = M2 = M3 = q = diag = Emax = Emin = Allmin = SE = var1 = var2 = var3 = NULL;
s_loop_ref = sarg = s = sb = sg = sdev = h_first = NULL;
hgpu_first = base_first = uses_first = scalar_first = NULL;
s_blocks = s_threads = s_dev_num = s_tmp_var = s_tmp_var_1 = NULL;
typ = NULL;
funcCall = funcCallKernel = NULL;
sl = NULL;
type_of_run = NULL;
st_hedr = st_end = stmt = first_exec = NULL;
fe = ae = arg_list = el = e = espec = ex = er = NULL;
ln = num = uses_num = has_red_array = use_device_num = num_of_red_arrays = 0;
//end of init block
mywarn("start: create fuction header ");
// create fuction header
st_hedr = Create_C_Function(sadapter);
st_hedr->addComment(Cuda_LoopHandlerComment());
st_end = st_hedr->lexNext();
fe = st_hedr->expr(0);
first_exec = st_end;
if (declaration_cmnt == NULL)
declaration_cmnt = "#include <cstdio>\n#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))\n#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))";
mywarn(" end: create fuction header ");
mywarn("start: create dummy argument list ");
// create dummy argument list: loop_ref, <dvm-array-headers>, <uses>
typ = C_PointerType(C_Derived_Type(s_DvmhLoopRef));
s_loop_ref = new SgSymbol(VARIABLE_NAME, "loop_ref", *typ, *st_hedr);
ae = new SgVarRefExp(s_loop_ref); //loop_ref
ae->setType(typ);
ae = new SgPointerDerefExp(*ae);
arg_list = new SgExprListExp(*ae);
fe->setLhs(arg_list);
for (sl = acc_array_list, ln = 0; sl; sl = sl->next, ++ln) // <dvm-array-headers>
{
SgArrayType *typearray = new SgArrayType(*C_DvmType());
typearray->addDimension(NULL);
sarg = new SgSymbol(VARIABLE_NAME, sl->symb->identifier(), *typearray, *st_hedr);
dvm_array_headers.push_back(sl->symb->identifier());
ae = new SgArrayRefExp(*sarg);
ae->setType(*typearray);
el = new SgExpression(EXPR_LIST);
el->setLhs(NULL);
ae->setLhs(*el);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
if (!ln)
h_first = sarg;
if (IS_REMOTE_ACCESS_BUFFER(sl->symb)) // case of RTS2 interface
nbuf++;
}
for (el = uses_list, ln = 0; el; el = el->rhs(), ++ln) // <uses>
{
s = el->lhs()->symbol();
typ = C_PointerType(C_Type(s->type()));
sarg = new SgSymbol(VARIABLE_NAME, s->identifier(), *typ, *st_hedr);
if (isByValue(s))
SYMB_ATTR(sarg->thesymb) = SYMB_ATTR(sarg->thesymb) | USE_IN_BIT;
ae = UsedValueRef(s, sarg);
ae->setType(typ);
ae = new SgPointerDerefExp(*ae);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
if (!ln)
uses_first = sarg;
}
uses_num = ln;
type_of_run = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("type_of_run"), *LongT, *st_hedr);
ae = new SgVarRefExp(type_of_run);
ae->setType(LongT);
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
if (options.isOn(SPEED_TEST_L0))
{
s_i = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("__s_i"), *C_Type(SgTypeInt()), *st_hedr);
ae = new SgVarRefExp(s_i);
ae->setType(C_Type(SgTypeInt()));
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
s_k = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("__s_k"), *C_Type(SgTypeInt()), *st_hedr);
ae = new SgVarRefExp(s_k);
ae->setType(C_Type(SgTypeInt()));
arg_list->setRhs(*new SgExprListExp(*ae));
arg_list = arg_list->rhs();
}
mywarn(" end: create dummy argument list ");
if (red_list) // reduction section
{
mywarn("start: in reduction section ");
s_tmp_var = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("tmpVar"), *C_DvmType(), *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
s_tmp_var_1 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("tmpVar1"), *C_DvmType(), *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
//looking through the reduction_op_list
for (er = red_list; er; er = er->rhs())
num_of_red_arrays++;
reduction_ptr = new SgSymbol*[num_of_red_arrays];
reduction_symb = new SgSymbol*[num_of_red_arrays];
reduction_loc_ptr = new SgSymbol*[num_of_red_arrays];
reduction_loc_symb = new SgSymbol*[num_of_red_arrays];
for (er = red_list, ln = 0; er; er = er->rhs(), ++ln)
{
SgExpression *ered, *ev, *en, *loc_var_ref;
SgSymbol *sred, *s_loc_var, *sgrid_loc;
int is_array;
SgType *loc_type = NULL, *btype = NULL;
loc_var_ref = NULL;
s_loc_var = NULL;
is_array = 0;
ered = er->lhs(); // reduction (variant==ARRAY_OP)
ev = ered->rhs(); // reduction variable reference for reduction operations except MINLOC,MAXLOC
if (isSgExprListExp(ev))
{
ev = ev->lhs(); // reduction variable reference
loc_var_ref = ered->rhs()->rhs()->lhs(); //location array reference
en = ered->rhs()->rhs()->rhs()->lhs(); // number of elements in location array
loc_el_num = LocElemNumber(en);
loc_type = loc_var_ref->symbol()->type();
}
else if (isSgArrayRefExp(ev) && !ev->lhs()) //whole array
is_array = 1;
s = sred = &(ev->symbol()->copy());
SYMB_SCOPE(s->thesymb) = st_hedr->thebif;
if (is_array)
{
SgArrayType *typearray = new SgArrayType(*C_Type(ev->symbol()->type()));
typearray->addRange(*ArrayLengthInElems(ev->symbol(), NULL, 0));
s->setType(*typearray);
}
else
s->setType(C_Type(ev->symbol()->type()));
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
reduction_symb[ln] = s;
if (!ln)
red_first = s;
s_loc_var = sgrid_loc = NULL;
if (loc_var_ref)
{
s = s_loc_var = &(loc_var_ref->symbol()->copy());
if (isSgArrayType(loc_type))
btype = loc_type->baseType();
else
btype = loc_type;
//!printf("__112\n");
SgArrayType *typearray = new SgArrayType(*C_Type(btype));
typearray->addRange(*new SgValueExp(loc_el_num));
s_loc_var->setType(*typearray);
SYMB_SCOPE(s->thesymb) = st_hedr->thebif;
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
reduction_loc_symb[ln] = s_loc_var;
s = sgrid_loc = GridSymbolForRedInAdapter(s, st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
//!printf("__113\n");
/*--- executable statements: register reductions in RTS ---*/
e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
{
stmt->addComment("// Register reduction for CUDA-execution");
first_exec = stmt;
}
char *buf_tmp = new char[8];
sprintf(buf_tmp, "%d", ln);
reduction_ptr[ln] = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "cuda_ptr_"), buf_tmp)), *C_PointerType(C_Type(ev->symbol()->type())), *st_hedr);
st_hedr->insertStmtAfter(*makeSymbolDeclaration(reduction_ptr[ln]), *st_hedr);
delete[]buf_tmp;
if (s_loc_var)
reduction_loc_ptr[ln] = sgrid_loc;
else
reduction_loc_ptr[ln] = NULL;
// create loop_cuda_register_red()
stmt = new SgCExpStmt(*RegisterReduction_forAcross(s_loop_ref, s_tmp_var, reduction_ptr[ln], reduction_loc_ptr[ln]));
st_end->insertStmtBefore(*stmt, *st_hedr);
// create loop_red_init_()
stmt = new SgCExpStmt(*InitReduction(s_loop_ref, s_tmp_var, sred, s_loc_var));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
mywarn(" end: out reduction section ");
}
mywarn("start: create vars ");
// create type for static arrays
SgArrayType *tpArr = new SgArrayType(*LongT);
SgValueExp *dimSize = new SgValueExp(loopV + acrossV + 2);
tpArr->addDimension(dimSize);
if (red_list)
{
red_blocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_of_red_blocks"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
lowI = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("lowI"), *LongT, *st_hedr);
s->setType(tpArr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
highI = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("highI"), *LongT, *st_hedr);
s->setType(tpArr);
addDeclExpList(s, stmt->expr(0));
idxI = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("idxI"), *LongT, *st_hedr);
s->setType(tpArr);
addDeclExpList(s, stmt->expr(0));
idxTypeInKernel = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("idxTypeInKernel"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
if (options.isOn(GPU_O0))
{
steps = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("steps"), *LongT, *st_hedr);
s->setType(tpArr);
addDeclExpList(s, stmt->expr(0));
}
bIdxs = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("idxs"), *LongT, *st_hedr);
s->setType(tpArr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
if (options.isOn(AUTO_TFM))
{
// create type for static arrays for addresingParams, size = 5
SgArrayType *tpArr = new SgArrayType(*LongT);
SgValueExp *dimSize = new SgValueExp(7);
tpArr->addDimension(dimSize);
addressingParams = new SgSymbol*[dvm_array_headers.size()];
outTypeOfTransformation = new SgSymbol*[dvm_array_headers.size()];
char *tmpS = new char[64];
for (size_t i = 0; i < dvm_array_headers.size(); ++i)
{
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_addressingParams");
addressingParams[i] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), *LongT, *st_hedr);
s->setType(tpArr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_outTypeOfTfm");
outTypeOfTransformation[i] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
}
if (acrossV == 1) // ACROSS with one dependence: create variables
{
SgStatement **stmts = new SgStatement*[MIN(loopV, 3) * 2];
for (int k = 0, k1 = MIN(loopV, 3); k < MIN(loopV, 3); ++k, ++k1)
{
nums[k] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_"), s_cuda_var[k])), *LongT, *st_hedr);
stmts[k] = makeSymbolDeclaration(s);
num_elems[k] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_elem_"), s_cuda_var[k])), *LongT, *st_hedr);
stmts[k1] = makeSymbolDeclaration(s);
}
for (int k = 0; k < MIN(loopV, 3) * 2; ++k)
st_hedr->insertStmtAfter(*stmts[k], *st_hedr);
if (loopV > 3)
{
for (int k = 0; k < loopV - 2; ++k)
{
char *tmp = new char[10];
sprintf(tmp, "%d", k);
num_elems[k + 3] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_elem_z_"), tmp)), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
delete[]tmp;
}
}
delete[]stmts;
}
else if (acrossV == 2) // ACROSS with two dependence: create variables
{
M = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("M"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
N = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("N"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
elem = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("elem"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
diag = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("diag"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
q = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("q"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
SgStatement **stmts = new SgStatement*[(MIN(loopV + 1, 3) - 1) * 2];
for (int k = 1, k1 = MIN(loopV + 1, 3) - 1; k < MIN(loopV + 1, 3); ++k, ++k1)
{
nums[k] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_"), s_cuda_var[k])), *LongT, *st_hedr);
stmts[k - 1] = makeSymbolDeclaration(s);
num_elems[k] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_elem_"), s_cuda_var[k])), *LongT, *st_hedr);
stmts[k1] = makeSymbolDeclaration(s);
}
nums[0] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_x"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
for (int i = 0; i < (MIN(loopV + 1, 3) - 1) * 2; ++i)
st_hedr->insertStmtAfter(*stmts[i], *st_hedr);
delete[]stmts;
if (loopV > 2)
{
for (int k = 0; k < loopV - 1; ++k)
{
char *tmp = new char[10];
sprintf(tmp, "%d", k);
num_elems[k + 3] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_elem_z_"), tmp)), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
delete[]tmp;
}
}
}
else if (acrossV >= 3) // ACROSS with three dependence: create variables
{
nums[0] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_x"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
nums[1] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_y"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
if (loopV > 0)
{
nums[2] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_z"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
for (int k = 0; k < loopV; ++k)
{
char *tmp = new char[10];
sprintf(tmp, "%d", k);
num_elems[k] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[16], "num_elem_z_"), tmp)), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
delete[]tmp;
}
num_elems[loopV] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_elem_z"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
M1 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Mi"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
M2 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Mj"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
M3 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Mk"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
Emax = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Emax"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
Emin = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Emin"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
Allmin = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Allmin"), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
SE = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("SE"), *LongT, *st_hedr);
stmt = makeSymbolDeclarationWithInit(s, new SgValueExp(1));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
diag = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("diag"), *LongT, *st_hedr);
stmt = makeSymbolDeclarationWithInit(s, new SgValueExp(1));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
var1 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("var1"), *LongT, *st_hedr);
stmt = makeSymbolDeclarationWithInit(s, new SgValueExp(1));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
var2 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("var2"), *LongT, *st_hedr);
stmt = makeSymbolDeclarationWithInit(s, new SgValueExp(0));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
var3 = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("var3"), *LongT, *st_hedr);
stmt = makeSymbolDeclarationWithInit(s, new SgValueExp(0));
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
// create indxs
for (int i = 0; i < acrossV; ++i)
{
acrossBase[i] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[20], "base_"),
loopAcrossSymb[i].symb->identifier())), *LongT, *st_hedr);
if (i == 0)
{
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(s, stmt->expr(0));
}
for (int i = 0; i < loopV; ++i)
{
loopBase[i] = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName(strcat(strcpy(new char[20], "base_"),
loopSymb[i].symb->identifier())), *LongT, *st_hedr);
addDeclExpList(s, stmt->expr(0));
}
// end
mywarn(" end: create vars ");
mywarn("start: create assigns");
s_blocks = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("blocks"), *t_dim3, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
s_threads = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("threads"), *t_dim3, *st_hedr);
addDeclExpList(s, stmt->expr(0));
shared_mem = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("shared_mem"), *LongT, *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
stream_t = s = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("stream"), *C_Derived_Type(s_cudaStream), *st_hedr);
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
for (s = uses_first, ln = 0; ln < uses_num; s = s->next(), ++ln) // uses
if (!(s->attributes() & USE_IN_BIT)) // passing to kernel scalar argument by reference
{
sdev = GpuScalarAdrSymbolInAdapter(s, st_hedr); // creating new symbol for address in device
if (!scalar_first)
{
scalar_first = sdev;
stmt = makeSymbolDeclaration(sdev);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(sdev, stmt->expr(0));
}
for (sl = acc_array_list, ln = 0; sl; sl = sl->next, ++ln)
{
s = GpuHeaderSymbolInAdapter(sl->symb, st_hedr);
if (!ln)
{
hgpu_first = s;
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(s, stmt->expr(0));
}
for (sl = acc_array_list, ln = 0; sl; sl = sl->next, ++ln)
{
s = GpuBaseSymbolInAdapter(sl->symb, st_hedr);
if (!ln)
{
base_first = s;
stmt = makeSymbolDeclaration(s);
st_hedr->insertStmtAfter(*stmt, *st_hedr);
}
else
addDeclExpList(s, stmt->expr(0));
}
num = ln;
/* call DvmType loop_cuda_autotransform_(DvmhLoopRef *InDvmhLoop, DvmType dvmDesc[]); */
if (options.isOn(AUTO_TFM))
{
s = h_first;
for (size_t i = 0; i < dvm_array_headers.size(); ++i, s = s->next())
{
stmt = new SgCExpStmt(*CudaAutoTransform(s_loop_ref, s));
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!i)
stmt->addComment("// Autotransform all arrays");
}
}
/* -------- call dvmh_get_device_addr(long *deviceRef, void *variable) ----*/
for (s = uses_first, sdev = scalar_first, ln = 0; ln < uses_num; s = s->next(), ++ln) // uses
if (!(s->attributes() & USE_IN_BIT)) // passing to kernel scalar argument by reference
{
s_dev_num = doDeviceNumVar(st_hedr, first_exec, s_dev_num, s_loop_ref);
e = &SgAssignOp(*new SgVarRefExp(sdev), *GetDeviceAddr(s_dev_num, s));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
stmt->addComment("// Get device addresses");
sdev = sdev->next();
}
/* -------- call dvmh_get_natural_base(long *deviceRef, long dvmDesc[] ) ----*/
for (sl = acc_array_list, s = h_first, sb = base_first, ln = 0; ln < num; sl = sl->next, s = s->next(), sb = sb->next(), ln++)
{
s_dev_num = doDeviceNumVar(st_hedr, first_exec, s_dev_num, s_loop_ref);
e = &SgAssignOp(*new SgVarRefExp(sb), *GetNaturalBase(s_dev_num, s));
stmt = new SgCExpStmt(*e);
SgStatement *cur = stmt;
st_end->insertStmtBefore(*stmt, *st_hedr);
if (IS_REMOTE_ACCESS_BUFFER(sl->symb)) // case of RTS2 interface
{
e = LoopGetRemoteBuf(s_loop_ref, nbuf--, s);
stmt = new SgCExpStmt(*e);
cur->insertStmtBefore(*stmt, *st_hedr);
}
if (!ln)
stmt->addComment("// Get natural bases");
}
/* call dvmh_fill_header_ex_(DvmType *deviceRef, void *base, DvmType dvmDesc[], DvmType dvmhDesc[], DvmType *outTypeOfTransformation, DvmType extendedParams[]);*/
if (options.isOn(AUTO_TFM))
{
for (s = h_first, sg = hgpu_first, sb = base_first, ln = 0; ln < num; s = s->next(), sg = sg->next(), sb = sb->next(), ln++)
{
stmt = new SgCExpStmt(*FillHeader_Ex(s_dev_num, sb, s, sg, outTypeOfTransformation[ln], addressingParams[ln]));
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
stmt->addComment("// Fill device headers");
}
}
/* -------- call dvmh_fill_header_(long *deviceRef, void *base, long dvmDesc[], long dvmhDesc[]);----*/
else
{
for (s = h_first, sg = hgpu_first, sb = base_first, ln = 0; ln < num; s = s->next(), sg = sg->next(), sb = sb->next(), ln++)
{
e = FillHeader(s_dev_num, sb, s, sg);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (!ln)
stmt->addComment("// Fill device headers");
}
}
/* -------- call loop_fill_bounds_(loop_ref, lowI, highI, idxI); ----*/
stmt = new SgCExpStmt(*FillBounds(s_loop_ref, lowI, highI, idxI));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Get bounds");
/* -------- call dvmh_change_filled_bounds(low, high, idx, n, dep, type_of_run, idxs); ----*/
if (acrossV == 1 || acrossV == 2 || acrossV >= 3)
{
char *name = new char[16];
name[0] = '\0';
sprintf(name, "%d", acrossV + loopV);
SgSymbol *tmp_1 = new SgSymbol(VARIABLE_NAME, name);
name[0] = '\0';
sprintf(name, "%d", acrossV);
SgSymbol *tmp_2 = new SgSymbol(VARIABLE_NAME, name);
stmt = new SgCExpStmt(*ChangeFilledBounds(lowI, highI, idxI, tmp_1, tmp_2, type_of_run, bIdxs));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Swap bounds");
delete[]name;
}
if (options.isOn(RTC))
{
/* -------- call loop_cuda_rtc_set_lang_(loop_ref, lang); ------------*/
if (options.isOn(C_CUDA))
stmt = new SgCExpStmt(*RtcSetLang(s_loop_ref, 1));
else
stmt = new SgCExpStmt(*RtcSetLang(s_loop_ref, 0));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Set CUDA language for launching kernels in RTC");
}
/* -------- call loop_guess_index_type_(loop_ref); ------------*/
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(idxTypeInKernel), *GuessIndexType(s_loop_ref)));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Guess index type in CUDA kernel");
SgFunctionCallExp *sizeofL = new SgFunctionCallExp(*createNewFunctionSymbol("sizeof"));
SgFunctionCallExp *sizeofLL = new SgFunctionCallExp(*createNewFunctionSymbol("sizeof"));
SgFunctionCallExp *sizeofI = new SgFunctionCallExp(*createNewFunctionSymbol("sizeof"));
sizeofL->addArg(*new SgKeywordValExp("long")); //addArg(*new SgVarRefExp(new SgSymbol(VARIABLE_NAME, "long")));
sizeofLL->addArg(*new SgKeywordValExp("long long")); //addArg(*new SgVarRefExp(new SgSymbol(VARIABLE_NAME, "long long")));
sizeofI->addArg(*new SgKeywordValExp("int")); //addArg(*new SgVarRefExp(new SgSymbol(VARIABLE_NAME, "int")));
stmt = new SgIfStmt(SgEqOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_LONG")))
&&
SgEqOp(*sizeofL, *sizeofI),
*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_INT")))));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgIfStmt(SgEqOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_LONG")))
&&
SgEqOp(*sizeofL, *sizeofLL),
*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_LLONG")))));
st_end->insertStmtBefore(*stmt, *st_hedr);
/* -------- call loop_cuda_get_config_(loop_ref, &shared_mem, &reg_per_th, &threads, &stream, &shared_mem); ------------*/
SgFunctionCallExp *tmpFunc = new SgFunctionCallExp(*createNewFunctionSymbol("dim3"));
int x = 0, y = 0, z = 0;
getDefaultCudaBlock(x, y, z, acrossV, loopV);
tmpFunc->addArg(*new SgValueExp(x));
tmpFunc->addArg(*new SgValueExp(y));
tmpFunc->addArg(*new SgValueExp(z));
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_threads), *tmpFunc));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("// Get CUDA configuration params");
if (loopV > 0 && red_list)
{
//OLD VAR
//stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*shared_mem), *new SgValueExp(getSizeOf())));
//st_end->insertStmtBefore(*stmt, *st_hedr);
int shared_mem_count = getSizeOf();
if (shared_mem_count)
{
if (!options.isOn(C_CUDA))
{
e = &SgAssignOp(*new SgVarRefExp(shared_mem), *new SgValueExp(shared_mem_count));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else
{
std::string preproc = std::string("#ifdef ") + fermiPreprocDir;
char* tmp = new char[preproc.size() + 1];
strcpy(tmp, preproc.data());
st_end->insertStmtBefore(*PreprocessorDirective(tmp), *st_hedr);
e = &SgAssignOp(*new SgVarRefExp(shared_mem), *new SgValueExp(shared_mem_count));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
st_end->insertStmtBefore(*PreprocessorDirective("#else"), *st_hedr);
e = &SgAssignOp(*new SgVarRefExp(shared_mem), *new SgValueExp(0));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
st_end->insertStmtBefore(*PreprocessorDirective("#endif"), *st_hedr);
}
}
}
else
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*shared_mem), *new SgValueExp(0)));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
string define_name_int = kernel_symb->identifier();
string define_name_long = kernel_symb->identifier();
define_name_int += "_int_regs";
define_name_long += "_llong_regs";
SgStatement *config_int = new SgCExpStmt(*GetConfig(s_loop_ref, shared_mem, new SgSymbol(VARIABLE_NAME, define_name_int.c_str()), s_threads, stream_t, shared_mem));
SgStatement *config_long = new SgCExpStmt(*GetConfig(s_loop_ref, shared_mem, new SgSymbol(VARIABLE_NAME, define_name_long.c_str()), s_threads, stream_t, shared_mem));
stmt = new SgIfStmt(SgEqOp(*new SgVarRefExp(*idxTypeInKernel), *new SgVarRefExp(*new SgSymbol(VARIABLE_NAME, "rt_INT"))), *config_int, *config_long);
st_end->insertStmtBefore(*stmt, *st_hedr);
// collect names, all _REGS constant
RGname_list = AddNewToSymbList(RGname_list, new SgSymbol(VARIABLE_NAME, define_name_int.c_str(), C_DvmType(), st_hedr));
allRegNames.push_back(new SgSymbol(VARIABLE_NAME, define_name_int.c_str()));
RGname_list = AddNewToSymbList(RGname_list, new SgSymbol(VARIABLE_NAME, define_name_long.c_str(), C_DvmType(), st_hedr));
allRegNames.push_back(new SgSymbol(VARIABLE_NAME, define_name_long.c_str()));
tmpFunc = new SgFunctionCallExp(*createNewFunctionSymbol("dim3"));
if (options.isOn(SPEED_TEST_L0))
{
tmpFunc->addArg(*new SgVarRefExp(s_i));
tmpFunc->addArg(*new SgVarRefExp(s_k));
tmpFunc->addArg(*new SgValueExp(z));
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_threads), *tmpFunc));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
if (acrossV == 1) // ACROSS with one dependence: create variables
{
//SgStatement **stmts = new SgStatement*[MIN(loopV, 3) * 2];
for (int k = 0; k < MIN(loopV, 3); ++k)
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[k]), *new SgRecordRefExp(*s_threads, (char*)s_cuda_var[k])));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
else if (acrossV == 2) // ACROSS with two dependence: create variables
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[0]), *new SgRecordRefExp(*s_threads, "x")));
st_end->insertStmtBefore(*stmt, *st_hedr);
for (int k = 1; k < MIN(loopV + 1, 3); ++k)
{
if (k == 1)
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[k]), *new SgRecordRefExp(*s_threads, "y")));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[k]), *new SgRecordRefExp(*s_threads, "z")));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
}
else if (acrossV >= 3) // ACROSS with three dependence: create variables
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[0]), *new SgRecordRefExp(*s_threads, "x")));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[1]), *new SgRecordRefExp(*s_threads, "y")));
st_end->insertStmtBefore(*stmt, *st_hedr);
if (loopV > 0)
{
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*nums[2]), *new SgRecordRefExp(*s_threads, "z")));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
mywarn(" end: create assigns");
espec = CreateBlocksThreadsSpec(shared_mem, s_blocks, s_threads, stream_t);
if (acrossV == 1) // ACROSS with one dependence: generate method
{
mywarn("start: in start across 1");
SgFunctionCallExp *f = new SgFunctionCallExp(*createNewFunctionSymbol("dim3"));
f->addArg(*new SgValueExp(1));
f->addArg(*new SgValueExp(1));
f->addArg(*new SgValueExp(1));
e = &SgAssignOp(*new SgVarRefExp(s_blocks), *f);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt->addComment("//Start method");
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
{
int *idx = new int[loopV];
SgExpression *mult_z = NULL;
for (int k = 0; k < MIN(2, loopV); ++k)
{
SgStatement *st1;
idx[k] = loopSymb[k].len;
e = &SgAssignOp(*new SgVarRefExp(loopBase[k]), *new SgArrayRefExp(*lowI, *new SgValueExp(idx[k])));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(idx[k])));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(idx[k])) - *new SgArrayRefExp(*highI, *new SgValueExp(idx[k]))));
e = &(*funcCall + *f1);
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*num_elems[k]), *e / *f1));
st_end->insertStmtBefore(*st1, *st_hedr);
st1 = new SgCExpStmt(SgAssignOp(*new SgRecordRefExp(*s_blocks, (char *)s_cuda_var[k]),
*new SgVarRefExp(*num_elems[k]) / *new SgVarRefExp(nums[k]) +
SgNeqOp(*new SgVarRefExp(*num_elems[k]) % *new SgVarRefExp(nums[k]), *new SgValueExp(0))));
st_end->insertStmtBefore(*st1, *st_hedr);
e = &SgAssignOp(*new SgRecordRefExp(*s_threads, (char *)s_cuda_var[k]), *new SgVarRefExp(*nums[k]));
st_end->insertStmtBefore(*new SgCExpStmt(*e), *st_hedr);
}
if (loopV > 3)
{
for (int k = 2; k < loopV; ++k)
{
SgStatement *st1;
idx[k] = loopSymb[k].len;
e = &SgAssignOp(*new SgVarRefExp(loopBase[k]), *new SgArrayRefExp(*lowI, *new SgValueExp(idx[k])));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(idx[k])));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(idx[k])) - *new SgArrayRefExp(*highI, *new SgValueExp(idx[k]))));
e = &(*funcCall + *f1);
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*num_elems[k + 1]), *e / *f1));
st_end->insertStmtBefore(*st1, *st_hedr);
if (k == 2)
mult_z = &(*new SgVarRefExp(*num_elems[k + 1]));
else
mult_z = &((*mult_z) * (*new SgVarRefExp(*num_elems[k + 1])));
}
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*num_elems[2]), *mult_z));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else if (loopV > 2)
{
SgStatement *st1;
int k = 2;
idx[k] = loopSymb[k].len;
e = &SgAssignOp(*new SgVarRefExp(loopBase[k]), *new SgArrayRefExp(*lowI, *new SgValueExp(idx[k])));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(idx[k])));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(idx[k])) - *new SgArrayRefExp(*highI, *new SgValueExp(idx[k]))));
e = &(*funcCall + *f1);
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(*num_elems[k]), *e / *f1));
st_end->insertStmtBefore(*st1, *st_hedr);
}
if (loopV > 2)
{
stmt = new SgCExpStmt(SgAssignOp(*new SgRecordRefExp(*s_blocks, (char *)s_cuda_var[2]),
*new SgVarRefExp(*num_elems[2]) / *new SgVarRefExp(nums[2]) +
SgNeqOp(*new SgVarRefExp(*num_elems[2]) % *new SgVarRefExp(nums[2]), *new SgValueExp(0))));
st_end->insertStmtBefore(*stmt, *st_hedr);
e = &SgAssignOp(*new SgRecordRefExp(*s_threads, (char *)s_cuda_var[2]), *new SgVarRefExp(*nums[2]));
st_end->insertStmtBefore(*new SgCExpStmt(*e), *st_hedr);
}
delete[]idx;
}
mywarn(" end: out start across 1");
if (red_list)
{
mywarn("strat: in red section");
if (loopV != 0)
{
// (blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z) / warpSize)
e = &SgAssignOp(*new SgVarRefExp(*red_blocks),
(*new SgRecordRefExp(*s_blocks, "x") * *new SgRecordRefExp(*s_blocks, "y") * *new SgRecordRefExp(*s_blocks, "z") *
*new SgRecordRefExp(*s_threads, "x") * *new SgRecordRefExp(*s_threads, "y") * *new SgRecordRefExp(*s_threads, "z"))
/ *new SgValueExp(warpSize));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else
{
e = &SgAssignOp(*new SgVarRefExp(*red_blocks), *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_tmp_var_1), *new SgValueExp(1)));
st_end->insertStmtBefore(*stmt, *st_hedr);
for (er = red_list, ln = 0; er; er = er->rhs(), ++ln)
{
e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(*PrepareReduction(s_loop_ref, s_tmp_var, red_blocks, s_tmp_var_1));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
mywarn(" end: out red section");
}
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgVarRefExp(acrossBase[0])
+ *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)));
stmt = new SgCExpStmt(*e);
if (options.isOn(C_CUDA) || options.isOn(GPU_O0) == false)
{
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
SgFunctionCallExp *f2 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1->addArg(*new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[0].len)) - *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)));
f2->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)));
e = &SgAssignOp(*new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[0].len)), (*f1 + *f2) / *f2);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
if (options.isOn(GPU_O0))
{
e = &SgAssignOp(*new SgArrayRefExp(*steps, *new SgArrayRefExp(*bIdxs, *new SgValueExp(0))), *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
for (int i = 0; i < loopV; ++i)
{
e = &SgAssignOp(*new SgArrayRefExp(*steps, *new SgArrayRefExp(*bIdxs, *new SgValueExp((int)(i + 1)))), *new SgValueExp(0));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
mywarn("start: in adding args section");
/* args for kernel */
{
funcCallKernel = CallKernel(kernel_symb, espec);
for (sg = hgpu_first, sb = base_first, sl = acc_array_list, ln = 0; ln<num; sg = sg->next(), sb = sb->next(), sl = sl->next, ln++)
{
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? C_Type(sl->symb->type()) : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sb));
funcCallKernel->addArg(*e);
for (int i = NumberOfCoeffs(sg); i > 0; i--)
funcCallKernel->addArg(*new SgArrayRefExp(*sg, *new SgValueExp(i)));
}
if (red_list)
insertReductionArgs(reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, funcCallKernel, red_blocks, has_red_array);
for (int k = 0; k < MIN(loopV, 2); ++k)
funcCallKernel->addArg(*new SgVarRefExp(num_elems[k]));
if (loopV == 3)
funcCallKernel->addArg(*new SgVarRefExp(num_elems[2]));
else if (loopV > 3)
for (int k = 3; k < loopV + 1; ++k)
funcCallKernel->addArg(*new SgVarRefExp(num_elems[k]));
for (int i = 0; i < acrossV; ++i)
{
if (i == 0 && options.isOn(RTC)) // across base is modifiable value
{
SgVarRefExp *toAdd = new SgVarRefExp(acrossBase[i]);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
}
else
funcCallKernel->addArg(*new SgVarRefExp(acrossBase[i]));
}
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgVarRefExp(loopBase[i]));
for (int i = 0; i < acrossV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[i].len)));
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[i].len)));
for (s = uses_first, sdev = scalar_first, ln = 0; ln < uses_num; s = s->next(), ln++) // uses
{
if (s->attributes() & USE_IN_BIT)
funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(*s))); // passing argument by value to kernel
else
{ // passing argument by reference to kernel
SgType *tp = NULL;
if (s->type()->hasBaseType())
tp = s->type()->baseType();
else
tp = s->type();
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? tp : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sdev));
funcCallKernel->addArg(*e);
sdev = sdev->next();
}
}
funcCallKernel->addArg(*new SgVarRefExp(type_of_run));
for (int i = 0; i < acrossV + loopV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*bIdxs, *new SgValueExp(i)));
char *cond_ = new char[strlen("cond_") + strlen(loopAcrossSymb[0].symb->identifier()) + 1];
cond_[0] = '\0';
strcat(cond_, "cond_");
strcat(cond_, loopAcrossSymb[0].symb->identifier());
if (options.isOn(GPU_O0))
{
funcCallKernel->addArg(*new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[0].len)));
for (int i = loopV - 1; i >= 0; i--)
funcCallKernel->addArg(*new SgArrayRefExp(*steps, *new SgValueExp(loopSymb[i].len)));
funcCallKernel->addArg(*new SgArrayRefExp(*steps, *new SgValueExp(loopAcrossSymb[0].len)));
}
}
mywarn(" end: out adding args section");
stmt = createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks);
if (options.isOn(GPU_O0))
st_end->insertStmtBefore(*stmt, *st_hedr);
else
{
SgSymbol *tmpV = new SgSymbol(VARIABLE_NAME, "int tmpV");
SgSymbol *tmpV1 = new SgSymbol(VARIABLE_NAME, "tmpV");
SgExprListExp *expr = new SgExprListExp();
expr->setLhs(SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgVarRefExp(acrossBase[0]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len))));
expr->setRhs(new SgExprListExp());
expr->rhs()->setLhs(SgAssignOp(*new SgVarRefExp(tmpV1), *new SgVarRefExp(tmpV1) + *new SgValueExp(1)));
SgForStmt *simple;
simple = new SgForStmt(&SgAssignOp(*new SgVarRefExp(tmpV), *new SgValueExp(0)), &(*new SgVarRefExp(tmpV1) < *new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[0].len))), expr, stmt);
st_end->insertStmtBefore(*simple);
}
}
else if (acrossV == 2) // ACROSS with two dependence: generate method
{
// attention!! need to add flag for support all cases
if (loopV != 0)
{
SgSymbol *tmp = nums[0];
nums[0] = nums[1];
nums[1] = tmp;
const char *tmpS = s_cuda_var[0];
s_cuda_var[0] = s_cuda_var[1];
s_cuda_var[1] = tmpS;
}
mywarn("strat: alloc mem");
{
int idx[2];
SgStatement *st1, *st2;
idx[1] = loopAcrossSymb[1].len;
idx[0] = loopAcrossSymb[0].len;
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(idx[0])));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(idx[0])) - *new SgArrayRefExp(*highI, *new SgValueExp(idx[0]))));
e = &(*funcCall + *new SgValueExp(1));
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(M), *e / *f1 + SgNeqOp(*e % *f1, *new SgValueExp(0))));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f1->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(idx[1])));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(idx[1])) - *new SgArrayRefExp(*highI, *new SgValueExp(idx[1]))));
e = &(*funcCall + *new SgValueExp(1));
st2 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(N), *e / *f1 + SgNeqOp(*e % *f1, *new SgValueExp(0))));
st_end->insertStmtBefore(*st1, *st_hedr);
st_end->insertStmtBefore(*st2, *st_hedr);
st1->addComment("// Count used variables");
}
// count num_elem_y and num_elem_z
if (loopV > 0)
{
SgFunctionCallExp *tempF = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[0].len)) - *new SgArrayRefExp(*highI, *new SgValueExp(loopSymb[0].len))));
tempF->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[0].len)));
e = &SgAssignOp(*new SgVarRefExp(num_elems[1]), (*funcCall + *new SgValueExp(1)) / *tempF + SgNeqOp((*funcCall + *new SgValueExp(1)) % *tempF, *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
SgExpression **e_z = new SgExpression*[loopV - 1];
for (int k = 0; k < loopV - 1; ++k)
{
SgFunctionCallExp *tempF = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[k + 1].len)) - *new SgArrayRefExp(*highI, *new SgValueExp(loopSymb[k + 1].len))));
tempF->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[k + 1].len)));
e_z[k] = &((*funcCall + *new SgValueExp(1)) / *tempF + SgNeqOp((*funcCall + *new SgValueExp(1)) % *tempF, *new SgValueExp(0)));
}
if (loopV > 2)
{
for (int k = 0; k < loopV - 1; ++k)
{
e = &SgAssignOp(*new SgVarRefExp(num_elems[k + 3]), *e_z[k]);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (k == 0)
e_z[0] = new SgVarRefExp(num_elems[k + 3]);
else
e_z[0] = &(*(e_z[0]) * (*new SgVarRefExp(num_elems[k + 3])));
}
}
if (loopV > 1)
{
e = &SgAssignOp(*new SgVarRefExp(num_elems[2]), *e_z[0]);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
delete[]e_z;
}
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("dim3"));
funcCall->addArg(*new SgVarRefExp(nums[0]));
for (int k = 1; k < MIN(loopV + 1, 3); ++k)
{
funcCall->addArg(*new SgVarRefExp(nums[k]));
}
e = &SgAssignOp(*new SgVarRefExp(s_blocks), *funcCall);
st_end->insertStmtBefore(*new SgCExpStmt(*e), *st_hedr);
for (int k = 1; k < MIN(loopV + 1, 3); ++k)
{
e = new SgExpression(NOTEQL_OP, &(*new SgVarRefExp(num_elems[k]) % *new SgVarRefExp(nums[k])), new SgValueExp(0), s);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[k]), *new SgVarRefExp(num_elems[k]) / *new SgVarRefExp(nums[k]) + *e);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("MIN"));
funcCall->addArg(*new SgVarRefExp(M));
funcCall->addArg(*new SgVarRefExp(N));
e = &SgAssignOp(*new SgVarRefExp(q), *funcCall);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
mywarn(" end: alloc mem");
if (red_list)
{
mywarn("strat: in red section");
if (loopV == 0)
{
e = &SgAssignOp(*new SgVarRefExp(*red_blocks), *new SgVarRefExp(q));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else if (loopV == 1)
{
e = &SgAssignOp(*new SgVarRefExp(*red_blocks), (*new SgVarRefExp(q) / *new SgVarRefExp(nums[0]) +
SgNeqOp(*new SgVarRefExp(q) % *new SgVarRefExp(nums[0]), *new SgValueExp(0))) *
*new SgRecordRefExp(*s_blocks, "y") *
*new SgRecordRefExp(*s_threads, "x") * *new SgRecordRefExp(*s_threads, "y") * *new SgRecordRefExp(*s_threads, "z") / *new SgValueExp(warpSize));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else
{
e = &SgAssignOp(*new SgVarRefExp(*red_blocks), (*new SgVarRefExp(q) / *new SgVarRefExp(nums[0]) +
SgNeqOp(*new SgVarRefExp(q) % *new SgVarRefExp(nums[0]), *new SgValueExp(0))) *
*new SgRecordRefExp(*s_blocks, "y") * *new SgRecordRefExp(*s_blocks, "z") *
*new SgRecordRefExp(*s_threads, "x") * *new SgRecordRefExp(*s_threads, "y") * *new SgRecordRefExp(*s_threads, "z") / *new SgValueExp(warpSize));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
e = &SgAssignOp(*new SgVarRefExp(s_tmp_var_1), *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
for (er = red_list, ln = 0; er; er = er->rhs(), ++ln)
{
e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(*PrepareReduction(s_loop_ref, s_tmp_var, red_blocks, s_tmp_var_1));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
mywarn(" end: out red section");
}
mywarn("strat: init bases");
// init bases
for (int i = 0; i < acrossV; ++i)
{
e = &SgAssignOp(*new SgVarRefExp(acrossBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[i].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (i == 0)
stmt->addComment("// Start SOR method here");
}
for (int i = 0; i < loopV; ++i)
{
e = &SgAssignOp(*new SgVarRefExp(loopBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[i].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
mywarn(" end: init bases");
mywarn("start: block1");
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(diag) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
SgWhileStmt *while_st = new SgWhileStmt(*new SgVarRefExp(diag) <= *new SgVarRefExp(q), *stmt);
st_end->insertStmtBefore(*while_st, *st_hedr);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgVarRefExp(acrossBase[0]) +
*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
/* --------- add argument list to kernel call ----*/
createArgsForKernelForTwoDeps(funcCallKernel, kernel_symb, espec, sg, hgpu_first, sb, base_first, sl, ln, num, e,
reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, red_blocks,
has_red_array, diag, loopV, num_elems, acrossV, acrossBase, loopBase, idxI,
loopAcrossSymb, loopSymb, s, uses_first, sdev, scalar_first, uses_num, dvm_array_headers,
addressingParams, outTypeOfTransformation, type_of_run, bIdxs);
stmt = createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks);
while_st->insertStmtAfter(*stmt);
mywarn(" end: block1");
mywarn("start: block2");
ex = new SgExpression(NOTEQL_OP, &(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[0])), new SgValueExp(0), s);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[0]) + *ex);
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgVarRefExp(*diag), *new SgVarRefExp(*diag) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg(*new SgVarRefExp(*M) - *new SgVarRefExp(*N));
SgWhileStmt *while_st1 = new SgWhileStmt(*new SgVarRefExp(diag) < *funcCall, *stmt);
SgWhileStmt *while_st2 = new SgWhileStmt(*new SgVarRefExp(diag) < *funcCall, stmt->copy());
SgWhileStmt *while_st3 = new SgWhileStmt(*new SgVarRefExp(diag) < *new SgVarRefExp(M) + *new SgVarRefExp(N), stmt->copy());
SgWhileStmt *while_st4 = new SgWhileStmt(*new SgVarRefExp(diag) < *new SgVarRefExp(M) + *new SgVarRefExp(N), stmt->copy());
SgIfStmt *if_st = new SgIfStmt(*new SgVarRefExp(*N) < *new SgVarRefExp(*M), *while_st3, *while_st4);
st_end->insertStmtBefore(*if_st, *st_hedr);
e = &SgAssignOp(*new SgVarRefExp(*elem), *new SgVarRefExp(q) - *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
if_st->insertStmtAfter(*stmt);
if_st->falseBody()->insertStmtBefore(stmt->copy());
if_st->falseBody()->insertStmtBefore(*while_st2);
if_st->falseBody()->insertStmtBefore(*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(diag), *new SgValueExp(0))));
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgVarRefExp(acrossBase[0])
- *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)));
stmt = new SgCExpStmt(*e);
if_st->insertStmtAfter(*stmt);
if_st->falseBody()->insertStmtBefore(stmt->copy());
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(q) + *funcCall + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
if_st->lexNext()->insertStmtAfter(*stmt);
if_st->falseBody()->lexNext()->lexNext()->lexNext()->insertStmtAfter(stmt->copy(), *if_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[1].len)) +
*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)));
stmt = new SgCExpStmt(*e);
if_st->insertStmtAfter(*stmt);
if_st->falseBody()->insertStmtBefore(stmt->copy());
if_st->insertStmtAfter(*while_st1);
if_st->insertStmtAfter(*new SgCExpStmt(SgAssignOp(*new SgVarRefExp(diag), *new SgValueExp(0))));
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgVarRefExp(acrossBase[0]) +
*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)));
stmt = new SgCExpStmt(*e);
while_st1->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) +
*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)));
stmt = new SgCExpStmt(*e);
while_st2->insertStmtAfter(*stmt);
while_st3->insertStmtAfter(stmt->copy());
while_st4->insertStmtAfter(stmt->copy());
mywarn(" end: block2");
mywarn("start: block3");
e = &SgAssignOp(*new SgVarRefExp(*elem), *new SgVarRefExp(*elem) - *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
while_st3->lastExecutable()->insertStmtAfter(*stmt);
while_st4->lastExecutable()->insertStmtAfter(stmt->copy());
/* --------- add argument list to kernel call ----*/
createArgsForKernelForTwoDeps(funcCallKernel, kernel_symb, espec, sg, hgpu_first, sb, base_first, sl, ln, num, e,
reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, red_blocks,
has_red_array, q, loopV, num_elems, acrossV, acrossBase, loopBase, idxI,
loopAcrossSymb, loopSymb, s, uses_first, sdev, scalar_first, uses_num, dvm_array_headers,
addressingParams, outTypeOfTransformation, type_of_run, bIdxs);
while_st1->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
while_st2->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
mywarn(" end: block3");
/* --------- add argument list to kernel call ----*/
createArgsForKernelForTwoDeps(funcCallKernel, kernel_symb, espec, sg, hgpu_first, sb, base_first, sl, ln, num, e,
reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, red_blocks,
has_red_array, elem, loopV, num_elems, acrossV, acrossBase, loopBase, idxI,
loopAcrossSymb, loopSymb, s, uses_first, sdev, scalar_first, uses_num, dvm_array_headers,
addressingParams, outTypeOfTransformation, type_of_run, bIdxs);
while_st3->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
while_st4->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
ex = new SgExpression(MOD_OP, new SgVarRefExp(q), new SgVarRefExp(nums[0]), s);
ex = new SgExpression(NOTEQL_OP, ex, new SgValueExp(0), s);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *new SgVarRefExp(q) / *new SgVarRefExp(nums[0]) + *ex);
while_st1->insertStmtAfter(*new SgCExpStmt(*e));
while_st2->insertStmtAfter(*new SgCExpStmt(*e));
SgExpression *ex1 = &(*new SgVarRefExp(*elem));
ex = new SgExpression(MOD_OP, ex1, new SgVarRefExp(nums[0]), s);
ex = new SgExpression(NOTEQL_OP, ex, new SgValueExp(0), s);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *ex1 / *new SgVarRefExp(nums[0]) + *ex);
while_st3->insertStmtAfter(*new SgCExpStmt(*e));
while_st4->insertStmtAfter(*new SgCExpStmt(*e));
}
else if (acrossV >= 3) // ACROSS with three or more dependence: generate method
{
// attention!! need to add flag for support all cases
if (loopV != 0)
{
SgSymbol *tmp = nums[0];
nums[0] = nums[2];
nums[2] = tmp;
const char *tmpS = s_cuda_var[0];
s_cuda_var[0] = s_cuda_var[2];
s_cuda_var[2] = tmpS;
}
SgExpression* firstElem = new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len));
SgExpression* secondElem = new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len));
SgIfStmt* if_stSwap = new SgIfStmt(*new SgVarRefExp(M1) > *new SgVarRefExp(M2), *new SgCExpStmt(*firstElem ^= *secondElem ^= *firstElem ^= *secondElem));
/* --------- add argument list to kernel call ----*/
{
funcCallKernel = CallKernel(kernel_symb, espec);
for (sg = hgpu_first, sb = base_first, sl = acc_array_list, ln = 0; ln<num; sg = sg->next(), sb = sb->next(), sl = sl->next, ln++)
{
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? C_Type(sl->symb->type()) : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sb));
funcCallKernel->addArg(*e);
for (int i = NumberOfCoeffs(sg); i>0; i--)
funcCallKernel->addArg(*new SgArrayRefExp(*sg, *new SgValueExp(i)));
}
if (red_list)
insertReductionArgs(reduction_ptr, reduction_loc_ptr, reduction_symb, reduction_loc_symb, funcCallKernel, red_blocks, has_red_array);
for (int i = 0; i < acrossV; ++i)
{
if (options.isOn(RTC)) // across base is modifiable value
{
SgVarRefExp *toAdd = new SgVarRefExp(acrossBase[i]);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
}
else
funcCallKernel->addArg(*new SgVarRefExp(acrossBase[i]));
}
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgVarRefExp(loopBase[i]));
for (int i = 0; i < acrossV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[i].len)));
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[i].len)));
for (s = uses_first, sdev = scalar_first, ln = 0; ln < uses_num; s = s->next(), ln++) // uses
{
if (s->attributes() & USE_IN_BIT)
funcCallKernel->addArg(SgDerefOp(*new SgVarRefExp(*s))); // passing argument by value to kernel
else
{ // passing argument by reference to kernel
SgType *tp = NULL;
if (s->type()->hasBaseType())
tp = s->type()->baseType();
else
tp = s->type();
e = new SgCastExp(*C_PointerType(options.isOn(C_CUDA) ? tp : new SgDescriptType(*SgTypeChar(), BIT_SIGNED)), *new SgVarRefExp(sdev));
funcCallKernel->addArg(*e);
sdev = sdev->next();
}
}
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("MIN"));
funcCall->addArg(*new SgVarRefExp(M1));
funcCall->addArg(*new SgVarRefExp(M2));
if (options.isOn(RTC)) // diag and SE are modifiable value
{
SgVarRefExp *toAdd = new SgVarRefExp(diag);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
toAdd = new SgVarRefExp(SE);
toAdd->addAttribute(RTC_NOT_REPLACE);
funcCallKernel->addArg(*toAdd);
}
else
{
funcCallKernel->addArg(*new SgVarRefExp(diag));
funcCallKernel->addArg(*new SgVarRefExp(SE));
}
funcCallKernel->addArg(*new SgVarRefExp(var1));
funcCallKernel->addArg(*new SgVarRefExp(var2));
funcCallKernel->addArg(*new SgVarRefExp(var3));
funcCallKernel->addArg(*new SgVarRefExp(Emax));
funcCallKernel->addArg(*new SgVarRefExp(Emin));
funcCallKernel->addArg(*funcCall);
funcCallKernel->addArg(*new SgVarRefExp(M1) > *new SgVarRefExp(M2));
if (loopV > 0)
for (int i = 0; i < loopV; ++i)
funcCallKernel->addArg(*new SgVarRefExp(num_elems[i]));
if (options.isOn(AUTO_TFM))
{
for (size_t i = 0; i < dvm_array_headers.size(); ++i)
{
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(0)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(1)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(2)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(3)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(4)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(5)));
funcCallKernel->addArg(*new SgArrayRefExp(*addressingParams[i], *new SgValueExp(6)));
funcCallKernel->addArg(*new SgVarRefExp(*outTypeOfTransformation[i]));
}
}
funcCallKernel->addArg(*new SgVarRefExp(type_of_run));
for (int i = 0; i < acrossV + loopV; ++i)
funcCallKernel->addArg(*new SgArrayRefExp(*bIdxs, *new SgValueExp(i)));
}
{
int idx[3];
SgStatement *st1;
for (int i = 0; i < 3; ++i)
idx[i] = loopAcrossSymb[i].len;
for (int i = 0; i < 3; ++i)
{
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(M1), *e / *f1 + SgNeqOp(*e % *f1, *new SgValueExp(0))));
f1->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(idx[i])));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(idx[i])) - *new SgArrayRefExp(*highI, *new SgValueExp(idx[i]))));
e = &(*funcCall + *new SgValueExp(1));
if (i == 0)
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(M1), *e / *f1 + SgNeqOp(*e % *f1, *new SgValueExp(0))));
else if (i == 1)
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(M2), *e / *f1 + SgNeqOp(*e % *f1, *new SgValueExp(0))));
else
st1 = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(M3), *e / *f1 + SgNeqOp(*e % *f1, *new SgValueExp(0))));
st_end->insertStmtBefore(*st1, *st_hedr);
if (i == 0)
st1->addComment("// Count used variables");
}
SgFunctionCallExp *f1 = new SgFunctionCallExp(*createNewFunctionSymbol("MIN"));
SgFunctionCallExp *f2 = new SgFunctionCallExp(*createNewFunctionSymbol("MIN"));
f1->addArg(*new SgVarRefExp(M1));
f1->addArg(*new SgVarRefExp(M2));
f2->addArg(*f1);
f2->addArg(*new SgVarRefExp(M3));
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(Allmin), *f2));
st_end->insertStmtBefore(*stmt, *st_hedr);
f2 = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
f2->addArg(*new SgVarRefExp(M1) - *new SgVarRefExp(M2));
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(Emin), *f1));
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(Emax), *f1 + *f2 + *new SgValueExp(1)));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
// count num_elem_z
if (loopV > 0)
{
SgFunctionCallExp *tempF = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[0].len)) - *new SgArrayRefExp(*highI, *new SgValueExp(loopSymb[0].len))));
tempF->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[0].len)));
e = &SgAssignOp(*new SgVarRefExp(num_elems[0]), (*funcCall + *new SgValueExp(1)) / *tempF + SgNeqOp((*funcCall + *new SgValueExp(1)) % *tempF, *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (loopV > 1)
{
SgExpression **e_z = new SgExpression*[loopV - 1];
for (int k = 0; k < loopV - 1; ++k)
{
SgFunctionCallExp *tempF = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg((*new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[k + 1].len)) - *new SgArrayRefExp(*highI, *new SgValueExp(loopSymb[k + 1].len))));
tempF->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopSymb[k + 1].len)));
e_z[k] = &((*funcCall + *new SgValueExp(1)) / *tempF + SgNeqOp((*funcCall + *new SgValueExp(1)) % *tempF, *new SgValueExp(0)));
}
for (int k = 0; k < loopV - 1; ++k)
{
e = &SgAssignOp(*new SgVarRefExp(num_elems[k + 1]), *e_z[k]);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (k == 0)
e_z[0] = &(*new SgVarRefExp(num_elems[0]) * (*new SgVarRefExp(num_elems[k + 1])));
else
e_z[0] = &(*(e_z[0]) * (*new SgVarRefExp(num_elems[k + 1])));
}
e = &SgAssignOp(*new SgVarRefExp(num_elems[loopV]), *e_z[0]);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
delete[]e_z;
}
else
{
e = &SgAssignOp(*new SgVarRefExp(num_elems[loopV]), *new SgVarRefExp(num_elems[0]));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("dim3"));
if (loopV > 0)
{
funcCall->addArg(*new SgVarRefExp(num_elems[loopV]) / *new SgVarRefExp(*nums[2]) + SgNeqOp(*new SgVarRefExp(num_elems[loopV]) % *new SgVarRefExp(*nums[2]), *new SgValueExp(0)));
funcCall->addArg(*new SgVarRefExp(nums[1]));
funcCall->addArg(*new SgVarRefExp(nums[0]));
}
else
{
funcCall->addArg(*new SgVarRefExp(nums[0]));
funcCall->addArg(*new SgVarRefExp(nums[1]));
}
e = &SgAssignOp(*new SgVarRefExp(s_blocks), *funcCall);
st_end->insertStmtBefore(*new SgCExpStmt(*e), *st_hedr);
if (red_list)
{
SgFunctionCallExp* f_m1 = new SgFunctionCallExp(*createNewFunctionSymbol("MAX"));
SgFunctionCallExp* f_m2 = new SgFunctionCallExp(*createNewFunctionSymbol("MAX"));
f_m1->addArg(*new SgVarRefExp(M1));
f_m1->addArg(*new SgVarRefExp(M2));
f_m2->addArg(*f_m1);
f_m2->addArg(*new SgVarRefExp(M3));
mywarn("strat: in red section");
if (loopV == 0)
{
e = &SgAssignOp(*new SgVarRefExp(*red_blocks), *new SgVarRefExp(Emin) * *f_m2);
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
else if (loopV > 0)
{
e = &SgAssignOp(*new SgVarRefExp(*red_blocks), (*new SgVarRefExp(Emin) / *new SgVarRefExp(nums[0]) +
SgNeqOp(*new SgVarRefExp(Emin) % *new SgVarRefExp(nums[0]), *new SgValueExp(0))) *
(*f_m2 / *new SgVarRefExp(nums[1]) + SgNeqOp(*f_m2 % *new SgVarRefExp(nums[1]), *new SgValueExp(0)))
* *new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[2]) *
*new SgRecordRefExp(*s_threads, "x") * *new SgRecordRefExp(*s_threads, "y") * *new SgRecordRefExp(*s_threads, "z") / *new SgValueExp(warpSize));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(s_tmp_var_1), *new SgValueExp(1)));
st_end->insertStmtBefore(*stmt, *st_hedr);
for (er = red_list, ln = 0; er; er = er->rhs(), ++ln)
{
e = &SgAssignOp(*new SgVarRefExp(s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(*PrepareReduction(s_loop_ref, s_tmp_var, red_blocks, s_tmp_var_1));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
mywarn(" end: out red section");
}
int flag_comment = 0;
for (int i = 3; i < acrossV; ++i)
{
e = &SgAssignOp(*new SgVarRefExp(acrossBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[i].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (i - 3 == 0)
{
stmt->addComment("// Start method");
flag_comment = 1;
}
}
if (acrossV == 3)
{
for (int i = 0; i < MIN(3, acrossV); ++i)
{
e = &SgAssignOp(*new SgVarRefExp(acrossBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[i].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
if (i == 0 && flag_comment == 0)
stmt->addComment("// Start method");
}
for (int i = 0; i < loopV; ++i)
{
e = &SgAssignOp(*new SgVarRefExp(loopBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[i].len)));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
SgWhileStmt *main_while_st = NULL;
SgStatement *main_stmt = NULL;
bool first = true;
if (acrossV > 3)
{
SgWhileStmt *tmp;
for (int i = 3; i < acrossV; ++i)
{
e = &SgAssignOp(*new SgVarRefExp(acrossBase[i]), *new SgVarRefExp(acrossBase[i]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[i].len)));
stmt = new SgCExpStmt(*e);
SgExpression *e1 = NULL;
SgFunctionCallExp *func = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
func->addArg(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[i].len)));
e1 = &(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[i].len)) / *func);
if (first)
{
main_while_st = new SgWhileStmt(*e1 * *new SgVarRefExp(acrossBase[i]) <= *e1 * *new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[i].len)), *stmt);
first = false;
}
else
{
tmp = new SgWhileStmt(*new SgVarRefExp(acrossBase[i]) <= *new SgArrayRefExp(*highI, *new SgValueExp(loopAcrossSymb[i].len)), *stmt);
main_while_st->insertStmtAfter(*tmp);
main_while_st = tmp;
}
main_stmt = stmt;
}
st_end->insertStmtBefore(*main_while_st, *st_hedr);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(SE), *new SgValueExp(1)));
main_stmt->insertStmtBefore(*stmt, *main_while_st);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(diag), *new SgValueExp(1)));
main_stmt->insertStmtBefore(*stmt, *main_while_st);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(var1), *new SgValueExp(1)));
main_stmt->insertStmtBefore(*stmt, *main_while_st);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(var2), *new SgValueExp(0)));
main_stmt->insertStmtBefore(*stmt, *main_while_st);
stmt = new SgCExpStmt(SgAssignOp(*new SgVarRefExp(var3), *new SgValueExp(0)));
main_stmt->insertStmtBefore(*stmt, *main_while_st);
for (int i = 0; i < MIN(3, acrossV); ++i)
{
e = &SgAssignOp(*new SgVarRefExp(acrossBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[i].len)));
stmt = new SgCExpStmt(*e);
main_stmt->insertStmtBefore(*stmt, *main_while_st);
}
for (int i = 0; i < loopV; ++i)
{
e = &SgAssignOp(*new SgVarRefExp(loopBase[i]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopSymb[i].len)));
stmt = new SgCExpStmt(*e);
main_stmt->insertStmtBefore(*stmt, *main_while_st);
}
}
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(diag) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
SgWhileStmt *while_st = new SgWhileStmt(*new SgVarRefExp(diag) <= *new SgVarRefExp(Allmin), *stmt);
if (acrossV == 3)
st_end->insertStmtBefore(*while_st, *st_hedr);
else
main_stmt->insertStmtBefore(*while_st, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[2]), *new SgVarRefExp(acrossBase[2]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[2].len)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
while_st->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[1]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[1]) + SgNeqOp(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[1]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[0]) + SgNeqOp(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[0]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgVarRefExp(var1), *new SgValueExp(0));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(var2), *new SgValueExp(0));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(var3), *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
{ // while for if block
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(diag) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
SgWhileStmt *while_st = new SgWhileStmt(SgNeqOp(*new SgVarRefExp(diag) - *new SgValueExp(1), *new SgVarRefExp(M3)), *stmt);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[2]), *new SgVarRefExp(acrossBase[2]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[2].len)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt, *while_st);
while_st->insertStmtAfter(if_stSwap->copy(), *while_st);
while_st->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
while_st->insertStmtAfter(if_stSwap->copy(), *while_st);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[1]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[1]) + SgNeqOp(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[1]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *new SgVarRefExp(Emin) / *new SgVarRefExp(nums[0]) + SgNeqOp(*new SgVarRefExp(Emin) % *new SgVarRefExp(nums[0]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt);
SgIfStmt *if_st = new SgIfStmt(*new SgVarRefExp(M3) > *new SgVarRefExp(Emin), *while_st);
if (acrossV == 3)
st_end->insertStmtBefore(*if_st, *st_hedr);
else
main_stmt->insertStmtBefore(*if_st, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(*diag), *new SgVarRefExp(*Allmin) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
if_st->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[1].len)) * (*new SgVarRefExp(M1) <= *new SgVarRefExp(M2)) + *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)) * (*new SgVarRefExp(M1) > *new SgVarRefExp(M2)));
stmt = new SgCExpStmt(*e);
if_st->insertStmtAfter(*stmt);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)) * (*new SgVarRefExp(M1) <= *new SgVarRefExp(M2)) + *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[1].len)) * (*new SgVarRefExp(M1) > *new SgVarRefExp(M2)));
stmt = new SgCExpStmt(*e);
if_st->insertStmtAfter(*stmt);
}
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(M3));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[1]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[1]) + SgNeqOp(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[1]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *new SgVarRefExp(Emin) / *new SgVarRefExp(nums[0]) + SgNeqOp(*new SgVarRefExp(Emin) % *new SgVarRefExp(nums[0]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(SE), *new SgValueExp(2));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), (*new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len))) * (*new SgVarRefExp(M1) <= *new SgVarRefExp(M2)) + (*new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[1].len)) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len))) * (*new SgVarRefExp(M1) > *new SgVarRefExp(M2)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[1].len)) * (*new SgVarRefExp(M1) <= *new SgVarRefExp(M2)) + *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)) * (*new SgVarRefExp(M1) > *new SgVarRefExp(M2)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[2]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[2].len)) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[2].len)) * (*new SgVarRefExp(M3) - *new SgValueExp(1)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(SE), *new SgVarRefExp(SE) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
while_st = new SgWhileStmt(SgNeqOp(*new SgVarRefExp(M1) + *new SgVarRefExp(M2) - *new SgVarRefExp(Allmin), *new SgVarRefExp(SE) - *new SgValueExp(1)), *stmt);
if (acrossV == 3)
st_end->insertStmtBefore(*while_st, *st_hedr);
else
main_stmt->insertStmtBefore(*while_st, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgVarRefExp(acrossBase[0]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)) * (*new SgVarRefExp(M1) <= *new SgVarRefExp(M2)) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) * (*new SgVarRefExp(M1) > *new SgVarRefExp(M2)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt, *while_st);
while_st->insertStmtAfter(if_stSwap->copy(), *while_st);
while_st->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
while_st->insertStmtAfter(if_stSwap->copy(), *while_st);
e = &SgAssignOp(*new SgVarRefExp(var1), *new SgValueExp(0));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(var2), *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(var3), *new SgValueExp(0));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(Allmin) - *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[0]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[0].len)) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[0].len)) * (*new SgVarRefExp(M1) - *new SgValueExp(1)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgArrayRefExp(*lowI, *new SgValueExp(loopAcrossSymb[1].len)) * (*new SgVarRefExp(M1) > *new SgVarRefExp(M2)) + *new SgVarRefExp(acrossBase[1]) * (*new SgVarRefExp(M1) <= *new SgVarRefExp(M2)));
stmt = new SgCExpStmt(*e);
if (acrossV == 3)
st_end->insertStmtBefore(*stmt, *st_hedr);
else
main_stmt->insertStmtBefore(*stmt, *main_while_st);
{ // if block
funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcCall->addArg(*new SgVarRefExp(*Emin) - *new SgVarRefExp(M3));
SgExpression *e1 = NULL, *e2 = NULL;
SgIfStmt *if_st1 = NULL;
e1 = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) + *new SgVarRefExp(*Emax) - *new SgVarRefExp(*Emin) - *new SgValueExp(1));
e2 = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) - *new SgVarRefExp(*Emax) + *new SgVarRefExp(*Emin) + *new SgValueExp(1));
if_st1 = new SgIfStmt(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) > *new SgValueExp(0), *new SgCExpStmt(*e1), *new SgCExpStmt(*e2));
SgIfStmt *if_st = new SgIfStmt(*new SgVarRefExp(*M1) <= *new SgVarRefExp(*M2) && *new SgVarRefExp(*M3) > *new SgVarRefExp(*Emin), *if_st1);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)));
stmt = new SgCExpStmt(*e);
if_st = new SgIfStmt(*new SgVarRefExp(*M1) > *new SgVarRefExp(*M2) && *new SgVarRefExp(*M3) > *new SgVarRefExp(*Emin), *stmt, *if_st);
e1 = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) + *new SgVarRefExp(*Emax) - *new SgVarRefExp(*Emin) - *new SgValueExp(1) + *funcCall);
e2 = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) - *new SgVarRefExp(*Emax) + *new SgVarRefExp(*Emin) + *new SgValueExp(1) + *new SgVarRefExp(M3) - *new SgVarRefExp(*Emin));
if_st1 = new SgIfStmt(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) > *new SgValueExp(0), *new SgCExpStmt(*e1), *new SgCExpStmt(*e2));
if_st = new SgIfStmt(*new SgVarRefExp(*M1) <= *new SgVarRefExp(*M2) && *new SgVarRefExp(*M3) <= *new SgVarRefExp(*Emin), *if_st1, *if_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) + *funcCall);
stmt = new SgCExpStmt(*e);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) + *funcCall * *new SgValueExp(-1));
SgStatement* stmtElse = new SgCExpStmt(*e);
if_st1 = new SgIfStmt(*new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)) > *new SgValueExp(0), *stmt, *stmtElse);
if_st = new SgIfStmt(*new SgVarRefExp(*M1) > *new SgVarRefExp(*M2) && *new SgVarRefExp(*M3) <= *new SgVarRefExp(*Emin), *if_st1, *if_st);
if (acrossV == 3)
st_end->insertStmtBefore(*if_st, *st_hedr);
else
main_stmt->insertStmtBefore(*if_st, *main_while_st);
}
e = &SgAssignOp(*new SgVarRefExp(diag), *new SgVarRefExp(diag) - *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
while_st = new SgWhileStmt(SgNeqOp(*new SgVarRefExp(diag), *new SgValueExp(0)), *stmt);
if (acrossV == 3)
st_end->insertStmtBefore(*while_st, *st_hedr);
else
main_stmt->insertStmtBefore(*while_st, *main_while_st);
e = &SgAssignOp(*new SgVarRefExp(acrossBase[1]), *new SgVarRefExp(acrossBase[1]) + *new SgArrayRefExp(*idxI, *new SgValueExp(loopAcrossSymb[1].len)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt, *while_st);
e = &SgAssignOp(*new SgVarRefExp(SE), *new SgVarRefExp(SE) + *new SgValueExp(1));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt, *while_st);
while_st->insertStmtAfter(*createKernelCallsInCudaHandler(funcCallKernel, s_loop_ref, idxTypeInKernel, s_blocks));
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[1]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[1]) + SgNeqOp(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[1]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt, *while_st);
e = &SgAssignOp(*new SgRecordRefExp(*s_blocks, (char*)s_cuda_var[0]), *new SgVarRefExp(diag) / *new SgVarRefExp(nums[0]) + SgNeqOp(*new SgVarRefExp(diag) % *new SgVarRefExp(nums[0]), *new SgValueExp(0)));
stmt = new SgCExpStmt(*e);
while_st->insertStmtAfter(*stmt, *while_st);
}
// !!! Global for all cases !!!
if (red_list)
{
ln = 0;
for (er = red_list; er; er = er->rhs(), ++ln)
{
//SgExpression *red_expr_ref = er->lhs()->rhs(); // reduction variable reference
num = RedFuncNumber(er->lhs()->lhs()); // type of reduction
e = &SgAssignOp(*new SgVarRefExp(*s_tmp_var), *new SgValueExp(ln+1));
stmt = new SgCExpStmt(*e);
st_end->insertStmtBefore(*stmt, *st_hedr);
stmt = new SgCExpStmt(*FinishReduction(s_loop_ref, s_tmp_var));
st_end->insertStmtBefore(*stmt, *st_hedr);
}
}
// create args for kernel and return it
vector<ArgsForKernel> argsKernel(countKernels);
const int rtTypes[] = { rt_INT, rt_LLONG };
for (unsigned ck = 0; ck < countKernels; ++ck)
{
argsKernel[ck].st_header = st_hedr;
argsKernel[ck].cond_ = NULL;
SgType *typeParams = indexTypeInKernel(rtTypes[ck]);
if (acrossV == 1)
{
char *cond_ = new char[strlen("cond_") + strlen(loopAcrossSymb[0].symb->identifier()) + 1];
cond_[0] = '\0';
strcat(cond_, "cond_");
strcat(cond_, loopAcrossSymb[0].symb->identifier());
argsKernel[ck].cond_ = new SgSymbol(VARIABLE_NAME, cond_, typeParams, st_hedr);
char *st = new char[strlen("steps_") + strlen(loopAcrossSymb[0].symb->identifier()) + 1];
st[0] = '\0';
strcat(st, "steps_");
strcat(st, loopAcrossSymb[0].symb->identifier());
argsKernel[ck].steps.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(st), typeParams, st_hedr));
for (int i = 0; i < loopV; ++i)
{
st = new char[strlen("steps_") + strlen(loopSymb[i].symb->identifier()) + 1];
st[0] = '\0';
strcat(st, "steps_");
strcat(st, loopSymb[i].symb->identifier());
argsKernel[ck].steps.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(st), typeParams, st_hedr));
}
}
if (acrossV != 1 && options.isOn(AUTO_TFM))
{
char *tmpS = new char[64];
for (size_t i = 0; i < dvm_array_headers.size(); ++i)
{
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_x_axis");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_offset_x");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_Rx");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_y_axis");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_offset_y");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_Ry");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
tmpS[0] = '\0';
strcat(tmpS, dvm_array_headers[i]);
strcat(tmpS, "_slash");
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(tmpS), typeParams, st_hedr));
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(outTypeOfTransformation[i]->identifier()), typeParams, st_hedr));
}
argsKernel[ck].arrayNames = dvm_array_headers;
}
if (acrossV == 2)
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("num_elem_across"), typeParams, st_hedr));
else if (acrossV >= 3)
{
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("max_z"), typeParams, st_hedr));
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("SE"), typeParams, st_hedr)); // SE
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("var1"), typeParams, st_hedr)); // var1
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("var2"), typeParams, st_hedr)); // var2
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("var3"), typeParams, st_hedr)); // var3
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Emax"), typeParams, st_hedr)); // Emax
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("Emin"), typeParams, st_hedr)); // Emin
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("min_ij"), typeParams, st_hedr));
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("swap_ij"), typeParams, st_hedr));
}
char *str = new char[32];
for (int i = 0; i < acrossV; ++i)
{
argsKernel[ck].acrossS.push_back(new SgSymbol(VARIABLE_NAME, acrossBase[i]->identifier(), typeParams, st_hedr)); // acrossBase[i]
argsKernel[ck].symb.push_back(loopAcrossSymb[i]);
strcpy(str, "step");
strcat(str, strchr(acrossBase[i]->identifier(), '_'));
argsKernel[ck].idxAcross.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(str), typeParams, st_hedr));
}
for (int i = 0; i < loopV; ++i)
{
argsKernel[ck].notAcrossS.push_back(new SgSymbol(VARIABLE_NAME, loopBase[i]->identifier(), typeParams, st_hedr)); // loopBase[i]
argsKernel[ck].nSymb.push_back(loopSymb[i]);
strcpy(str, "step");
strcat(str, strchr(loopBase[i]->identifier(), '_'));
argsKernel[ck].idxNotAcross.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(str), typeParams, st_hedr));
strcpy(str, "num_elem");
strcat(str, strchr(loopBase[i]->identifier(), '_'));
argsKernel[ck].sizeVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(str), typeParams, st_hedr));
}
if (acrossV == 1 || acrossV == 2 || acrossV >= 3)
{
argsKernel[ck].otherVars.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName("type_of_run"), typeParams, st_hedr));
char *t = new char[32];
for (int i = 0; i < acrossV + loopV; ++i)
{
char p[8];
sprintf(p, "%d", i);
t[0] = '\0';
strcat(t, "idxs_");
strcat(t, p);
argsKernel[ck].baseIdxsInKer.push_back(new SgSymbol(VARIABLE_NAME, TestAndCorrectName(t), typeParams, st_hedr));
}
delete[]t;
}
delete[]str;
}
// end of creation args for kernel
delete[]reduction_loc_ptr;
delete[]reduction_loc_symb;
delete[]reduction_ptr;
delete[]reduction_symb;
delete[]num_elems;
mywarn(" end Adapter Function");
if (options.isOn(C_CUDA))
RenamingCudaFunctionVariables(st_hedr, s_loop_ref, 0);
return argsKernel;
}
void MakeDeclarationsForKernel_On_C_Across(SgType *indexType)
{
// declare do_variables
DeclareDoVars(indexType);
// declare private(local in kernel) variables
DeclarePrivateVars();
// declare variables, used in loop and passed by reference:
// <type> &<name> = *p_<name>;
DeclareUsedVars();
}
void MakeDeclarationsForKernelAcross(SgType *indexType)
{
#if debugMode
mywarn("strat: MakeDeclarations Function");
#endif
// declare do_variables
DeclareDoVars();
// declare private(local in kernel) variables
DeclarePrivateVars();
// declare dummy arguments:
// declare reduction dummy arguments
DeclareDummyArgumentsForReductions(NULL, indexType);
// declare array coefficients
DeclareArrayCoeffsInKernel(indexType);
// declare bases for arrays
DeclareArrayBases();
// declare variables, used in loop
DeclareUsedVars();
#if debugMode
mywarn(" end: MakeDeclarations Function");
#endif
}
SgExpression *CreateKernelDummyListAcross(ArgsForKernel *argsKer, SgType *idxTypeInKernel) //SgSymbol *s_red_count_k,
{
#if debugMode
mywarn("strat: CreateKernelDummyListAcross Function");
#endif
SgExpression *arg_list, *ae;
arg_list = NULL;
arg_list = AddListToList(CreateArrayDummyList(idxTypeInKernel), CreateRedDummyList(idxTypeInKernel));
// base_ref + <array_coeffs> ...
// + <red_var[_1]> [+red_var_2+...+red_var_M] + <red>_grid [ + <loc_var_1>...<loc_var_N>]
// + 'blocks'
if (argsKer->symb.size() < 3)
{
for (int it = 0; it < argsKer->sizeVars.size(); ++it)
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->sizeVars[it])));
}
for (int it = 0; it < argsKer->acrossS.size(); ++it)
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->acrossS[it])));
for (int it = 0; it < argsKer->notAcrossS.size(); ++it)
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->notAcrossS[it])));
for (int it = 0; it < argsKer->idxAcross.size(); ++it)
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->idxAcross[it])));
for (int it = 0; it < argsKer->idxNotAcross.size(); ++it)
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->idxNotAcross[it])));
if (uses_list)
arg_list = AddListToList(arg_list, CreateUsesDummyList()); //[+ <uses> ]
if (argsKer->symb.size() >= 3)
for (int it = 0; it < argsKer->sizeVars.size(); ++it)
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(argsKer->sizeVars[it])));
if (argsKer->acrossS.size() != 1)
{
for (size_t i = 0; i < argsKer->otherVars.size(); ++i)
{
ae = new SgExprListExp(*new SgVarRefExp(argsKer->otherVars[i]));
arg_list = AddListToList(arg_list, ae);
}
}
else if (argsKer->otherVars.size() != 0)
{
ae = new SgExprListExp(*new SgVarRefExp(argsKer->otherVars[argsKer->otherVars.size() - 1]));
arg_list = AddListToList(arg_list, ae);
}
for (size_t i = 0; i < argsKer->baseIdxsInKer.size(); ++i)
{
ae = new SgExprListExp(*new SgVarRefExp(argsKer->baseIdxsInKer[i]));
arg_list = AddListToList(arg_list, ae);
}
if (argsKer->cond_ != NULL && options.isOn(GPU_O0))
{
SgSymbol *tmp = argsKer->cond_;
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(tmp)));
for (size_t i = 0; i < argsKer->steps.size(); ++i)
{
SgSymbol *tmp = argsKer->steps[i];
arg_list = AddListToList(arg_list, new SgExprListExp(*new SgVarRefExp(tmp)));
}
}
#if debugMode
mywarn(" end: CreateKernelDummyListAcross Function");
#endif
return arg_list;
}
SgStatement *CreateLoopKernelAcross(SgSymbol *skernel, ArgsForKernel* argsKer, SgType *idxTypeInKernel)
{
#if debugMode
mywarn("strat: CreateLoopKernelAcross");
#endif
ACROSS_MOD_IN_KERNEL = 1;
#if kerneloff
return NULL;
#endif
int nloop = 0;
SgStatement *st = NULL, *st_end = NULL;
SgExpression *fe = NULL;
SgSymbol *tid = NULL, *s_red_count_k = NULL;
SgIfStmt *if_st = NULL;
SgType *longType = idxTypeInKernel;
if (!skernel)
return(NULL);
nloop = ParLoopRank();
// create kernel procedure for loop in Fortran-Cuda language or kernel function in C_Cuda
// creating Header and End Statement of Kernel
if (options.isOn(C_CUDA))
{
kernel_st = Create_C_Kernel_Function(skernel);
fe = kernel_st->expr(0);
}
else
kernel_st = CreateKernelProcedure(skernel);
kernel_st->addComment(LoopKernelComment());
st_end = kernel_st->lexNext();
cur_in_kernel = st = kernelScope = kernel_st;
// !!creating variables and making structures for reductions
CompleteStructuresForReductionInKernel();
if (red_list)
s_red_count_k = RedCountSymbol(kernel_st);
// create dummy argument list of kernel:
if (options.isOn(C_CUDA))
fe->setLhs(CreateKernelDummyListAcross(argsKer, longType)); //s_red_count_k,
else
// create dummy argument list and add it to kernel header statement (Fortran-Cuda)
kernel_st->setExpression(0, *CreateKernelDummyListAcross(argsKer, longType)); //s_red_count_k,
// generating block of index variables calculation
#if debugMode
mywarn("start: block4");
#endif
tid = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_x"), *longType, *cur_in_kernel);
if (options.isOn(C_CUDA))
st = AssignStatement(*new SgVarRefExp(*tid), (*new SgRecordRefExp(*s_blockidx, "x")) *
*new SgRecordRefExp(*s_blockdim, "x") + *new SgRecordRefExp(*s_threadidx, "x"));
else
st = AssignStatement(*new SgVarRefExp(*tid), (*new SgRecordRefExp(*s_blockidx, "x") - *new SgValueExp(1)) *
*new SgRecordRefExp(*s_blockdim, "x") + *new SgRecordRefExp(*s_threadidx, "x") - *new SgValueExp(1));
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
size_t size = argsKer->otherVarsForOneTh.size();
size_t size1 = argsKer->otherVars.size();
SgForStmt *for_st = NULL, *inner_for_st = NULL;
SgFunctionCallExp *funcAbs = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcAbs->addArg(*new SgVarRefExp(argsKer->otherVars[size1 - 1]));
SgExpression *sign = &(*new SgVarRefExp(argsKer->otherVars[size1 - 1]) / *funcAbs);
if (options.isOn(C_CUDA))
for_st = new SgForStmt(&SgAssignOp(*new SgVarRefExp(argsKer->otherVarsForOneTh[size - 1]), *new SgVarRefExp(argsKer->otherVars[size1 - 3])), &(*sign * *new SgVarRefExp(argsKer->otherVarsForOneTh[size - 1]) <= *sign * *new SgVarRefExp(argsKer->otherVars[size1 - 2])), &SgAssignOp(*new SgVarRefExp(argsKer->otherVarsForOneTh[size - 1]), *new SgVarRefExp(argsKer->otherVarsForOneTh[size - 1]) + *new SgVarRefExp(argsKer->otherVars[size1 - 1])), NULL);
else
for_st = new SgForStmt(argsKer->otherVarsForOneTh[size - 1], new SgVarRefExp(argsKer->otherVars[size1 - 3]), new SgVarRefExp(argsKer->otherVars[size1 - 2]), new SgVarRefExp(argsKer->otherVars[size1 - 1]), NULL);
inner_for_st = for_st;
for (int i = size - 2; i >= 0; i--)
{
SgForStmt *tmp = for_st;
funcAbs = new SgFunctionCallExp(*createNewFunctionSymbol("abs"));
funcAbs->addArg(*new SgVarRefExp(argsKer->otherVars[3 * i + 2]));
sign = &(*new SgVarRefExp(argsKer->otherVars[3 * i + 2]) / *funcAbs);
if (options.isOn(C_CUDA))
for_st = new SgForStmt(&SgAssignOp(*new SgVarRefExp(argsKer->otherVarsForOneTh[i]), *new SgVarRefExp(argsKer->otherVars[3 * i])), &(*sign * *new SgVarRefExp(argsKer->otherVarsForOneTh[i]) <= *sign * *new SgVarRefExp(argsKer->otherVars[3 * i + 1])), &(SgAssignOp(*new SgVarRefExp(argsKer->otherVarsForOneTh[i]), *new SgVarRefExp(argsKer->otherVarsForOneTh[i]) + *new SgVarRefExp(argsKer->otherVars[3 * i + 2]))), NULL);
else
for_st = new SgForStmt(argsKer->otherVarsForOneTh[i], new SgVarRefExp(argsKer->otherVars[3 * i]), new SgVarRefExp(argsKer->otherVars[3 * i + 1]), new SgVarRefExp(argsKer->otherVars[3 * i + 2]), NULL);
for_st->insertStmtAfter(*tmp);
}
if_st = new SgIfStmt(SgEqOp(*new SgVarRefExp(*tid), *new SgValueExp(0)), *for_st);
cur_in_kernel->insertStmtAfter(*if_st, *kernel_st);
#if debugMode
mywarn(" end: block4");
mywarn("start: block5");
#endif
// generating assign statements for MAXLOC, MINLOC reduction operations
if (red_list)
Do_Assign_For_Loc_Arrays();
// inserting loop body to innermost IF statement of BlockForCalculationThreadLoopVariables
#if debugMode
mywarn(" end: block5");
mywarn("strat: inserting loop body");
#endif
vector<SgSymbol*> forDeclarationInKernel;
{
SgStatement *stk, *last;
block = CreateIfForRedBlack(loop_body, nloop);
last = inner_for_st->lastNodeOfStmt();
inner_for_st->insertStmtAfter(*block); //cur_in_kernel is innermost IF statement
if (options.isOn(C_CUDA))
{
if (block->comments() == NULL)
block->addComment("// Loop body");
}
else
block->addComment("! Loop body\n");
// correct copy of loop_body (change or extract last statement of block if it is CONTROL_END)
if (block != loop_body)
stk = last->lexPrev()->lexPrev();
else
stk = last->lexPrev();
if (stk->variant() == CONTROL_END)
{
if (stk->hasLabel() || stk == loop_body) // when body of DO_ENDDO loop is empty, stk == loop_body
stk->setVariant(CONT_STAT);
else
{
st = stk->lexPrev();
stk->extractStmt();
stk = st;
}
}
ReplaceExitCycleGoto(block, stk);
for_kernel = 1;
last = cur_st;
TranslateBlock(inner_for_st);
if (options.isOn(C_CUDA))
{
//get info of arrays in private and locvar lists
swapDimentionsInprivateList();
vector < stack < SgStatement*> > zero = vector < stack < SgStatement*> >(0);
Translate_Fortran_To_C(inner_for_st, inner_for_st->lastNodeOfStmt(), zero, 0);
}
cur_st = last;
createBodyKernel = false;
}
#if debugMode
mywarn(" end: inserting loop body");
mywarn("start: create reduction block");
#endif
if (red_list)
{
int num;
reduction_operation_list *tmp_list = red_struct_list;
for (SgExpression *er = red_list; er; er = er->rhs())
{
num = 0;
SgExpression *red_expr_ref = er->lhs()->rhs(); // reduction variable reference
num = RedFuncNumber(er->lhs()->lhs()); // type of reduction
SgSymbol *redGrid = new SgSymbol(VARIABLE_NAME, tmp_list->red_grid->identifier());
redGrid->setType(*new SgArrayType(*tmp_list->red_grid->type()));
st = AssignStatement(*new SgArrayRefExp(*redGrid, *new SgValueExp(0)), red_expr_ref->copy());
if_st->lastExecutable()->insertStmtAfter(*st);
tmp_list = tmp_list->next;
}
}
#if debugMode
mywarn(" end: create reduction block");
#endif
// make declarations
if (options.isOn(C_CUDA))
MakeDeclarationsForKernel_On_C_Across(idxTypeInKernel);
else // Fortran-Cuda
MakeDeclarationsForKernelAcross(idxTypeInKernel);
for_kernel = 0;
kernel_st->insertStmtAfter(*tid->makeVarDeclStmt());
if (!options.isOn(C_CUDA))
{
for (size_t i = 0; i < argsKer->otherVars.size(); ++i)
{
st = argsKer->otherVars[i]->makeVarDeclStmt();
st->setExpression(2, *new SgExprListExp(*new SgExpression(ACC_VALUE_OP)));
kernel_st->insertStmtAfter(*st);
}
}
#if debugMode
mywarn(" end: CreateLoopKernelAcross");
#endif
if (options.isOn(C_CUDA))
RenamingCudaFunctionVariables(kernel_st, skernel, 1);
ACROSS_MOD_IN_KERNEL = 0;
return kernel_st;
}
static SgStatement* makeBlockIdxAssigment(SgSymbol* tid, const char* XYZ)
{
SgStatement* st = NULL;
if (options.isOn(C_CUDA))
st = AssignStatement(*new SgVarRefExp(*tid), (*new SgRecordRefExp(*s_blockidx, XYZ)) *
*new SgRecordRefExp(*s_blockdim, XYZ) + *new SgRecordRefExp(*s_threadidx, XYZ));
else
st = AssignStatement(*new SgVarRefExp(*tid), (*new SgRecordRefExp(*s_blockidx, XYZ) - *new SgValueExp(1)) *
*new SgRecordRefExp(*s_blockdim, XYZ) + *new SgRecordRefExp(*s_threadidx, XYZ) - *new SgValueExp(1));
return st;
}
static void createDeclaration(SgSymbol* toDecl)
{
SgStatement* st = toDecl->makeVarDeclStmt();
st->setExpression(2, *new SgExprListExp(*new SgExpression(ACC_VALUE_OP)));
kernel_st->insertStmtAfter(*st);
}
static void createDeclaration(const vector<SgSymbol*>& toDecl)
{
for (int it = 0; it < toDecl.size(); ++it)
createDeclaration(toDecl[it]);
}
SgStatement *CreateLoopKernelAcross(SgSymbol *skernel, ArgsForKernel* argsKer, int acrossNum, SgType *idxTypeInKernel)
{
#if debugMode
mywarn("strat: CreateLoopKernelAcross");
#endif
ACROSS_MOD_IN_KERNEL = 1;
#if kerneloff
return NULL;
#endif
int nloop;
SgStatement *st = NULL, *st_end = NULL;
SgExpression *e = NULL, *fe = NULL;
SgSymbol *tid = NULL, *tid1 = NULL, *tid2 = NULL, *s_red_count_k = NULL, *coords = NULL;
SgIfStmt *if_st = NULL, *if_st1 = NULL, *if_st2 = NULL;
SgForStmt *mainFor = NULL;
SgSymbol *tmpvar1 = NULL;
SgExpression **leftExprs, **rightExprs;
SgType *longType = idxTypeInKernel;
if (!skernel)
return(NULL);
nloop = ParLoopRank();
// create kernel procedure for loop in Fortran-Cuda language or kernel function in C_Cuda
// creating Header and End Statement of Kernel
if (options.isOn(C_CUDA))
{
kernel_st = Create_C_Kernel_Function(skernel);
fe = kernel_st->expr(0);
}
else
kernel_st = CreateKernelProcedure(skernel);
if (!options.isOn(C_CUDA) && createConvert_XY && options.isOn(AUTO_TFM))
{
kernel_st->addComment("!------------- dvmh_convert_XY() function ------------\n");
kernel_st->addComment(funcDvmhConvXYfortVerLong);
kernel_st->addComment(funcDvmhConvXYfortVer);
createConvert_XY = false;
}
kernel_st->addComment(LoopKernelComment());
st_end = kernel_st->lexNext();
cur_in_kernel = st = kernelScope = kernel_st;
// !!creating variables and making structures for reductions
CompleteStructuresForReductionInKernel(); //CompleteStructuresForReductionInKernelAcross();
if (red_list)
s_red_count_k = RedCountSymbol(kernel_st);
// create dummy argument list of kernel:
if (options.isOn(C_CUDA))
fe->setLhs(CreateKernelDummyListAcross(argsKer, idxTypeInKernel)); // s_red_count_k,
else
// create dummy argument list and add it to kernel header statement (Fortran-Cuda)
kernel_st->setExpression(0, *CreateKernelDummyListAcross(argsKer, idxTypeInKernel)); // s_red_count_k,
// generating block of index variables calculation
#if debugMode
mywarn("start: block4");
#endif
SgArrayType *tpArr = new SgArrayType(*longType);
SgValueExp *dimSize = new SgValueExp((int)(argsKer->symb.size() + argsKer->nSymb.size()));
tpArr->addDimension(dimSize);
coords = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("coords"), *longType, *cur_in_kernel);
coords->setType(tpArr);
tid = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_x"), *longType, *cur_in_kernel);
if (argsKer->symb.size() < 3)
{
if (argsKer->nSymb.size() == 1)
tid1 = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_y"), *longType, *cur_in_kernel);
else if (argsKer->nSymb.size() >= 2)
{
tid1 = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_y"), *longType, *cur_in_kernel);
tid2 = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_z"), *longType, *cur_in_kernel);
}
}
else if (argsKer->symb.size() >= 3)
{
tid1 = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_y"), *longType, *cur_in_kernel);
if (argsKer->nSymb.size() > 0)
tid2 = new SgSymbol(VARIABLE_NAME, TestAndCorrectName("id_z"), *longType, *cur_in_kernel);
}
st = makeBlockIdxAssigment(tid, "x");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
if (argsKer->symb.size() == 1)
{
if (argsKer->nSymb.size() == 2)
{
st = makeBlockIdxAssigment(tid1, "y");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
}
else if (argsKer->nSymb.size() >= 3)
{
st = makeBlockIdxAssigment(tid1, "y");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
st = makeBlockIdxAssigment(tid2, "z");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
}
}
else if (argsKer->symb.size() == 2)
{
if (argsKer->nSymb.size() == 1)
{
st = makeBlockIdxAssigment(tid1, "y");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
}
else if (argsKer->nSymb.size() >= 2)
{
st = makeBlockIdxAssigment(tid1, "y");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
st = makeBlockIdxAssigment(tid2, "z");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
}
}
else if (argsKer->symb.size() >= 3)
{
st = makeBlockIdxAssigment(tid1, "y");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
if (argsKer->nSymb.size() > 0)
{
st = makeBlockIdxAssigment(tid2, "z");
cur_in_kernel->insertStmtAfter(*st, *kernel_st);
cur_in_kernel = st;
}
}
#if debugMode
mywarn(" end: block4");
mywarn("start: block5");
#endif
if (argsKer->symb.size() == 1) // body for 1 dependence
{
int idx_exprs = 0;
int count_of_dims = argsKer->nSymb.size() + argsKer->symb.size();
vector<SageSymbols>::iterator itAcr = argsKer->symb.begin();
vector<SageSymbols>::iterator it = argsKer->nSymb.begin();
vector<SgSymbol*>::iterator itAcrS = argsKer->acrossS.begin();
vector<SgSymbol*>::iterator itS = argsKer->notAcrossS.begin();
vector<SgSymbol*>::iterator it_sizeV = argsKer->sizeVars.begin();
vector<SgSymbol*>::iterator itIdxAcr = argsKer->idxAcross.begin();
vector<SgSymbol*>::iterator itIdx = argsKer->idxNotAcross.begin();
leftExprs = new SgExpression*[count_of_dims];
rightExprs = new SgExpression*[count_of_dims];
e = &(*new SgVarRefExp(*itAcrS));
st = AssignStatement(*new SgVarRefExp((*itAcr).symb), *e);
leftExprs[idx_exprs] = &(*new SgVarRefExp((*itAcr).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS));
idx_exprs++;
if (argsKer->nSymb.size() == 1)
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdx));
idx_exprs++;
}
else if (argsKer->nSymb.size() == 2)
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itIdx++;
itS++;
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itIdx++;
itS++;
}
else if (argsKer->nSymb.size() >= 3)
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itIdx++;
itS++;
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itIdx++;
itS++;
SgExpression *e_z1, *e_z2, *tmp_exp;
it_sizeV = argsKer->sizeVars.begin();
it_sizeV++;
it_sizeV++;
if (argsKer->nSymb.size() > 3)
{
SgFunctionCallExp *funCall = new SgFunctionCallExp(*createNewFunctionSymbol("mod"));
e_z1 = new SgVarRefExp(*it_sizeV);
funCall->addArg(*new SgVarRefExp(*tid2));
funCall->addArg(*e_z1);
tmp_exp = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
st = AssignStatement(*new SgVarRefExp((*it).symb), *tmp_exp);
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itS++;
itIdx++;
it_sizeV++;
e_z2 = new SgVarRefExp(*it_sizeV);
it_sizeV++;
for (unsigned i = 0; i < argsKer->nSymb.size() - 3; ++i, it++, itS++, itIdx++)
{
SgFunctionCallExp *funCall = new SgFunctionCallExp(*createNewFunctionSymbol("mod"));
if (i == argsKer->nSymb.size() - 4)
tmp_exp = &(*new SgVarRefExp(*itS) + ((*new SgVarRefExp(*tid2) / *e_z1)) * *new SgVarRefExp(*itIdx));
else
{
funCall->addArg((*new SgVarRefExp(*tid2) / *e_z1));
funCall->addArg(*e_z2);
tmp_exp = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
}
st = AssignStatement(*new SgVarRefExp((*it).symb), *tmp_exp);
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(tmp_exp->copy());
idx_exprs++;
e_z1 = &(*e_z1 * *e_z2);
if (i != argsKer->nSymb.size() - 4)
{
e_z2 = new SgVarRefExp(*it_sizeV);
it_sizeV++;
}
}
}
else
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid2) * *new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = &(*new SgVarRefExp((*it).symb));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid2) * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itIdx++;
itS++;
}
}
if (options.isOn(C_CUDA))
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0])), &(rightExprs[0]->copy()));
else
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0]) + *new SgValueExp(1)), &(rightExprs[0]->copy()));
// main IF
it_sizeV = argsKer->sizeVars.begin();
if (argsKer->nSymb.size() == 0)
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgValueExp(1), *st);
else if (argsKer->nSymb.size() == 1)
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp(*it_sizeV), *st);
else if (argsKer->nSymb.size() == 2)
{
SgSymbol *tmp = *it_sizeV;
it_sizeV++;
SgSymbol *tmp1 = *it_sizeV;
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp(tmp) &&
*new SgVarRefExp(*tid1) < *new SgVarRefExp(tmp1), *st);
}
else if (argsKer->nSymb.size() >= 3)
{
SgSymbol *tmp = *it_sizeV;
it_sizeV++;
SgSymbol *tmp1 = *it_sizeV;
it_sizeV++;
SgExpression *if_mult = NULL;
for (unsigned i = 0; i < argsKer->nSymb.size() - 2; ++i)
{
if (i == 0)
if_mult = new SgVarRefExp(*it_sizeV);
else
if_mult = &((*if_mult) * *new SgVarRefExp(*it_sizeV));
it_sizeV++;
}
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp(tmp) &&
*new SgVarRefExp(*tid1) < *new SgVarRefExp(tmp1) && *new SgVarRefExp(*tid2) < *if_mult, *st);
}
for (size_t i = 1; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (options.isOn(C_CUDA))
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[i])), &(rightExprs[i]->copy()));
else
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[i]) + *new SgValueExp(1)), &(rightExprs[i]->copy()));
if_st->lastExecutable()->insertStmtAfter(*st);
}
for (size_t i = 0; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (options.isOn(C_CUDA))
st = AssignStatement(&(leftExprs[i]->copy()), new SgArrayRefExp(*coords, *new SgValueExp((int)(i))));
else
st = AssignStatement(&(leftExprs[i]->copy()), new SgArrayRefExp(*coords, *new SgValueExp((int)(i + 1))));
if_st->lastExecutable()->insertStmtAfter(*st);
}
if (options.isOn(GPU_O0))
{
SgSymbol *cond_s = argsKer->cond_;
tmpvar1 = new SgSymbol(VARIABLE_NAME, "tmpV");
SgExprListExp *listAss = new SgExprListExp();
SgExprListExp *tmp = listAss;
listAss->setLhs(&SgAssignOp(leftExprs[0]->copy(), (*(&leftExprs[0]->copy())) + *new SgVarRefExp(argsKer->steps[0])));
for (size_t i = 1; i < argsKer->baseIdxsInKer.size(); ++i)
{
tmp->setRhs(new SgExprListExp());
tmp = (SgExprListExp*)tmp->rhs();
tmp->setLhs(&SgAssignOp(leftExprs[i]->copy(), (*(&leftExprs[i]->copy())) + *new SgVarRefExp(argsKer->steps[i])));
}
tmp->setRhs(new SgExprListExp());
tmp = (SgExprListExp*)tmp->rhs();
tmp->setLhs(&SgAssignOp(*new SgVarRefExp(tmpvar1), *new SgVarRefExp(tmpvar1) + *new SgValueExp(1)));
if (options.isOn(C_CUDA))
mainFor = new SgForStmt(&SgAssignOp(*new SgVarRefExp(tmpvar1), *new SgValueExp(1)), &(*new SgVarRefExp(tmpvar1) <= *new SgVarRefExp(*cond_s)), listAss, NULL);
else
mainFor = new SgForStmt(tmpvar1, &(rightExprs[0]->copy()), new SgVarRefExp(cond_s), new SgVarRefExp(*itIdxAcr), NULL);
if_st->lastExecutable()->insertStmtAfter(*mainFor);
}
cur_in_kernel->insertStmtAfter(*if_st, *kernel_st);
if (options.isOn(GPU_O0))
cur_in_kernel = mainFor->lastExecutable();
else
cur_in_kernel = if_st->lastExecutable();
if (!options.isOn(C_CUDA) && options.isOn(GPU_O0))
{
for (size_t i = 0; i < argsKer->baseIdxsInKer.size(); ++i)
mainFor->lastExecutable()->insertStmtAfter(*AssignStatement(*&leftExprs[i]->copy(), (*(&leftExprs[i]->copy())) + *new SgVarRefExp(argsKer->steps[i])), *mainFor);
}
delete []leftExprs;
delete []rightExprs;
}
else if (argsKer->symb.size() == 2) // body for 2 dependence
{
// attention!! adding to support all variants!!
if (argsKer->nSymb.size() != 0)
{
SgSymbol *tmp = tid1;
tid1 = tid;
tid = tmp;
}
SgExpression **leftExprs, **rightExprs;
int idx_exprs = 0;
int count_of_dims = argsKer->nSymb.size() + argsKer->symb.size();
leftExprs = new SgExpression*[count_of_dims];
rightExprs = new SgExpression*[count_of_dims];
vector<SageSymbols>::iterator itAcr = argsKer->symb.begin();
vector<SageSymbols>::iterator it = argsKer->nSymb.begin();
vector<SgSymbol*>::iterator itAcrS = argsKer->acrossS.begin();
vector<SgSymbol*>::iterator itS = argsKer->notAcrossS.begin();
vector<SgSymbol*>::iterator it_sizeV = argsKer->sizeVars.begin();
vector<SgSymbol*>::iterator itIdxAcr = argsKer->idxAcross.begin();
vector<SgSymbol*>::iterator itIdx = argsKer->idxNotAcross.begin();
e = &(*new SgVarRefExp(*itAcrS) - *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdxAcr));
st = AssignStatement(*new SgVarRefExp((*itAcr).symb), *e);
leftExprs[idx_exprs] = new SgVarRefExp((*itAcr).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS) - *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdxAcr));
idx_exprs++;
itAcr++;
itAcrS++;
itIdxAcr++;
e = &(*new SgVarRefExp(*itAcrS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdxAcr));
st = AssignStatement(*new SgVarRefExp((*itAcr).symb), *e);
leftExprs[idx_exprs] = new SgVarRefExp((*itAcr).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS) + *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdxAcr));
idx_exprs++;
itAcr++;
itAcrS++;
itIdxAcr++;
if (argsKer->nSymb.size() == 1)
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) *
*new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*itIdx));
idx_exprs++;
}
else if (argsKer->nSymb.size() >= 2)
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) *
*new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itIdx++;
itS++;
SgExpression *e_z1, *e_z2, *tmp_exp;
it_sizeV = argsKer->sizeVars.begin();
it_sizeV++;
it_sizeV++;
if (argsKer->nSymb.size() > 2)
{
SgFunctionCallExp *funCall = new SgFunctionCallExp(*createNewFunctionSymbol("mod"));
e_z1 = new SgVarRefExp(*it_sizeV);
funCall->addArg(*new SgVarRefExp(*tid2));
funCall->addArg(*e_z1);
tmp_exp = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
st = AssignStatement(*new SgVarRefExp((*it).symb), *tmp_exp);
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itS++;
itIdx++;
it_sizeV++;
e_z2 = new SgVarRefExp(*it_sizeV);
it_sizeV++;
for (; it != argsKer->nSymb.end(); it++, itS++, itIdx++)
{
SgFunctionCallExp *funCall = new SgFunctionCallExp(*createNewFunctionSymbol("mod"));
it++;
if (it == argsKer->nSymb.end())
{
tmp_exp = &(*new SgVarRefExp(*itS) + ((*new SgVarRefExp(*tid2) / *e_z1)) * *new SgVarRefExp(*itIdx));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + ((*new SgVarRefExp(*tid2) / *e_z1)) * *new SgVarRefExp(*itIdx));
}
else
{
funCall->addArg((*new SgVarRefExp(*tid2) / *e_z1));
funCall->addArg(*e_z2);
tmp_exp = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
}
it--;
st = AssignStatement(*new SgVarRefExp((*it).symb), *tmp_exp);
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
idx_exprs++;
e_z1 = &(*e_z1 * *e_z2);
it++;
if (it != argsKer->nSymb.end())
{
e_z2 = new SgVarRefExp(*it_sizeV);
it_sizeV++;
}
it--;
}
}
else
for (; it != argsKer->nSymb.end(); it++, itS++, itIdx++)
{
st = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(*tid2) *
*new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(*tid2) * *new SgVarRefExp(*itIdx));
idx_exprs++;
}
}
if (options.isOn(C_CUDA))
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0])), &(rightExprs[0]->copy()));
else
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0]) + *new SgValueExp(1)), &(rightExprs[0]->copy()));
// main IF
it_sizeV = argsKer->sizeVars.begin();
if (argsKer->nSymb.size() == 0)
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp(*it_sizeV), *st);
else if (argsKer->nSymb.size() == 1)
{
SgSymbol *tmp = *it_sizeV;
it_sizeV++;
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp(tmp) &&
*new SgVarRefExp(*tid1) < *new SgVarRefExp(*it_sizeV), *st);
}
else if (argsKer->nSymb.size() >= 2)
{
SgExpression *tmp_exp;
SgSymbol *tmp = *it_sizeV;
it_sizeV++;
SgSymbol *tmp1 = *it_sizeV;
it_sizeV++;
tmp_exp = new SgVarRefExp(*it_sizeV);
it_sizeV++;
for (; it_sizeV != argsKer->sizeVars.end(); it_sizeV++)
tmp_exp = &((*tmp_exp) * *new SgVarRefExp(*it_sizeV));
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp(tmp) &&
*new SgVarRefExp(*tid1) < *new SgVarRefExp(tmp1) &&
*new SgVarRefExp(*tid2) < *tmp_exp, *st);
}
for (size_t i = 1; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (options.isOn(C_CUDA))
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[i])), &(rightExprs[i]->copy()));
else
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[i]) + *new SgValueExp(1)), &(rightExprs[i]->copy()));
if_st->lastExecutable()->insertStmtAfter(*st);
}
for (size_t i = 0; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (options.isOn(C_CUDA))
st = AssignStatement(&(leftExprs[i]->copy()), new SgArrayRefExp(*coords, *new SgValueExp((int)(i))));
else
st = AssignStatement(&(leftExprs[i]->copy()), new SgArrayRefExp(*coords, *new SgValueExp((int)(i + 1))));
if_st->lastExecutable()->insertStmtAfter(*st);
}
cur_in_kernel->insertStmtAfter(*if_st, *kernel_st);
cur_in_kernel = if_st->lastExecutable();
delete[]leftExprs;
delete[]rightExprs;
}
else if (argsKer->symb.size() >= 3) // body for >3 dependence
{
// attention!! adding to support all variants!!<21><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
if (argsKer->nSymb.size() >= 1)
{
SgSymbol *tmp = tid2;
tid2 = tid;
tid = tmp;
}
SgStatement *st, *st1;
SgSymbol *max_z, *se, *emax, *emin, *v1, *v2, *v3, *min_ij, *swap_ij, *i, *j;
SgSymbol **num_elems;
SgIfStmt *if_st3;
vector<SageSymbols>::iterator itAcr = argsKer->symb.begin();
vector<SageSymbols>::iterator it = argsKer->nSymb.begin();
vector<SgSymbol*>::iterator itAcrS = argsKer->acrossS.begin();
vector<SgSymbol*>::iterator itS = argsKer->notAcrossS.begin();
vector<SgSymbol*>::iterator it_sizeV = argsKer->sizeVars.begin();
vector<SgSymbol*>::iterator itIdxAcr = argsKer->idxAcross.begin();
vector<SgSymbol*>::iterator itIdx = argsKer->idxNotAcross.begin();
SgExpression **leftExprs, **rightExprs;
int idx_exprs = 0;
int count_of_dims = argsKer->nSymb.size() + argsKer->symb.size();
leftExprs = new SgExpression*[count_of_dims];
rightExprs = new SgExpression*[count_of_dims];
num_elems = new SgSymbol*[argsKer->nSymb.size()];
max_z = *it_sizeV;
it_sizeV++;
se = *it_sizeV;
it_sizeV++;
v1 = *it_sizeV;
it_sizeV++;
v2 = *it_sizeV;
it_sizeV++;
v3 = *it_sizeV;
it_sizeV++;
emax = *it_sizeV;
it_sizeV++;
emin = *it_sizeV;
it_sizeV++;
min_ij = *it_sizeV;
it_sizeV++;
swap_ij = *it_sizeV;
it_sizeV++;
for (size_t i = 0; i < argsKer->nSymb.size(); ++i)
{
num_elems[i] = *it_sizeV;
it_sizeV++;
}
e = &(*new SgVarRefExp(*itAcrS) - *new SgVarRefExp(*tid) * *new SgVarRefExp(*itIdxAcr));
st = AssignStatement(*new SgVarRefExp(*itAcrS), *new SgVarRefExp(*itAcrS) - *new SgVarRefExp(*itIdxAcr) *
(*new SgVarRefExp(*se) + *new SgVarRefExp(*tid1) - *new SgVarRefExp(*emin)));
itAcrS++;
itIdxAcr++;
st1 = AssignStatement(*new SgVarRefExp(*itAcrS), *new SgVarRefExp(*itAcrS) + *new SgVarRefExp(*itIdxAcr) *
(*new SgVarRefExp(*se) + *new SgVarRefExp(*tid1) - *new SgVarRefExp(*emin)));
if_st2 = new SgIfStmt(SgEqOp(*new SgVarRefExp(*v3), *new SgValueExp(1)) && *new SgVarRefExp(emin) < *new SgVarRefExp(tid1) + *new SgVarRefExp(se), *st1);
if_st2->insertStmtAfter(*st);
SgFunctionCallExp *funcCall = new SgFunctionCallExp(*createNewFunctionSymbol("min"));
funcCall->addArg(*new SgVarRefExp(*se) + *new SgVarRefExp(*tid1));
itAcrS--;
itIdxAcr--;
if_st = new SgIfStmt(*new SgVarRefExp(*tid) < *new SgVarRefExp((*itAcr).symb), *if_st2);
if (argsKer->nSymb.size() == 0)
if_st3 = new SgIfStmt(*new SgVarRefExp(*tid1) < *new SgVarRefExp(*max_z), *if_st);
else
{
SgExpression *tmp = new SgVarRefExp(num_elems[0]);
for (size_t i = 1; i < argsKer->nSymb.size(); ++i)
tmp = &(*tmp * *new SgVarRefExp(num_elems[i]));
if_st3 = new SgIfStmt(*new SgVarRefExp(*tid1) < *new SgVarRefExp(*max_z) && *new SgVarRefExp(*tid2) < *tmp, *if_st);
}
cur_in_kernel->insertStmtAfter(*if_st3, *kernel_st);
cur_in_kernel = if_st->lexNext();
st1 = AssignStatement(*new SgVarRefExp((*itAcr).symb), *new SgVarRefExp(*min_ij));
st = AssignStatement(*new SgVarRefExp((*itAcr).symb), *new SgValueExp(2) * *new SgVarRefExp(*min_ij) - *new SgVarRefExp(se) -
*new SgVarRefExp(tid1) + *new SgVarRefExp(emax) - *new SgVarRefExp(emin) - *new SgValueExp(1));
if_st1 = new SgIfStmt(*new SgVarRefExp(*tid1) + *new SgVarRefExp(se) < *new SgVarRefExp(*emax), *st1, *st);
st1 = AssignStatement(*new SgVarRefExp((*itAcr).symb), *new SgVarRefExp(*tid1) + *new SgVarRefExp(se));
if_st1 = new SgIfStmt(*new SgVarRefExp(*tid1) + *new SgVarRefExp(se) < *new SgVarRefExp(*emin), *st1, *if_st1);
if_st3->insertStmtAfter(*if_st1);
i = (*itAcr).symb;
st1 = AssignStatement(*new SgVarRefExp((*itAcr).symb), *new SgVarRefExp(*itAcrS) + ((*new SgVarRefExp(tid1) *
(*new SgVarRefExp(v1) + *new SgVarRefExp(v3)) - *new SgVarRefExp(tid))) * *new SgVarRefExp(*itIdxAcr));
leftExprs[idx_exprs] = new SgVarRefExp((*itAcr).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS) + ((*new SgVarRefExp(tid1) *
(*new SgVarRefExp(v1) + *new SgVarRefExp(v3)) - *new SgVarRefExp(tid))) * *new SgVarRefExp(*itIdxAcr));
idx_exprs++;
itAcrS++;
itIdxAcr++;
itAcr++;
j = (*itAcr).symb;
st1 = AssignStatement(*new SgVarRefExp((*itAcr).symb), *new SgVarRefExp(*itAcrS) + (*new SgVarRefExp(tid1) *
*new SgVarRefExp(v2) + *new SgVarRefExp(tid)) * *new SgVarRefExp(*itIdxAcr));
leftExprs[idx_exprs] = new SgVarRefExp((*itAcr).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS) + (*new SgVarRefExp(tid1) *
*new SgVarRefExp(v2) + *new SgVarRefExp(tid)) * *new SgVarRefExp(*itIdxAcr));
idx_exprs++;
itAcrS++;
itIdxAcr++;
itAcr++;
st1 = AssignStatement(*new SgVarRefExp((*itAcr).symb), *new SgVarRefExp(*itAcrS) - *new SgVarRefExp(tid1) *
*new SgVarRefExp(*itIdxAcr));
leftExprs[idx_exprs] = new SgVarRefExp((*itAcr).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS) - *new SgVarRefExp(tid1) * *new SgVarRefExp(*itIdxAcr));
idx_exprs++;
if (argsKer->symb.size() > 3)
{
for (size_t i = 0; i < argsKer->symb.size() - 3; ++i)
{
itAcrS++;
itIdxAcr++;
itAcr++;
leftExprs[idx_exprs] = new SgVarRefExp((*itAcr).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itAcrS));
idx_exprs++;
}
}
if (argsKer->nSymb.size() == 1)
{
st1 = AssignStatement(*new SgVarRefExp((*it).symb), *new SgVarRefExp(*itS) + *new SgVarRefExp(tid2) *
*new SgVarRefExp(*itIdx));
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *new SgVarRefExp(tid2) * *new SgVarRefExp(*itIdx));
idx_exprs++;
}
else if (argsKer->nSymb.size() > 1)
{
SgExpression *e_z1, *e_z2, *tmp_exp;
SgFunctionCallExp *funCall = new SgFunctionCallExp(*createNewFunctionSymbol("mod"));
e_z1 = new SgVarRefExp(num_elems[0]);
funCall->addArg(*new SgVarRefExp(*tid2));
funCall->addArg(*e_z1);
tmp_exp = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
st = AssignStatement(*new SgVarRefExp((*it).symb), *tmp_exp);
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
idx_exprs++;
it++;
itS++;
itIdx++;
e_z2 = new SgVarRefExp(num_elems[1]);
for (int count = 2; it != argsKer->nSymb.end(); it++, itS++, itIdx++, ++count)
{
SgFunctionCallExp *funCall = new SgFunctionCallExp(*createNewFunctionSymbol("mod"));
it++;
if (it == argsKer->nSymb.end())
{
tmp_exp = &(*new SgVarRefExp(*itS) + ((*new SgVarRefExp(*tid2) / *e_z1)) * *new SgVarRefExp(*itIdx));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + ((*new SgVarRefExp(*tid2) / *e_z1)) * *new SgVarRefExp(*itIdx));
}
else
{
funCall->addArg((*new SgVarRefExp(*tid2) / *e_z1));
funCall->addArg(*e_z2);
tmp_exp = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
rightExprs[idx_exprs] = &(*new SgVarRefExp(*itS) + *funCall * *new SgVarRefExp(*itIdx));
}
it--;
st = AssignStatement(*new SgVarRefExp((*it).symb), *tmp_exp);
leftExprs[idx_exprs] = new SgVarRefExp((*it).symb);
idx_exprs++;
e_z1 = &(*e_z1 * *e_z2);
it++;
if (it != argsKer->nSymb.end())
{
e_z2 = new SgVarRefExp(num_elems[count]);
}
it--;
}
}
if (options.isOn(C_CUDA))
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0])), &(rightExprs[0]->copy()));
else
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0]) + *new SgValueExp(1)), &(rightExprs[0]->copy()));
// insert into MAIN If
if_st->lastExecutable()->insertStmtAfter(*st);
for (size_t i = 1; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (options.isOn(C_CUDA))
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[i])), &(rightExprs[i]->copy()));
else
st = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[i]) + *new SgValueExp(1)), &(rightExprs[i]->copy()));
if_st->lastExecutable()->insertStmtAfter(*st);
}
//insert swap block
if (options.isOn(C_CUDA))
{
SgExpression *firstElem = new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0]));
SgExpression *secondElem = new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[1]));
if_st2 = new SgIfStmt(*new SgVarRefExp(swap_ij) * *new SgVarRefExp(v3), *new SgCExpStmt(*firstElem ^= *secondElem ^= *firstElem ^= *secondElem));
}
else
{
st1 = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0]) + *new SgValueExp(1)), new SgVarRefExp(v3));
if_st2 = new SgIfStmt(*new SgVarRefExp(swap_ij) * *new SgVarRefExp(v3), *st1);
st1 = AssignStatement(new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[1]) + *new SgValueExp(1)),
new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[0]) + *new SgValueExp(1)));
if_st2->insertStmtAfter(*st1);
st1 = AssignStatement(new SgVarRefExp(v3), new SgArrayRefExp(*coords, *new SgVarRefExp(argsKer->baseIdxsInKer[1]) + *new SgValueExp(1)));
if_st2->insertStmtAfter(*st1);
}
if_st->lastExecutable()->insertStmtAfter(*if_st2);
for (size_t i = 0; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (options.isOn(C_CUDA))
st = AssignStatement(&(leftExprs[i]->copy()), new SgArrayRefExp(*coords, *new SgValueExp((int)(i))));
else
st = AssignStatement(&(leftExprs[i]->copy()), new SgArrayRefExp(*coords, *new SgValueExp((int)(i + 1))));
if_st->lastExecutable()->insertStmtAfter(*st);
}
delete[]leftExprs;
delete[]rightExprs;
cur_in_kernel = if_st->lastExecutable();
}
// generating assign statements for MAXLOC, MINLOC reduction operations
if (red_list)
Do_Assign_For_Loc_Arrays();
// inserting loop body to innermost IF statement of BlockForCalculationThreadLoopVariables
#if debugMode
mywarn(" end: block5");
mywarn("strat: inserting loop body");
#endif
SgStatement *currStForInsetGetXY = cur_in_kernel;
vector<SgSymbol*> forDeclarationInKernel;
set<char *> uniqueNames;
// create, insert, optimize and convert loop_body into kernel
{
SgStatement *stk, *last;
vector<newInfo> allNewInfo;
if (argsKer->symb.size() == 1)
{
if (options.isOn(GPU_O0))
optimizeLoopBodyForOne(allNewInfo);
oneCase = true;
}
else
oneCase = false;
block = CreateIfForRedBlack(loop_body, nloop);
last = cur_in_kernel->lexNext();
if (argsKer->symb.size() == 1 && allNewInfo.size() != 0 && options.isOn(GPU_O0)) //insert needed assigns
{
SgIfStmt *ifSt = new SgIfStmt(*new SgVarRefExp(argsKer->idxAcross[0]) > *new SgValueExp(0), *&allNewInfo[0].loadsBeforePlus[0]->copy(), *&allNewInfo[0].loadsBeforeMinus[0]->copy());
for (size_t i = 0; i < allNewInfo.size(); ++i)
{
if (i == 0)
{
for (size_t k = 1; k < allNewInfo[i].loadsBeforePlus.size(); ++k)
{
ifSt->insertStmtAfter(*&allNewInfo[i].loadsBeforePlus[k]->copy(), *ifSt);
ifSt->falseBody()->insertStmtBefore(*&allNewInfo[i].loadsBeforeMinus[k]->copy(), *ifSt);
}
}
else
{
for (size_t k = 0; k < allNewInfo[i].loadsBeforePlus.size(); ++k)
{
ifSt->insertStmtAfter(*&allNewInfo[i].loadsBeforePlus[k]->copy(), *ifSt);
ifSt->falseBody()->insertStmtBefore(*&allNewInfo[i].loadsBeforeMinus[k]->copy(), *ifSt);
}
}
}
mainFor->insertStmtBefore(*ifSt);
}
if (argsKer->symb.size() == 1 && options.isOn(GPU_O0))
cur_in_kernel->insertStmtAfter(*block, *mainFor); //cur_in_kernel is innermost FOR stmt
else
cur_in_kernel->insertStmtAfter(*block, *if_st); //cur_in_kernel is innermost IF statement
if (options.isOn(C_CUDA))
{
if (block->comments() == NULL)
block->addComment("// Loop body");
}
else
block->addComment("! Loop body\n");
// correct copy of loop_body (change or extract last statement of block if it is CONTROL_END)
if (block != loop_body)
stk = last->lexPrev()->lexPrev();
else
stk = last->lexPrev();
if (stk->variant() == CONTROL_END)
{
if (stk->hasLabel() || stk == loop_body) // when body of DO_ENDDO loop is empty, stk == loop_body
stk->setVariant(CONT_STAT);
else
{
st = stk->lexPrev();
stk->extractStmt();
stk = st;
}
}
ReplaceExitCycleGoto(block, stk);
for_kernel = 1;
last = cur_st;
if (argsKer->symb.size() == 1 && allNewInfo.size() != 0 && options.isOn(GPU_O0)) //insert needed assigns
{
SgIfStmt *ifSt = new SgIfStmt(*new SgVarRefExp(argsKer->idxAcross[0]) > *new SgValueExp(0), *&allNewInfo[0].loadsInForPlus[0]->copy(), *&allNewInfo[0].loadsInForMinus[0]->copy());
for (size_t i = 0; i < allNewInfo.size(); ++i)
{
size_t k;
if (i == 0)
k = 1;
else
k = 0;
for (; k < allNewInfo[i].loadsInForPlus.size(); ++k)
{
ifSt->insertStmtAfter(*&allNewInfo[i].loadsInForPlus[k]->copy(), *ifSt);
ifSt->falseBody()->insertStmtBefore(*&allNewInfo[i].loadsInForMinus[k]->copy(), *ifSt);
}
}
mainFor->insertStmtAfter(*ifSt);
for (size_t i = 0; i < allNewInfo.size(); ++i)
{
if (options.isOn(C_CUDA))
{
for (size_t k = 0; k < allNewInfo[i].stores.size(); ++k)
mainFor->lastExecutable()->insertStmtAfter(*&allNewInfo[i].stores[k]->copy());
}
else
{
for (size_t k = 0; k < allNewInfo[i].stores.size(); ++k)
mainFor->lastExecutable()->lexPrev()->lexPrev()->insertStmtBefore(*&allNewInfo[i].stores[k]->copy());
}
}
size_t k = allNewInfo[0].swapsUp.size() - 1;
ifSt = new SgIfStmt(*new SgVarRefExp(argsKer->idxAcross[0]) > *new SgValueExp(0), *&allNewInfo[0].swapsDown[k]->copy(), *&allNewInfo[0].swapsUp[k]->copy());
for (size_t i = 0; i < allNewInfo.size(); ++i)
{
size_t last;
if (i == 0)
last = allNewInfo[i].swapsUp.size() - 1;
else
last = allNewInfo[0].swapsUp.size();
for (size_t k = 0; k < last; ++k)
{
ifSt->insertStmtAfter(*&allNewInfo[i].swapsDown[last - 1 - k]->copy(), *ifSt);
ifSt->falseBody()->insertStmtBefore(*&allNewInfo[i].swapsUp[last - 1 - k]->copy(), *ifSt);
}
}
mainFor->lastExecutable()->insertStmtAfter(*ifSt);
}
// insert dvmh_convert_XY calls directly into loop_body if some array accesses depend on its definitions (inserting right before accesses)
if (options.isOn(AUTO_TFM))
{
if (acrossNum != 1)
{
map<SgSymbol*, Array*>& arrays = currentLoop->getArrays();
string funcDvmhConvXYname_type = funcDvmhConvXYname;
if (!options.isOn(C_CUDA))
{
if (strcmp(idxTypeInKernel->symbol()->identifier(), indexTypeInKernel(rt_INT)->symbol()->identifier()) == 0)
funcDvmhConvXYname_type += "_int";
else if (strcmp(idxTypeInKernel->symbol()->identifier(), indexTypeInKernel(rt_LONG)->symbol()->identifier()) == 0)
funcDvmhConvXYname_type += "_long";
else if (strcmp(idxTypeInKernel->symbol()->identifier(), indexTypeInKernel(rt_LLONG)->symbol()->identifier()) == 0)
funcDvmhConvXYname_type += "_llong";
}
for (map<SgSymbol*, Array*>::iterator it = arrays.begin(); it != arrays.end(); ++it)
{
Array* array = it->second;
set<SgSymbol*>& privateList = currentLoop->getPrivateList();
if (privateList.find(it->first) == privateList.end())
{
for (map<string, Access*>::iterator it2 = array->getAccesses().begin(); it2 != array->getAccesses().end(); ++it2)
analyzeArrayIndxs(array->getSymbol(), it2->second->getSubscripts());
int numSymb = 0;
for (size_t i1 = 0; i1 < argsKer->arrayNames.size(); ++i1)
if (strcmp(argsKer->arrayNames[i1], array->getSymbol()->identifier()) == 0)
{
numSymb = (int)i1;
break;
}
array->generateAssigns(
new SgVarRefExp(argsKer->otherVars[8 * numSymb + 1]),
new SgVarRefExp(argsKer->otherVars[8 * numSymb + 4]),
new SgVarRefExp(argsKer->otherVars[8 * numSymb + 2]),
new SgVarRefExp(argsKer->otherVars[8 * numSymb + 5]),
new SgVarRefExp(argsKer->otherVars[8 * numSymb + 6]));
SgIfStmt* ifSt = NULL, *if1case = NULL, *if2case = NULL;
TfmInfo& tfmInfo = array->getTfmInfo();
map<SgStatement*, vector<SgFunctionCallExp*> >& ifCalls = tfmInfo.ifCalls;
map<SgStatement*, vector<SgFunctionCallExp*> >& elseCalls = tfmInfo.elseCalls;
SgSymbol* x_axis = argsKer->otherVars[8 * numSymb];
SgSymbol* y_axis = argsKer->otherVars[8 * numSymb + 3];
int tfsDim1 = tfmInfo.transformDims[0];
int tfsDim2 = tfmInfo.transformDims[1];
for (map<SgStatement*, vector<SgFunctionCallExp*> >::iterator it = ifCalls.begin(); it != ifCalls.end(); ++it)
{
if (it->first == NULL)
continue;
if (ifCalls[it->first].size() > 0)
{
if (options.isOn(C_CUDA))
{
if2case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim2)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim1))), *new SgCExpStmt(*(elseCalls[it->first][0])));
if1case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim1)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim2))), *new SgCExpStmt(*(ifCalls[it->first][0])), *if2case);
ifSt = new SgIfStmt(SgEqOp(*new SgVarRefExp(argsKer->otherVars[8 * numSymb + 7]), *new SgValueExp(2)), *if1case);
}
else
{
if2case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim2)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim1))),
*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(elseCalls[it->first][0]->args())));
if1case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim1)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim2))),
*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(ifCalls[it->first][0]->args())), *if2case);
ifSt = new SgIfStmt(SgEqOp(*new SgVarRefExp(argsKer->otherVars[8 * numSymb + 7]), *new SgValueExp(2)), *if1case);
}
}
for (size_t k = 1; k < ifCalls[it->first].size(); ++k)
{
if (options.isOn(C_CUDA))
{
if1case->insertStmtAfter(*new SgCExpStmt(*(ifCalls[it->first][k])));
if2case->insertStmtAfter(*new SgCExpStmt(*(elseCalls[it->first][k])));
}
else
{
if1case->insertStmtAfter(*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(ifCalls[it->first][k]->args())));
if2case->insertStmtAfter(*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(elseCalls[it->first][k]->args())));
}
}
if (ifSt != NULL)
{
if (loop_body == it->first)
loop_body->insertStmtBefore(*ifSt);
else
{
for (SgStatement* stmt = loop_body; stmt != NULL; stmt = stmt->lexNext())
{
if (stmt->lexNext() == it->first)
{
stmt->insertStmtAfter(*ifSt);
break;
}
}
}
}
ifSt = NULL;
}
}
}
}
}
TranslateBlock(if_st);
if (options.isOn(C_CUDA))
{
//get info of arrays in private and locvar lists
swapDimentionsInprivateList();
if (argsKer->symb.size() == 1 && options.isOn(GPU_O0))
{
Translate_Fortran_To_C(mainFor->lexPrev()->controlParent());
Translate_Fortran_To_C(mainFor, mainFor->lastNodeOfStmt(), copyOfBody, 0); //countOfCopies
}
else
Translate_Fortran_To_C(if_st, if_st->lastNodeOfStmt(), copyOfBody, 0); // countOfCopies
}
cur_st = last;
if (createBodyKernel == false)
createBodyKernel = true;
}
//insert dvmh_convert_XY before loop_body if its arguments depend only on loop indices
if (options.isOn(AUTO_TFM))
{
#if debugMode
mywarn("strat: inserting transform calls");
#endif
if (acrossNum != 1)
{
map<SgSymbol*, Array*>& arrays = currentLoop->getArrays();
string funcDvmhConvXYname_type = funcDvmhConvXYname;
if (!options.isOn(C_CUDA))
{
if (strcmp(idxTypeInKernel->symbol()->identifier(), indexTypeInKernel(rt_INT)->symbol()->identifier()) == 0)
funcDvmhConvXYname_type += "_int";
else if (strcmp(idxTypeInKernel->symbol()->identifier(), indexTypeInKernel(rt_LONG)->symbol()->identifier()) == 0)
funcDvmhConvXYname_type += "_long";
else if (strcmp(idxTypeInKernel->symbol()->identifier(), indexTypeInKernel(rt_LLONG)->symbol()->identifier()) == 0)
funcDvmhConvXYname_type += "_llong";
}
for (map<SgSymbol*, Array*>::iterator it = arrays.begin(); it != arrays.end(); ++it)
{
Array *array = it->second;
set<SgSymbol*>& privateList = currentLoop->getPrivateList();
if (privateList.find(it->first) == privateList.end())
{
int numSymb = 0;
for (size_t i1 = 0; i1 < argsKer->arrayNames.size(); ++i1)
if (strcmp(argsKer->arrayNames[i1], array->getSymbol()->identifier()) == 0)
{
numSymb = (int)i1;
break;
}
SgIfStmt* ifSt = NULL, *if1case = NULL, *if2case = NULL;
TfmInfo& tfmInfo = array->getTfmInfo();
vector<SgFunctionCallExp*>& ifCalls = tfmInfo.ifCalls[NULL];
vector<SgFunctionCallExp*>& elseCalls = tfmInfo.elseCalls[NULL];
SgSymbol* x_axis = argsKer->otherVars[8 * numSymb];
SgSymbol* y_axis = argsKer->otherVars[8 * numSymb + 3];
int tfsDim1 = tfmInfo.transformDims[0];
int tfsDim2 = tfmInfo.transformDims[1];
if (ifCalls.size() > 0)
if (options.isOn(C_CUDA))
{
if2case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim2)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim1))), *new SgCExpStmt(*(elseCalls[0])));
if1case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim1)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim2))), *new SgCExpStmt(*(ifCalls[0])), *if2case);
ifSt = new SgIfStmt(SgEqOp(*new SgVarRefExp(argsKer->otherVars[8 * numSymb + 7]), *new SgValueExp(2)), *if1case);
}
else
{
if2case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim2)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim1))),
*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(elseCalls[0]->args())));
if1case = new SgIfStmt((SgEqOp(*new SgVarRefExp(x_axis->copy()), *new SgValueExp(tfsDim1)) && SgEqOp(*new SgVarRefExp(y_axis->copy()), *new SgValueExp(tfsDim2))),
*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(ifCalls[0]->args())), *if2case);
ifSt = new SgIfStmt(SgEqOp(*new SgVarRefExp(argsKer->otherVars[8 * numSymb + 7]), *new SgValueExp(2)), *if1case);
}
for (size_t k = 1; k < ifCalls.size(); ++k)
{
if (options.isOn(C_CUDA))
{
if1case->insertStmtAfter(*new SgCExpStmt(*(ifCalls[k])));
if2case->insertStmtAfter(*new SgCExpStmt(*(elseCalls[k])));
}
else
{
if1case->insertStmtAfter(*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(ifCalls[k]->args())));
if2case->insertStmtAfter(*new SgCallStmt(*createNewFunctionSymbol(funcDvmhConvXYname_type.c_str()), *(elseCalls[k]->args())));
}
}
if (ifSt != NULL)
currStForInsetGetXY->insertStmtAfter(*ifSt);
vector<SgStatement*>& zeroSt = tfmInfo.zeroSt;
for (size_t k = 0; k < zeroSt.size(); ++k)
currStForInsetGetXY->insertStmtAfter(zeroSt[k]->copy());
vector<SgSymbol*>& coef = tfmInfo.coefficients;
for (unsigned z = 0; z < coef.size(); ++z)
forDeclarationInKernel.push_back(&(coef[z]->copy()));
}
}
}
#if debugMode
mywarn("end: inserting transform calls");
#endif
}
#if debugMode
mywarn(" end: inserting loop body");
mywarn("start: create reduction block");
#endif
if (red_list && argsKer->nSymb.size() == 0)
{
int num;
reduction_operation_list *tmp_list = red_struct_list;
int needComment = 1;
SgSymbol* overAll = OverallBlocksSymbol();
SgSymbol* freeS = *argsKer->acrossS.begin();
for (SgExpression *er = red_list; er; er = er->rhs())
{
num = 0;
int flag_func_call = 1;
SgExpression *red_expr_ref = er->lhs()->rhs(); // reduction variable reference
SgExpression *loc_var_ref = NULL, *en = NULL;
int loc_el_num = 0;
if (isSgExprListExp(red_expr_ref))
{
red_expr_ref = red_expr_ref->lhs(); // reduction variable reference
loc_var_ref = er->lhs()->rhs()->rhs()->lhs(); //location array reference
en = er->lhs()->rhs()->rhs()->rhs()->lhs(); // number of elements in location array
loc_el_num = LocElemNumber(en);
}
num = RedFuncNumber(er->lhs()->lhs()); // type of reduction
const char *str_operation = NULL;
if (num == 1)
flag_func_call = 0; // +
else if (num == 2)
flag_func_call = 0; // *
else if (num == 3)
str_operation = "max";
else if (num == 4)
str_operation = "min";
else if (num == 5)
flag_func_call = 0; // and
else if (num == 6)
flag_func_call = 0; // or
else if (num == 7)
flag_func_call = 0; // !=
else if (num == 8)
flag_func_call = 0; // ==
else if (num == 9)
flag_func_call = 0; // maxloc
else if (num == 10)
flag_func_call = 0; // minloc
if (flag_func_call == 1)
{
SgFunctionCallExp *funcCall = new SgFunctionCallExp(*createNewFunctionSymbol(str_operation));
if (argsKer->symb.size() < 3)
{
SgSymbol *redGrid = new SgSymbol(VARIABLE_NAME, tmp_list->red_grid->identifier());
redGrid->setType(*new SgArrayType(*tmp_list->red_grid->type()));
if (tmp_list->redvar_size == 0)
{
funcCall->addArg(*new SgArrayRefExp(*redGrid, *new SgVarRefExp(*tid)));
funcCall->addArg(*new SgVarRefExp(*red_expr_ref->symbol()));
st = AssignStatement(*new SgArrayRefExp(*redGrid, *new SgVarRefExp(*tid)), *funcCall);
}
else if (tmp_list->redvar_size > 0 && options.isOn(C_CUDA)) //TODO for Fortran
{
SgExpression* idx = &(*new SgVarRefExp(freeS) * *new SgVarRefExp(overAll) + *new SgVarRefExp(*tid));
funcCall->addArg(*new SgArrayRefExp(*redGrid, *idx));
funcCall->addArg(*new SgArrayRefExp(*red_expr_ref->symbol(), *new SgVarRefExp(freeS)));
SgExpression* start = new SgExpression(ASSGN_OP, new SgVarRefExp(freeS), new SgValueExp(0));
SgExpression* end = &(*new SgVarRefExp(freeS) < *new SgValueExp(tmp_list->redvar_size));
SgExpression* step = new SgExpression(ASSGN_OP, new SgVarRefExp(freeS), &(*new SgVarRefExp(freeS) + *new SgValueExp(1)));
st = new SgForStmt(start, end, step, AssignStatement(*new SgArrayRefExp(*redGrid, *idx), *funcCall));
}
else
{
//TODO
}
}
else
{
SgSymbol *redGrid = new SgSymbol(VARIABLE_NAME, tmp_list->red_grid->identifier());
redGrid->setType(*new SgArrayType(*tmp_list->red_grid->type()));
SgSymbol *emin = argsKer->sizeVars[6];
funcCall->addArg(*new SgArrayRefExp(*redGrid, *new SgVarRefExp(*tid) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*emin)));
funcCall->addArg(*new SgVarRefExp(red_expr_ref->symbol()));
st = AssignStatement(*new SgArrayRefExp(*redGrid, *new SgVarRefExp(*tid) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*emin)), *funcCall);
}
}
else
{
SgExpression *e1 = NULL;
if (argsKer->symb.size() < 3)
{
if (tmp_list->redvar_size == 0)
e1 = new SgVarRefExp(*tid);
else if (tmp_list->redvar_size > 0)
e1 = &(*new SgVarRefExp(freeS) * *new SgVarRefExp(overAll) + *new SgVarRefExp(*tid));
else
{
//TODO
}
}
else
{
SgSymbol *emin = argsKer->sizeVars[6];
e1 = &(*new SgVarRefExp(*tid) + *new SgVarRefExp(*tid1) * *new SgVarRefExp(*emin));
}
e = NULL;
SgIfStmt *ifSt = NULL;
SgSymbol *redGrid = new SgSymbol(VARIABLE_NAME, tmp_list->red_grid->identifier());
redGrid->setType(*new SgArrayType(*tmp_list->red_grid->type()));
SgExpression* red_ref = NULL;
if (tmp_list->redvar_size == 0)
red_ref = &red_expr_ref->copy();
else // TODO
red_ref = new SgArrayRefExp(*red_expr_ref->symbol(), *new SgVarRefExp(freeS));
if (num == 1)
e = &(*new SgArrayRefExp(*redGrid, *e1) + *red_ref);
else if (num == 2)
e = &(*new SgArrayRefExp(*redGrid, *e1) * *red_ref);
else if (num == 5)
e = &(*new SgArrayRefExp(*redGrid, *e1) && *red_ref);
else if (num == 6)
e = &(*new SgArrayRefExp(*redGrid, *e1) || *red_ref);
else if (num == 7)
e = &SgNeqOp(*new SgArrayRefExp(*redGrid, *e1), *red_ref);
else if (num == 8)
e = &SgEqOp(*new SgArrayRefExp(*redGrid, *e1), *red_ref);
else if (num == 9 || num == 10)
{
st = AssignStatement(*new SgArrayRefExp(*redGrid, *e1), red_expr_ref->copy());
ifSt = new SgIfStmt(red_expr_ref->copy() > *new SgArrayRefExp(*redGrid, *e1), *st);
for (int i = loc_el_num - 1; i >= 0; i--)
{
SgSymbol *locGrid = new SgSymbol(VARIABLE_NAME, tmp_list->loc_grid->identifier());
locGrid->setType(*new SgArrayType(*tmp_list->loc_grid->type()));
if (options.isOn(C_CUDA))
st = AssignStatement(*new SgArrayRefExp(*locGrid, *new SgValueExp(i), *e1), *new SgArrayRefExp(*loc_var_ref->symbol(), *new SgValueExp(i)));
else
st = AssignStatement(*new SgArrayRefExp(*locGrid, *new SgValueExp(i + 1), *e1), *new SgArrayRefExp(*loc_var_ref->symbol(), *new SgValueExp(i + 1)));
ifSt->insertStmtAfter(*st);
}
}
if (num != 9 && num != 10)
{
if (tmp_list->redvar_size == 0)
st = AssignStatement(*new SgArrayRefExp(*redGrid, *e1), *e);
else if (tmp_list->redvar_size > 0 && options.isOn(C_CUDA)) // TODO for Fortran
{
SgExpression* start = new SgExpression(ASSGN_OP, new SgVarRefExp(freeS), new SgValueExp(0));
SgExpression* end = &(*new SgVarRefExp(freeS) < *new SgValueExp(tmp_list->redvar_size));
SgExpression* step = new SgExpression(ASSGN_OP, new SgVarRefExp(freeS), &(*new SgVarRefExp(freeS) + *new SgValueExp(1)));
st = new SgForStmt(start, end, step, AssignStatement(*new SgArrayRefExp(*redGrid, *e1), *e));
}
else
{
//TODO
}
}
else
st = ifSt;
}
if (argsKer->symb.size() < 3)
if_st->lastExecutable()->insertStmtAfter(*st, *if_st);
else
if_st->lastExecutable()->insertStmtAfter(*st);
tmp_list = tmp_list->next;
if (needComment == 1)
{
if (options.isOn(C_CUDA))
st->addComment("// Reduction");
else
st->addComment("! Reduction\n");
needComment = 0;
}
}
DeclarationCreateReductionBlocksAcross(nloop, red_list);
}
else if (red_list && argsKer->nSymb.size() > 0) // generating reduction calculation blocks
CreateReductionBlocksAcross(st_end, nloop, red_list, new SgSymbol(*tid));
#if debugMode
mywarn(" end: create reduction block");
#endif
// make declarations
if (options.isOn(C_CUDA))
MakeDeclarationsForKernel_On_C_Across(idxTypeInKernel);
else // Fortran-Cuda
MakeDeclarationsForKernelAcross(idxTypeInKernel);
for_kernel = 0;
st = coords->makeVarDeclStmt();
kernel_st->insertStmtAfter(*st);
st = tid->makeVarDeclStmt();
kernel_st->insertStmtAfter(*st);
if (tmpvar1 != NULL)
addDeclExpList(tmpvar1, st->expr(0));
if (options.isOn(AUTO_TFM))
{
for (size_t i = 0; i < forDeclarationInKernel.size(); ++i)
addDeclExpList(forDeclarationInKernel[i], st->expr(0));
}
if (argsKer->symb.size() == 1)
{
if (argsKer->nSymb.size() == 2)
addDeclExpList(tid1, st->expr(0));
else if (argsKer->nSymb.size() >= 3)
{
addDeclExpList(tid1, st->expr(0));
addDeclExpList(tid2, st->expr(0));
}
}
else if (argsKer->symb.size() == 2)
{
if (argsKer->nSymb.size() == 1)
addDeclExpList(tid1, st->expr(0));
else if (argsKer->nSymb.size() >= 2)
{
addDeclExpList(tid1, st->expr(0));
addDeclExpList(tid2, st->expr(0));
}
}
else if (argsKer->symb.size() >= 3)
{
addDeclExpList(tid1, st->expr(0));
if (argsKer->nSymb.size() > 0)
addDeclExpList(tid2, st->expr(0));
}
if (!options.isOn(C_CUDA))
{
createDeclaration(argsKer->sizeVars);
createDeclaration(argsKer->acrossS);
createDeclaration(argsKer->notAcrossS);
createDeclaration(argsKer->idxAcross);
createDeclaration(argsKer->idxNotAcross);
for (size_t i = 0; i < argsKer->otherVars.size() / 8 * 8; i += 8)
{
createDeclaration(argsKer->otherVars[i]);
addDeclExpList(argsKer->otherVars[i + 3], st->expr(0));
createDeclaration(argsKer->otherVars[i + 1]);
addDeclExpList(argsKer->otherVars[i + 4], st->expr(0));
createDeclaration(argsKer->otherVars[i + 2]);
addDeclExpList(argsKer->otherVars[i + 5], st->expr(0));
createDeclaration(argsKer->otherVars[i + 6]);
addDeclExpList(argsKer->otherVars[i + 7], st->expr(0));
}
if (argsKer->otherVars.size() != 0 && argsKer->otherVars.size() % 8 != 0)
createDeclaration(argsKer->otherVars[argsKer->otherVars.size() - 1]);
for (size_t i = 0; i < argsKer->baseIdxsInKer.size(); ++i)
{
if (i == 0)
createDeclaration(argsKer->baseIdxsInKer[i]);
else
addDeclExpList(argsKer->baseIdxsInKer[i], st->expr(0));
}
if (argsKer->cond_ != NULL)
{
createDeclaration(argsKer->cond_);
for (size_t i = 0; i < argsKer->steps.size(); ++i)
addDeclExpList(argsKer->steps[i], st->expr(0));
}
}
#if debugMode
mywarn(" end: CreateLoopKernelAcross");
#endif
// inserting IMPLICIT NONE
if (!options.isOn(C_CUDA)) // Fortran-Cuda
kernel_st->insertStmtAfter(*new SgStatement(IMPL_DECL), *kernel_st);
if (options.isOn(C_CUDA))
RenamingCudaFunctionVariables(kernel_st, skernel, 1);
ACROSS_MOD_IN_KERNEL = 0;
return kernel_st;
}
// -------------------------- Reduction block for Across ---------------------------- //
SgSymbol *RedBlockSymbolInKernelAcross(SgSymbol *s, SgType *type)
{
char *name = NULL;
SgSymbol *sb = NULL;
SgValueExp M0(0);
SgExpression *MD = new SgExpression(DDOT, &M0.copy(), new SgKeywordValExp("*"), NULL);
SgArrayType *typearray;
int i = 1;
if (!type)
typearray = new SgArrayType(*s->type()->baseType());
else if (isSgArrayType(s->type()))
typearray = (SgArrayType *)&(s->type()->copy());
else
typearray = new SgArrayType(*type);
if (!options.isOn(C_CUDA))
typearray->addRange(*MD);
else
typearray->addDimension(NULL);
name = new char[strlen(s->identifier()) + 8];
sprintf(name, "%s_block", s->identifier());
while (isSameNameShared(name))
sprintf(name, "%s_block%d", s->identifier(), i++);
sb = new SgVariableSymb(name, *typearray, *kernel_st); // scope may be mod_gpu
#if 0
shared_list = AddToSymbList(shared_list, sb);
#endif
delete[]name;
return sb;
}
void DeclarationOfReductionBlockInKernelAcross(SgExpression *ered, reduction_operation_list *rsl)
{
SgStatement *ass, *newst, *current, *if_st, *while_st, *typedecl, *st, *do_st;
SgExpression *le, *re, *eatr, *cond, *ev;
SgSymbol *red_var, *red_var_k, *s_block, *loc_var, *sf;
SgType *rtype;
int i, ind;
//init block
ass = newst = current = if_st = while_st = typedecl = st = do_st = NULL;
le = re = eatr = cond = ev = NULL;
red_var = red_var_k = s_block = loc_var = sf = NULL;
rtype = NULL;
i = ind = loc_el_num = 0;
//end of init block
// analys of reduction operation
// ered - reduction operation (variant==ARRAY_OP)
ev = ered->rhs(); // reduction variable reference for reduction operations except MINLOC,MAXLOC
if (isSgExprListExp(ev)) // for MAXLOC,MINLOC
{
loc_var = ev->rhs()->lhs()->symbol(); //location array reference
ev = ev->lhs(); // reduction variable reference
}
else
loc_var = NULL;
// <red_var>_block([ k,] i) = <red_var> [k=LowerBound:UpperBound]
// or for MAXLOC,MINLOC
// <red_var>_block(i)%<red_var> = <red_var>
// <red_var>_block(i)%<loc_var>(1) = <loc_var>(1)
// [<red_var>_block(i)%<loc_var>(2) = <loc_var>(2) ]
// . . .
// create and declare array '<red_var>_block'
red_var = ev->symbol();
if (rsl->locvar)
{
newst = Declaration_Statement(rsl->locvar); //declare location variable
kernel_st->insertStmtAfter(*newst, *kernel_st);
}
if (rsl->redvar_size > 0)
{
newst = Declaration_Statement(rsl->redvar); //declare reduction variable
kernel_st->insertStmtAfter(*newst, *kernel_st);
}
else if (rsl->redvar_size < 0)
{
red_var_k = RedVariableSymbolInKernel(rsl->redvar, rsl->dimSize_arg, rsl->lowBound_arg);
newst = Declaration_Statement(red_var_k); //declare reduction variable
kernel_st->insertStmtAfter(*newst, *kernel_st);
}
rtype = (rsl->redvar_size >= 0) ? TypeOfRedBlockSymbol(ered) : red_var_k->type();
s_block = RedBlockSymbolInKernelAcross(red_var, rtype);
newst = Declaration_Statement(s_block);
if (options.isOn(C_CUDA))
newst->addDeclSpec(BIT_CUDA_SHARED | BIT_EXTERN);
else
{
eatr = new SgExprListExp(*new SgExpression(ACC_SHARED_OP));
newst->setExpression(2, *eatr);
}
kernel_st->insertStmtAfter(*newst, *kernel_st);
if (isSgExprListExp(ered->rhs())) //MAXLOC,MINLOC
{
typedecl = MakeStructDecl(rtype->symbol());
kernel_st->insertStmtAfter(*typedecl, *kernel_st);
}
}
void DeclarationCreateReductionBlocksAcross(int nloop, SgExpression *red_op_list)
{
SgStatement *newst, *dost;
SgExpression *er;
SgSymbol *i_var, *j_var;
reduction_operation_list *rsl;
int n;
formal_red_grid_list = NULL;
// index variables
dost = DoStmt(first_do_par, nloop);
i_var = dost->symbol();
if (nloop > 1)
j_var = dost->controlParent()->symbol();
else
{
j_var = IndVarInKernel(i_var);
newst = j_var->makeVarDeclStmt();
kernel_st->insertStmtAfter(*newst, *kernel_st);
}
//looking through the reduction_op_list
for (er = red_op_list, rsl = red_struct_list, n = 1; er; er = er->rhs(), rsl = rsl->next, n++)
{
DeclarationOfReductionBlockInKernelAcross(er->lhs(), rsl);
}
}
void CreateReductionBlocksAcross(SgStatement *stat, int nloop, SgExpression *red_op_list, SgSymbol *red_count_symb)
{
SgStatement *newst, *ass, *dost;
SgExpression *er, *re;
SgSymbol *i_var, *j_var;
reduction_operation_list *rsl;
int n;
formal_red_grid_list = NULL;
// index variables
dost = DoStmt(first_do_par, nloop);
i_var = dost->symbol();
if (nloop > 1)
j_var = dost->controlParent()->symbol();
else
{
j_var = IndVarInKernel(i_var);
newst = j_var->makeVarDeclStmt();
kernel_st->insertStmtAfter(*newst, *kernel_st);
}
//create symbol 'syncthreads'
// declare '<red_var>_block' array for each reduction var
// <i_var> = threadIdx%x -1 + [ (threadIdx%y - 1) * blockDim%x [ + (threadIdx%z - 1) * blockDim%x * blockDim%y ] ]
// or C_Cuda
// <i_var> = threadIdx%x + [ threadIdx%y * blockDim%x [ + threadIdx%z * blockDim%x * blockDim%y ] ]
re = ThreadIdxRefExpr("x");
if (nloop > 1)
re = &(*re + (*ThreadIdxRefExpr("y")) * (*new SgRecordRefExp(*s_blockdim, "x")));
if (nloop > 2)
re = &(*re + (*ThreadIdxRefExpr("z")) * (*new SgRecordRefExp(*s_blockdim, "x") * (*new SgRecordRefExp(*s_blockdim, "y"))));
if (options.isOn(C_CUDA)) // global cuda index
{
// gIDX = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + (blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y) * blockDim.x * blockDim.y * blockDim.z;
SgExpression& thrX = *new SgRecordRefExp(*s_threadidx, "x");
SgExpression& thrY = *new SgRecordRefExp(*s_threadidx, "y");
SgExpression& thrZ = *new SgRecordRefExp(*s_threadidx, "z");
SgExpression& blDimX = *new SgRecordRefExp(*s_blockdim, "x");
SgExpression& blDimY = *new SgRecordRefExp(*s_blockdim, "y");
SgExpression& blDimZ = *new SgRecordRefExp(*s_blockdim, "z");
SgExpression& blIdxX = *new SgRecordRefExp(*s_blockidx, "x");
SgExpression& blIdxY = *new SgRecordRefExp(*s_blockidx, "y");
SgExpression& blIdxZ = *new SgRecordRefExp(*s_blockidx, "z");
SgExpression& grX = *new SgRecordRefExp(*s_griddim, "x");
SgExpression& grY = *new SgRecordRefExp(*s_griddim, "y");
ass = new SgAssignStmt(*new SgVarRefExp(i_var), thrX + thrY * blDimX + thrZ * blDimX * blDimY + (blIdxX + blIdxY * grX + blIdxZ * grX * grY) * blDimX * blDimY * blDimZ);
}
else
ass = AssignStatement(new SgVarRefExp(i_var), re);
stat->insertStmtBefore(*ass, *stat->controlParent());
if (options.isOn(C_CUDA))
ass->addComment("// Reduction");
else
ass->addComment("! Reduction\n");
//looking through the reduction_op_list
SgIfStmt* if_st = NULL;
SgIfStmt* if_del = NULL;
SgIfStmt* if_new = NULL;
int declArrayVars = 1;
SgSymbol* s_warpsize = new SgVariableSymb("warpSize", *SgTypeInt(), *mod_gpu);
if (options.isOn(C_CUDA))
if_st = new SgIfStmt(SgEqOp(*new SgVarRefExp(i_var) % *new SgVarRefExp(s_warpsize), *new SgValueExp(0)));
for (er = red_op_list, rsl = red_struct_list, n = 1; er; er = er->rhs(), rsl = rsl->next, n++)
{
if (options.isOn(C_CUDA))
ReductionBlockInKernel_On_C_Cuda(stat, i_var, er->lhs(), rsl, if_st, if_del, if_new, declArrayVars, true, true);
else
ReductionBlockInKernel(stat, nloop, i_var, j_var, er->lhs(), rsl, red_count_symb, n);
}
if (options.isOn(C_CUDA))
stat->insertStmtBefore(*if_st, *stat->controlParent());
}
//end of Reduction block for Across
#undef LongT
#undef debugMode
#undef kerneloff