ROSE  0.11.145.0
libxomp.h
1 /*
2  * A common layer for both gomp and omni runtime library
3  * Liao 1/20/2009
4  * */
5 #ifndef LIB_XOMP_H
6 #define LIB_XOMP_H
7 
8 // Fortran outlined function uses one parameter for each variable to be passed by reference
9 // We predefine a max number of parameters to be allowed here.
10 #define MAX_OUTLINED_FUNC_PARAMETER_COUNT 256
11 #ifdef __cplusplus
12 extern "C" {
13 #endif
14 
15 #include <stdio.h>
16 #include <stdbool.h>
17 #include <stdlib.h> // for abort()
18 #include <assert.h>
19 #include <sys/time.h>
20 
21 // return the current time stamp in a double floating point number
22 extern double xomp_time_stamp(void);
23 extern int env_region_instr_val; // save the environment variable value for instrumentation support
24 //e.g. export XOMP_REGION_INSTR=0|1
25 
26 //enum omp_rtl_enum {
27 // e_gomp,
28 // e_omni,
29 // e_last_rtl
30 //};
31 //
32 //extern omp_rtl_enum rtl_type;
33 
34 //Runtime library initialization routine
35 extern void XOMP_init (int argc, char ** argv);
36 extern void xomp_init (void);
37 extern void xomp_acc_init (void);
38 
39 // Runtime library termination routine
40 extern void XOMP_terminate (int exitcode);
41 
42 // func: pointer to a function which will be run in parallel
43 // data: pointer to a data segment which will be used as the arguments of func
44 // ifClauseValue: set to if-clause-expression if if-clause exists, or default is 1.
45 // numThreadsSpecified: set to the expression of num_threads clause if the clause exists, or default is 0
46 // file_name:line_no the start source file info about this parallel region, used to pass source level info. to runtime
47 extern void XOMP_parallel_start (void (*func) (void *), void *data, unsigned ifClauseValue, unsigned numThreadsSpecified, char* file_name, int line_no);
48 extern void XOMP_parallel_end (char* file_name, int line_no);
49 
50 /* Initialize sections and return the next section id (starting from 0) to be executed by the current thread */
51 extern int XOMP_sections_init_next(int section_count);
52 
53 /* Return the next section id (starting from 0) to be executed by the current thread. Return value <0 means no sections left */
54 extern int XOMP_sections_next(void);
55 
56 /* Called after the current thread is told that all sections are executed. It synchronizes all threads also. */
57 extern void XOMP_sections_end(void);
58 
59 /* Called after the current thread is told that all sections are executed. It does not synchronizes all threads. */
60 extern void XOMP_sections_end_nowait(void);
61 
62 extern void XOMP_task (void (*) (void *), void *, void (*) (void *, void *),
63  long, long, bool, unsigned);
64 extern void XOMP_taskwait (void);
65 
66 // scheduler functions, union of runtime library functions
67 // empty body if not used by one
68 // scheduler initialization, only meaningful used for OMNI
69 
70 // Default loop scheduling, worksharing without any schedule clause, upper bounds are inclusive
71 // Kick in before all runtime libraries. We use the default loop scheduling from XOMP regardless the runtime chosen.
72 extern void XOMP_loop_default(int lower, int upper, int stride, long* n_lower, long* n_upper);
73 
75 // Non-op for gomp
76 extern void XOMP_loop_static_init(int lower, int upper, int stride, int chunk_size);
77 extern void XOMP_loop_dynamic_init(int lower, int upper, int stride, int chunk_size);
78 extern void XOMP_loop_guided_init(int lower, int upper, int stride, int chunk_size);
79 extern void XOMP_loop_runtime_init(int lower, int upper, int stride);
80 
81 // ordered case
82 extern void XOMP_loop_ordered_static_init(int lower, int upper, int stride, int chunk_size);
83 extern void XOMP_loop_ordered_dynamic_init(int lower, int upper, int stride, int chunk_size);
84 extern void XOMP_loop_ordered_guided_init(int lower, int upper, int stride, int chunk_size);
85 extern void XOMP_loop_ordered_runtime_init(int lower, int upper, int stride);
86 
87 
88 // if (start),
89 // mostly used because of gomp, omni will just call XOMP_loop_xxx_next();
90 // (long start, long end, long incr, long chunk_size,long *istart, long *iend)
91 // upper bounds are non-inclusive,
92 // bounds for inclusive loop control will need +/-1 , depending on incremental/decremental cases
93 extern bool XOMP_loop_static_start (long, long, long, long, long *, long *);
94 extern bool XOMP_loop_dynamic_start (long, long, long, long, long *, long *);
95 extern bool XOMP_loop_guided_start (long, long, long, long, long *, long *);
96 extern bool XOMP_loop_runtime_start (long, long, long, long *, long *);
97 
98 extern bool XOMP_loop_ordered_static_start (long, long, long, long, long *, long *);
99 extern bool XOMP_loop_ordered_dynamic_start (long, long, long, long, long *, long *);
100 extern bool XOMP_loop_ordered_guided_start (long, long, long, long, long *, long *);
101 extern bool XOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
102 
103 // next
104 extern bool XOMP_loop_static_next (long *, long *);
105 extern bool XOMP_loop_dynamic_next (long *, long *);
106 extern bool XOMP_loop_guided_next (long *, long *);
107 extern bool XOMP_loop_runtime_next (long *, long *);
108 
109 extern bool XOMP_loop_ordered_static_next (long *, long *);
110 extern bool XOMP_loop_ordered_dynamic_next (long *, long *);
111 extern bool XOMP_loop_ordered_guided_next (long *, long *);
112 extern bool XOMP_loop_ordered_runtime_next (long *, long *);
113 
114 // simplest static even divide a range start:size into several chunks of offset:size pair
115 // This can be used to divide up loop or data (array range).
116 // original range is start_offset:orig_size
117 // chunk_id starts from 0
118 // output is the offset and size for the current chunk id
119 extern void XOMP_static_even_divide (long start, long orig_size, int chunk_count, int chunk_id, long * chunk_offset, long * chunk_size);
120 
121 //--------------end of loop functions
122 
123 extern void XOMP_barrier (void);
124 extern void XOMP_critical_start (void** data);
125 extern void XOMP_critical_end (void** data);
126 extern bool XOMP_single(void);
127 extern bool XOMP_master(void);
128 
129 extern void XOMP_atomic_start (void);
130 extern void XOMP_atomic_end (void);
131 
132 extern void XOMP_loop_end (void);
133 extern void XOMP_loop_end_nowait (void);
134  // --- end loop functions ---
135 // flush without variable list
136 extern void XOMP_flush_all (void);
137 // omp flush with variable list, flush one by one, given each's start address and size
138 extern void XOMP_flush_one (char * startAddress, int nbyte);
139 
140 
141 // omp ordered directive
142 extern void XOMP_ordered_start (void);
143 extern void XOMP_ordered_end (void);
144 
145 //--------------------- extensions to support OpenMP accelerator model experimental implementation------
146 // We only include
147 
148 // Set the device id to be used by the current task
149 extern void xomp_set_default_device (int devID);
150 //--------------------- kernel launch ------------------
151 
152 // the max number of threads per thread block of the first available device
153 extern size_t xomp_get_maxThreadsPerBlock(int devID);
154 
155 //get the max number of 1D blocks for a given input length
156 extern size_t xomp_get_max1DBlock(int devID, size_t ss);
157 
158 // Get the max number threads for one dimension (x or y) of a 2D block
159 // Two factors are considered: the total number of threads within the 2D block must<= total threads per block
160 // x * y <= maxThreadsPerBlock 512 or 1024
161 // each dimension: the number of threads must <= maximum x/y-dimension
162 // x <= maxThreadsDim[0], 1024
163 // y <= maxThreadsDim[1], 1024
164 // maxThreadsDim[0] happens to be equal to maxThreadsDim[1] so we use a single function to calculate max segments for both dimensions
165 extern size_t xomp_get_max_threads_per_dimesion_2D (int devID);
166 
167 // return the max number of segments for a dimension (either x or y) of a 2D block
168 extern size_t xomp_get_maxSegmentsPerDimensionOf2DBlock(int devID, size_t dimension_size);
169 
170 //------------------memory allocation/copy/free----------------------------------
171 //Allocate device memory and return the pointer
172 // This should be a better interface than cudaMalloc()
173 // since it mimics malloc() closely
174 /*
175 return a pointer to the allocated space
176  * upon successful completion with size not equal to 0
177 return a null pointer if
178  * size is 0
179  * failure due to any reason
180 */
181 extern void* xomp_deviceMalloc(size_t size);
182 
183 // A host version
184 extern void* xomp_hostMalloc(size_t size);
185 
186 //get the time stamp for now, up to microsecond resolution: 1e-6 , but maybe 1e-4 in practice
187 extern double xomp_time_stamp();
188 
189 
190 // memory copy from src to dest, return the pointer to dest. NULL pointer if anything is wrong
191 extern void * xomp_memcpyHostToDevice (void *dest, const void * src, size_t n_n);
192 extern void * xomp_memcpyDeviceToHost (void *dest, const void * src, size_t n_n);
193 // copy a dynamically allocated host source array to linear dest address on a GPU device. the dimension information of the source array
194 // is given by: int dimensions[dimension_size], with known element size.
195 // bytes_copied reports the total bytes copied by this function.
196 // Note: It cannot be used copy static arrays declared like type array[N][M] !!
197 extern void * xomp_memcpyDynamicHostToDevice (void *dest, const void * src, int * dimensions, size_t dimension_size, size_t element_size, size_t *bytes_copied);
198 
199 // copy linear src memory to dynamically allocated destination, with dimension information given by
200 // int dimensions[dimension_size]
201 // the source memory has total n continuous memory, with known size for each element
202 // the total bytes copied by this function is reported by bytes_copied
203 extern void * xomp_memcpyDynamicDeviceToHost (void *dest, int * dimensions, size_t dimension_size, const void * src, size_t element_size, size_t *bytes_copied);
204 
205 extern void * xomp_memcpyDeviceToDevice (void *dest, const void * src, size_t n_n);
206 extern void * xomp_memcpyHostToHost (void *dest, const void * src, size_t n_n); // same as memcpy??
207 
208 
209 // free the device memory pointed by a pointer, return false in case of failure, otherwise return true
210 extern bool xomp_freeDevice(void* devPtr);
211 // free the host memory pointed by a pointer, return false in case of failure, otherwise return true
212 extern bool xomp_freeHost(void* hostPtr);
213 
214 /* Allocation/Free functions for Host */
215 /* Allocate a multi-dimensional array
216  *
217  * Input parameters:
218  * int *dimensions: an integer array storing the size of each dimension
219  * size_t dimension_num: the number of dimensions
220  * size_t esize: the size of an array element
221  *
222  * return:
223  * the pointer to the allocated array
224  * */
225 extern void * xomp_mallocArray(int * dimensions, size_t dimension_num, size_t esize);
226 
227 extern void xomp_freeArrayPointer (void* array, int * dimensions, size_t dimension_num);
228 
229 
230 /* CUDA reduction support */
231 //------------ types for CUDA reduction support---------
232 // Reduction for regular OpenMP is supported by compiler translation. No runtime support is needed.
233 // For the accelerator model experimental implementation, we use a two-level reduction method:
234 // thread-block level within GPU + beyond-block level on CPU
235 
236 /* an internal union type to be flexible for all types associated with reduction operations
237  We don't really want to expose this to the compiler to simplify the compiler translation.
238 */
239 // We try to limit the numbers of runtime data types exposed to a compiler.
240 // A set of integers to represent reduction operations
241 #define XOMP_REDUCTION_PLUS 6
242 #define XOMP_REDUCTION_MINUS 7
243 #define XOMP_REDUCTION_MUL 8
244 #define XOMP_REDUCTION_BITAND 9 // &
245 #define XOMP_REDUCTION_BITOR 10 // |
246 #define XOMP_REDUCTION_BITXOR 11 // ^
247 #define XOMP_REDUCTION_LOGAND 12 // &&
248 #define XOMP_REDUCTION_LOGOR 13 // ||
249 
250 #if 0
251 // No linker support for device code. We have to put implementation of these device functions into the header
252 // TODO: wait until nvcc supports linker for device code.
253 //#define XOMP_INNER_BLOCK_REDUCTION_DECL(dtype)
254 //__device__ void xomp_inner_block_reduction_##dtype(dtype local_value, dtype * grid_level_results, int reduction_op);
255 //
256 // TODO declare more prototypes
257 //XOMP_INNER_BLOCK_REDUCTION_DECL(int)
258 //XOMP_INNER_BLOCK_REDUCTION_DECL(float)
259 //XOMP_INNER_BLOCK_REDUCTION_DECL(double)
260 //
261 //#undef XOMP_INNER_BLOCK_REDUCTION_DECL
262 
263 #endif
264 
265 #define XOMP_BEYOND_BLOCK_REDUCTION_DECL(dtype) \
266  dtype xomp_beyond_block_reduction_##dtype(dtype * per_block_results, int numBlocks, int reduction_op);
267 
268 XOMP_BEYOND_BLOCK_REDUCTION_DECL(int)
269 XOMP_BEYOND_BLOCK_REDUCTION_DECL(float)
270 XOMP_BEYOND_BLOCK_REDUCTION_DECL(double)
271 
272 #undef XOMP_BEYOND_BLOCK_REDUCTION_DECL
273 // Liao, 8/29/2013
274 // Support round-robin static scheduling of loop iterations running on GPUs (accelerator)
275 // Static even scheduling may cause each thread to touch too much data, which stress memory channel.
276 // NOT IN USE. We use compiler to generate the variables instead of using a runtime data structure.
278  int num; /* the thread number of this thread in team */
279  int num_thds; /* current running thread, referenced by children */
280  int in_parallel; /* current thread executes the region in parallel */
281 
282  /* used for schedule */
283  int loop_chunk_size; //************* this is the chunk size
284  int loop_end; //************* equivalent to upper limit, up
285  int loop_sched_index; //************* lb+chunk_size*tp->num (num is the thread number of this thread in team)
286  int loop_stride; //************* chunk_size * nthds /* used for static scheduling */
287 
288  /* for 'lastprivate' */
289  int is_last;
290 };
291 
292 #define XOMP_MAX_MAPPED_VARS 256 // for simplicity, we use preallocated memory for storing the mapped variable list
293 /* Test runtime support for nested device data environments */
294 /* Liao, May 2, 2013*/
295 /* A data structure to keep track of a mapped variable
296  * Right now we use memory address of the original variable and the size of the variable
297  * */
299 {
300  void * address; // original variable's address
301  //TODO: support array sections
302  int* size;
303  int* offset;
304  int* DimSize;
305  int nDim;
306  int typeSize;
307  void * dev_address; // the corresponding device variable's address
308  bool copyTo; // if this variable should be copied to the device first
309  bool copyFrom; // if this variable should be copied back to HOST when existing the data environment
310 };
311 
313 extern void copy_mapped_variable (struct XOMP_mapped_variable* desc, struct XOMP_mapped_variable* src );
314 
315 /* A doubly linked list for tracking Device Data Environment (DDE) */
316 typedef struct DDE_data {
317  // Do we need this at all? we can allocate/deallocate data without saving region ID
318  int Region_ID; // hash of the AST node? or just memory address of the AST node for now
319 
320 // Store the device ID in DDE
321  int devID;
322 
323 // array of the newly mapped variables
324  int new_variable_count;
325  struct XOMP_mapped_variable* new_variables;
326  //struct XOMP_mapped_variable new_variables[XOMP_MAX_MAPPED_VARS];
327 
328 // array of inherited mapped variable from possible upper level DDEs
329  int inherited_variable_count;
330  struct XOMP_mapped_variable* inherited_variables;
331  //struct XOMP_mapped_variable inherited_variables[XOMP_MAX_MAPPED_VARS];
332 
333  // link to its parent node
334  struct DDE_data* parent;
335  // link to its child node
336  struct DDE_data* child;
337 } DDE;
338 
339 // Internal control variables for target devices
340 extern int xomp_get_num_devices();
341 extern int xomp_get_max_devices(void);
342 extern int xomp_num_devices;
343 extern int xomp_max_num_devices;
344 
345 // The head of the list of DDE data nodes
346 extern DDE** DDE_head; //TODO. We don't really need this head pointer, it is like a stack, access the end is enough
347 // The tail of the list
348 extern DDE** DDE_tail;
349 
350 extern void** xomp_cuda_prop;
351 // create a new DDE-data node and append it to the end of the tracking list
352 // copy all variables from its parent node to be into the set of inherited variable set.
353 //void XOMP_Device_Data_Environment_Enter();
354 extern void xomp_deviceDataEnvironmentEnter(int devID);
355 
356 // A all-in-one wrapper to integrate three things: 1) get inherited variable 2) allocate if not found, 3) register,
357 // and 4) copy into GPU operations into one function
358 //
359 // Based on the CPU variable address and size, also indicate if copyin or copyback is needed.
360 // The function will first try to inherit/reuse the same variable from the parent DDE. i
361 // If not successful , it will allocate a new data on device, register it to the current DDE, and copy CPU values when needed.
362 // The allocated or found device variable address will be returned.
363 extern void* xomp_deviceDataEnvironmentPrepareVariable(int devID, void* original_variable_address, int nDim, int typeSize, int* size, int* offset, int* vDimSize, bool copyTo, bool copyFrom);
364 
365 // Check if an original variable is already mapped in enclosing data environment, return its device variable's address if yes.
366 // return NULL if not
367 //void* XOMP_Device_Data_Environment_Get_Inherited_Variable (void* original_variable_address, int size);
368 extern void* xomp_deviceDataEnvironmentGetInheritedVariable (int devID, void* original_variable_address, int typeSize, int* size);
369 
371 //void XOMP_Device_Data_Environment_Add_Variable (void* var_addr, int var_size, void * dev_addr);
372 extern void xomp_deviceDataEnvironmentAddVariable (int devID, void* var_addr, int* var_size, int* var_offset, int* var_dim, int nDim, int typeSize, void * dev_addr, bool copyTo, bool copyFrom);
373 
374 // Exit current DDE: deallocate device memory, delete the DDE-data node from the end of the tracking list
375 //void XOMP_Device_Data_Environment_Exit();
376 extern void xomp_deviceDataEnvironmentExit(int devID);
377 
378 
379 #ifdef __cplusplus
380  }
381 #endif
382 
383 #endif /* LIB_XOMP_H */
384 
385 
386 
387