ROSE 0.11.145.192
libxomp.h
1/*
2 * A common layer for both gomp and omni runtime library
3 * Liao 1/20/2009
4 * */
5#ifndef LIB_XOMP_H
6#define LIB_XOMP_H
7
8// Fortran outlined function uses one parameter for each variable to be passed by reference
9// We predefine a max number of parameters to be allowed here.
10#define MAX_OUTLINED_FUNC_PARAMETER_COUNT 256
11#ifdef __cplusplus
12extern "C" {
13#endif
14
15#include <stdio.h>
16#include <stdbool.h>
17#include <stdlib.h> // for abort()
18#include <assert.h>
19#include <sys/time.h>
20
21// return the current time stamp in a double floating point number
22extern double xomp_time_stamp(void);
23extern int env_region_instr_val; // save the environment variable value for instrumentation support
24//e.g. export XOMP_REGION_INSTR=0|1
25
26//enum omp_rtl_enum {
27// e_gomp,
28// e_omni,
29// e_last_rtl
30//};
31//
32//extern omp_rtl_enum rtl_type;
33
34//Runtime library initialization routine
35extern void XOMP_init (int argc, char ** argv);
36extern void xomp_init (void);
37extern void xomp_acc_init (void);
38
39// Runtime library termination routine
40extern void XOMP_terminate (int exitcode);
41
42// func: pointer to a function which will be run in parallel
43// data: pointer to a data segment which will be used as the arguments of func
44// ifClauseValue: set to if-clause-expression if if-clause exists, or default is 1.
45// numThreadsSpecified: set to the expression of num_threads clause if the clause exists, or default is 0
46// file_name:line_no the start source file info about this parallel region, used to pass source level info. to runtime
47extern void XOMP_parallel_start (void (*func) (void *), void *data, unsigned ifClauseValue, unsigned numThreadsSpecified, char* file_name, int line_no);
48extern void XOMP_parallel_end (char* file_name, int line_no);
49
50/* Initialize sections and return the next section id (starting from 0) to be executed by the current thread */
51extern int XOMP_sections_init_next(int section_count);
52
53/* Return the next section id (starting from 0) to be executed by the current thread. Return value <0 means no sections left */
54extern int XOMP_sections_next(void);
55
56/* Called after the current thread is told that all sections are executed. It synchronizes all threads also. */
57extern void XOMP_sections_end(void);
58
59/* Called after the current thread is told that all sections are executed. It does not synchronizes all threads. */
60extern void XOMP_sections_end_nowait(void);
61
62extern void XOMP_task (void (*) (void *), void *, void (*) (void *, void *),
63 long, long, bool, unsigned);
64extern void XOMP_taskwait (void);
65
66// scheduler functions, union of runtime library functions
67// empty body if not used by one
68// scheduler initialization, only meaningful used for OMNI
69
70// Default loop scheduling, worksharing without any schedule clause, upper bounds are inclusive
71// Kick in before all runtime libraries. We use the default loop scheduling from XOMP regardless the runtime chosen.
72extern void XOMP_loop_default(int lower, int upper, int stride, long* n_lower, long* n_upper);
73
75// Non-op for gomp
76extern void XOMP_loop_static_init(int lower, int upper, int stride, int chunk_size);
77extern void XOMP_loop_dynamic_init(int lower, int upper, int stride, int chunk_size);
78extern void XOMP_loop_guided_init(int lower, int upper, int stride, int chunk_size);
79extern void XOMP_loop_runtime_init(int lower, int upper, int stride);
80
81// ordered case
82extern void XOMP_loop_ordered_static_init(int lower, int upper, int stride, int chunk_size);
83extern void XOMP_loop_ordered_dynamic_init(int lower, int upper, int stride, int chunk_size);
84extern void XOMP_loop_ordered_guided_init(int lower, int upper, int stride, int chunk_size);
85extern void XOMP_loop_ordered_runtime_init(int lower, int upper, int stride);
86
87
88// if (start),
89// mostly used because of gomp, omni will just call XOMP_loop_xxx_next();
90// (long start, long end, long incr, long chunk_size,long *istart, long *iend)
91// upper bounds are non-inclusive,
92// bounds for inclusive loop control will need +/-1 , depending on incremental/decremental cases
93extern bool XOMP_loop_static_start (long, long, long, long, long *, long *);
94extern bool XOMP_loop_dynamic_start (long, long, long, long, long *, long *);
95extern bool XOMP_loop_guided_start (long, long, long, long, long *, long *);
96extern bool XOMP_loop_runtime_start (long, long, long, long *, long *);
97
98extern bool XOMP_loop_ordered_static_start (long, long, long, long, long *, long *);
99extern bool XOMP_loop_ordered_dynamic_start (long, long, long, long, long *, long *);
100extern bool XOMP_loop_ordered_guided_start (long, long, long, long, long *, long *);
101extern bool XOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
102
103// next
104extern bool XOMP_loop_static_next (long *, long *);
105extern bool XOMP_loop_dynamic_next (long *, long *);
106extern bool XOMP_loop_guided_next (long *, long *);
107extern bool XOMP_loop_runtime_next (long *, long *);
108
109extern bool XOMP_loop_ordered_static_next (long *, long *);
110extern bool XOMP_loop_ordered_dynamic_next (long *, long *);
111extern bool XOMP_loop_ordered_guided_next (long *, long *);
112extern bool XOMP_loop_ordered_runtime_next (long *, long *);
113
114// simplest static even divide a range start:size into several chunks of offset:size pair
115// This can be used to divide up loop or data (array range).
116// original range is start_offset:orig_size
117// chunk_id starts from 0
118// output is the offset and size for the current chunk id
119extern void XOMP_static_even_divide (long start, long orig_size, int chunk_count, int chunk_id, long * chunk_offset, long * chunk_size);
120
121//--------------end of loop functions
122
123extern void XOMP_barrier (void);
124extern void XOMP_critical_start (void** data);
125extern void XOMP_critical_end (void** data);
126extern bool XOMP_single(void);
127extern bool XOMP_master(void);
128
129extern void XOMP_atomic_start (void);
130extern void XOMP_atomic_end (void);
131
132extern void XOMP_loop_end (void);
133extern void XOMP_loop_end_nowait (void);
134 // --- end loop functions ---
135// flush without variable list
136extern void XOMP_flush_all (void);
137// omp flush with variable list, flush one by one, given each's start address and size
138extern void XOMP_flush_one (char * startAddress, int nbyte);
139
140
141// omp ordered directive
142extern void XOMP_ordered_start (void);
143extern void XOMP_ordered_end (void);
144
145//--------------------- extensions to support OpenMP accelerator model experimental implementation------
146// We only include
147
148// Set the device id to be used by the current task
149extern void xomp_set_default_device (int devID);
150//--------------------- kernel launch ------------------
151
152// the max number of threads per thread block of the first available device
153extern size_t xomp_get_maxThreadsPerBlock(int devID);
154
155//get the max number of 1D blocks for a given input length
156extern size_t xomp_get_max1DBlock(int devID, size_t ss);
157
158// Get the max number threads for one dimension (x or y) of a 2D block
159// Two factors are considered: the total number of threads within the 2D block must<= total threads per block
160// x * y <= maxThreadsPerBlock 512 or 1024
161// each dimension: the number of threads must <= maximum x/y-dimension
162// x <= maxThreadsDim[0], 1024
163// y <= maxThreadsDim[1], 1024
164// maxThreadsDim[0] happens to be equal to maxThreadsDim[1] so we use a single function to calculate max segments for both dimensions
165extern size_t xomp_get_max_threads_per_dimesion_2D (int devID);
166
167// return the max number of segments for a dimension (either x or y) of a 2D block
168extern size_t xomp_get_maxSegmentsPerDimensionOf2DBlock(int devID, size_t dimension_size);
169
170//------------------memory allocation/copy/free----------------------------------
171//Allocate device memory and return the pointer
172// This should be a better interface than cudaMalloc()
173// since it mimics malloc() closely
174/*
175return a pointer to the allocated space
176 * upon successful completion with size not equal to 0
177return a null pointer if
178 * size is 0
179 * failure due to any reason
180*/
181extern void* xomp_deviceMalloc(size_t size);
182
183// A host version
184extern void* xomp_hostMalloc(size_t size);
185
186//get the time stamp for now, up to microsecond resolution: 1e-6 , but maybe 1e-4 in practice
187extern double xomp_time_stamp();
188
189
190// memory copy from src to dest, return the pointer to dest. NULL pointer if anything is wrong
191extern void * xomp_memcpyHostToDevice (void *dest, const void * src, size_t n_n);
192extern void * xomp_memcpyDeviceToHost (void *dest, const void * src, size_t n_n);
193// copy a dynamically allocated host source array to linear dest address on a GPU device. the dimension information of the source array
194// is given by: int dimensions[dimension_size], with known element size.
195// bytes_copied reports the total bytes copied by this function.
196// Note: It cannot be used copy static arrays declared like type array[N][M] !!
197extern void * xomp_memcpyDynamicHostToDevice (void *dest, const void * src, int * dimensions, size_t dimension_size, size_t element_size, size_t *bytes_copied);
198
199// copy linear src memory to dynamically allocated destination, with dimension information given by
200// int dimensions[dimension_size]
201// the source memory has total n continuous memory, with known size for each element
202// the total bytes copied by this function is reported by bytes_copied
203extern void * xomp_memcpyDynamicDeviceToHost (void *dest, int * dimensions, size_t dimension_size, const void * src, size_t element_size, size_t *bytes_copied);
204
205extern void * xomp_memcpyDeviceToDevice (void *dest, const void * src, size_t n_n);
206extern void * xomp_memcpyHostToHost (void *dest, const void * src, size_t n_n); // same as memcpy??
207
208
209// free the device memory pointed by a pointer, return false in case of failure, otherwise return true
210extern bool xomp_freeDevice(void* devPtr);
211// free the host memory pointed by a pointer, return false in case of failure, otherwise return true
212extern bool xomp_freeHost(void* hostPtr);
213
214/* Allocation/Free functions for Host */
215/* Allocate a multi-dimensional array
216 *
217 * Input parameters:
218 * int *dimensions: an integer array storing the size of each dimension
219 * size_t dimension_num: the number of dimensions
220 * size_t esize: the size of an array element
221 *
222 * return:
223 * the pointer to the allocated array
224 * */
225extern void * xomp_mallocArray(int * dimensions, size_t dimension_num, size_t esize);
226
227extern void xomp_freeArrayPointer (void* array, int * dimensions, size_t dimension_num);
228
229
230/* CUDA reduction support */
231//------------ types for CUDA reduction support---------
232// Reduction for regular OpenMP is supported by compiler translation. No runtime support is needed.
233// For the accelerator model experimental implementation, we use a two-level reduction method:
234// thread-block level within GPU + beyond-block level on CPU
235
236/* an internal union type to be flexible for all types associated with reduction operations
237 We don't really want to expose this to the compiler to simplify the compiler translation.
238*/
239// We try to limit the numbers of runtime data types exposed to a compiler.
240// A set of integers to represent reduction operations
241#define XOMP_REDUCTION_PLUS 6
242#define XOMP_REDUCTION_MINUS 7
243#define XOMP_REDUCTION_MUL 8
244#define XOMP_REDUCTION_BITAND 9 // &
245#define XOMP_REDUCTION_BITOR 10 // |
246#define XOMP_REDUCTION_BITXOR 11 // ^
247#define XOMP_REDUCTION_LOGAND 12 // &&
248#define XOMP_REDUCTION_LOGOR 13 // ||
249
250#if 0
251// No linker support for device code. We have to put implementation of these device functions into the header
252// TODO: wait until nvcc supports linker for device code.
253//#define XOMP_INNER_BLOCK_REDUCTION_DECL(dtype)
254//__device__ void xomp_inner_block_reduction_##dtype(dtype local_value, dtype * grid_level_results, int reduction_op);
255//
256// TODO declare more prototypes
257//XOMP_INNER_BLOCK_REDUCTION_DECL(int)
258//XOMP_INNER_BLOCK_REDUCTION_DECL(float)
259//XOMP_INNER_BLOCK_REDUCTION_DECL(double)
260//
261//#undef XOMP_INNER_BLOCK_REDUCTION_DECL
262
263#endif
264
265#define XOMP_BEYOND_BLOCK_REDUCTION_DECL(dtype) \
266 dtype xomp_beyond_block_reduction_##dtype(dtype * per_block_results, int numBlocks, int reduction_op);
267
268XOMP_BEYOND_BLOCK_REDUCTION_DECL(int)
269XOMP_BEYOND_BLOCK_REDUCTION_DECL(float)
270XOMP_BEYOND_BLOCK_REDUCTION_DECL(double)
271
272#undef XOMP_BEYOND_BLOCK_REDUCTION_DECL
273// Liao, 8/29/2013
274// Support round-robin static scheduling of loop iterations running on GPUs (accelerator)
275// Static even scheduling may cause each thread to touch too much data, which stress memory channel.
276// NOT IN USE. We use compiler to generate the variables instead of using a runtime data structure.
278 int num; /* the thread number of this thread in team */
279 int num_thds; /* current running thread, referenced by children */
280 int in_parallel; /* current thread executes the region in parallel */
281
282 /* used for schedule */
283 int loop_chunk_size; //************* this is the chunk size
284 int loop_end; //************* equivalent to upper limit, up
285 int loop_sched_index; //************* lb+chunk_size*tp->num (num is the thread number of this thread in team)
286 int loop_stride; //************* chunk_size * nthds /* used for static scheduling */
287
288 /* for 'lastprivate' */
289 int is_last;
290};
291
292#define XOMP_MAX_MAPPED_VARS 256 // for simplicity, we use preallocated memory for storing the mapped variable list
293/* Test runtime support for nested device data environments */
294/* Liao, May 2, 2013*/
295/* A data structure to keep track of a mapped variable
296 * Right now we use memory address of the original variable and the size of the variable
297 * */
299{
300 void * address; // original variable's address
301 //TODO: support array sections
302 int* size;
303 int* offset;
304 int* DimSize;
305 int nDim;
306 int typeSize;
307 void * dev_address; // the corresponding device variable's address
308 bool copyTo; // if this variable should be copied to the device first
309 bool copyFrom; // if this variable should be copied back to HOST when existing the data environment
310};
311
313extern void copy_mapped_variable (struct XOMP_mapped_variable* desc, struct XOMP_mapped_variable* src );
314
315/* A doubly linked list for tracking Device Data Environment (DDE) */
316typedef struct DDE_data {
317 // Do we need this at all? we can allocate/deallocate data without saving region ID
318 int Region_ID; // hash of the AST node? or just memory address of the AST node for now
319
320// Store the device ID in DDE
321 int devID;
322
323// array of the newly mapped variables
324 int new_variable_count;
325 struct XOMP_mapped_variable* new_variables;
326 //struct XOMP_mapped_variable new_variables[XOMP_MAX_MAPPED_VARS];
327
328// array of inherited mapped variable from possible upper level DDEs
329 int inherited_variable_count;
330 struct XOMP_mapped_variable* inherited_variables;
331 //struct XOMP_mapped_variable inherited_variables[XOMP_MAX_MAPPED_VARS];
332
333 // link to its parent node
334 struct DDE_data* parent;
335 // link to its child node
336 struct DDE_data* child;
337} DDE;
338
339// Internal control variables for target devices
340extern int xomp_get_num_devices();
341extern int xomp_get_max_devices(void);
342extern int xomp_num_devices;
343extern int xomp_max_num_devices;
344
345// The head of the list of DDE data nodes
346extern DDE** DDE_head; //TODO. We don't really need this head pointer, it is like a stack, access the end is enough
347// The tail of the list
348extern DDE** DDE_tail;
349
350extern void** xomp_cuda_prop;
351// create a new DDE-data node and append it to the end of the tracking list
352// copy all variables from its parent node to be into the set of inherited variable set.
353//void XOMP_Device_Data_Environment_Enter();
354extern void xomp_deviceDataEnvironmentEnter(int devID);
355
356// A all-in-one wrapper to integrate three things: 1) get inherited variable 2) allocate if not found, 3) register,
357// and 4) copy into GPU operations into one function
358//
359// Based on the CPU variable address and size, also indicate if copyin or copyback is needed.
360// The function will first try to inherit/reuse the same variable from the parent DDE. i
361// If not successful , it will allocate a new data on device, register it to the current DDE, and copy CPU values when needed.
362// The allocated or found device variable address will be returned.
363extern void* xomp_deviceDataEnvironmentPrepareVariable(int devID, void* original_variable_address, int nDim, int typeSize, int* size, int* offset, int* vDimSize, bool copyTo, bool copyFrom);
364
365// Check if an original variable is already mapped in enclosing data environment, return its device variable's address if yes.
366// return NULL if not
367//void* XOMP_Device_Data_Environment_Get_Inherited_Variable (void* original_variable_address, int size);
368extern void* xomp_deviceDataEnvironmentGetInheritedVariable (int devID, void* original_variable_address, int typeSize, int* size);
369
371//void XOMP_Device_Data_Environment_Add_Variable (void* var_addr, int var_size, void * dev_addr);
372extern void xomp_deviceDataEnvironmentAddVariable (int devID, void* var_addr, int* var_size, int* var_offset, int* var_dim, int nDim, int typeSize, void * dev_addr, bool copyTo, bool copyFrom);
373
374// Exit current DDE: deallocate device memory, delete the DDE-data node from the end of the tracking list
375//void XOMP_Device_Data_Environment_Exit();
376extern void xomp_deviceDataEnvironmentExit(int devID);
377
378
379#ifdef __cplusplus
380 }
381#endif
382
383#endif /* LIB_XOMP_H */
384
385
386
387