StarPU Handbook - StarPU Applications
Loading...
Searching...
No Matches
2. A Vector Scaling Application

2.1 Base version

The non-StarPU version shows a basic example that we will be using to illustrate how to use StarPU. It simply allocates a vector, and calls a scaling function over it.

void vector_scal_cpu(float *val, unsigned n, float factor)
{
unsigned i;
for (i = 0; i < n; i++)
val[i] *= factor;
}
#define NX 2048
int main(void)
{
float *vector;
unsigned i;
vector = malloc(sizeof(vector[0]) * NX);
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
float factor = 3.14;
vector_scal_cpu(vector, NX, factor);
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
free(vector);
return 0;
}

2.2 StarPU C version

2.2.1 Computation Kernels

We are going to transform here the computation function vector_scal_cpu.

void vector_scal_cpu(float *val, unsigned n, float factor)
{
unsigned i;
for (i = 0; i < n; i++)
val[i] *= factor;
}

The StarPU corresponding function takes as parameters a list of DSM interfaces and a non-DSM parameter.

void vector_scal_cpu(void *buffers[], void *cl_arg)
{

The first DSM parameter is the vector and is available through buffer[0]. StarPU provides functions to get the vector data, and extract the pointer and size of the vector.

struct starpu_vector_interface *vector = buffers[0];
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
unsigned n = STARPU_VECTOR_GET_NX(vector);
#define STARPU_VECTOR_GET_NX(interface)
Definition starpu_data_interfaces.h:2100
#define STARPU_VECTOR_GET_PTR(interface)
Definition starpu_data_interfaces.h:2084
Definition starpu_data_interfaces.h:1981

The non-DSM parameters are stored in the second argument of the function, and need to be unpacked.

float factor;
starpu_codelet_unpack_args(cl_arg, &factor);
void starpu_codelet_unpack_args(void *cl_arg,...)

It is then possible to perform the vector scaling as in the original function.

unsigned i;
for (i = 0; i < n; i++)
val[i] *= factor;
Original code StarPU code
void vector_scal_cpu(float *val, unsigned n, float factor)
{
unsigned i;
for (i = 0; i < n; i++)
val[i] *= factor;
}
void vector_scal_cpu(void *buffers[], void *cl_arg)
{
struct starpu_vector_interface *vector = buffers[0];
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
unsigned n = STARPU_VECTOR_GET_NX(vector);
float factor;
starpu_codelet_unpack_args(cl_arg, &factor);
unsigned i;
for (i = 0; i < n; i++)
val[i] *= factor;
}

The GPU and OpenCL implementations can be seen in FullSourceCodeVectorScal.

2.2.2 Main Code

Let's look now at the main code.

  • The cl codelet structure simply gathers pointers on the functions mentioned above, and notes that the functions takes only one DSM parameter.

    static struct starpu_codelet cl =
    {
    .cpu_funcs = {vector_scal_cpu},
    .cuda_funcs = {vector_scal_cuda},
    .opencl_funcs = {vector_scal_opencl},
    .nbuffers = 1,
    .modes = {STARPU_RW}
    };
    starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS]
    Definition starpu_task.h:414
    Definition starpu_task.h:338
    @ STARPU_RW
    Definition starpu_data.h:60
  • The main function starts with initializing StarPU with the default parameters.

    int ret = starpu_init(NULL);
    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
    int starpu_init(struct starpu_conf *conf)
    #define STARPU_CHECK_RETURN_VALUE(err, message,...)
    Definition starpu_util.h:416
  • It then allocates the vector and fills it like the original code.

    vector = malloc(sizeof(vector[0]) * NX);
    for (i = 0; i < NX; i++)
    vector[i] = 1.0f;
    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
  • It then registers the data to StarPU, and gets back a DSM handle. From now on, the application is not supposed to access vector directly, since its content may be copied and modified by a task on a GPU, the main-memory copy then being outdated.

    starpu_data_handle_t vector_handle;
    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
    #define STARPU_MAIN_RAM
    Definition starpu_task.h:144
    void starpu_vector_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
    struct _starpu_data_state * starpu_data_handle_t
    Definition starpu_data.h:45
  • It then submits a (asynchronous) task to StarPU.

    float factor = 3.14;
    ret = starpu_task_insert(&cl,
    STARPU_RW, vector_handle,
    STARPU_VALUE, &factor, sizeof(factor),
    0);
    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
    int starpu_task_insert(struct starpu_codelet *cl,...)
    #define STARPU_VALUE
    Definition starpu_task_util.h:45
  • It waits for task completion, and unregisters the vector from StarPU, which brings back the modified version to main memory, so the result can be read.

    starpu_data_unregister(vector_handle);
    int starpu_task_wait_for_all(void)
    void starpu_data_unregister(starpu_data_handle_t handle)
  • Eventually, it shuts down StarPU:

    void starpu_shutdown(void)
Original code StarPU code
#define NX 2048
int main(void)
{
float *vector;
unsigned i;
vector = malloc(sizeof(vector[0]) * NX);
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
float factor = 3.14;
vector_scal_cpu(vector, NX, factor);
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
free(vector);
return 0;
}
#include <starpu.h>
extern void vector_scal_cpu(void *buffers[], void *_args);
extern void vector_scal_cuda(void *buffers[], void *_args);
extern void vector_scal_opencl(void *buffers[], void *_args);
static struct starpu_codelet cl =
{
.cpu_funcs = {vector_scal_cpu},
.cuda_funcs = {vector_scal_cuda},
.opencl_funcs = {vector_scal_opencl},
.nbuffers = 1,
.modes = {STARPU_RW}
};
#ifdef STARPU_USE_OPENCL
#endif
#define NX 2048
int main(void)
{
float *vector;
unsigned i;
int ret = starpu_init(NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
#ifdef STARPU_USE_OPENCL
starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
#endif
vector = malloc(sizeof(vector[0]) * NX);
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
starpu_data_handle_t vector_handle;
starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
float factor = 3.14;
ret = starpu_task_insert(&cl,
STARPU_RW, vector_handle,
STARPU_VALUE, &factor, sizeof(factor),
0);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
starpu_data_unregister(vector_handle);
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
free(vector);
#ifdef STARPU_USE_OPENCL
#endif
return 0;
}
cl_program programs[STARPU_MAXOPENCLDEVS]
Definition starpu_opencl.h:48
int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options)
int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
Definition starpu_opencl.h:46

2.3 Building and Running

We will use the StarPU docker image.

$ docker run -it registry.gitlab.inria.fr/starpu/starpu-docker/starpu:latest

If your machine has GPU devices, you can use the following command to enable the GPU devices within the docker image.

$ docker run -it registry.gitlab.inria.fr/starpu/starpu-docker/starpu:latest

From your docker image, you can then call the following commands.

$ cd src/starpu/doc/tutorial
$ make vector_scal
$ ./vector_scal

You can set the environment variable STARPU_WORKER_STATS to 1 when running your application to see the number of tasks executed by each device.

$ STARPU_WORKER_STATS=1 ./vector_scal

If your machine has GPU devices, you can force the execution on the GPU devices by setting the number of CPU workers to 0.

# to force the implementation on a GPU device, by default, it will enable CUDA
$ STARPU_WORKER_STATS=1 STARPU_NCPU=0 ./vector_scal

# to force the implementation on a OpenCL device
$ STARPU_WORKER_STATS=1 STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal