#define CUDSS_CALL_AND_CHECK(call, status, msg)                                \
    do {                                                                       \
        status = call;                                                         \
        if (status != CUDSS_STATUS_SUCCESS) {                                  \
            printf("CUDSS call ended unsuccessfully with status = %d, "        \
                   "details: " #msg "\n",                                      \
                   status);                                                    \
        }                                                                      \
    } while (0);

#define CUDSS_CALL_AND_CHECK_TIME(call, status, msg, func_name, WARM_UP,       \
                                  PERF_RUN)                                    \
    do {                                                                       \
        if (WARM_UP) {                                                         \
            status = call;                                                     \
            if (status != CUDSS_STATUS_SUCCESS) {                              \
                printf("CUDSS call ended unsuccessfully with status = %d, "    \
                       "details: " #msg "\n",                                  \
                       status);                                                \
            }                                                                  \
        }                                                                      \
        cudaDeviceSynchronize();                                               \
        start_time = second();                                                 \
        for (int i = 0; i < (PERF_RUN ? nrun : 1); i++) {                      \
            status = call;                                                     \
            if (status != CUDSS_STATUS_SUCCESS) {                              \
                printf("CUDSS call ended unsuccessfully with status = %d, "    \
                       "details: " #msg "\n",                                  \
                       status);                                                \
            }                                                                  \
        }                                                                      \
        cudaDeviceSynchronize();                                               \
        double tmp_t_ = (second() - start_time) / (PERF_RUN ? nrun : 1);       \
        if (rank == 0) {                                                       \
            printf("%s: time = %1.8f\n", func_name, tmp_t_);                   \
            fflush(0);                                                         \
        }                                                                      \
    } while (0);

// Quite a crude host timer, just an example
static double second(void) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}

// ...

cudssStatus_t status = CUDSS_STATUS_SUCCESS;

// Analysis
CUDSS_CALL_AND_CHECK_TIME(
    cudssExecute(handle, CUDSS_PHASE_ANALYSIS, solverConfig, solverData, A, sol, rhs),
    status, "cudssExecute for analysis", "ANALYSIS", 0, 0);

int nrun = 10;  // for smaller matrices, one should have a larger value, say 1000; for
                // bigger, often 1 is enough
int warmup = 1; // it is recommended to have at least one warm-up iteration for
                // factorization and solve

// Factorization
CUDSS_CALL_AND_CHECK_TIME(cudssExecute(handle, CUDSS_PHASE_FACTORIZATION,
                                       solverConfig, solverData, A, sol, rhs),
                          status, "cudssExecute for factor", "FACTOR", warmup, 1);

// Note: depending on the application, it might make sense also to measure
// performance of other phases,
//       e.g., CUDSS_PHASE_REFACTORIZATION or solve sub-phases

// Solve
CUDSS_CALL_AND_CHECK_TIME(
    cudssExecute(handle, CUDSS_PHASE_SOLVE, solverConfig, solverData, A, sol, rhs),
    status, "cudssExecute for solve", "SOLVE", warmup, 1);