Examples#

Library initialization#

#include <custabilizer.h>

int main() {
    custabilizerHandle_t handle;
    custabilizerCreate(&handle);

    custabilizerDestroy(handle);
    return 0;
}

Simple circuit simulation#

This example demonstrates how to create a circuit and apply it using custabilizerFrameSimulatorApplyCircuit().

#include <custabilizer.h>
#include <cuda_runtime.h>

int main() {
    // Create library handle
    custabilizerHandle_t handle;
    custabilizerCreate(&handle);

    // Define a simple circuit
    const char* circuitString =
        "X_ERROR(0.01) 0 1\n"
        "H 0\n"
        "CNOT 0 1\n"
        "M 0 1\n";

    // Create circuit
    int64_t bufferSize;
    custabilizerCircuitSizeFromString(handle, circuitString, &bufferSize);
    void* bufferDevice;
    cudaMalloc(&bufferDevice, bufferSize);

    custabilizerCircuit_t circuit;
    custabilizerCreateCircuitFromString(handle, circuitString, bufferDevice,
                                       bufferSize, &circuit);

    // Set up frame simulator parameters
    int64_t numQubits = 2;
    int64_t numShots = 1017;
    int64_t numMeasurements = 2;
    int64_t stride = ((numShots + 7) / 8 + 3) & ~3;  // Pad to multiple of 4

    // Create frame simulator
    custabilizerFrameSimulator_t frameSimulator;
    custabilizerCreateFrameSimulator(handle, numQubits, numShots,
                                    numMeasurements, stride, &frameSimulator);

    // Allocate and initialize bit tables
    int64_t bitTableBytes = numQubits * stride;
    int64_t mTableBytes = numMeasurements * stride;

    custabilizerBitInt_t* xTableDevice;
    custabilizerBitInt_t* zTableDevice;
    custabilizerBitInt_t* mTableDevice;

    cudaMalloc(&xTableDevice, bitTableBytes);
    cudaMalloc(&zTableDevice, bitTableBytes);
    cudaMalloc(&mTableDevice, mTableBytes);

    cudaMemset(xTableDevice, 0, bitTableBytes);
    cudaMemset(zTableDevice, 0, bitTableBytes);
    cudaMemset(mTableDevice, 0, mTableBytes);

    // Apply circuit to the Pauli frames
    uint64_t seed = 42;
    int randomizeFrameAfterMeasurement = 1;
    cudaStream_t stream = 0;

    custabilizerFrameSimulatorApplyCircuit(handle, frameSimulator, circuit,
                                          randomizeFrameAfterMeasurement, seed,
                                          xTableDevice, zTableDevice, mTableDevice,
                                          stream);

    cudaStreamSynchronize(stream);

    // Clean up
    cudaFree(xTableDevice);
    cudaFree(zTableDevice);
    cudaFree(mTableDevice);
    custabilizerDestroyFrameSimulator(frameSimulator);
    custabilizerDestroyCircuit(circuit);
    cudaFree(bufferDevice);
    custabilizerDestroy(handle);
    return 0;
}

DEM sampling pipeline#

This example demonstrates Bernoulli sampling from a probability array followed by a GF(2) sparse-dense matrix multiplication to compute detector outcomes. This is the building-block workflow behind DEMSampler.

#include <custabilizer.h>
#include <cuda_runtime.h>
#include <stdlib.h>

int main() {
    custabilizerHandle_t handle;
    custabilizerCreate(&handle);

    int64_t numErrors  = 4;
    int64_t numDets    = 3;
    int64_t numShots   = 1024;
    int64_t numDetsPad = ((numDets + 31) / 32) * 32;  // round up to 32

    // Error probabilities
    double probsHost[] = {0.01, 0.02, 0.005, 0.03};
    double* probsDev;
    cudaMalloc(&probsDev, numErrors * sizeof(double));
    cudaMemcpy(probsDev, probsHost, numErrors * sizeof(double),
               cudaMemcpyHostToDevice);

    // Step 1: Sparse Bernoulli sampling
    size_t wsSize;
    custabilizerSampleProbArraySparsePrepare(handle, numShots, numErrors,
                                             &wsSize);
    void* wsDev;
    cudaMalloc(&wsDev, wsSize);

    int64_t capacity = numShots;  // initial guess
    uint64_t* colDev;
    uint64_t* rowDev;
    cudaMalloc(&colDev, capacity * sizeof(uint64_t));
    cudaMalloc(&rowDev, (numShots + 1) * sizeof(uint64_t));

    uint64_t nnz = (uint64_t)capacity;
    custabilizerStatus_t st = custabilizerSampleProbArraySparseCompute(
        handle, numShots, numErrors, probsDev, 42,
        &nnz, colDev, rowDev, wsDev, wsSize, 0);

    if (st == CUSTABILIZER_STATUS_INSUFFICIENT_SPARSE_STORAGE) {
        // nnz now holds the required capacity; reallocate and retry
        cudaFree(colDev);
        cudaMalloc(&colDev, nnz * sizeof(uint64_t));
        custabilizerSampleProbArraySparseCompute(
            handle, numShots, numErrors, probsDev, 42,
            &nnz, colDev, rowDev, wsDev, wsSize, 0);
    }

    // Step 2: Detector matrix B (dense, bit-packed)
    int64_t wordsPerRow = numDetsPad / 32;
    int64_t bBytes = numErrors * wordsPerRow * sizeof(custabilizerBitInt_t);
    custabilizerBitInt_t* bHost = (custabilizerBitInt_t*)calloc(
        numErrors * wordsPerRow, sizeof(custabilizerBitInt_t));

    // error 0 -> det 0,1  |  error 1 -> det 1
    // error 2 -> det 2    |  error 3 -> det 0,2
    bHost[0 * wordsPerRow] = 0x3;  // bits 0,1
    bHost[1 * wordsPerRow] = 0x2;  // bit 1
    bHost[2 * wordsPerRow] = 0x4;  // bit 2
    bHost[3 * wordsPerRow] = 0x5;  // bits 0,2

    custabilizerBitInt_t* bDev;
    cudaMalloc(&bDev, bBytes);
    cudaMemcpy(bDev, bHost, bBytes, cudaMemcpyHostToDevice);

    // Step 3: GF(2) matmul  C = S @ B
    int64_t cWords = numShots * wordsPerRow;
    custabilizerBitInt_t* cDev;
    cudaMalloc(&cDev, cWords * sizeof(custabilizerBitInt_t));

    custabilizerGF2SparseDenseMatrixMultiply(
        handle, numShots, numDetsPad, numErrors, nnz,
        colDev, rowDev, bDev, 0, cDev, 0);

    cudaDeviceSynchronize();

    // Clean up
    free(bHost);
    cudaFree(cDev);
    cudaFree(bDev);
    cudaFree(colDev);
    cudaFree(rowDev);
    cudaFree(wsDev);
    cudaFree(probsDev);
    custabilizerDestroy(handle);
    return 0;
}