Get and set slice modules

The following is just going to be a big code dump and there is no need to think about this too deeply.


type DeviceGetSliceModule() = 
    let block_size = 256

    let kernel_code = "
        //Kernel code:
        extern \"C\" {
            __global__ void getSliceKernel(const "+FloatTypeCpp+"* matrix, "+FloatTypeCpp+"* out_matrix, const int start_row, const int end_row, const int num_rows, const int start_col, const int end_col, const int num_cols, const unsigned col_major){
                const int stride = blockDim.x * gridDim.x;
                if (col_major){
                    int i = threadIdx.x+blockIdx.x*blockDim.x;
                    const int row_stride = end_row-start_row+1;
                    const int col_stride = end_col-start_col+1;
                    while (1) {
                        const int row_i = i % row_stride;
                        const int col_i = i / row_stride;
                        const int row = start_row+row_i;
                        const int col = start_col+col_i;
                        const int idx = row+col*num_rows;
                        if (row_i < row_stride && col_i < col_stride) {
                            out_matrix[i] = matrix[idx];
                            i += stride;
                        } else return;
                    }
                }
                else{
                    int i = threadIdx.x+blockIdx.x*blockDim.x;
                    const int row_stride = end_row-start_row+1;
                    const int col_stride = end_col-start_col+1;
                    while (1) {
                        const int row_i = i / col_stride;
                        const int col_i = i % col_stride;
                        const int row = start_row+row_i;
                        const int col = start_col+col_i;
                        const int idx = col+row*num_cols;
                        if (row_i < row_stride && col_i < col_stride) {
                            out_matrix[i] = matrix[idx];
                            i += stride;
                        } else return;
                    }
                }
            }
        }

        "
    let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"getSliceKernel")
    do  
        try k.Compile([|"-arch=compute_30"|])
        with 
        | 😕 NVRTCException as x -> 
            printfn "%s" (k.GetLogAsString())
            reraise()

    let kernel = ctx.LoadKernelPTX(k.GetPTX(),"getSliceKernel")

    /// For matrices stored in row major order.
    /// Zero based indexing.
    member t.AR(x: dMatrix, start_row, end_row, start_col, end_col) =
        if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0"
        if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols"
        let order = 0u
        let row_stride = end_row-start_row+1
        let col_stride = end_col-start_col+1
        let y = dMatrix.create(row_stride, col_stride)
        let n = row_stride*col_stride
        let gridSize = divup n block_size
        kernel.GridDimensions <- dim3(gridSize)
        kernel.BlockDimensions <- dim3(block_size)
        kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore
        y

    /// For matrices stored in column major order.
    /// Zero based indexing.
    member t.AC(x: dMatrix, start_row, end_row, start_col, end_col) =
        if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0"
        if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols"
        let order = 1u
        let row_stride = end_row-start_row+1
        let col_stride = end_col-start_col+1
        let y = dMatrix.create(row_stride, col_stride)
        let n = row_stride*col_stride
        let gridSize = divup n block_size
        kernel.GridDimensions <- dim3(gridSize)
        kernel.BlockDimensions <- dim3(block_size)
        kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore
        y

type DeviceSetSliceModule() = 
    let block_size = 256

    let kernel_code = "
        //Kernel code:
        extern \"C\" {
            __global__ void setSliceKernel("+FloatTypeCpp+"* matrix, const "+FloatTypeCpp+"* out_matrix, const int start_row, const int end_row, const int num_rows, const int start_col, const int end_col, const int num_cols, const unsigned col_major){
                const int stride = blockDim.x * gridDim.x;
                if (col_major){
                    int i = threadIdx.x+blockIdx.x*blockDim.x;
                    const int row_stride = end_row-start_row+1;
                    const int col_stride = end_col-start_col+1;
                    while (1) {
                        const int row_i = i % row_stride;
                        const int col_i = i / row_stride;
                        const int row = start_row+row_i;
                        const int col = start_col+col_i;
                        const int idx = row+col*num_rows;
                        if (row_i < row_stride && col_i < col_stride) {
                            matrix[idx] = out_matrix[i];
                            i += stride;
                        } else return;
                    }
                }
                else{
                    int i = threadIdx.x+blockIdx.x*blockDim.x;
                    const int row_stride = end_row-start_row+1;
                    const int col_stride = end_col-start_col+1;
                    while (1) {
                        const int row_i = i / col_stride;
                        const int col_i = i % col_stride;
                        const int row = start_row+row_i;
                        const int col = start_col+col_i;
                        const int idx = col+row*num_cols;
                        if (row_i < row_stride && col_i < col_stride) {
                            matrix[idx] = out_matrix[i];
                            i += stride;
                        } else return;
                    }
                }
            }
        }

        "
    let k = new ManagedCuda.NVRTC.CudaRuntimeCompiler(kernel_code,"setSliceKernel")
    do  
        try k.Compile([|"-arch=compute_30"|])
        with 
        | 😕 NVRTCException as x -> 
            printfn "%s" (k.GetLogAsString())
            reraise()

    let kernel = ctx.LoadKernelPTX(k.GetPTX(),"setSliceKernel")

    /// For matrices stored in row major order.
    /// Zero based indexing.
    member t.AR(x: dMatrix, y: dMatrix, start_row, end_row, start_col, end_col) =
        if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0"
        if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols"
        let order = 0u
        let row_stride = end_row-start_row+1
        let col_stride = end_col-start_col+1
        if y.rc <> (row_stride,col_stride) then failwith "y.rc <> row_stride,col_stride"
        let n = row_stride*col_stride
        let gridSize = divup n block_size
        kernel.GridDimensions <- dim3(gridSize)
        kernel.BlockDimensions <- dim3(block_size)
        kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore

    /// For matrices stored in column major order.
    /// Zero based indexing.
    member t.AC(x: dMatrix, y: dMatrix, start_row, end_row, start_col, end_col) =
        if (start_row < 0 || start_col < 0) then failwith "start_row < 0 || start_col < 0"
        if (end_row >= x.num_rows || start_col >= x.num_cols) then failwith "end_row >= x.num_rows || start_col >= x.num_cols"
        let order = 1u
        let row_stride = end_row-start_row+1
        let col_stride = end_col-start_col+1
        if y.rc <> (row_stride,col_stride) then failwith "y.rc <> row_stride,col_stride"
        let n = row_stride*col_stride
        let gridSize = divup n block_size
        kernel.GridDimensions <- dim3(gridSize)
        kernel.BlockDimensions <- dim3(block_size)
        kernel.RunAsync(str.Stream, x.dArray.DevicePointer,y.dArray.DevicePointer,start_row, end_row, x.num_rows, start_col, end_col, x.num_cols, order) |> ignore

// The Item and GetSlice operators. Column major
let setsliceModule = DeviceSetSliceModule()
let getsliceModule = DeviceGetSliceModule()

type dMatrix with
    member t.GetSlice(rowStart: int option, rowFinish : int option,
                         colStart: int option, colFinish : int option) =
        let rowStart = defaultArg rowStart 0
        let rowFinish = defaultArg rowFinish (t.num_rows-1)
        let colStart = defaultArg colStart 0
        let colFinish = defaultArg colFinish (t.num_cols-1)
        getsliceModule.AC(t,rowStart,rowFinish,colStart,colFinish)

    member t.GetSlice(row: int, colStart: int option, colFinish: int option) =
            let colStart = defaultArg colStart 0
            let colFinish = defaultArg colFinish t.num_cols-1
            getsliceModule.AC(t,row,row,colStart,colFinish)

    member t.GetSlice(rowStart: int option, rowFinish: int option, col: int) =
            let rowStart = defaultArg rowStart 0
            let rowFinish = defaultArg rowFinish t.num_rows-1
            getsliceModule.AC(t,rowStart,rowFinish,col,col)

    member t.SetSlice(rowStart: int option, rowFinish : int option,
                         colStart: int option, colFinish : int option, y) =
        let rowStart = defaultArg rowStart 0
        let rowFinish = defaultArg rowFinish (t.num_rows-1)
        let colStart = defaultArg colStart 0
        let colFinish = defaultArg colFinish (t.num_cols-1)
        setsliceModule.AC(t,y,rowStart,rowFinish,colStart,colFinish)

    member t.SetSlice(row: int, colStart: int option, colFinish: int option,y) =
            let colStart = defaultArg colStart 0
            let colFinish = defaultArg colFinish t.num_cols-1
            setsliceModule.AC(t,y,row,row,colStart,colFinish)

    member t.SetSlice(rowStart: int option, rowFinish: int option, col: int,y) =
            let rowStart = defaultArg rowStart 0
            let rowFinish = defaultArg rowFinish t.num_rows-1
            setsliceModule.AC(t,y,rowStart,rowFinish,col,col)

Even though it is 200 lines long, all the above does is lets us access matrix like a 2D array. With this extension it can be read and set using .[1..3,2..5] or something to that effect. These kernels could also be helpful in isolation. There are both column major and row major versions inside the function.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s