src/ngfx/computeOps/MatrixMultiplyCPUOp.cpp

Defines

Name
VEC4_LOAD(src, j)

Macro Documentation

define VEC4_LOAD

#define VEC4_LOAD(
    src,
    j
)
vec4(src.data[j], src.data[j + 1], src.data[j + 2], src.data[j + 3])

Source code

/*
 * Copyright 2020 GoPro Inc.
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
#include "ngfx/computeOps/MatrixMultiplyCPUOp.h"
#include "ngfx/core/DebugUtil.h"
#include "ngfx/core/Timer.h"
#include <glm/glm.hpp>
using namespace ngfx;
using namespace glm;

MatrixMultiplyCPUOp::MatrixMultiplyCPUOp(MatrixParam src0, MatrixParam src1,
                                         MatrixParam dst)
    : MatrixMultiplyOp(nullptr), dst(dst) {
  update(src0, src1);
}

MatrixMultiplyCPUOp::~MatrixMultiplyCPUOp() {}
void MatrixMultiplyCPUOp::apply(CommandBuffer *, Graphics *) {
  matrixMultiply();
}

void MatrixMultiplyCPUOp::update(MatrixParam src0, MatrixParam src1) {
  this->src0 = src0;
  this->src1 = src1;
  src1t_data.resize(src1.w * src1.h);
  src1t = {src1.h, src1.w, src1t_data.data()};
  transpose(src1, src1t);
}

void MatrixMultiplyCPUOp::transpose(MatrixParam &src, MatrixParam &dst) {
  Timer timer;
  float *dst_data = dst.data;
  for (uint32_t dst_row = 0; dst_row < dst.h; dst_row++) {
    for (uint32_t dst_col = 0; dst_col < dst.w; dst_col++) {
      float *src_data = &src.data[dst_col * src.w + dst_row];
      *dst_data++ = *src_data;
    }
  }
  timer.update();
  NGFX_LOG("transpose elapsed: %f", timer.elapsed);
}

#define VEC4_LOAD(src, j)                                                      \
  vec4(src.data[j], src.data[j + 1], src.data[j + 2], src.data[j + 3])

void MatrixMultiplyCPUOp::matrixMultiply() {
  Timer timer;
  float *dst_data = dst.data;
  for (uint32_t dst_row = 0; dst_row < dst.h; dst_row++) {
    uint32_t src0_offset = dst_row * src0.w;
    for (uint32_t dst_col = 0; dst_col < dst.w; dst_col++) {
      uint32_t src1t_offset = dst_col * src1t.h;
      float c = 0.0f;
      for (uint32_t j = 0; j < src0.w; j += 4) {
        vec4 a0 = VEC4_LOAD(src0, src0_offset + j),
             b0 = VEC4_LOAD(src1t, src1t_offset + j);
        c += dot(a0, b0);
      }
      *dst_data++ = c;
    }
  }
  timer.update();
  NGFX_LOG("CPU matrix multiply elapsed: %f", timer.elapsed);
}

Updated on 3 April 2021 at 20:21:51 PDT