code sample

Code Samples

Catalogue

Rasterization: Texture Mapping

Modeling: Mesh Upsampling
Path Tracing: BVH (Bounding Volume Hierarchy) Acceleration
Path Tracing: Monte Carlo Integration and Global Illumination
Cloth Simulation
Cloth Simulation: Diffuse Fragment Shader
Cloth Simulation: Blinn-Phong Fragment Shader
Cloth Simulation: Texture Mapping Fragment Shader
Cloth Simulation: Bump Mapping Vertex Shader
Cloth Simulation: Bump Mapping Fragment Shader
Cloth Simulation: Displacement Mapping Vertex Shader
Cloth Simulation: Displacement Mapping Fragment Shader
Cloth Simulation: Mirror Fragment Shader
Image Filtering Acceleration by Pthread
Image Filtering Acceleration by CUDA
Image Filtering Acceleration by OpenACC
Matrix Multiplication Acceleration by Memory Locality, SIMD, OpenMP and MPI

Rasterization: Texture Mapping


x
void RasterizerImp::rasterize_textured_triangle(float x0, float y0, float u0, float v0,
                                                    float x1, float y1, float u1, float v1,
                                                    float x2, float y2, float u2, float v2,
                                                    Texture &tex)
    {
        // Ensure that these points are arranged in counterclockwise order. If not, exchange them
        if ((x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) < 0)
        {
            swap(x1, x2);
            swap(y1, y2);
            swap(u1, u2);
            swap(v1, v2);
        }

        // Get the boundary and use ceil() / floor() to improve the accuracy

        // Find the boundary of the triangle
        int xmax = ceil(std::max({x0, x1, x2}));
        int xmin = floor(std::min({x0, x1, x2}));
        int ymax = ceil(std::max({y0, y1, y2}));
        int ymin = floor(std::min({y0, y1, y2}));

        int scale = (int)sqrt(sample_rate);
        // Initialize the SampleParams
        SampleParams sp;
        sp.lsm = lsm;
        sp.psm = psm;
        Color c;

        // Initialize the transition matrix
        Matrix3x3 M(x0, x1, x2, y0, y1, y2, 1, 1, 1);
        Matrix3x3 inv_M = M.inv();

        // Convert the point into Vector form
        Vector2D X0 = Vector2D(x0, y0);
        Vector2D X1 = Vector2D(x1, y1);
        Vector2D X2 = Vector2D(x2, y2);
        // Triangle line
        Vector2D l0 = X1 - X0;
        Vector2D l1 = X2 - X1;
        Vector2D l2 = X0 - X2;
        // Normal vector
        Vector2D N0 = Vector2D(-l0.y, l0.x);
        Vector2D N1 = Vector2D(-l1.y, l1.x);
        Vector2D N2 = Vector2D(-l2.y, l2.x);

        for (int y = ymin; y < ymax; y++)
        {
            for (int x = xmin; x < xmax; x++)
            {
                for (int j = 0; j < scale; j++)
                {
                    for (int i = 0; i < scale; i++)
                    {
                        float px = (float)x + ((float)i + 0.5) / (float)scale;
                        float py = (float)y + ((float)j + 0.5) / (float)scale;
                        Vector2D p0 = Vector2D(px, py);
                        Vector2D V1 = p0 - X0;
                        Vector2D V2 = p0 - X1;
                        Vector2D V3 = p0 - X2;
                        // Use L = V * N to detect whether the point is inside the triangle
                        if ((dot(V1, N0) >= 0) && (dot(V2, N1) >= 0) && (dot(V3, N2) >= 0))
                        {
                            Vector3D p = Vector3D(px, py, 1);
                            // Use M^-1 * (x, y, 1) to get the barycentric coordinate
                            Vector3D weight_p = inv_M * p;
                            weight_p.z = 1 - weight_p.x - weight_p.y;
                            // Use the weight to calculate the uv coordinate of the sampled point
                            sp.p_uv = Vector2D(dot(weight_p, Vector3D(u0, u1, u2)), dot(weight_p, Vector3D(v0, v1, v2)));

                            Vector3D weight_p_dx = inv_M * Vector3D(px + 1., py, 1);
                            weight_p_dx.z = 1 - weight_p_dx.x - weight_p_dx.y;
                            // Since dx=1, then we just need to use subtraction to get d(u,v)/dx
                            sp.p_dx_uv = Vector2D(dot(weight_p_dx, Vector3D(u0, u1, u2)), dot(weight_p_dx, Vector3D(v0, v1, v2))) - sp.p_uv;

                            Vector3D weight_p_dy = inv_M * Vector3D(px, py + 1., 1);
                            weight_p_dy.z = 1 - weight_p_dy.x - weight_p_dy.y;
                            // Since dy=1, then we just need to use subtraction to get d(u,v)/dy
                            sp.p_dy_uv = Vector2D(dot(weight_p_dy, Vector3D(u0, u1, u2)), dot(weight_p_dy, Vector3D(v0, v1, v2))) - sp.p_uv;

                            // Check the texture image and get the color
                            c = tex.sample(sp);
                            super_sample_fill_pixel(x, y, i + j * scale, c);
                        }
                    }
                }
            }
        }
    }

float Texture::get_level(const SampleParams &sp)
    {
        // Scale the derivatives according to the texture size
        Vector2D p_tx_uv = Vector2D(sp.p_dx_uv.x * (width - 1), sp.p_dx_uv.y * (height - 1));
        Vector2D p_ty_uv = Vector2D(sp.p_dy_uv.x * (width - 1), sp.p_dy_uv.y * (height - 1));
        // Get the bigger norm
        float L = std::max(p_tx_uv.norm(), p_ty_uv.norm());
        float D = log2(L);
        return D;
    }

Catalogue

Modeling: Mesh Upsampling


xxxxxxxxxx
void MeshResampler::upsample(HalfedgeMesh &mesh)
    {
    
        // 1. Compute new positions for all the old vertices
        for (VertexIter v = mesh.verticesBegin(); v != mesh.verticesEnd(); v++)
        {
            v->isNew = false; // set it to be false
            HalfedgeIter h = v->halfedge();
            Vector3D original_position = v->position;
            Vector3D original_neighbor_position_sum(0, 0, 0);
            int n = 0;
            float u;

            // Find all the neighbors of the vertex
            do
            {
                n++;
                original_neighbor_position_sum += h->twin()->vertex()->position;
                h = h->twin()->next();
            } while (h != v->halfedge());

            // Use the formula to compute the final result
            if (n == 3)
            {
                u = 3.0 / 16.0;
            }
            else
            {
                u = 3.0 / (8.0 * n);
            }

            v->newPosition = (1 - n * u) * original_position + u * original_neighbor_position_sum;
        }

        // 2. Compute the updated vertex positions associated with edges
        for (EdgeIter e = mesh.edgesBegin(); e != mesh.edgesEnd(); e++)
        {
            // Set isNew to false for all old edges
            e->isNew = false;

            /* C
              /|\
             A | D
              \|/
               B  */

            HalfedgeIter h0 = e->halfedge();
            HalfedgeIter h1 = h0->twin();
            HalfedgeIter CA = h0->next();
            HalfedgeIter AB = CA->next();
            HalfedgeIter BD = h1->next();
            HalfedgeIter DC = BD->next();

            // Find the position of the four vertices
            Vector3D pB = h0->vertex()->position;
            Vector3D pC = CA->vertex()->position;
            Vector3D pA = AB->vertex()->position;
            Vector3D pD = DC->vertex()->position;

            // Use the formula to compute the final result
            e->newPosition = 3.0 / 8.0 * (pB + pC) + 1.0 / 8.0 * (pA + pD);
        }

        // 3. Split every edge in the mesh
        for (EdgeIter e = mesh.edgesBegin(); e != mesh.edgesEnd(); e++)
        {
            /* C
              /|\
             A | D
              \|/
               B  */
            HalfedgeIter h0 = e->halfedge();
            HalfedgeIter h1 = h0->twin();
            VertexIter B = h0->vertex();
            VertexIter C = h1->vertex();

            // Check whether both vertices are new
            if (B->isNew == false && C->isNew == false)
            {
                VertexIter new_v = mesh.splitEdge(e);
                new_v->newPosition = e->newPosition;
            }
        }

        // 4. Flip any new edge that connects an old and new vertex
        for (EdgeIter e = mesh.edgesBegin(); e != mesh.edgesEnd(); e++)
        {
            /* C
              /|\
             A | D
              \|/
               B  */
            HalfedgeIter h0 = e->halfedge();
            HalfedgeIter h1 = h0->twin();
            VertexIter B = h0->vertex();
            VertexIter C = h1->vertex();

            // Check whether one vertex is new, the other is old
            if ((B->isNew ^ C->isNew) && e->isNew == true)
            {
                mesh.flipEdge(e);
            }
        }

        // 5. Copy the new vertex positions into the final Vertex::position
        for (VertexIter v = mesh.verticesBegin(); v != mesh.verticesEnd(); v++)
        {
            v->position = v->newPosition;
        }
    }

Catalogue

Path Tracing: BVH (Bounding Volume Hierarchy) Acceleration


xxxxxxxxxx
BVHNode *BVHAccel::construct_bvh(std::vector<Primitive *>::iterator start,
                                     std::vector<Primitive *>::iterator end,
                                     size_t max_leaf_size)
    {

      // Construct a BVH from the given vector of primitives and maximum leaf
      // size configuration. The starter code builds a BVH aggregate with a
      // single leaf node (which is also the root) that encloses all the
      // primitives.

      int count = end - start;

      if (count > max_leaf_size)
      {
        Vector3D avg_centrd = Vector3D(0, 0, 0);

        BBox bbox;

        for (auto p = start; p != end; p++)
        {
          BBox bb = (*p)->get_bbox();
          bbox.expand(bb);
          avg_centrd += bb.centroid();
        }

        avg_centrd /= count;

        auto smaller_than_centrd = [avg_centrd](Primitive *p)
        {
          return p->get_bbox().centroid().x < avg_centrd.x;
        };

        auto mid = std::partition(start, end, smaller_than_centrd);

        BVHNode *node = new BVHNode(bbox);

        if (mid == start || mid == end)
        {

          std::sort(start, end, cmp);

          node->l = construct_bvh(start, start + max_leaf_size, max_leaf_size);
          node->r = construct_bvh(start + max_leaf_size, end, max_leaf_size);
        }
        else
        {
          node->l = construct_bvh(start, mid, max_leaf_size);
          node->r = construct_bvh(mid, end, max_leaf_size);
        }

        return node;
      }
      else
      {
        BBox bbox;

        for (auto p = start; p != end; p++)
        {
          BBox bb = (*p)->get_bbox();
          bbox.expand(bb);
        }

        BVHNode *node = new BVHNode(bbox);
        node->start = start;
        node->end = end;

        return node;
      }
    }

    bool BVHAccel::has_intersection(const Ray &ray, BVHNode *node) const
    {
      // Take note that this function has a short-circuit that the
      // Intersection version cannot since it returns as soon as it finds
      // a hit, it doesn't actually have to find the closest hit.

      double t0 = ray.min_t;
      double t1 = ray.max_t;
      if (!node->bb.intersect(ray, t0, t1))
        return false;

      if (node->l == NULL && node->r == NULL)
      {
        for (auto p = node->start; p != node->end; p++)
        {
          total_isects++;
          if ((*p)->has_intersection(ray))
            return true;
        }
        return false;
      }
      return has_intersection(ray, node->l) || has_intersection(ray, node->r);
    }

    bool BVHAccel::intersect(const Ray &ray, Intersection *i, BVHNode *node) const
    {

      bool hit = false;

      double t0 = ray.min_t;
      double t1 = ray.max_t;
      if (!node->bb.intersect(ray, t0, t1))
        return hit;
      
      if (node->l == NULL && node->r == NULL)
      {
        for (auto p = node->start; p != node->end; p++)
        {
          total_isects++;
          hit = (*p)->intersect(ray, i) || hit;
        }
        return hit;
      }
      hit = intersect(ray, i, node->l) || hit;
      hit = intersect(ray, i, node->r) || hit;
      return hit;
    }

Catalogue

Path Tracing: Monte Carlo Integration and Global Illumination


xxxxxxxxxx
Vector3D PathTracer::estimate_direct_lighting_hemisphere(const Ray &r,
                                                             const Intersection &isect)
    {
        // Estimate the lighting from this intersection coming directly from a light.
        // For this function, sample uniformly in a hemisphere.

        // Note: When comparing Cornel Box (CBxxx.dae) results to importance sampling, you may find the "glow" around the light source is gone.
        // This is totally fine: the area lights in importance sampling have directionality, however, in hemisphere sampling, we don't model this behavior.

        // Make a coordinate system for a hit point
        // with N aligned with the Z direction.
        Matrix3x3 o2w;
        make_coord_space(o2w, isect.n);
        Matrix3x3 w2o = o2w.T();

        // w_out points towards the source of the ray (e.g.,
        // toward the camera if this is a primary ray)
        const Vector3D hit_p = r.o + r.d * isect.t;
        const Vector3D w_out = w2o * (-r.d);

        // This is the same number of total samples as
        // estimate_direct_lighting_importance (outside of delta lights). We keep the
        // same number of samples for clarity of comparison.
        int num_samples = scene->lights.size() * ns_area_light;
        Vector3D L_out;

        // Define the direction of the sample light emitted from the light source as w_in
        Vector3D w_in;
        double pdf; // f sampling pdf
        for (int i = 0; i < num_samples; i++)
        {
            Vector3D f = isect.bsdf->sample_f(w_out, &w_in, &pdf); // Get f, w_in, pdf
            Vector3D n = Vector3D(0, 0, 1);                        // Normal vector
            double cos_theta = dot(w_in, n);                       // Angle between two vectors
            // When calculating, w_in is converted to a vector in the local coordinate system at the intersection.
            Ray ray_in = Ray(hit_p, o2w * w_in); // Define the outgoing light
            ray_in.min_t = EPS_F;
            Intersection light_src;
            if (bvh->intersect(ray_in, &light_src))
            {                                                // Check whether it hits
                Vector3D L = light_src.bsdf->get_emission(); // Get the radiance directly
                L_out += f * L * cos_theta / pdf;
            }
        }
        L_out /= num_samples;
        return L_out;
    }


    Vector3D PathTracer::estimate_direct_lighting_importance(const Ray &r,
                                                             const Intersection &isect)
    {
        // Estimate the lighting from this intersection coming directly from a light.
        // To implement importance sampling, sample only from lights, not uniformly in
        // a hemisphere.

        //Make a coordinate system for a hit point
        // with N aligned with the Z direction.
        Matrix3x3 o2w;
        make_coord_space(o2w, isect.n);
        Matrix3x3 w2o = o2w.T();

        // w_out points towards the source of the ray (e.g.,
        // toward the camera if this is a primary ray)
        const Vector3D hit_p = r.o + r.d * isect.t;
        const Vector3D w_out = w2o * (-r.d);
        Vector3D L_out;

        Vector3D w_in;
        double distToLight;
        double pdf;
        for (SceneLight *light : scene->lights)
        {
            int num_samples = 0;
            Vector3D L_unit = Vector3D(0, 0, 0);
            for (int i = 0; i < ns_area_light; i++)
            {
                Vector3D L = light->sample_L(hit_p, &w_in, &distToLight, &pdf); // Get the radiance by sampling
                double cos_theta = dot(w_in, isect.n);
                Ray ray_in = Ray(hit_p, w_in, distToLight - EPS_F);
                ray_in.min_t = EPS_F;
                Intersection object;
                if (!bvh->intersect(ray_in, &object))
                {
                    Vector3D f = isect.bsdf->f(w_out, w_in);
                    L_unit += f * L * cos_theta / pdf;
                }
                num_samples++;
                if (light->is_delta_light())
                    break;
            }
            L_out += L_unit / num_samples;
        }

        return L_out;
    }

Vector3D PathTracer::at_least_one_bounce_radiance(const Ray &r,
                                                      const Intersection &isect)
    {
        Matrix3x3 o2w;
        make_coord_space(o2w, isect.n);
        Matrix3x3 w2o = o2w.T();

        Vector3D hit_p = r.o + r.d * isect.t;
        Vector3D w_out = w2o * (-r.d);

        Vector3D L_out(0, 0, 0);

        // Returns the one bounce radiance + radiance from extra bounces at this point.
        // Should be called recursively to simulate extra bounces.

        // Direct illumination
        L_out = one_bounce_radiance(r, isect);

        // Importance sampling
        Vector3D w_in;
        double pdf;
        Vector3D f = isect.bsdf->sample_f(w_out, &w_in, &pdf);

        // Russian Roulette ray continuation probability
        double cpdf = 0.7;

        // Recursive estimation of indirect illumination
        Intersection i_next;
        Ray r_next = Ray(hit_p, o2w * w_in);
        r_next.depth = r.depth - 1;
        r_next.min_t = EPS_F;
        double costheta = dot(w_in, Vector3D(0, 0, 1));
        // condition: 1. depth limitation 2. Russian roulette condition 3. intersection
        if (coin_flip(cpdf) && bvh->intersect(r_next, &i_next) && r.depth > 1)
        {
            L_out += at_least_one_bounce_radiance(r_next, i_next) * f * costheta / pdf / cpdf;
        }

        return L_out;
    }

Catalogue

Cloth Simulation


xxxxxxxxxx
void Cloth::simulate(double frames_per_sec, double simulation_steps, ClothParameters *cp,
                     vector<Vector3D> external_accelerations,
                     vector<CollisionObject *> *collision_objects)
{
  double mass = width * height * cp->density / num_width_points / num_height_points;
  double delta_t = 1.0f / frames_per_sec / simulation_steps;

  //Compute the total force acting on each point mass.
  Vector3D total_external_force(0, 0, 0);
  for (Vector3D &external_acceleration : external_accelerations)
  {
    total_external_force += external_acceleration * mass;
  }
  for (PointMass &pm : point_masses)
    pm.forces = total_external_force;
  for (Spring &s : springs)
  {
    Vector3D pa = s.pm_a->position;
    Vector3D pb = s.pm_b->position;
    Vector3D a_b = pb - pa;
    double l = s.rest_length;
    double F = cp->ks * (a_b.norm() - l);
    if (s.spring_type == STRUCTURAL && cp->enable_structural_constraints)
    {
      s.pm_a->forces += F * (a_b).unit();
      s.pm_b->forces += F * (-a_b).unit();
    }
    if (s.spring_type == SHEARING && cp->enable_shearing_constraints)
    {
      s.pm_a->forces += F * (a_b).unit();
      s.pm_b->forces += F * (-a_b).unit();
    }
    if (s.spring_type == BENDING && cp->enable_bending_constraints)
    {
      s.pm_a->forces += 0.2 * F * (a_b).unit();
      s.pm_b->forces += 0.2 * F * (-a_b).unit();
    }
  }

  //Use Verlet integration to compute new point mass positions
  for (PointMass &pm : point_masses)
  {
    if (!pm.pinned)
    {
      Vector3D new_position = pm.position + (1 - cp->damping / 100) * (pm.position - pm.last_position) + pm.forces / mass * delta_t * delta_t;
      pm.last_position = pm.position;
      pm.position = new_position;
    }
  }

  //Handle self-collisions.
  build_spatial_map();
  for (PointMass &pm : point_masses)
  {

    self_collide(pm, simulation_steps);
  }

  //Handle collisions with other primitives.
  for (PointMass &pm : point_masses)
    for (CollisionObject *obj : *collision_objects)
      obj->collide(pm);

  //Constrain the changes to be such that the spring does not change
  // in length more than 10% per timestep [Provot 1995].
  for (Spring &s : springs)
  {
    Vector3D pa = s.pm_a->position;
    Vector3D pb = s.pm_b->position;
    Vector3D a_b = pb - pa;
    double l = s.rest_length;
    double L = (a_b.norm() - 1.1 * l);
    if (s.spring_type == STRUCTURAL && cp->enable_structural_constraints)
    {
      if (a_b.norm() > 1.1 * l)
      {
        if (!s.pm_a->pinned && !s.pm_b->pinned)
        {
          s.pm_a->position += (L / 2) * (a_b).unit();
          s.pm_b->position += (L / 2) * (-a_b).unit();
        }
        else if (!s.pm_a->pinned && s.pm_b->pinned)
        {
          s.pm_a->position += (a_b).unit() * L;
        }
        else if (s.pm_a->pinned && !s.pm_b->pinned)
        {
          s.pm_b->position += (-a_b).unit() * L;
        }
      }
    }
    if (s.spring_type == SHEARING && cp->enable_shearing_constraints)
    {
      if (a_b.norm() > 1.1 * l)
      {
        if (!s.pm_a->pinned && !s.pm_b->pinned)
        {
          s.pm_a->position += (L / 2) * (a_b).unit();
          s.pm_b->position += (L / 2) * (-a_b).unit();
        }
        else if (!s.pm_a->pinned && s.pm_b->pinned)
        {
          s.pm_a->position += (a_b).unit() * L;
        }
        else if (s.pm_a->pinned && !s.pm_b->pinned)
        {
          s.pm_b->position += (-a_b).unit() * L;
        }
      }
    }
    if (s.spring_type == BENDING && cp->enable_bending_constraints)
    {
      if (a_b.norm() > 1.1 * l)
      {
        if (!s.pm_a->pinned && !s.pm_b->pinned)
        {
          s.pm_a->position += (L / 2) * (a_b).unit();
          s.pm_b->position += (L / 2) * (-a_b).unit();
        }
        else if (!s.pm_a->pinned && s.pm_b->pinned)
        {
          s.pm_a->position += (a_b).unit() * L;
        }
        else if (s.pm_a->pinned && !s.pm_b->pinned)
        {
          s.pm_b->position += (-a_b).unit() * L;
        }
      }
    }
  }
}

Catalogue

Cloth Simulation: Diffuse Fragment Shader


xxxxxxxxxx
#version 330

// The camera's position in world-space
uniform vec3 u_cam_pos;

// Color
uniform vec4 u_color;

// Properties of the single point light
uniform vec3 u_light_pos;
uniform vec3 u_light_intensity;

// We also get the uniform texture we want to use.
uniform sampler2D u_texture_1;

// These are the inputs which are the outputs of the vertex shader.
in vec4 v_position;
in vec4 v_normal;

// This is where the final pixel color is output.
// Here, we are only interested in the first 3 dimensions (xyz).
// The 4th entry in this vector is for "alpha blending" which we
// do not require you to know about. For now, just set the alpha
// to 1.
out vec4 out_color;

void main() {
    
  vec3 kd = vec3(1, 1, 1);
  vec3 point2light = u_light_pos - v_position.xyz;
  float r2 = length(point2light) * length(point2light);
  vec3 n = normalize(v_normal.xyz);
  vec3 l = normalize(point2light);

  vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(n, l));
  out_color = vec4(Ld, 1);
  
}

Catalogue

Cloth Simulation: Blinn-Phong Fragment Shader


xxxxxxxxxx
#version 330

uniform vec4 u_color;
uniform vec3 u_cam_pos;
uniform vec3 u_light_pos;
uniform vec3 u_light_intensity;

in vec4 v_position;
in vec4 v_normal;
in vec2 v_uv;

out vec4 out_color;

void main() {
    
  //only ambient
//  vec3 ka = vec3(0.1, 0.1, 0.1);
//  vec3 kd = vec3(0, 0, 0);
//  vec3 ks = vec3(0, 0, 0);
//  vec3 Ia = vec3(1, 1, 1);
//  int p = 100;

  //only diffuse
//  vec3 ka = vec3(0, 0, 0);
//  vec3 kd = u_color.xyz;
//  vec3 ks = vec3(0, 0, 0);
//  vec3 Ia = vec3(1, 1, 1);
//  int p = 100;

  //only specular
//  vec3 ka = vec3(0, 0, 0);
//  vec3 kd = vec3(0, 0, 0);
//  vec3 ks = vec3(0.5, 0.5, 0.5);
//  vec3 Ia = vec3(1, 1, 1);
//  int p = 100;

  //entire
  vec3 ka = vec3(0.1, 0.1, 0.1);
  vec3 kd = u_color.xyz;
  vec3 ks = vec3(0.5, 0.5, 0.5);
  vec3 Ia = vec3(1, 1, 1);
  int p = 100;

  vec3 point2light = u_light_pos - v_position.xyz;
  vec3 point2cam = u_cam_pos - v_position.xyz;
  float r2 = length(point2light) * length(point2light);
  vec3 n = normalize(v_normal.xyz);
  vec3 l = normalize(point2light);
  vec3 v = normalize(point2cam);
  vec3 h = normalize(v + l);
  
  vec3 La = ka * Ia;
  vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(n, l));
  vec3 Ls = ks * (u_light_intensity / r2) * pow(max(0, dot(n, h)), p);
  out_color = vec4(La + Ld + Ls, 1);
  
}

Catalogue

Cloth Simulation: Texture Mapping Fragment Shader


xxxxxxxxxx
#version 330

uniform vec3 u_cam_pos;
uniform vec3 u_light_pos;
uniform vec3 u_light_intensity;

uniform sampler2D u_texture_5;

in vec4 v_position;
in vec4 v_normal;
in vec2 v_uv;

out vec4 out_color;

void main() {
  
  out_color = texture(u_texture_5, v_uv);
}

Catalogue

Cloth Simulation: Bump Mapping Vertex Shader


xxxxxxxxxx
#version 330

uniform mat4 u_view_projection;
uniform mat4 u_model;

uniform sampler2D u_texture_4;
uniform vec2 u_texture_4_size;

uniform float u_normal_scaling;
uniform float u_height_scaling;

in vec4 in_position;
in vec4 in_normal;
in vec4 in_tangent;
in vec2 in_uv;

out vec4 v_position;
out vec4 v_normal;
out vec2 v_uv;
out vec4 v_tangent;

void main() {
  v_position = u_model * in_position;
  v_normal = normalize(u_model * in_normal);
  v_uv = in_uv;
  v_tangent = normalize(u_model * in_tangent);
  gl_Position = u_view_projection * u_model * in_position;
}

Catalogue

Cloth Simulation: Bump Mapping Fragment Shader


xxxxxxxxxx
#version 330

uniform vec3 u_cam_pos;
uniform vec3 u_light_pos;
uniform vec3 u_light_intensity;

uniform vec4 u_color;

uniform sampler2D u_texture_4;
uniform vec2 u_texture_4_size;

uniform float u_normal_scaling;
uniform float u_height_scaling;

in vec4 v_position;
in vec4 v_normal;
in vec4 v_tangent;
in vec2 v_uv;

out vec4 out_color;

float h(vec2 uv) {
  vec4 height = texture(u_texture_4, uv);
  return height.r;
}

void main() {
    
  vec3 n = normalize(v_normal.xyz);
  vec3 t = normalize(v_tangent.xyz);
  vec3 b = cross(n, t);
  mat3 o2m = mat3(t, b, n);

  vec2 u1v = vec2(v_uv.x + 1.0 / u_texture_4_size.x, v_uv.y);
  vec2 uv1 = vec2(v_uv.x, v_uv.y + 1.0 / u_texture_4_size.y);

  float dU = (h(u1v) - h(v_uv)) * u_height_scaling * u_normal_scaling;
  float dV = (h(uv1) - h(v_uv)) * u_height_scaling * u_normal_scaling;

  vec3 no = vec3(-dU, -dV, 1);
  vec3 nd = normalize(o2m * no);

  vec3 ka = vec3(0.1, 0.1, 0.1);
  vec3 kd = vec3(0.8, 0.8, 0.8);
  vec3 ks = vec3(0.5, 0.5, 0.5);
  vec3 Ia = vec3(1, 1, 1);
  int p = 100;

  vec3 point2light = u_light_pos - v_position.xyz;
  vec3 point2cam = u_cam_pos - v_position.xyz;
  float r2 = length(point2light) * length(point2light);
  vec3 l = normalize(point2light);
  vec3 v = normalize(point2cam);
  vec3 h = normalize(v + l);
  
  vec3 La = ka * Ia;
  vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(nd, l));
  vec3 Ls = ks * (u_light_intensity / r2) * pow(max(0, dot(nd, h)), p);
  out_color = vec4(La + Ld + Ls, 1);

}

Catalogue

Cloth Simulation: Displacement Mapping Vertex Shader


xxxxxxxxxx
#version 330

uniform mat4 u_view_projection;
uniform mat4 u_model;

uniform sampler2D u_texture_4;
uniform vec2 u_texture_4_size;

uniform float u_normal_scaling;
uniform float u_height_scaling;

in vec4 in_position;
in vec4 in_normal;
in vec4 in_tangent;
in vec2 in_uv;

out vec4 v_position;
out vec4 v_normal;
out vec2 v_uv;
out vec4 v_tangent;

float h(vec2 uv) {
  vec4 height = texture(u_texture_4, uv);
  return height.g;
}

void main() {
  
  v_normal = normalize(u_model * in_normal);
  v_tangent = normalize(u_model * in_tangent);
  v_uv = in_uv;

  vec4 disp = v_normal * h(v_uv) * u_height_scaling;
  v_position = u_model * in_position + disp;

  gl_Position = u_view_projection * v_position;
}

Catalogue

Cloth Simulation: Displacement Mapping Fragment Shader


xxxxxxxxxx
#version 330

uniform vec3 u_cam_pos;
uniform vec3 u_light_pos;
uniform vec3 u_light_intensity;

uniform vec4 u_color;

uniform sampler2D u_texture_4;
uniform vec2 u_texture_4_size;

uniform float u_normal_scaling;
uniform float u_height_scaling;

in vec4 v_position;
in vec4 v_normal;
in vec4 v_tangent;
in vec2 v_uv;

out vec4 out_color;

float h(vec2 uv) {
  vec4 height = texture(u_texture_4, uv);
  return height.r;
}

void main() {
  
  vec3 n = normalize(v_normal.xyz);
  vec3 t = normalize(v_tangent.xyz);
  vec3 b = cross(n, t);
  mat3 o2m = mat3(t, b, n);

  vec2 u1v = vec2(v_uv.x + 1.0 / u_texture_4_size.x, v_uv.y);
  vec2 uv1 = vec2(v_uv.x, v_uv.y + 1.0 / u_texture_4_size.y);

  float dU = (h(u1v) - h(v_uv)) * u_height_scaling * u_normal_scaling;
  float dV = (h(uv1) - h(v_uv)) * u_height_scaling * u_normal_scaling;

  vec3 no = vec3(-dU, -dV, 1);
  vec3 nd = normalize(o2m * no);

  vec3 ka = vec3(0.1, 0.1, 0.1);
  vec3 kd = vec3(0.8, 0.8, 0.8);
  vec3 ks = vec3(0.5, 0.5, 0.5);
  vec3 Ia = vec3(1, 1, 1);
  int p = 100;

  vec3 point2light = u_light_pos - v_position.xyz;
  vec3 point2cam = u_cam_pos - v_position.xyz;
  float r2 = length(point2light) * length(point2light);
  vec3 l = normalize(point2light);
  vec3 v = normalize(point2cam);
  vec3 h = normalize(v + l);
  
  vec3 La = ka * Ia;
  vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(nd, l));
  vec3 Ls = ks * (u_light_intensity / r2) * pow(max(0, dot(nd, h)), p);
  out_color = vec4(La + Ld + Ls, 1);
}

Catalogue

Cloth Simulation: Mirror Fragment Shader


xxxxxxxxxx
#version 330


uniform vec3 u_cam_pos;

uniform samplerCube u_texture_cubemap;

in vec4 v_position;
in vec4 v_normal;
in vec4 v_tangent;

out vec4 out_color;

void main() {
  
    vec3 wo = normalize(u_cam_pos - v_position.xyz);
    vec3 n = normalize(v_normal.xyz);
    vec3 wi = normalize(2 * dot(wo, n) * n - wo);
    
    out_color = texture(u_texture_cubemap, wi);

}

Catalogue

Image Filtering Acceleration by Pthread


xxxxxxxxxx
//
// Pthread implementation of filtering a JPEG image
//

#include <iostream>
#include <chrono>
#include <pthread.h>
#include "utils.hpp"
#include <cstring>
#include <cmath>

const int FILTER_SIZE = 3;
const int HALF_SIZE = (FILTER_SIZE - 1) / 2;

// Structure to pass data to each thread
struct ThreadData
{
    unsigned char* input_red;
    unsigned char* input_green;
    unsigned char* input_blue;
    unsigned char* output_buffer;
    int start;
    int end;
    int origin_width;
    int padded_width;
};

void* filterImage(void* arg)
{
    ThreadData* data = reinterpret_cast<ThreadData*>(arg);

    for (int i = data->start; i < data->end; i++)
    {
        unsigned char sum_r = 0, sum_g = 0, sum_b = 0;
        int start = (i / data->origin_width + 1) * data->padded_width +
                    (i % data->origin_width + 1);
        int width = data->padded_width;
        int indices[9] = {start - 1 * width - 1,
                          start - 1 * width,
                          start - 1 * width + 1,
                          start - 1,
                          start,
                          start + 1,
                          start + 1 * width - 1,
                          start + 1 * width,
                          start + 1 * width + 1};

        sum_r += data->input_red[indices[0]] / 9;
        sum_g += data->input_green[indices[0]] / 9;
        sum_b += data->input_blue[indices[0]] / 9;

        sum_r += data->input_red[indices[1]] / 9;
        sum_g += data->input_green[indices[1]] / 9;
        sum_b += data->input_blue[indices[1]] / 9;

        sum_r += data->input_red[indices[2]] / 9;
        sum_g += data->input_green[indices[2]] / 9;
        sum_b += data->input_blue[indices[2]] / 9;

        sum_r += data->input_red[indices[3]] / 9;
        sum_g += data->input_green[indices[3]] / 9;
        sum_b += data->input_blue[indices[3]] / 9;

        sum_r += data->input_red[indices[4]] / 9;
        sum_g += data->input_green[indices[4]] / 9;
        sum_b += data->input_blue[indices[4]] / 9;

        sum_r += data->input_red[indices[5]] / 9;
        sum_g += data->input_green[indices[5]] / 9;
        sum_b += data->input_blue[indices[5]] / 9;

        sum_r += data->input_red[indices[6]] / 9;
        sum_g += data->input_green[indices[6]] / 9;
        sum_b += data->input_blue[indices[6]] / 9;

        sum_r += data->input_red[indices[7]] / 9;
        sum_g += data->input_green[indices[7]] / 9;
        sum_b += data->input_blue[indices[7]] / 9;

        sum_r += data->input_red[indices[8]] / 9;
        sum_g += data->input_green[indices[8]] / 9;
        sum_b += data->input_blue[indices[8]] / 9;

        data->output_buffer[i * 3] = sum_r;
        data->output_buffer[i * 3 + 1] = sum_g;
        data->output_buffer[i * 3 + 2] = sum_b;
    }

    return nullptr;
}

int main(int argc, char** argv)
{
    // Verify input argument format
    if (argc != 4)
    {
        std::cerr << "Invalid argument, should be: ./executable "
                     "/path/to/input/jpeg /path/to/output/jpeg num_threads\n";
        return -1;
    }

    int num_threads = std::stoi(argv[3]); // User-specified thread count

    // Read from input JPEG
    const char* input_filepath = argv[1];
    std::cout << "Input file from: " << input_filepath << "\n";
    auto input_jpeg = read_from_jpeg(input_filepath);

    auto filteredImage =
        new unsigned char[input_jpeg.width * input_jpeg.height *
                          input_jpeg.num_channels];

    for (int i = 0;
         i < input_jpeg.width * input_jpeg.height * input_jpeg.num_channels;
         ++i)
        filteredImage[i] = 0;

    int origin_width = input_jpeg.width;
    int origin_height = input_jpeg.height;
    int padded_width = input_jpeg.width + HALF_SIZE * 2;
    int padded_height = input_jpeg.height + HALF_SIZE * 2;

    unsigned char* padded_buffer =
        new unsigned char[padded_width * padded_height *
                          input_jpeg.num_channels];

    memset(padded_buffer, 0,
           padded_width * padded_height * input_jpeg.num_channels);

    for (int y = 0; y < input_jpeg.height; y++)
    {
        for (int x = 0; x < input_jpeg.width; x++)
        {
            for (int c = 0; c < input_jpeg.num_channels; c++)
            {
                padded_buffer[(y + HALF_SIZE) * padded_width *
                                  input_jpeg.num_channels +
                              (x + HALF_SIZE) * input_jpeg.num_channels + c] =
                    input_jpeg
                        .buffer[y * input_jpeg.width * input_jpeg.num_channels +
                                x * input_jpeg.num_channels + c];
            }
        }
    }

    delete[] input_jpeg.buffer;

    input_jpeg.width = padded_width;
    input_jpeg.height = padded_height;
    input_jpeg.buffer = padded_buffer;

    // Prepross, store reds, greens and blues separately
    auto reds = new unsigned char[input_jpeg.width * input_jpeg.height];
    auto greens = new unsigned char[input_jpeg.width * input_jpeg.height];
    auto blues = new unsigned char[input_jpeg.width * input_jpeg.height];

    for (int i = 0; i < input_jpeg.width * input_jpeg.height; i++)
    {
        reds[i] = input_jpeg.buffer[i * input_jpeg.num_channels];
        greens[i] = input_jpeg.buffer[i * input_jpeg.num_channels + 1];
        blues[i] = input_jpeg.buffer[i * input_jpeg.num_channels + 2];
    }

    pthread_t threads[num_threads];
    ThreadData thread_data[num_threads];

    auto start_time = std::chrono::high_resolution_clock::now();

    int chunk_size = origin_width * origin_height / num_threads;
    for (int i = 0; i < num_threads; i++)
    {
        thread_data[i].input_red = reds;
        thread_data[i].input_green = greens;
        thread_data[i].input_blue = blues;
        thread_data[i].output_buffer = filteredImage;
        thread_data[i].start = i * chunk_size;
        thread_data[i].end = (i == num_threads - 1)
                                 ? origin_width * origin_height
                                 : (i + 1) * chunk_size;
        thread_data[i].origin_width = origin_width;
        thread_data[i].padded_width = padded_width;

        pthread_create(&threads[i], nullptr, filterImage, &thread_data[i]);
    }

    // Wait for all threads to finish
    for (int i = 0; i < num_threads; i++)
    {
        pthread_join(threads[i], nullptr);
    }

    auto end_time = std::chrono::high_resolution_clock::now();
    auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(
        end_time - start_time);

    // Write filteredImage to output JPEG
    const char* output_filepath = argv[2];
    std::cout << "Output file to: " << output_filepath << "\n";
    JPEGMeta output_jpeg{filteredImage, origin_width, origin_height, 3,
                         input_jpeg.color_space};
    if (write_to_jpeg(output_jpeg, output_filepath))
    {
        std::cerr << "Failed to write output JPEG\n";
        return -1;
    }

    // Release allocated memory
    delete[] input_jpeg.buffer;
    delete[] filteredImage;
    delete[] reds;
    delete[] greens;
    delete[] blues;

    std::cout << "Transformation Complete!" << std::endl;
    std::cout << "Execution Time: " << elapsed_time.count()
              << " milliseconds\n";

    return 0;
}

Catalogue

Image Filtering Acceleration by CUDA


xxxxxxxxxx
//
// CUDA implementation of filtering a JPEG image
//

#include <iostream>

#include <cuda_runtime.h> // CUDA Header

#include "utils.hpp"

#include <cstring>

const int FILTER_SIZE = 3;
const int HALF_SIZE = (FILTER_SIZE - 1) / 2;

// CUDA kernel functon
__global__ void filterImage(const unsigned char* input, unsigned char* output,
                            int width, int height, int num_channels)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int origin_width = width - 2;
    int origin_height = height - 2;
    int padded_width = width;
    if (idx < origin_width * origin_height)
    {
        unsigned char sum_r = 0, sum_g = 0, sum_b = 0;
        int start =
            (idx / origin_width + 1) * padded_width + (idx % origin_width + 1);
        int indices[9] = {start - 1 * width - 1,
                          start - 1 * width,
                          start - 1 * width + 1,
                          start - 1,
                          start,
                          start + 1,
                          start + 1 * width - 1,
                          start + 1 * width,
                          start + 1 * width + 1};

        sum_r += input[indices[0] * num_channels] * 1 / 9;
        sum_g += input[indices[0] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[0] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[1] * num_channels] * 1 / 9;
        sum_g += input[indices[1] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[1] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[2] * num_channels] * 1 / 9;
        sum_g += input[indices[2] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[2] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[3] * num_channels] * 1 / 9;
        sum_g += input[indices[3] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[3] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[4] * num_channels] * 1 / 9;
        sum_g += input[indices[4] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[4] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[5] * num_channels] * 1 / 9;
        sum_g += input[indices[5] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[5] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[6] * num_channels] * 1 / 9;
        sum_g += input[indices[6] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[6] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[7] * num_channels] * 1 / 9;
        sum_g += input[indices[7] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[7] * num_channels + 2] * 1 / 9;

        sum_r += input[indices[8] * num_channels] * 1 / 9;
        sum_g += input[indices[8] * num_channels + 1] * 1 / 9;
        sum_b += input[indices[8] * num_channels + 2] * 1 / 9;

        int output_index = idx * num_channels;
        output[output_index] = sum_r;
        output[output_index + 1] = sum_g;
        output[output_index + 2] = sum_b;
    }
}

int main(int argc, char** argv)
{
    // Verify input argument format
    if (argc != 3)
    {
        std::cerr << "Invalid argument, should be: ./executable "
                     "/path/to/input/jpeg /path/to/output/jpeg\n";
        return -1;
    }
    // Read from input JPEG
    const char* input_filepath = argv[1];
    std::cout << "Input file from: " << input_filepath << "\n";
    auto input_jpeg = read_from_jpeg(input_filepath);

    auto filteredImage =
        new unsigned char[input_jpeg.width * input_jpeg.height *
                          input_jpeg.num_channels];

    for (int i = 0;
         i < input_jpeg.width * input_jpeg.height * input_jpeg.num_channels;
         ++i)
        filteredImage[i] = 0;

    int origin_width = input_jpeg.width;
    int origin_height = input_jpeg.height;
    int padded_width = input_jpeg.width + HALF_SIZE * 2;
    int padded_height = input_jpeg.height + HALF_SIZE * 2;

    unsigned char* padded_buffer =
        new unsigned char[padded_width * padded_height *
                          input_jpeg.num_channels];

    memset(padded_buffer, 0,
           padded_width * padded_height * input_jpeg.num_channels);

    for (int y = 0; y < input_jpeg.height; y++)
    {
        for (int x = 0; x < input_jpeg.width; x++)
        {
            for (int c = 0; c < input_jpeg.num_channels; c++)
            {
                padded_buffer[(y + HALF_SIZE) * padded_width *
                                  input_jpeg.num_channels +
                              (x + HALF_SIZE) * input_jpeg.num_channels + c] =
                    input_jpeg
                        .buffer[y * input_jpeg.width * input_jpeg.num_channels +
                                x * input_jpeg.num_channels + c];
            }
        }
    }

    delete[] input_jpeg.buffer;

    input_jpeg.width = padded_width;
    input_jpeg.height = padded_height;
    input_jpeg.buffer = padded_buffer;

    // Allocate memory on host (CPU)

    // Allocate memory on device (GPU)
    unsigned char* d_input;
    unsigned char* d_output;
    cudaMalloc((void**)&d_input, input_jpeg.width * input_jpeg.height *
                                     input_jpeg.num_channels *
                                     sizeof(unsigned char));
    cudaMalloc((void**)&d_output, origin_width * origin_height *
                                      input_jpeg.num_channels *
                                      sizeof(unsigned char));
    // Copy input data from host to device
    cudaMemcpy(d_input, input_jpeg.buffer,
               input_jpeg.width * input_jpeg.height * input_jpeg.num_channels *
                   sizeof(unsigned char),
               cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    float gpuDuration;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int blockSize = 512;
    int numBlocks = (origin_width * origin_height) / blockSize + 1;
    cudaEventRecord(start, 0); // GPU start time
    filterImage<<<numBlocks, blockSize>>>(d_input, d_output, input_jpeg.width,
                                          input_jpeg.height,
                                          input_jpeg.num_channels);
    cudaEventRecord(stop, 0); // GPU end time
    cudaEventSynchronize(stop);
    // Print the result of the GPU computation
    cudaEventElapsedTime(&gpuDuration, start, stop);
    // Copy output data from device to host
    cudaMemcpy(filteredImage, d_output,
               origin_width * origin_height * input_jpeg.num_channels *
                   sizeof(unsigned char),
               cudaMemcpyDeviceToHost);

    const char* output_filepath = argv[2];
    std::cout << "Output file to: " << output_filepath << "\n";
    JPEGMeta output_jpeg{filteredImage, origin_width, origin_height, 3,
                         JCS_RGB};
    if (write_to_jpeg(output_jpeg, output_filepath))
    {
        std::cerr << "Failed to write output JPEG\n";
        return -1;
    }
    // Release allocated memory on device and host
    cudaFree(d_input);
    cudaFree(d_output);
    delete[] input_jpeg.buffer;
    delete[] filteredImage;

    std::cout << "Transformation Complete!" << std::endl;
    std::cout << "GPU Execution Time: " << gpuDuration << " milliseconds"
              << std::endl;
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return 0;
}

Catalogue

Image Filtering Acceleration by OpenACC


xxxxxxxxxx
//
// OpenACC implementation of filtering a JPEG image
//

#include <iostream>
#include <chrono>

#include "utils.hpp"
// #include <openacc.h> // OpenACC Header
#include <cstring>

const int FILTER_SIZE = 3;
const int HALF_SIZE = (FILTER_SIZE - 1) / 2;

int main(int argc, char **argv)
{
    // Verify input argument format
    if (argc != 3)
    {
        std::cerr << "Invalid argument, should be: ./executable "
                     "/path/to/input/jpeg /path/to/output/jpeg\n";
        return -1;
    }
    // Read from input JPEG
    const char *input_filepath = argv[1];
    std::cout << "Input file from: " << input_filepath << "\n";
    JPEGMeta input_jpeg = read_from_jpeg(input_filepath);

    auto filteredImage =
        new unsigned char[input_jpeg.width * input_jpeg.height *
                          input_jpeg.num_channels];

    for (int i = 0;
         i < input_jpeg.width * input_jpeg.height * input_jpeg.num_channels;
         ++i)
        filteredImage[i] = 0;

    int origin_width = input_jpeg.width;
    int origin_height = input_jpeg.height;
    int padded_width = input_jpeg.width + HALF_SIZE * 2;
    int padded_height = input_jpeg.height + HALF_SIZE * 2;

    unsigned char *padded_buffer =
        new unsigned char[padded_width * padded_height *
                          input_jpeg.num_channels];

    memset(padded_buffer, 0,
           padded_width * padded_height * input_jpeg.num_channels);

    for (int y = 0; y < input_jpeg.height; y++)
    {
        for (int x = 0; x < input_jpeg.width; x++)
        {
            for (int c = 0; c < input_jpeg.num_channels; c++)
            {
                padded_buffer[(y + HALF_SIZE) * padded_width *
                                  input_jpeg.num_channels +
                              (x + HALF_SIZE) * input_jpeg.num_channels + c] =
                    input_jpeg
                        .buffer[y * input_jpeg.width * input_jpeg.num_channels +
                                x * input_jpeg.num_channels + c];
            }
        }
    }

    delete[] input_jpeg.buffer;

    input_jpeg.width = padded_width;
    input_jpeg.height = padded_height;
    input_jpeg.buffer = padded_buffer;

    int width = input_jpeg.width;
    int height = input_jpeg.height;
    int num_channels = input_jpeg.num_channels;

    unsigned char *buffer = new unsigned char[width * height * num_channels];
    for (int i = 0; i < width * height * num_channels; i++)
    {
        buffer[i] = input_jpeg.buffer[i];
    }
#pragma acc enter data copyin(                                      \
    filteredImage[0 : origin_width * origin_height * num_channels], \
    buffer[0 : width * height * num_channels])

#pragma acc update device(                                          \
    filteredImage[0 : origin_width * origin_height * num_channels], \
    buffer[0 : width * height * num_channels])

    auto start_time = std::chrono::high_resolution_clock::now();
#pragma acc parallel present(                                       \
    filteredImage[0 : origin_width * origin_height * num_channels], \
    buffer[0 : width * height * num_channels]) num_gangs(1024)
    {
#pragma acc loop independent
        for (int i = 0; i < origin_width * origin_height; i++)
        {
            unsigned char sum_r = 0, sum_g = 0, sum_b = 0;
            int start =
                (i / origin_width + 1) * padded_width + (i % origin_width + 1);
            int width = padded_width;

            int indices[9] = {start - 1 * width - 1,
                              start - 1 * width,
                              start - 1 * width + 1,
                              start - 1,
                              start,
                              start + 1,
                              start + 1 * width - 1,
                              start + 1 * width,
                              start + 1 * width + 1};

            sum_r += buffer[indices[0] * num_channels] / 9;
            sum_g += buffer[indices[0] * num_channels + 1] / 9;
            sum_b += buffer[indices[0] * num_channels + 2] / 9;

            sum_r += buffer[indices[1] * num_channels] / 9;
            sum_g += buffer[indices[1] * num_channels + 1] / 9;
            sum_b += buffer[indices[1] * num_channels + 2] / 9;

            sum_r += buffer[indices[2] * num_channels] / 9;
            sum_g += buffer[indices[2] * num_channels + 1] / 9;
            sum_b += buffer[indices[2] * num_channels + 2] / 9;

            sum_r += buffer[indices[3] * num_channels] / 9;
            sum_g += buffer[indices[3] * num_channels + 1] / 9;
            sum_b += buffer[indices[3] * num_channels + 2] / 9;

            sum_r += buffer[indices[4] * num_channels] / 9;
            sum_g += buffer[indices[4] * num_channels + 1] / 9;
            sum_b += buffer[indices[4] * num_channels + 2] / 9;

            sum_r += buffer[indices[5] * num_channels] / 9;
            sum_g += buffer[indices[5] * num_channels + 1] / 9;
            sum_b += buffer[indices[5] * num_channels + 2] / 9;

            sum_r += buffer[indices[6] * num_channels] / 9;
            sum_g += buffer[indices[6] * num_channels + 1] / 9;
            sum_b += buffer[indices[6] * num_channels + 2] / 9;

            sum_r += buffer[indices[7] * num_channels] / 9;
            sum_g += buffer[indices[7] * num_channels + 1] / 9;
            sum_b += buffer[indices[7] * num_channels + 2] / 9;

            sum_r += buffer[indices[8] * num_channels] / 9;
            sum_g += buffer[indices[8] * num_channels + 1] / 9;
            sum_b += buffer[indices[8] * num_channels + 2] / 9;

            filteredImage[i * 3] = sum_r;
            filteredImage[i * 3 + 1] = sum_g;
            filteredImage[i * 3 + 2] = sum_b;
        }
    }
    auto end_time = std::chrono::high_resolution_clock::now();
#pragma acc update self(                                            \
    filteredImage[0 : origin_width * origin_height * num_channels], \
    buffer[0 : width * height * num_channels])

#pragma acc exit data copyout( \
    filteredImage[0 : origin_width * origin_height * num_channels])

    auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(
        end_time - start_time);

    const char *output_filepath = argv[2];
    std::cout << "Output file to: " << output_filepath << "\n";
    JPEGMeta output_jpeg{filteredImage, origin_width, origin_height, 3,
                         input_jpeg.color_space};
    if (write_to_jpeg(output_jpeg, output_filepath))
    {
        std::cerr << "Failed to write output JPEG\n";
        return -1;
    }
    // Release allocated memory
    delete[] input_jpeg.buffer;
    delete[] buffer;
    delete[] filteredImage;

    std::cout << "Transformation Complete!" << std::endl;
    std::cout << "Execution Time: " << elapsed_time.count()
              << " milliseconds\n";
    return 0;
}

Catalogue

Matrix Multiplication Acceleration by Memory Locality, SIMD, OpenMP and MPI


xxxxxxxxxx
//
// MPI + OpenMp + SIMD + Reordering Matrix Multiplication
//

#include <mpi.h> // MPI Header
#include <omp.h>
#include <immintrin.h>
#include <stdexcept>
#include <chrono>
#include "matrix.hpp"

#define MASTER 0
#define TAG_GATHER 0

Matrix matrix_multiply_mpi(const Matrix& matrix1, const Matrix& matrix2,
                           const size_t start_row, const size_t end_row)
{
    if (matrix1.getCols() != matrix2.getRows())
    {
        throw std::invalid_argument(
            "Matrix dimensions are not compatible for multiplication.");
    }

    size_t M = matrix1.getRows(), K = matrix1.getCols(), N = matrix2.getCols();

    Matrix result(end_row - start_row, N);

#pragma omp parallel for
    for (size_t i = start_row; i < end_row; i++)
    {
        int* result_i = result[i - start_row];
        const int* matrix1_i = matrix1[i];

        __m256i result_vecs[(N + 7) / 8];

        for (size_t j = 0; j < (N + 7) / 8; j++)
        {
            result_vecs[j] = _mm256_setzero_si256();
        }

        for (size_t k = 0; k < K; k++)
        {
            int matrix1_ik = matrix1_i[k];
            const int* matrix2_k = matrix2[k];

            for (size_t j = 0; j < N; j += 8)
            {
                __m256i m1_vec = _mm256_set1_epi32(matrix1_ik);
                __m256i m2_vec = _mm256_load_si256((__m256i*)&matrix2_k[j]);

                __m256i prod_vec = _mm256_mullo_epi32(m1_vec, m2_vec);

                result_vecs[j / 8] =
                    _mm256_add_epi32(result_vecs[j / 8], prod_vec);
            }
        }

        for (size_t j = 0; j < N; j += 8)
        {
            _mm256_store_si256((__m256i*)&result_i[j], result_vecs[j / 8]);
        }
    }

    return result;
}

int main(int argc, char** argv)
{
    // Verify input argument format
    if (argc != 5)
    {
        throw std::invalid_argument(
            "Invalid argument, should be: ./executable thread_num "
            "/path/to/matrix1 /path/to/matrix2 /path/to/multiply_result\n");
    }

    // Start the MPI
    MPI_Init(&argc, &argv);
    // How many processes are running
    int numtasks;
    MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
    // What's my rank?
    int taskid;
    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
    // Which node am I running on?
    int len;
    char hostname[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(hostname, &len);
    MPI_Status status;

    int thread_num = atoi(argv[1]);
    omp_set_num_threads(thread_num);

    // Read Matrix
    const std::string matrix1_path = argv[2];

    const std::string matrix2_path = argv[3];

    const std::string result_path = argv[4];

    Matrix matrix1 = Matrix::loadFromFile(matrix1_path);

    Matrix matrix2 = Matrix::loadFromFile(matrix2_path);

    size_t M = matrix1.getRows();
    size_t N = matrix2.getCols();

    int row_num_per_task = M / numtasks;
    int left_row_num = M % numtasks;
    std::vector<int> rows(numtasks + 1, 0);
    int divided_left_row_num = 0;

    for (int i = 0; i < numtasks; i++)
    {
        if (divided_left_row_num < left_row_num)
        {
            rows[i + 1] = rows[i] + row_num_per_task + 1;
            divided_left_row_num++;
        }
        else
            rows[i + 1] = rows[i] + row_num_per_task;
    }

    auto start_time = std::chrono::high_resolution_clock::now();
    if (taskid == MASTER)
    {
        Matrix result = matrix_multiply_mpi(matrix1, matrix2, rows[MASTER],
                                            rows[MASTER + 1]);

        Matrix final_result(M, N);

        // Your Code Here for Synchronization!

        for (int i = rows[MASTER]; i < rows[MASTER + 1]; i++)
        {
            for (int j = 0; j < N; j++)
            {
                final_result[i][j] = result[i][j];
            }
        }

        for (int i = MASTER + 1; i < numtasks; i++)
        {
            for (size_t j = rows[i]; j < rows[i + 1]; j++)
            {
                int* start_pos = final_result[j];

                MPI_Recv(start_pos, N, MPI_INT, i, TAG_GATHER, MPI_COMM_WORLD,
                         &status);
            }
        }

        auto end_time = std::chrono::high_resolution_clock::now();
        auto elapsed_time =
            std::chrono::duration_cast<std::chrono::milliseconds>(end_time -
                                                                  start_time);

        final_result.saveToFile(result_path);

        std::cout << "Output file to: " << result_path << std::endl;

        std::cout << "Multiplication Complete!" << std::endl;
        std::cout << "Execution Time: " << elapsed_time.count()
                  << " milliseconds" << std::endl;
    }
    else
    {
        Matrix result = matrix_multiply_mpi(matrix1, matrix2, rows[taskid],
                                            rows[taskid + 1]);

        // Your Code Here for Synchronization!
        for (size_t i = rows[taskid]; i < rows[taskid + 1]; i++)
        {
            MPI_Send(result[i - rows[taskid]], N, MPI_INT, MASTER, TAG_GATHER,
                     MPI_COMM_WORLD);
        }
    }

    MPI_Finalize();
    return 0;
}

Catalogue