Path Tracing: Monte Carlo Integration and Global Illumination
Matrix Multiplication Acceleration by Memory Locality, SIMD, OpenMP and MPI
xvoid RasterizerImp::rasterize_textured_triangle(float x0, float y0, float u0, float v0, float x1, float y1, float u1, float v1, float x2, float y2, float u2, float v2, Texture &tex) { // Ensure that these points are arranged in counterclockwise order. If not, exchange them if ((x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) < 0) { swap(x1, x2); swap(y1, y2); swap(u1, u2); swap(v1, v2); }
// Get the boundary and use ceil() / floor() to improve the accuracy
// Find the boundary of the triangle int xmax = ceil(std::max({x0, x1, x2})); int xmin = floor(std::min({x0, x1, x2})); int ymax = ceil(std::max({y0, y1, y2})); int ymin = floor(std::min({y0, y1, y2}));
int scale = (int)sqrt(sample_rate); // Initialize the SampleParams SampleParams sp; sp.lsm = lsm; sp.psm = psm; Color c;
// Initialize the transition matrix Matrix3x3 M(x0, x1, x2, y0, y1, y2, 1, 1, 1); Matrix3x3 inv_M = M.inv();
// Convert the point into Vector form Vector2D X0 = Vector2D(x0, y0); Vector2D X1 = Vector2D(x1, y1); Vector2D X2 = Vector2D(x2, y2); // Triangle line Vector2D l0 = X1 - X0; Vector2D l1 = X2 - X1; Vector2D l2 = X0 - X2; // Normal vector Vector2D N0 = Vector2D(-l0.y, l0.x); Vector2D N1 = Vector2D(-l1.y, l1.x); Vector2D N2 = Vector2D(-l2.y, l2.x);
for (int y = ymin; y < ymax; y++) { for (int x = xmin; x < xmax; x++) { for (int j = 0; j < scale; j++) { for (int i = 0; i < scale; i++) { float px = (float)x + ((float)i + 0.5) / (float)scale; float py = (float)y + ((float)j + 0.5) / (float)scale; Vector2D p0 = Vector2D(px, py); Vector2D V1 = p0 - X0; Vector2D V2 = p0 - X1; Vector2D V3 = p0 - X2; // Use L = V * N to detect whether the point is inside the triangle if ((dot(V1, N0) >= 0) && (dot(V2, N1) >= 0) && (dot(V3, N2) >= 0)) { Vector3D p = Vector3D(px, py, 1); // Use M^-1 * (x, y, 1) to get the barycentric coordinate Vector3D weight_p = inv_M * p; weight_p.z = 1 - weight_p.x - weight_p.y; // Use the weight to calculate the uv coordinate of the sampled point sp.p_uv = Vector2D(dot(weight_p, Vector3D(u0, u1, u2)), dot(weight_p, Vector3D(v0, v1, v2)));
Vector3D weight_p_dx = inv_M * Vector3D(px + 1., py, 1); weight_p_dx.z = 1 - weight_p_dx.x - weight_p_dx.y; // Since dx=1, then we just need to use subtraction to get d(u,v)/dx sp.p_dx_uv = Vector2D(dot(weight_p_dx, Vector3D(u0, u1, u2)), dot(weight_p_dx, Vector3D(v0, v1, v2))) - sp.p_uv;
Vector3D weight_p_dy = inv_M * Vector3D(px, py + 1., 1); weight_p_dy.z = 1 - weight_p_dy.x - weight_p_dy.y; // Since dy=1, then we just need to use subtraction to get d(u,v)/dy sp.p_dy_uv = Vector2D(dot(weight_p_dy, Vector3D(u0, u1, u2)), dot(weight_p_dy, Vector3D(v0, v1, v2))) - sp.p_uv;
// Check the texture image and get the color c = tex.sample(sp); super_sample_fill_pixel(x, y, i + j * scale, c); } } } } } }
float Texture::get_level(const SampleParams &sp) { // Scale the derivatives according to the texture size Vector2D p_tx_uv = Vector2D(sp.p_dx_uv.x * (width - 1), sp.p_dx_uv.y * (height - 1)); Vector2D p_ty_uv = Vector2D(sp.p_dy_uv.x * (width - 1), sp.p_dy_uv.y * (height - 1)); // Get the bigger norm float L = std::max(p_tx_uv.norm(), p_ty_uv.norm()); float D = log2(L); return D; }
xxxxxxxxxxvoid MeshResampler::upsample(HalfedgeMesh &mesh) { // 1. Compute new positions for all the old vertices for (VertexIter v = mesh.verticesBegin(); v != mesh.verticesEnd(); v++) { v->isNew = false; // set it to be false HalfedgeIter h = v->halfedge(); Vector3D original_position = v->position; Vector3D original_neighbor_position_sum(0, 0, 0); int n = 0; float u;
// Find all the neighbors of the vertex do { n++; original_neighbor_position_sum += h->twin()->vertex()->position; h = h->twin()->next(); } while (h != v->halfedge());
// Use the formula to compute the final result if (n == 3) { u = 3.0 / 16.0; } else { u = 3.0 / (8.0 * n); }
v->newPosition = (1 - n * u) * original_position + u * original_neighbor_position_sum; }
// 2. Compute the updated vertex positions associated with edges for (EdgeIter e = mesh.edgesBegin(); e != mesh.edgesEnd(); e++) { // Set isNew to false for all old edges e->isNew = false;
/* C /|\ A | D \|/ B */
HalfedgeIter h0 = e->halfedge(); HalfedgeIter h1 = h0->twin(); HalfedgeIter CA = h0->next(); HalfedgeIter AB = CA->next(); HalfedgeIter BD = h1->next(); HalfedgeIter DC = BD->next();
// Find the position of the four vertices Vector3D pB = h0->vertex()->position; Vector3D pC = CA->vertex()->position; Vector3D pA = AB->vertex()->position; Vector3D pD = DC->vertex()->position;
// Use the formula to compute the final result e->newPosition = 3.0 / 8.0 * (pB + pC) + 1.0 / 8.0 * (pA + pD); }
// 3. Split every edge in the mesh for (EdgeIter e = mesh.edgesBegin(); e != mesh.edgesEnd(); e++) { /* C /|\ A | D \|/ B */ HalfedgeIter h0 = e->halfedge(); HalfedgeIter h1 = h0->twin(); VertexIter B = h0->vertex(); VertexIter C = h1->vertex();
// Check whether both vertices are new if (B->isNew == false && C->isNew == false) { VertexIter new_v = mesh.splitEdge(e); new_v->newPosition = e->newPosition; } }
// 4. Flip any new edge that connects an old and new vertex for (EdgeIter e = mesh.edgesBegin(); e != mesh.edgesEnd(); e++) { /* C /|\ A | D \|/ B */ HalfedgeIter h0 = e->halfedge(); HalfedgeIter h1 = h0->twin(); VertexIter B = h0->vertex(); VertexIter C = h1->vertex();
// Check whether one vertex is new, the other is old if ((B->isNew ^ C->isNew) && e->isNew == true) { mesh.flipEdge(e); } }
// 5. Copy the new vertex positions into the final Vertex::position for (VertexIter v = mesh.verticesBegin(); v != mesh.verticesEnd(); v++) { v->position = v->newPosition; } }
xxxxxxxxxxBVHNode *BVHAccel::construct_bvh(std::vector<Primitive *>::iterator start, std::vector<Primitive *>::iterator end, size_t max_leaf_size) {
// Construct a BVH from the given vector of primitives and maximum leaf // size configuration. The starter code builds a BVH aggregate with a // single leaf node (which is also the root) that encloses all the // primitives.
int count = end - start;
if (count > max_leaf_size) { Vector3D avg_centrd = Vector3D(0, 0, 0);
BBox bbox;
for (auto p = start; p != end; p++) { BBox bb = (*p)->get_bbox(); bbox.expand(bb); avg_centrd += bb.centroid(); }
avg_centrd /= count;
auto smaller_than_centrd = [avg_centrd](Primitive *p) { return p->get_bbox().centroid().x < avg_centrd.x; };
auto mid = std::partition(start, end, smaller_than_centrd);
BVHNode *node = new BVHNode(bbox);
if (mid == start || mid == end) {
std::sort(start, end, cmp);
node->l = construct_bvh(start, start + max_leaf_size, max_leaf_size); node->r = construct_bvh(start + max_leaf_size, end, max_leaf_size); } else { node->l = construct_bvh(start, mid, max_leaf_size); node->r = construct_bvh(mid, end, max_leaf_size); }
return node; } else { BBox bbox;
for (auto p = start; p != end; p++) { BBox bb = (*p)->get_bbox(); bbox.expand(bb); }
BVHNode *node = new BVHNode(bbox); node->start = start; node->end = end;
return node; } }
bool BVHAccel::has_intersection(const Ray &ray, BVHNode *node) const { // Take note that this function has a short-circuit that the // Intersection version cannot since it returns as soon as it finds // a hit, it doesn't actually have to find the closest hit.
double t0 = ray.min_t; double t1 = ray.max_t; if (!node->bb.intersect(ray, t0, t1)) return false;
if (node->l == NULL && node->r == NULL) { for (auto p = node->start; p != node->end; p++) { total_isects++; if ((*p)->has_intersection(ray)) return true; } return false; } return has_intersection(ray, node->l) || has_intersection(ray, node->r); }
bool BVHAccel::intersect(const Ray &ray, Intersection *i, BVHNode *node) const {
bool hit = false;
double t0 = ray.min_t; double t1 = ray.max_t; if (!node->bb.intersect(ray, t0, t1)) return hit; if (node->l == NULL && node->r == NULL) { for (auto p = node->start; p != node->end; p++) { total_isects++; hit = (*p)->intersect(ray, i) || hit; } return hit; } hit = intersect(ray, i, node->l) || hit; hit = intersect(ray, i, node->r) || hit; return hit; }
xxxxxxxxxxVector3D PathTracer::estimate_direct_lighting_hemisphere(const Ray &r, const Intersection &isect) { // Estimate the lighting from this intersection coming directly from a light. // For this function, sample uniformly in a hemisphere.
// Note: When comparing Cornel Box (CBxxx.dae) results to importance sampling, you may find the "glow" around the light source is gone. // This is totally fine: the area lights in importance sampling have directionality, however, in hemisphere sampling, we don't model this behavior.
// Make a coordinate system for a hit point // with N aligned with the Z direction. Matrix3x3 o2w; make_coord_space(o2w, isect.n); Matrix3x3 w2o = o2w.T();
// w_out points towards the source of the ray (e.g., // toward the camera if this is a primary ray) const Vector3D hit_p = r.o + r.d * isect.t; const Vector3D w_out = w2o * (-r.d);
// This is the same number of total samples as // estimate_direct_lighting_importance (outside of delta lights). We keep the // same number of samples for clarity of comparison. int num_samples = scene->lights.size() * ns_area_light; Vector3D L_out;
// Define the direction of the sample light emitted from the light source as w_in Vector3D w_in; double pdf; // f sampling pdf for (int i = 0; i < num_samples; i++) { Vector3D f = isect.bsdf->sample_f(w_out, &w_in, &pdf); // Get f, w_in, pdf Vector3D n = Vector3D(0, 0, 1); // Normal vector double cos_theta = dot(w_in, n); // Angle between two vectors // When calculating, w_in is converted to a vector in the local coordinate system at the intersection. Ray ray_in = Ray(hit_p, o2w * w_in); // Define the outgoing light ray_in.min_t = EPS_F; Intersection light_src; if (bvh->intersect(ray_in, &light_src)) { // Check whether it hits Vector3D L = light_src.bsdf->get_emission(); // Get the radiance directly L_out += f * L * cos_theta / pdf; } } L_out /= num_samples; return L_out; }
Vector3D PathTracer::estimate_direct_lighting_importance(const Ray &r, const Intersection &isect) { // Estimate the lighting from this intersection coming directly from a light. // To implement importance sampling, sample only from lights, not uniformly in // a hemisphere.
//Make a coordinate system for a hit point // with N aligned with the Z direction. Matrix3x3 o2w; make_coord_space(o2w, isect.n); Matrix3x3 w2o = o2w.T();
// w_out points towards the source of the ray (e.g., // toward the camera if this is a primary ray) const Vector3D hit_p = r.o + r.d * isect.t; const Vector3D w_out = w2o * (-r.d); Vector3D L_out;
Vector3D w_in; double distToLight; double pdf; for (SceneLight *light : scene->lights) { int num_samples = 0; Vector3D L_unit = Vector3D(0, 0, 0); for (int i = 0; i < ns_area_light; i++) { Vector3D L = light->sample_L(hit_p, &w_in, &distToLight, &pdf); // Get the radiance by sampling double cos_theta = dot(w_in, isect.n); Ray ray_in = Ray(hit_p, w_in, distToLight - EPS_F); ray_in.min_t = EPS_F; Intersection object; if (!bvh->intersect(ray_in, &object)) { Vector3D f = isect.bsdf->f(w_out, w_in); L_unit += f * L * cos_theta / pdf; } num_samples++; if (light->is_delta_light()) break; } L_out += L_unit / num_samples; }
return L_out; }
Vector3D PathTracer::at_least_one_bounce_radiance(const Ray &r, const Intersection &isect) { Matrix3x3 o2w; make_coord_space(o2w, isect.n); Matrix3x3 w2o = o2w.T();
Vector3D hit_p = r.o + r.d * isect.t; Vector3D w_out = w2o * (-r.d);
Vector3D L_out(0, 0, 0);
// Returns the one bounce radiance + radiance from extra bounces at this point. // Should be called recursively to simulate extra bounces.
// Direct illumination L_out = one_bounce_radiance(r, isect);
// Importance sampling Vector3D w_in; double pdf; Vector3D f = isect.bsdf->sample_f(w_out, &w_in, &pdf);
// Russian Roulette ray continuation probability double cpdf = 0.7;
// Recursive estimation of indirect illumination Intersection i_next; Ray r_next = Ray(hit_p, o2w * w_in); r_next.depth = r.depth - 1; r_next.min_t = EPS_F; double costheta = dot(w_in, Vector3D(0, 0, 1)); // condition: 1. depth limitation 2. Russian roulette condition 3. intersection if (coin_flip(cpdf) && bvh->intersect(r_next, &i_next) && r.depth > 1) { L_out += at_least_one_bounce_radiance(r_next, i_next) * f * costheta / pdf / cpdf; }
return L_out; }
xxxxxxxxxxvoid Cloth::simulate(double frames_per_sec, double simulation_steps, ClothParameters *cp, vector<Vector3D> external_accelerations, vector<CollisionObject *> *collision_objects){ double mass = width * height * cp->density / num_width_points / num_height_points; double delta_t = 1.0f / frames_per_sec / simulation_steps;
//Compute the total force acting on each point mass. Vector3D total_external_force(0, 0, 0); for (Vector3D &external_acceleration : external_accelerations) { total_external_force += external_acceleration * mass; } for (PointMass &pm : point_masses) pm.forces = total_external_force; for (Spring &s : springs) { Vector3D pa = s.pm_a->position; Vector3D pb = s.pm_b->position; Vector3D a_b = pb - pa; double l = s.rest_length; double F = cp->ks * (a_b.norm() - l); if (s.spring_type == STRUCTURAL && cp->enable_structural_constraints) { s.pm_a->forces += F * (a_b).unit(); s.pm_b->forces += F * (-a_b).unit(); } if (s.spring_type == SHEARING && cp->enable_shearing_constraints) { s.pm_a->forces += F * (a_b).unit(); s.pm_b->forces += F * (-a_b).unit(); } if (s.spring_type == BENDING && cp->enable_bending_constraints) { s.pm_a->forces += 0.2 * F * (a_b).unit(); s.pm_b->forces += 0.2 * F * (-a_b).unit(); } }
//Use Verlet integration to compute new point mass positions for (PointMass &pm : point_masses) { if (!pm.pinned) { Vector3D new_position = pm.position + (1 - cp->damping / 100) * (pm.position - pm.last_position) + pm.forces / mass * delta_t * delta_t; pm.last_position = pm.position; pm.position = new_position; } }
//Handle self-collisions. build_spatial_map(); for (PointMass &pm : point_masses) {
self_collide(pm, simulation_steps); }
//Handle collisions with other primitives. for (PointMass &pm : point_masses) for (CollisionObject *obj : *collision_objects) obj->collide(pm);
//Constrain the changes to be such that the spring does not change // in length more than 10% per timestep [Provot 1995]. for (Spring &s : springs) { Vector3D pa = s.pm_a->position; Vector3D pb = s.pm_b->position; Vector3D a_b = pb - pa; double l = s.rest_length; double L = (a_b.norm() - 1.1 * l); if (s.spring_type == STRUCTURAL && cp->enable_structural_constraints) { if (a_b.norm() > 1.1 * l) { if (!s.pm_a->pinned && !s.pm_b->pinned) { s.pm_a->position += (L / 2) * (a_b).unit(); s.pm_b->position += (L / 2) * (-a_b).unit(); } else if (!s.pm_a->pinned && s.pm_b->pinned) { s.pm_a->position += (a_b).unit() * L; } else if (s.pm_a->pinned && !s.pm_b->pinned) { s.pm_b->position += (-a_b).unit() * L; } } } if (s.spring_type == SHEARING && cp->enable_shearing_constraints) { if (a_b.norm() > 1.1 * l) { if (!s.pm_a->pinned && !s.pm_b->pinned) { s.pm_a->position += (L / 2) * (a_b).unit(); s.pm_b->position += (L / 2) * (-a_b).unit(); } else if (!s.pm_a->pinned && s.pm_b->pinned) { s.pm_a->position += (a_b).unit() * L; } else if (s.pm_a->pinned && !s.pm_b->pinned) { s.pm_b->position += (-a_b).unit() * L; } } } if (s.spring_type == BENDING && cp->enable_bending_constraints) { if (a_b.norm() > 1.1 * l) { if (!s.pm_a->pinned && !s.pm_b->pinned) { s.pm_a->position += (L / 2) * (a_b).unit(); s.pm_b->position += (L / 2) * (-a_b).unit(); } else if (!s.pm_a->pinned && s.pm_b->pinned) { s.pm_a->position += (a_b).unit() * L; } else if (s.pm_a->pinned && !s.pm_b->pinned) { s.pm_b->position += (-a_b).unit() * L; } } } }}
xxxxxxxxxx#version 330
// The camera's position in world-spaceuniform vec3 u_cam_pos;
// Coloruniform vec4 u_color;
// Properties of the single point lightuniform vec3 u_light_pos;uniform vec3 u_light_intensity;
// We also get the uniform texture we want to use.uniform sampler2D u_texture_1;
// These are the inputs which are the outputs of the vertex shader.in vec4 v_position;in vec4 v_normal;
// This is where the final pixel color is output.// Here, we are only interested in the first 3 dimensions (xyz).// The 4th entry in this vector is for "alpha blending" which we// do not require you to know about. For now, just set the alpha// to 1.out vec4 out_color;
void main() { vec3 kd = vec3(1, 1, 1); vec3 point2light = u_light_pos - v_position.xyz; float r2 = length(point2light) * length(point2light); vec3 n = normalize(v_normal.xyz); vec3 l = normalize(point2light);
vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(n, l)); out_color = vec4(Ld, 1); }
xxxxxxxxxx#version 330
uniform vec4 u_color;uniform vec3 u_cam_pos;uniform vec3 u_light_pos;uniform vec3 u_light_intensity;
in vec4 v_position;in vec4 v_normal;in vec2 v_uv;
out vec4 out_color;
void main() { //only ambient// vec3 ka = vec3(0.1, 0.1, 0.1);// vec3 kd = vec3(0, 0, 0);// vec3 ks = vec3(0, 0, 0);// vec3 Ia = vec3(1, 1, 1);// int p = 100;
//only diffuse// vec3 ka = vec3(0, 0, 0);// vec3 kd = u_color.xyz;// vec3 ks = vec3(0, 0, 0);// vec3 Ia = vec3(1, 1, 1);// int p = 100;
//only specular// vec3 ka = vec3(0, 0, 0);// vec3 kd = vec3(0, 0, 0);// vec3 ks = vec3(0.5, 0.5, 0.5);// vec3 Ia = vec3(1, 1, 1);// int p = 100;
//entire vec3 ka = vec3(0.1, 0.1, 0.1); vec3 kd = u_color.xyz; vec3 ks = vec3(0.5, 0.5, 0.5); vec3 Ia = vec3(1, 1, 1); int p = 100;
vec3 point2light = u_light_pos - v_position.xyz; vec3 point2cam = u_cam_pos - v_position.xyz; float r2 = length(point2light) * length(point2light); vec3 n = normalize(v_normal.xyz); vec3 l = normalize(point2light); vec3 v = normalize(point2cam); vec3 h = normalize(v + l); vec3 La = ka * Ia; vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(n, l)); vec3 Ls = ks * (u_light_intensity / r2) * pow(max(0, dot(n, h)), p); out_color = vec4(La + Ld + Ls, 1); }
xxxxxxxxxx#version 330
uniform vec3 u_cam_pos;uniform vec3 u_light_pos;uniform vec3 u_light_intensity;
uniform sampler2D u_texture_5;
in vec4 v_position;in vec4 v_normal;in vec2 v_uv;
out vec4 out_color;
void main() { out_color = texture(u_texture_5, v_uv);}
xxxxxxxxxx#version 330
uniform mat4 u_view_projection;uniform mat4 u_model;
uniform sampler2D u_texture_4;uniform vec2 u_texture_4_size;
uniform float u_normal_scaling;uniform float u_height_scaling;
in vec4 in_position;in vec4 in_normal;in vec4 in_tangent;in vec2 in_uv;
out vec4 v_position;out vec4 v_normal;out vec2 v_uv;out vec4 v_tangent;
void main() { v_position = u_model * in_position; v_normal = normalize(u_model * in_normal); v_uv = in_uv; v_tangent = normalize(u_model * in_tangent); gl_Position = u_view_projection * u_model * in_position;}
xxxxxxxxxx#version 330
uniform vec3 u_cam_pos;uniform vec3 u_light_pos;uniform vec3 u_light_intensity;
uniform vec4 u_color;
uniform sampler2D u_texture_4;uniform vec2 u_texture_4_size;
uniform float u_normal_scaling;uniform float u_height_scaling;
in vec4 v_position;in vec4 v_normal;in vec4 v_tangent;in vec2 v_uv;
out vec4 out_color;
float h(vec2 uv) { vec4 height = texture(u_texture_4, uv); return height.r;}
void main() { vec3 n = normalize(v_normal.xyz); vec3 t = normalize(v_tangent.xyz); vec3 b = cross(n, t); mat3 o2m = mat3(t, b, n);
vec2 u1v = vec2(v_uv.x + 1.0 / u_texture_4_size.x, v_uv.y); vec2 uv1 = vec2(v_uv.x, v_uv.y + 1.0 / u_texture_4_size.y);
float dU = (h(u1v) - h(v_uv)) * u_height_scaling * u_normal_scaling; float dV = (h(uv1) - h(v_uv)) * u_height_scaling * u_normal_scaling;
vec3 no = vec3(-dU, -dV, 1); vec3 nd = normalize(o2m * no);
vec3 ka = vec3(0.1, 0.1, 0.1); vec3 kd = vec3(0.8, 0.8, 0.8); vec3 ks = vec3(0.5, 0.5, 0.5); vec3 Ia = vec3(1, 1, 1); int p = 100;
vec3 point2light = u_light_pos - v_position.xyz; vec3 point2cam = u_cam_pos - v_position.xyz; float r2 = length(point2light) * length(point2light); vec3 l = normalize(point2light); vec3 v = normalize(point2cam); vec3 h = normalize(v + l); vec3 La = ka * Ia; vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(nd, l)); vec3 Ls = ks * (u_light_intensity / r2) * pow(max(0, dot(nd, h)), p); out_color = vec4(La + Ld + Ls, 1);
}
xxxxxxxxxx#version 330
uniform mat4 u_view_projection;uniform mat4 u_model;
uniform sampler2D u_texture_4;uniform vec2 u_texture_4_size;
uniform float u_normal_scaling;uniform float u_height_scaling;
in vec4 in_position;in vec4 in_normal;in vec4 in_tangent;in vec2 in_uv;
out vec4 v_position;out vec4 v_normal;out vec2 v_uv;out vec4 v_tangent;
float h(vec2 uv) { vec4 height = texture(u_texture_4, uv); return height.g;}
void main() { v_normal = normalize(u_model * in_normal); v_tangent = normalize(u_model * in_tangent); v_uv = in_uv;
vec4 disp = v_normal * h(v_uv) * u_height_scaling; v_position = u_model * in_position + disp;
gl_Position = u_view_projection * v_position;}
xxxxxxxxxx#version 330
uniform vec3 u_cam_pos;uniform vec3 u_light_pos;uniform vec3 u_light_intensity;
uniform vec4 u_color;
uniform sampler2D u_texture_4;uniform vec2 u_texture_4_size;
uniform float u_normal_scaling;uniform float u_height_scaling;
in vec4 v_position;in vec4 v_normal;in vec4 v_tangent;in vec2 v_uv;
out vec4 out_color;
float h(vec2 uv) { vec4 height = texture(u_texture_4, uv); return height.r;}
void main() { vec3 n = normalize(v_normal.xyz); vec3 t = normalize(v_tangent.xyz); vec3 b = cross(n, t); mat3 o2m = mat3(t, b, n);
vec2 u1v = vec2(v_uv.x + 1.0 / u_texture_4_size.x, v_uv.y); vec2 uv1 = vec2(v_uv.x, v_uv.y + 1.0 / u_texture_4_size.y);
float dU = (h(u1v) - h(v_uv)) * u_height_scaling * u_normal_scaling; float dV = (h(uv1) - h(v_uv)) * u_height_scaling * u_normal_scaling;
vec3 no = vec3(-dU, -dV, 1); vec3 nd = normalize(o2m * no);
vec3 ka = vec3(0.1, 0.1, 0.1); vec3 kd = vec3(0.8, 0.8, 0.8); vec3 ks = vec3(0.5, 0.5, 0.5); vec3 Ia = vec3(1, 1, 1); int p = 100;
vec3 point2light = u_light_pos - v_position.xyz; vec3 point2cam = u_cam_pos - v_position.xyz; float r2 = length(point2light) * length(point2light); vec3 l = normalize(point2light); vec3 v = normalize(point2cam); vec3 h = normalize(v + l); vec3 La = ka * Ia; vec3 Ld = kd * (u_light_intensity / r2) * max(0, dot(nd, l)); vec3 Ls = ks * (u_light_intensity / r2) * pow(max(0, dot(nd, h)), p); out_color = vec4(La + Ld + Ls, 1);}
xxxxxxxxxx#version 330
uniform vec3 u_cam_pos;
uniform samplerCube u_texture_cubemap;
in vec4 v_position;in vec4 v_normal;in vec4 v_tangent;
out vec4 out_color;
void main() { vec3 wo = normalize(u_cam_pos - v_position.xyz); vec3 n = normalize(v_normal.xyz); vec3 wi = normalize(2 * dot(wo, n) * n - wo); out_color = texture(u_texture_cubemap, wi);
}
xxxxxxxxxx//// Pthread implementation of filtering a JPEG image//
const int FILTER_SIZE = 3;const int HALF_SIZE = (FILTER_SIZE - 1) / 2;
// Structure to pass data to each threadstruct ThreadData{ unsigned char* input_red; unsigned char* input_green; unsigned char* input_blue; unsigned char* output_buffer; int start; int end; int origin_width; int padded_width;};
void* filterImage(void* arg){ ThreadData* data = reinterpret_cast<ThreadData*>(arg);
for (int i = data->start; i < data->end; i++) { unsigned char sum_r = 0, sum_g = 0, sum_b = 0; int start = (i / data->origin_width + 1) * data->padded_width + (i % data->origin_width + 1); int width = data->padded_width; int indices[9] = {start - 1 * width - 1, start - 1 * width, start - 1 * width + 1, start - 1, start, start + 1, start + 1 * width - 1, start + 1 * width, start + 1 * width + 1};
sum_r += data->input_red[indices[0]] / 9; sum_g += data->input_green[indices[0]] / 9; sum_b += data->input_blue[indices[0]] / 9;
sum_r += data->input_red[indices[1]] / 9; sum_g += data->input_green[indices[1]] / 9; sum_b += data->input_blue[indices[1]] / 9;
sum_r += data->input_red[indices[2]] / 9; sum_g += data->input_green[indices[2]] / 9; sum_b += data->input_blue[indices[2]] / 9;
sum_r += data->input_red[indices[3]] / 9; sum_g += data->input_green[indices[3]] / 9; sum_b += data->input_blue[indices[3]] / 9;
sum_r += data->input_red[indices[4]] / 9; sum_g += data->input_green[indices[4]] / 9; sum_b += data->input_blue[indices[4]] / 9;
sum_r += data->input_red[indices[5]] / 9; sum_g += data->input_green[indices[5]] / 9; sum_b += data->input_blue[indices[5]] / 9;
sum_r += data->input_red[indices[6]] / 9; sum_g += data->input_green[indices[6]] / 9; sum_b += data->input_blue[indices[6]] / 9;
sum_r += data->input_red[indices[7]] / 9; sum_g += data->input_green[indices[7]] / 9; sum_b += data->input_blue[indices[7]] / 9;
sum_r += data->input_red[indices[8]] / 9; sum_g += data->input_green[indices[8]] / 9; sum_b += data->input_blue[indices[8]] / 9;
data->output_buffer[i * 3] = sum_r; data->output_buffer[i * 3 + 1] = sum_g; data->output_buffer[i * 3 + 2] = sum_b; }
return nullptr;}
int main(int argc, char** argv){ // Verify input argument format if (argc != 4) { std::cerr << "Invalid argument, should be: ./executable " "/path/to/input/jpeg /path/to/output/jpeg num_threads\n"; return -1; }
int num_threads = std::stoi(argv[3]); // User-specified thread count
// Read from input JPEG const char* input_filepath = argv[1]; std::cout << "Input file from: " << input_filepath << "\n"; auto input_jpeg = read_from_jpeg(input_filepath);
auto filteredImage = new unsigned char[input_jpeg.width * input_jpeg.height * input_jpeg.num_channels];
for (int i = 0; i < input_jpeg.width * input_jpeg.height * input_jpeg.num_channels; ++i) filteredImage[i] = 0;
int origin_width = input_jpeg.width; int origin_height = input_jpeg.height; int padded_width = input_jpeg.width + HALF_SIZE * 2; int padded_height = input_jpeg.height + HALF_SIZE * 2;
unsigned char* padded_buffer = new unsigned char[padded_width * padded_height * input_jpeg.num_channels];
memset(padded_buffer, 0, padded_width * padded_height * input_jpeg.num_channels);
for (int y = 0; y < input_jpeg.height; y++) { for (int x = 0; x < input_jpeg.width; x++) { for (int c = 0; c < input_jpeg.num_channels; c++) { padded_buffer[(y + HALF_SIZE) * padded_width * input_jpeg.num_channels + (x + HALF_SIZE) * input_jpeg.num_channels + c] = input_jpeg .buffer[y * input_jpeg.width * input_jpeg.num_channels + x * input_jpeg.num_channels + c]; } } }
delete[] input_jpeg.buffer;
input_jpeg.width = padded_width; input_jpeg.height = padded_height; input_jpeg.buffer = padded_buffer;
// Prepross, store reds, greens and blues separately auto reds = new unsigned char[input_jpeg.width * input_jpeg.height]; auto greens = new unsigned char[input_jpeg.width * input_jpeg.height]; auto blues = new unsigned char[input_jpeg.width * input_jpeg.height];
for (int i = 0; i < input_jpeg.width * input_jpeg.height; i++) { reds[i] = input_jpeg.buffer[i * input_jpeg.num_channels]; greens[i] = input_jpeg.buffer[i * input_jpeg.num_channels + 1]; blues[i] = input_jpeg.buffer[i * input_jpeg.num_channels + 2]; }
pthread_t threads[num_threads]; ThreadData thread_data[num_threads];
auto start_time = std::chrono::high_resolution_clock::now();
int chunk_size = origin_width * origin_height / num_threads; for (int i = 0; i < num_threads; i++) { thread_data[i].input_red = reds; thread_data[i].input_green = greens; thread_data[i].input_blue = blues; thread_data[i].output_buffer = filteredImage; thread_data[i].start = i * chunk_size; thread_data[i].end = (i == num_threads - 1) ? origin_width * origin_height : (i + 1) * chunk_size; thread_data[i].origin_width = origin_width; thread_data[i].padded_width = padded_width;
pthread_create(&threads[i], nullptr, filterImage, &thread_data[i]); }
// Wait for all threads to finish for (int i = 0; i < num_threads; i++) { pthread_join(threads[i], nullptr); }
auto end_time = std::chrono::high_resolution_clock::now(); auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>( end_time - start_time);
// Write filteredImage to output JPEG const char* output_filepath = argv[2]; std::cout << "Output file to: " << output_filepath << "\n"; JPEGMeta output_jpeg{filteredImage, origin_width, origin_height, 3, input_jpeg.color_space}; if (write_to_jpeg(output_jpeg, output_filepath)) { std::cerr << "Failed to write output JPEG\n"; return -1; }
// Release allocated memory delete[] input_jpeg.buffer; delete[] filteredImage; delete[] reds; delete[] greens; delete[] blues;
std::cout << "Transformation Complete!" << std::endl; std::cout << "Execution Time: " << elapsed_time.count() << " milliseconds\n";
return 0;}
xxxxxxxxxx//// CUDA implementation of filtering a JPEG image//
// CUDA Header
const int FILTER_SIZE = 3;const int HALF_SIZE = (FILTER_SIZE - 1) / 2;
// CUDA kernel functon__global__ void filterImage(const unsigned char* input, unsigned char* output, int width, int height, int num_channels){ int idx = blockIdx.x * blockDim.x + threadIdx.x; int origin_width = width - 2; int origin_height = height - 2; int padded_width = width; if (idx < origin_width * origin_height) { unsigned char sum_r = 0, sum_g = 0, sum_b = 0; int start = (idx / origin_width + 1) * padded_width + (idx % origin_width + 1); int indices[9] = {start - 1 * width - 1, start - 1 * width, start - 1 * width + 1, start - 1, start, start + 1, start + 1 * width - 1, start + 1 * width, start + 1 * width + 1};
sum_r += input[indices[0] * num_channels] * 1 / 9; sum_g += input[indices[0] * num_channels + 1] * 1 / 9; sum_b += input[indices[0] * num_channels + 2] * 1 / 9;
sum_r += input[indices[1] * num_channels] * 1 / 9; sum_g += input[indices[1] * num_channels + 1] * 1 / 9; sum_b += input[indices[1] * num_channels + 2] * 1 / 9;
sum_r += input[indices[2] * num_channels] * 1 / 9; sum_g += input[indices[2] * num_channels + 1] * 1 / 9; sum_b += input[indices[2] * num_channels + 2] * 1 / 9;
sum_r += input[indices[3] * num_channels] * 1 / 9; sum_g += input[indices[3] * num_channels + 1] * 1 / 9; sum_b += input[indices[3] * num_channels + 2] * 1 / 9;
sum_r += input[indices[4] * num_channels] * 1 / 9; sum_g += input[indices[4] * num_channels + 1] * 1 / 9; sum_b += input[indices[4] * num_channels + 2] * 1 / 9;
sum_r += input[indices[5] * num_channels] * 1 / 9; sum_g += input[indices[5] * num_channels + 1] * 1 / 9; sum_b += input[indices[5] * num_channels + 2] * 1 / 9;
sum_r += input[indices[6] * num_channels] * 1 / 9; sum_g += input[indices[6] * num_channels + 1] * 1 / 9; sum_b += input[indices[6] * num_channels + 2] * 1 / 9;
sum_r += input[indices[7] * num_channels] * 1 / 9; sum_g += input[indices[7] * num_channels + 1] * 1 / 9; sum_b += input[indices[7] * num_channels + 2] * 1 / 9;
sum_r += input[indices[8] * num_channels] * 1 / 9; sum_g += input[indices[8] * num_channels + 1] * 1 / 9; sum_b += input[indices[8] * num_channels + 2] * 1 / 9;
int output_index = idx * num_channels; output[output_index] = sum_r; output[output_index + 1] = sum_g; output[output_index + 2] = sum_b; }}
int main(int argc, char** argv){ // Verify input argument format if (argc != 3) { std::cerr << "Invalid argument, should be: ./executable " "/path/to/input/jpeg /path/to/output/jpeg\n"; return -1; } // Read from input JPEG const char* input_filepath = argv[1]; std::cout << "Input file from: " << input_filepath << "\n"; auto input_jpeg = read_from_jpeg(input_filepath);
auto filteredImage = new unsigned char[input_jpeg.width * input_jpeg.height * input_jpeg.num_channels];
for (int i = 0; i < input_jpeg.width * input_jpeg.height * input_jpeg.num_channels; ++i) filteredImage[i] = 0;
int origin_width = input_jpeg.width; int origin_height = input_jpeg.height; int padded_width = input_jpeg.width + HALF_SIZE * 2; int padded_height = input_jpeg.height + HALF_SIZE * 2;
unsigned char* padded_buffer = new unsigned char[padded_width * padded_height * input_jpeg.num_channels];
memset(padded_buffer, 0, padded_width * padded_height * input_jpeg.num_channels);
for (int y = 0; y < input_jpeg.height; y++) { for (int x = 0; x < input_jpeg.width; x++) { for (int c = 0; c < input_jpeg.num_channels; c++) { padded_buffer[(y + HALF_SIZE) * padded_width * input_jpeg.num_channels + (x + HALF_SIZE) * input_jpeg.num_channels + c] = input_jpeg .buffer[y * input_jpeg.width * input_jpeg.num_channels + x * input_jpeg.num_channels + c]; } } }
delete[] input_jpeg.buffer;
input_jpeg.width = padded_width; input_jpeg.height = padded_height; input_jpeg.buffer = padded_buffer;
// Allocate memory on host (CPU)
// Allocate memory on device (GPU) unsigned char* d_input; unsigned char* d_output; cudaMalloc((void**)&d_input, input_jpeg.width * input_jpeg.height * input_jpeg.num_channels * sizeof(unsigned char)); cudaMalloc((void**)&d_output, origin_width * origin_height * input_jpeg.num_channels * sizeof(unsigned char)); // Copy input data from host to device cudaMemcpy(d_input, input_jpeg.buffer, input_jpeg.width * input_jpeg.height * input_jpeg.num_channels * sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaEvent_t start, stop; float gpuDuration; cudaEventCreate(&start); cudaEventCreate(&stop); int blockSize = 512; int numBlocks = (origin_width * origin_height) / blockSize + 1; cudaEventRecord(start, 0); // GPU start time filterImage<<<numBlocks, blockSize>>>(d_input, d_output, input_jpeg.width, input_jpeg.height, input_jpeg.num_channels); cudaEventRecord(stop, 0); // GPU end time cudaEventSynchronize(stop); // Print the result of the GPU computation cudaEventElapsedTime(&gpuDuration, start, stop); // Copy output data from device to host cudaMemcpy(filteredImage, d_output, origin_width * origin_height * input_jpeg.num_channels * sizeof(unsigned char), cudaMemcpyDeviceToHost);
const char* output_filepath = argv[2]; std::cout << "Output file to: " << output_filepath << "\n"; JPEGMeta output_jpeg{filteredImage, origin_width, origin_height, 3, JCS_RGB}; if (write_to_jpeg(output_jpeg, output_filepath)) { std::cerr << "Failed to write output JPEG\n"; return -1; } // Release allocated memory on device and host cudaFree(d_input); cudaFree(d_output); delete[] input_jpeg.buffer; delete[] filteredImage;
std::cout << "Transformation Complete!" << std::endl; std::cout << "GPU Execution Time: " << gpuDuration << " milliseconds" << std::endl; cudaEventDestroy(start); cudaEventDestroy(stop); return 0;}
xxxxxxxxxx//// OpenACC implementation of filtering a JPEG image//
// #include <openacc.h> // OpenACC Header
const int FILTER_SIZE = 3;const int HALF_SIZE = (FILTER_SIZE - 1) / 2;
int main(int argc, char **argv){ // Verify input argument format if (argc != 3) { std::cerr << "Invalid argument, should be: ./executable " "/path/to/input/jpeg /path/to/output/jpeg\n"; return -1; } // Read from input JPEG const char *input_filepath = argv[1]; std::cout << "Input file from: " << input_filepath << "\n"; JPEGMeta input_jpeg = read_from_jpeg(input_filepath);
auto filteredImage = new unsigned char[input_jpeg.width * input_jpeg.height * input_jpeg.num_channels];
for (int i = 0; i < input_jpeg.width * input_jpeg.height * input_jpeg.num_channels; ++i) filteredImage[i] = 0;
int origin_width = input_jpeg.width; int origin_height = input_jpeg.height; int padded_width = input_jpeg.width + HALF_SIZE * 2; int padded_height = input_jpeg.height + HALF_SIZE * 2;
unsigned char *padded_buffer = new unsigned char[padded_width * padded_height * input_jpeg.num_channels];
memset(padded_buffer, 0, padded_width * padded_height * input_jpeg.num_channels);
for (int y = 0; y < input_jpeg.height; y++) { for (int x = 0; x < input_jpeg.width; x++) { for (int c = 0; c < input_jpeg.num_channels; c++) { padded_buffer[(y + HALF_SIZE) * padded_width * input_jpeg.num_channels + (x + HALF_SIZE) * input_jpeg.num_channels + c] = input_jpeg .buffer[y * input_jpeg.width * input_jpeg.num_channels + x * input_jpeg.num_channels + c]; } } }
delete[] input_jpeg.buffer;
input_jpeg.width = padded_width; input_jpeg.height = padded_height; input_jpeg.buffer = padded_buffer;
int width = input_jpeg.width; int height = input_jpeg.height; int num_channels = input_jpeg.num_channels;
unsigned char *buffer = new unsigned char[width * height * num_channels]; for (int i = 0; i < width * height * num_channels; i++) { buffer[i] = input_jpeg.buffer[i]; }
auto start_time = std::chrono::high_resolution_clock::now(); { for (int i = 0; i < origin_width * origin_height; i++) { unsigned char sum_r = 0, sum_g = 0, sum_b = 0; int start = (i / origin_width + 1) * padded_width + (i % origin_width + 1); int width = padded_width;
int indices[9] = {start - 1 * width - 1, start - 1 * width, start - 1 * width + 1, start - 1, start, start + 1, start + 1 * width - 1, start + 1 * width, start + 1 * width + 1};
sum_r += buffer[indices[0] * num_channels] / 9; sum_g += buffer[indices[0] * num_channels + 1] / 9; sum_b += buffer[indices[0] * num_channels + 2] / 9;
sum_r += buffer[indices[1] * num_channels] / 9; sum_g += buffer[indices[1] * num_channels + 1] / 9; sum_b += buffer[indices[1] * num_channels + 2] / 9;
sum_r += buffer[indices[2] * num_channels] / 9; sum_g += buffer[indices[2] * num_channels + 1] / 9; sum_b += buffer[indices[2] * num_channels + 2] / 9;
sum_r += buffer[indices[3] * num_channels] / 9; sum_g += buffer[indices[3] * num_channels + 1] / 9; sum_b += buffer[indices[3] * num_channels + 2] / 9;
sum_r += buffer[indices[4] * num_channels] / 9; sum_g += buffer[indices[4] * num_channels + 1] / 9; sum_b += buffer[indices[4] * num_channels + 2] / 9;
sum_r += buffer[indices[5] * num_channels] / 9; sum_g += buffer[indices[5] * num_channels + 1] / 9; sum_b += buffer[indices[5] * num_channels + 2] / 9;
sum_r += buffer[indices[6] * num_channels] / 9; sum_g += buffer[indices[6] * num_channels + 1] / 9; sum_b += buffer[indices[6] * num_channels + 2] / 9;
sum_r += buffer[indices[7] * num_channels] / 9; sum_g += buffer[indices[7] * num_channels + 1] / 9; sum_b += buffer[indices[7] * num_channels + 2] / 9;
sum_r += buffer[indices[8] * num_channels] / 9; sum_g += buffer[indices[8] * num_channels + 1] / 9; sum_b += buffer[indices[8] * num_channels + 2] / 9;
filteredImage[i * 3] = sum_r; filteredImage[i * 3 + 1] = sum_g; filteredImage[i * 3 + 2] = sum_b; } } auto end_time = std::chrono::high_resolution_clock::now();
auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>( end_time - start_time);
const char *output_filepath = argv[2]; std::cout << "Output file to: " << output_filepath << "\n"; JPEGMeta output_jpeg{filteredImage, origin_width, origin_height, 3, input_jpeg.color_space}; if (write_to_jpeg(output_jpeg, output_filepath)) { std::cerr << "Failed to write output JPEG\n"; return -1; } // Release allocated memory delete[] input_jpeg.buffer; delete[] buffer; delete[] filteredImage;
std::cout << "Transformation Complete!" << std::endl; std::cout << "Execution Time: " << elapsed_time.count() << " milliseconds\n"; return 0;}
xxxxxxxxxx//// MPI + OpenMp + SIMD + Reordering Matrix Multiplication//
// MPI Header
Matrix matrix_multiply_mpi(const Matrix& matrix1, const Matrix& matrix2, const size_t start_row, const size_t end_row){ if (matrix1.getCols() != matrix2.getRows()) { throw std::invalid_argument( "Matrix dimensions are not compatible for multiplication."); }
size_t M = matrix1.getRows(), K = matrix1.getCols(), N = matrix2.getCols();
Matrix result(end_row - start_row, N);
for (size_t i = start_row; i < end_row; i++) { int* result_i = result[i - start_row]; const int* matrix1_i = matrix1[i];
__m256i result_vecs[(N + 7) / 8];
for (size_t j = 0; j < (N + 7) / 8; j++) { result_vecs[j] = _mm256_setzero_si256(); }
for (size_t k = 0; k < K; k++) { int matrix1_ik = matrix1_i[k]; const int* matrix2_k = matrix2[k];
for (size_t j = 0; j < N; j += 8) { __m256i m1_vec = _mm256_set1_epi32(matrix1_ik); __m256i m2_vec = _mm256_load_si256((__m256i*)&matrix2_k[j]);
__m256i prod_vec = _mm256_mullo_epi32(m1_vec, m2_vec);
result_vecs[j / 8] = _mm256_add_epi32(result_vecs[j / 8], prod_vec); } }
for (size_t j = 0; j < N; j += 8) { _mm256_store_si256((__m256i*)&result_i[j], result_vecs[j / 8]); } }
return result;}
int main(int argc, char** argv){ // Verify input argument format if (argc != 5) { throw std::invalid_argument( "Invalid argument, should be: ./executable thread_num " "/path/to/matrix1 /path/to/matrix2 /path/to/multiply_result\n"); }
// Start the MPI MPI_Init(&argc, &argv); // How many processes are running int numtasks; MPI_Comm_size(MPI_COMM_WORLD, &numtasks); // What's my rank? int taskid; MPI_Comm_rank(MPI_COMM_WORLD, &taskid); // Which node am I running on? int len; char hostname[MPI_MAX_PROCESSOR_NAME]; MPI_Get_processor_name(hostname, &len); MPI_Status status;
int thread_num = atoi(argv[1]); omp_set_num_threads(thread_num);
// Read Matrix const std::string matrix1_path = argv[2];
const std::string matrix2_path = argv[3];
const std::string result_path = argv[4];
Matrix matrix1 = Matrix::loadFromFile(matrix1_path);
Matrix matrix2 = Matrix::loadFromFile(matrix2_path);
size_t M = matrix1.getRows(); size_t N = matrix2.getCols();
int row_num_per_task = M / numtasks; int left_row_num = M % numtasks; std::vector<int> rows(numtasks + 1, 0); int divided_left_row_num = 0;
for (int i = 0; i < numtasks; i++) { if (divided_left_row_num < left_row_num) { rows[i + 1] = rows[i] + row_num_per_task + 1; divided_left_row_num++; } else rows[i + 1] = rows[i] + row_num_per_task; }
auto start_time = std::chrono::high_resolution_clock::now(); if (taskid == MASTER) { Matrix result = matrix_multiply_mpi(matrix1, matrix2, rows[MASTER], rows[MASTER + 1]);
Matrix final_result(M, N);
// Your Code Here for Synchronization!
for (int i = rows[MASTER]; i < rows[MASTER + 1]; i++) { for (int j = 0; j < N; j++) { final_result[i][j] = result[i][j]; } }
for (int i = MASTER + 1; i < numtasks; i++) { for (size_t j = rows[i]; j < rows[i + 1]; j++) { int* start_pos = final_result[j];
MPI_Recv(start_pos, N, MPI_INT, i, TAG_GATHER, MPI_COMM_WORLD, &status); } }
auto end_time = std::chrono::high_resolution_clock::now(); auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
final_result.saveToFile(result_path);
std::cout << "Output file to: " << result_path << std::endl;
std::cout << "Multiplication Complete!" << std::endl; std::cout << "Execution Time: " << elapsed_time.count() << " milliseconds" << std::endl; } else { Matrix result = matrix_multiply_mpi(matrix1, matrix2, rows[taskid], rows[taskid + 1]);
// Your Code Here for Synchronization! for (size_t i = rows[taskid]; i < rows[taskid + 1]; i++) { MPI_Send(result[i - rows[taskid]], N, MPI_INT, MASTER, TAG_GATHER, MPI_COMM_WORLD); } }
MPI_Finalize(); return 0;}