Skip to content
This repository has been archived by the owner on Dec 5, 2023. It is now read-only.

Commit

Permalink
version finale
Browse files Browse the repository at this point in the history
  • Loading branch information
[jérémy morlier] committed Mar 24, 2023
1 parent ce4a6fe commit a9c796b
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 197 deletions.
2 changes: 1 addition & 1 deletion src/Display/Display_SDL2/Display_SDL2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ ::update(bool& done)
for (int i = 0; i < particles.x.size(); i++)
{
glBegin (GL_POINTS);
glColor3f (1.0f, 1.0f, 1.0f);
glColor3f (0.0f, 1.0f, 1.0f);
glVertex3f(particles.x[i], particles.y[i], particles.z[i]);
glEnd();
}
Expand Down
313 changes: 161 additions & 152 deletions src/Model/Model_CPU/Model_CPU_fast/Model_CPU_fast.cpp
Original file line number Diff line number Diff line change
@@ -1,71 +1,5 @@
// #ifdef GALAX_MODEL_CPU_FAST

// #include <cmath>

// #include "Model_CPU_fast.hpp"

// #include <xsimd/xsimd.hpp>
// #include <omp.h>

// namespace xs = xsimd;
// using b_type = xs::batch<float, xs::avx2>;

// Model_CPU_fast
// ::Model_CPU_fast(const Initstate& initstate, Particles& particles)
// : Model_CPU(initstate, particles)
// {
// }

// void Model_CPU_fast
// ::step()
// {
// std::fill(accelerationsx.begin(), accelerationsx.end(), 0);
// std::fill(accelerationsy.begin(), accelerationsy.end(), 0);
// std::fill(accelerationsz.begin(), accelerationsz.end(), 0);

// // // OMP version
// // #pragma omp parallel for collapse(2)
// // for (int i = 0; i < n_particles; i ++)
// // {
// // for (int j = 0; j < n_particles; j++)
// // {
// // if(i != j)
// // {
// // const float diffx = particles.x[j] - particles.x[i];
// // const float diffy = particles.y[j] - particles.y[i];
// // const float diffz = particles.z[j] - particles.z[i];

// // float dij = diffx * diffx + diffy * diffy + diffz * diffz;

// // if (dij < 1.0)b_type::size

// // accelerationsx[i] += diffx * dij * initstate.masses[j];
// // accelerationsy[i] += diffy * dij * initstate.masses[j];
// // accelerationsz[i] += diffz * dij * initstate.masses[j];
// // }
// // }
// // }
// // #pragma omp parallel for
// // for (int i = 0; i < n_particles; i++)
// // {
// // velocitiesx[i] += accelerationsx[i] * 2.0f;
// // velocitiesy[i] += accelerationsy[i] * 2.0f;
// // velocitiesz[i] += accelerationsz[i] * 2.0f;
// // particles.x[i] += velocitiesx [i] * 0.1f;
// // particles.y[i] += velocitiesy [i] * 0.1f;
// // particles.z[i] += velocitiesz [i] * 0.1f;
// // }


// // OMP + xsimd version
// // std::size_t all_size = accelerationsx.size();
// // std::size_t vec_size = all_size - all_size % b_type::size;

// }

//#endif // GALAX_MODEL_CPU_FAST
#ifdef GALAX_MODEL_CPU_FAST

#include <iostream>
#include <cmath>

#include "Model_CPU_fast.hpp"
Expand All @@ -76,98 +10,173 @@
namespace xs = xsimd;
using b_type = xs::batch<float, xs::avx2>;

Model_CPU_fast
::Model_CPU_fast(const Initstate& initstate, Particles& particles)
: Model_CPU(initstate, particles)
Model_CPU_fast ::Model_CPU_fast(const Initstate &initstate, Particles &particles)
: Model_CPU(initstate, particles)
{
}

void Model_CPU_fast
::step()
struct Rot
{
static constexpr unsigned get(unsigned i, unsigned n)
{
return (i + n - 1) % n;
}
};

constexpr auto mask = xsimd::make_batch_constant<xsimd::batch<uint32_t, xsimd::avx2>, Rot>();

void Model_CPU_fast ::step()
{
std::fill(accelerationsx.begin(), accelerationsx.end(), 0);
std::fill(accelerationsy.begin(), accelerationsy.end(), 0);
std::fill(accelerationsz.begin(), accelerationsz.end(), 0);



// OMP + xsimd version
// std::size_t all_size = accelerationsx.size();
// std::size_t vec_size = all_size - all_size % b_type::size;
const b_type ones_batch = 1.0;
const b_type tens_batch = 10.0;
//#pragma omp parallel for
for (int i = 0; i < n_particles; i += b_type::size)
{
// load registers body i
const b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
const b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
const b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
b_type raccx_i = b_type::load_unaligned(&accelerationsx[i]);
b_type raccy_i = b_type::load_unaligned(&accelerationsy[i]);
b_type raccz_i = b_type::load_unaligned(&accelerationsz[i]);
// #pragma omp parallel for
for (int j = 0; j < n_particles; j += b_type::size)
{
//load registers body j
const b_type rposx_j = b_type::load_unaligned(&particles.x[j]);
const b_type rposy_j = b_type::load_unaligned(&particles.y[j]);
const b_type rposz_j = b_type::load_unaligned(&particles.z[j]);

const b_type rmasses = b_type::load_unaligned(&initstate.masses[j]);

const auto diffx = rposx_j - rposx_i;
const auto diffy = rposy_j - rposy_i;
const auto diffz = rposz_j - rposz_i;

auto dij = diffx * diffx + diffy * diffy + diffz * diffz;

auto cond = xs::gt(ones_batch, dij);

dij = xs::select(cond, tens_batch, 10.0/(xs::sqrt(dij)*xs::sqrt(dij)*xs::sqrt(dij)));

raccx_i += diffx * dij * rmasses;
raccy_i += diffy * dij * rmasses;
raccz_i += diffz * dij * rmasses;

raccx_i.store_unaligned(&accelerationsx[i]);
raccy_i.store_unaligned(&accelerationsy[i]);
raccz_i.store_unaligned(&accelerationsz[i]);

}

}
// std::fill(accelerationsx.begin(), accelerationsx.end(), 0);
// std::fill(accelerationsy.begin(), accelerationsy.end(), 0);
// std::fill(accelerationsz.begin(), accelerationsz.end(), 0);

//OMP + xsimd version

const b_type zeros_batch = 0.0;
const b_type tens_batch = 10.0;

// #pragma omp parallel for
// for (int i = 0; i < n_particles; i += b_type::size)
// {
// // load registers body i
// const b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
// const b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
// const b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
// // b_type raccx_i = b_type::load_unaligned(&accelerationsx[i]);
// // b_type raccy_i = b_type::load_unaligned(&accelerationsy[i]);
// // b_type racc // b_type test = b_type::load_unaligned(&particles.x[0]);
// for(int i = 0; i <b_type::size; i++ )
// {
// std::cout<<test.get(i) << " ";
// }Capture d’écran du 2023-03-24 11-57-00
// std::cout<< std::endl;
// xsimd::swizzle(test, mask);
// for(int i = 0; i <b_type::size; i++ )
// {
// std::cout<<test.get(i) << " ";
// }
// std::cout<< std::endl;ype::load_unaligned(&velocitiesy[i]);
// b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
// //#pragma omp parallel forrmasses_j
// for (int j = 0; j < test = ters body j
// const b_type rposx_j = particles.x[j];
// const b_type rposy_j = particles.y[j];
// const b_type rposz_j = particles.z[j];
// const b_type rmasses = initstate.masses[j];
// b_type test = b_type::load_unaligned(&particles.x[0]);
// for(int i = 0; i <b_type::size; i++ )
// {
// std::cout<<test.get(i) << " ";
// }
// std::cout<< std::endl;
// xsimd::swizzle(test, mask);
// for(int i = 0; i <b_type::size; i++ )rmasses_j
// std::cout<< std::endl;/ (dij_s * dij_s * dij_s);
// dij = 10 / (xs::sqrt(dij) * dij);
// dij = xs::min(dij, tens_batch);

// //dij = xs::select(cond, zeros_batch, dij);
// dij = dij * rmasses * 2.0f;

// rvelx_i += diffx * dij;
// rvely_i += diffy * dij;
// rvelz_i += diffz * dij;
// }
// // raccx_i.store_unaligned(&accelerationsx[i]);
// // raccy_i.store_unaligned(&accelerationsy[i]);
// // raccz_i.store_unaligned(&accelerationsz[i]);

// rvelx_i.store_unaligned(&velocitiesx[i]);
// rvely_i.store_unaligned(&velocitiesy[i]);
// rvelz_i.store_unaligned(&velocitiesz[i]);
// }
#pragma omp parallel for
for (int i = 0; i < n_particles; i += b_type::size)
{
// load registers body i
const b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
const b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
const b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
b_type rvelx_i = b_type::load_unaligned(&velocitiesx[i]);
b_type rvely_i = b_type::load_unaligned(&velocitiesy[i]);
b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
const b_type rmasses_i = b_type::load_unaligned(&initstate.masses[i]);

#pragma omp parallel for
for (int j = i; j < n_particles; j += b_type::size)
{
b_type rvelx_j = b_type::load_unaligned(&velocitiesx[j]);
b_type rvely_j = b_type::load_unaligned(&velocitiesy[j]);
b_type rvelz_j = b_type::load_unaligned(&velocitiesz[j]);
b_type rposx_j = b_type::load_unaligned(&particles.x[j]);
b_type rposy_j = b_type::load_unaligned(&particles.y[j]);
b_type rposz_j = b_type::load_unaligned(&particles.z[j]);

b_type rmasses_j = b_type::load_unaligned(&initstate.masses[j]);

for (int k = 0; k < b_type::size; k++)
{

const b_type diffx = rposx_j - rposx_i;
const b_type diffy = rposy_j - rposy_i;
const b_type diffz = rposz_j - rposz_i;
b_type dij = diffx * diffx + diffy * diffy + diffz * diffz;

auto cond = xs::eq(zeros_batch, dij);

dij = 10 / (xs::sqrt(dij) * dij);
dij = xs::min(dij, tens_batch);

dij = xs::select(cond, zeros_batch, dij);
auto dij_a = dij * rmasses_j * 2.0f;
rvelx_i += diffx * dij_a;
rvely_i += diffy * dij_a;
rvelz_i += diffz * dij_a;

auto dij_b = - dij * rmasses_i * 2.0f;
rvelx_j += diffx * dij_b;
rvely_j += diffy * dij_b;
rvelz_j += diffz * dij_b;

rposx_j = xsimd::swizzle(rposx_j, mask);
rposy_j = xsimd::swizzle(rposy_j, mask);
rposz_j = xsimd::swizzle(rposz_j, mask);
rmasses_j = xsimd::swizzle(rmasses_j, mask);
rvelx_j = xsimd::swizzle(rvelx_j, mask);
rvely_j = xsimd::swizzle(rvely_j, mask);
rvelz_j = xsimd::swizzle(rvelz_j, mask);
}
rvelx_j.store_unaligned(&velocitiesx[j]);
rvely_j.store_unaligned(&velocitiesy[j]);
rvelz_j.store_unaligned(&velocitiesz[j]);
}

rvelx_i.store_unaligned(&velocitiesx[i]);
rvely_i.store_unaligned(&velocitiesy[i]);
rvelz_i.store_unaligned(&velocitiesz[i]);
}

#pragma omp parallel for
for (int i = 0; i < n_particles; i += b_type::size)
{
b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
b_type rvelx_i = b_type::load_unaligned(&velocitiesx[i]);
b_type rvely_i = b_type::load_unaligned(&velocitiesy[i]);
b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
const b_type raccx_i = b_type::load_unaligned(&accelerationsx[i]);
const b_type raccy_i = b_type::load_unaligned(&accelerationsy[i]);
const b_type raccz_i = b_type::load_unaligned(&accelerationsz[i]);

rvelx_i += raccx_i * 2.0f;
rvely_i += raccy_i * 2.0f;
rvelz_i += raccz_i * 2.0f;

rposx_i += rvelx_i * 0.1f;
rposy_i += rvely_i * 0.1f;
rposz_i += rvelz_i * 0.1f;

rvelx_i.store_unaligned(&velocitiesx[i]);
rvely_i.store_unaligned(&velocitiesy[i]);
rvelz_i.store_unaligned(&velocitiesz[i]);

rposx_i.store_unaligned(&particles.x[i]);
rposy_i.store_unaligned(&particles.y[i]);
rposz_i.store_unaligned(&particles.z[i]);
}
for (int i = 0; i < n_particles; i += b_type::size)
{
b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
b_type rposz_i = b_type::load_unaligned(&particles.z[i]);

const b_type rvelx_i = b_type::load_unaligned(&velocitiesx[i]);
const b_type rvely_i = b_type::load_unaligned(&velocitiesy[i]);
const b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);


rposx_i += rvelx_i * 0.1f;
rposy_i += rvely_i * 0.1f;
rposz_i += rvelz_i * 0.1f;

rposx_i.store_unaligned(&particles.x[i]);
rposy_i.store_unaligned(&particles.y[i]);
rposz_i.store_unaligned(&particles.z[i]);
}
}

#endif // GALAX_MODEL_CPU_FAST
7 changes: 4 additions & 3 deletions src/Model/Model_GPU/Model_GPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ inline bool cuda_memcpy(void * dst, const void * src, size_t count, enum cudaMem
return true;
}

void update_position_gpu(float4* positionsGPU, float3* velocitiesGPU, float3* accelerationsGPU, int n_particles)
void update_position_gpu(float4* positionsGPU, float3* velocitiesGPU, float4* accelerationsGPU, int n_particles)
{
update_position_cu(positionsGPU, velocitiesGPU, accelerationsGPU, n_particles);
cudaError_t cudaStatus;
Expand Down Expand Up @@ -68,16 +68,17 @@ ::Model_GPU(const Initstate& initstate, Particles& particles)
accelerationsf3[i].x = 0;
accelerationsf3[i].y = 0;
accelerationsf3[i].z = 0;
accelerationsf3[i].w = 0;
}

cuda_malloc((void**)&positionsGPU, n_particles * sizeof(float4));
cuda_malloc((void**)&velocitiesGPU, n_particles * sizeof(float3));
cuda_malloc((void**)&accelerationsGPU, n_particles * sizeof(float3));
cuda_malloc((void**)&accelerationsGPU, n_particles * sizeof(float4));


cuda_memcpy(positionsGPU, positionsf3.data() , n_particles * sizeof(float4), cudaMemcpyHostToDevice);
cuda_memcpy(velocitiesGPU, velocitiesf3.data() , n_particles * sizeof(float3), cudaMemcpyHostToDevice);
cuda_memcpy(accelerationsGPU, accelerationsf3.data() , n_particles * sizeof(float3), cudaMemcpyHostToDevice);
cuda_memcpy(accelerationsGPU, accelerationsf3.data() , n_particles * sizeof(float4), cudaMemcpyHostToDevice);

}

Expand Down
4 changes: 2 additions & 2 deletions src/Model/Model_GPU/Model_GPU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ class Model_GPU : public Model

std::vector<float4> positionsf3 ;
std::vector<float3> velocitiesf3 ;
std::vector<float3> accelerationsf3;
std::vector<float4> accelerationsf3;
std::vector<float> massesf;

float4* positionsGPU;
float3* velocitiesGPU;
float3* accelerationsGPU;
float4* accelerationsGPU;

public:
Model_GPU(const Initstate& initstate, Particles& particles);
Expand Down
Loading

0 comments on commit a9c796b

Please sign in to comment.