version finale

JeremyMorlier · Mar 24, 2023 · a9c796b · a9c796b
1 parent ce4a6fe
commit a9c796b
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 197 deletions.
diff --git a/src/Display/Display_SDL2/Display_SDL2.cpp b/src/Display/Display_SDL2/Display_SDL2.cpp
@@ -130,7 +130,7 @@ ::update(bool& done)
 	for (int i = 0; i < particles.x.size(); i++)
 	{
 		glBegin   (GL_POINTS);
-		glColor3f (1.0f, 1.0f, 1.0f);
+		glColor3f (0.0f, 1.0f, 1.0f);
 		glVertex3f(particles.x[i], particles.y[i], particles.z[i]);
 		glEnd();
 	}

diff --git a/src/Model/Model_CPU/Model_CPU_fast/Model_CPU_fast.cpp b/src/Model/Model_CPU/Model_CPU_fast/Model_CPU_fast.cpp
@@ -1,71 +1,5 @@
-// #ifdef GALAX_MODEL_CPU_FAST
-
-// #include <cmath>
-
-// #include "Model_CPU_fast.hpp"
-
-// #include <xsimd/xsimd.hpp>
-// #include <omp.h>
-
-// namespace xs = xsimd;
-// using b_type = xs::batch<float, xs::avx2>;
-
-// Model_CPU_fast
-// ::Model_CPU_fast(const Initstate& initstate, Particles& particles)
-// : Model_CPU(initstate, particles)
-// {
-// }
-
-// void Model_CPU_fast
-// ::step()
-// {
-//     std::fill(accelerationsx.begin(), accelerationsx.end(), 0);
-//     std::fill(accelerationsy.begin(), accelerationsy.end(), 0);
-//     std::fill(accelerationsz.begin(), accelerationsz.end(), 0);
-
-// // // OMP  version
-// //  #pragma omp parallel for collapse(2)
-// //     for (int i = 0; i < n_particles; i ++)
-// //     {
-// // 		for (int j = 0; j < n_particles; j++)
-// // 		{
-// // 			if(i != j)
-// // 			{
-// // 				const float diffx = particles.x[j] - particles.x[i];
-// // 				const float diffy = particles.y[j] - particles.y[i];
-// // 				const float diffz = particles.z[j] - particles.z[i];
-
-// // 				float dij = diffx * diffx + diffy * diffy + diffz * diffz;
-
-// // 				if (dij < 1.0)b_type::size
-
-// // 				accelerationsx[i] += diffx * dij * initstate.masses[j];
-// // 				accelerationsy[i] += diffy * dij * initstate.masses[j];
-// // 				accelerationsz[i] += diffz * dij * initstate.masses[j];
-// // 			}
-// // 		}
-// //     }
-// // #pragma omp parallel for 
-// // 	for (int i = 0; i < n_particles; i++)
-// // 	{
-// // 		velocitiesx[i] += accelerationsx[i] * 2.0f;
-// // 		velocitiesy[i] += accelerationsy[i] * 2.0f;
-// // 		velocitiesz[i] += accelerationsz[i] * 2.0f;
-// // 		particles.x[i] += velocitiesx   [i] * 0.1f;
-// // 		particles.y[i] += velocitiesy   [i] * 0.1f;
-// // 		particles.z[i] += velocitiesz   [i] * 0.1f;
-// // 	}
-
-
-// // OMP + xsimd version
-// // std::size_t all_size = accelerationsx.size();
-// // std::size_t vec_size = all_size - all_size % b_type::size;	
-
-// }
-
-//#endif // GALAX_MODEL_CPU_FAST
 #ifdef GALAX_MODEL_CPU_FAST
-
+#include <iostream>
 #include <cmath>
 
 #include "Model_CPU_fast.hpp"
@@ -76,98 +10,173 @@
 namespace xs = xsimd;
 using b_type = xs::batch<float, xs::avx2>;
 
-Model_CPU_fast
-::Model_CPU_fast(const Initstate& initstate, Particles& particles)
-: Model_CPU(initstate, particles)
+Model_CPU_fast ::Model_CPU_fast(const Initstate &initstate, Particles &particles)
+    : Model_CPU(initstate, particles)
 {
 }
 
-void Model_CPU_fast
-::step()
+struct Rot
+{
+        static constexpr unsigned get(unsigned i, unsigned n)
+        {
+                return (i + n - 1) % n;
+        }
+};
+
+constexpr auto mask = xsimd::make_batch_constant<xsimd::batch<uint32_t, xsimd::avx2>, Rot>();
+
+void Model_CPU_fast ::step()
 {
- std::fill(accelerationsx.begin(), accelerationsx.end(), 0);
- std::fill(accelerationsy.begin(), accelerationsy.end(), 0);
- std::fill(accelerationsz.begin(), accelerationsz.end(), 0);
-
-
-
-// OMP + xsimd version
-// std::size_t all_size = accelerationsx.size();
-// std::size_t vec_size = all_size - all_size % b_type::size; 
-const b_type ones_batch = 1.0;
-const b_type tens_batch = 10.0;
-//#pragma omp parallel for
- for (int i = 0; i < n_particles; i += b_type::size)
- {
- // load registers body i
- const b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
- const b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
- const b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
- b_type raccx_i = b_type::load_unaligned(&accelerationsx[i]);
- b_type raccy_i = b_type::load_unaligned(&accelerationsy[i]);
- b_type raccz_i = b_type::load_unaligned(&accelerationsz[i]);
-// #pragma omp parallel for
- for (int j = 0; j < n_particles; j += b_type::size)
- {
- //load registers body j
- const b_type rposx_j = b_type::load_unaligned(&particles.x[j]);
- const b_type rposy_j = b_type::load_unaligned(&particles.y[j]);
- const b_type rposz_j = b_type::load_unaligned(&particles.z[j]);
-
- const b_type rmasses = b_type::load_unaligned(&initstate.masses[j]);
-
- const auto diffx = rposx_j - rposx_i;
- const auto diffy = rposy_j - rposy_i;
- const auto diffz = rposz_j - rposz_i;
-
- auto dij = diffx * diffx + diffy * diffy + diffz * diffz;
-
- auto cond = xs::gt(ones_batch, dij);
-
- dij = xs::select(cond, tens_batch, 10.0/(xs::sqrt(dij)*xs::sqrt(dij)*xs::sqrt(dij)));
-
- raccx_i += diffx * dij * rmasses;
- raccy_i += diffy * dij * rmasses;
- raccz_i += diffz * dij * rmasses;
-
- raccx_i.store_unaligned(&accelerationsx[i]);
- raccy_i.store_unaligned(&accelerationsy[i]);
- raccz_i.store_unaligned(&accelerationsz[i]);
-
- }
-
- }
+        //     std::fill(accelerationsx.begin(), accelerationsx.end(), 0);
+        //     std::fill(accelerationsy.begin(), accelerationsy.end(), 0);
+        //     std::fill(accelerationsz.begin(), accelerationsz.end(), 0);
+
+        //OMP + xsimd version
+
+        const b_type zeros_batch = 0.0;
+        const b_type tens_batch = 10.0;
+
+        // #pragma omp parallel for
+        //         for (int i = 0; i < n_particles; i += b_type::size)
+        //         {
+        //                 // load registers body i
+        //                 const b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
+        //                 const b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
+        //                 const b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
+        //                 //       b_type raccx_i = b_type::load_unaligned(&accelerationsx[i]);
+        //                 //       b_type raccy_i = b_type::load_unaligned(&accelerationsy[i]);
+        //                 //       b_type racc        // b_type test = b_type::load_unaligned(&particles.x[0]);
+        // for(int i = 0; i <b_type::size; i++ )
+        // {
+        //       std::cout<<test.get(i) << " ";
+        // }Capture d’écran du 2023-03-24 11-57-00
+        // std::cout<< std::endl;
+        // xsimd::swizzle(test, mask);
+        // for(int i = 0; i <b_type::size; i++ )
+        // {
+        //       std::cout<<test.get(i) << " ";
+        // }
+        // std::cout<< std::endl;ype::load_unaligned(&velocitiesy[i]);
+        //                 b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
+        //                 //#pragma omp parallel forrmasses_j
+        //                 for (int j = 0; j < test = ters body j
+        //                         const b_type rposx_j = particles.x[j];
+        //                         const b_type rposy_j = particles.y[j];
+        //                         const b_type rposz_j = particles.z[j];
+        //                         const b_type rmasses = initstate.masses[j];
+        // b_type test = b_type::load_unaligned(&particles.x[0]);
+        // for(int i = 0; i <b_type::size; i++ )
+        // {
+        //       std::cout<<test.get(i) << " ";
+        // }
+        // std::cout<< std::endl;
+        // xsimd::swizzle(test, mask);
+        // for(int i = 0; i <b_type::size; i++ )rmasses_j
+        // std::cout<< std::endl;/ (dij_s * dij_s * dij_s);
+        //                         dij = 10 / (xs::sqrt(dij) * dij);
+        //                         dij = xs::min(dij, tens_batch);
+
+        //                         //dij = xs::select(cond, zeros_batch, dij);
+        //                         dij = dij * rmasses * 2.0f;
+
+        //                         rvelx_i += diffx * dij;
+        //                         rvely_i += diffy * dij;
+        //                         rvelz_i += diffz * dij;
+        //                 }
+        //                 // raccx_i.store_unaligned(&accelerationsx[i]);
+        //                 // raccy_i.store_unaligned(&accelerationsy[i]);
+        //                 // raccz_i.store_unaligned(&accelerationsz[i]);
+
+        //                 rvelx_i.store_unaligned(&velocitiesx[i]);
+        //                 rvely_i.store_unaligned(&velocitiesy[i]);
+        //                 rvelz_i.store_unaligned(&velocitiesz[i]);
+        //         }
+        #pragma omp parallel for
+        for (int i = 0; i < n_particles; i += b_type::size)
+        {
+                // load registers body i
+                const b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
+                const b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
+                const b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
+                b_type rvelx_i = b_type::load_unaligned(&velocitiesx[i]);
+                b_type rvely_i = b_type::load_unaligned(&velocitiesy[i]);
+                b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
+                const b_type rmasses_i = b_type::load_unaligned(&initstate.masses[i]);
+
+                #pragma omp parallel for
+                for (int j = i; j < n_particles; j += b_type::size)
+                {
+                        b_type rvelx_j = b_type::load_unaligned(&velocitiesx[j]);
+                        b_type rvely_j = b_type::load_unaligned(&velocitiesy[j]);
+                        b_type rvelz_j = b_type::load_unaligned(&velocitiesz[j]);
+                        b_type rposx_j = b_type::load_unaligned(&particles.x[j]);
+                        b_type rposy_j = b_type::load_unaligned(&particles.y[j]);
+                        b_type rposz_j = b_type::load_unaligned(&particles.z[j]);
+
+                        b_type rmasses_j = b_type::load_unaligned(&initstate.masses[j]);
+
+                        for (int k = 0; k < b_type::size; k++)
+                        {
+
+                                const b_type diffx = rposx_j - rposx_i;
+                                const b_type diffy = rposy_j - rposy_i;
+                                const b_type diffz = rposz_j - rposz_i;
+                                b_type dij = diffx * diffx + diffy * diffy + diffz * diffz;
+
+                                auto cond = xs::eq(zeros_batch, dij);
+
+                                dij = 10 / (xs::sqrt(dij) * dij);
+                                dij = xs::min(dij, tens_batch);
+
+                                dij = xs::select(cond, zeros_batch, dij);
+                                auto dij_a = dij * rmasses_j * 2.0f;
+                                rvelx_i += diffx * dij_a;
+                                rvely_i += diffy * dij_a;
+                                rvelz_i += diffz * dij_a;
+
+                                auto dij_b = - dij * rmasses_i * 2.0f;
+                                rvelx_j += diffx * dij_b;
+                                rvely_j += diffy * dij_b;
+                                rvelz_j += diffz * dij_b;
+
+                                rposx_j = xsimd::swizzle(rposx_j, mask);
+                                rposy_j = xsimd::swizzle(rposy_j, mask);
+                                rposz_j = xsimd::swizzle(rposz_j, mask);
+                                rmasses_j = xsimd::swizzle(rmasses_j, mask);
+                                rvelx_j = xsimd::swizzle(rvelx_j, mask);
+                                rvely_j = xsimd::swizzle(rvely_j, mask);
+                                rvelz_j = xsimd::swizzle(rvelz_j, mask);
+                        }
+                        rvelx_j.store_unaligned(&velocitiesx[j]);
+                        rvely_j.store_unaligned(&velocitiesy[j]);
+                        rvelz_j.store_unaligned(&velocitiesz[j]);
+                }
+
+                rvelx_i.store_unaligned(&velocitiesx[i]);
+                rvely_i.store_unaligned(&velocitiesy[i]);
+                rvelz_i.store_unaligned(&velocitiesz[i]);
+        }
 
 #pragma omp parallel for
- for (int i = 0; i < n_particles; i += b_type::size)
- {
- b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
- b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
- b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
- b_type rvelx_i = b_type::load_unaligned(&velocitiesx[i]);
- b_type rvely_i = b_type::load_unaligned(&velocitiesy[i]);
- b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
- const b_type raccx_i = b_type::load_unaligned(&accelerationsx[i]);
- const b_type raccy_i = b_type::load_unaligned(&accelerationsy[i]);
- const b_type raccz_i = b_type::load_unaligned(&accelerationsz[i]);
-
- rvelx_i += raccx_i * 2.0f;
- rvely_i += raccy_i * 2.0f;
- rvelz_i += raccz_i * 2.0f;
-
- rposx_i += rvelx_i * 0.1f;
- rposy_i += rvely_i * 0.1f;
- rposz_i += rvelz_i * 0.1f;
-
- rvelx_i.store_unaligned(&velocitiesx[i]);
- rvely_i.store_unaligned(&velocitiesy[i]);
- rvelz_i.store_unaligned(&velocitiesz[i]); 
-
- rposx_i.store_unaligned(&particles.x[i]);
- rposy_i.store_unaligned(&particles.y[i]);
- rposz_i.store_unaligned(&particles.z[i]); 
- }
+        for (int i = 0; i < n_particles; i += b_type::size)
+        {
+                b_type rposx_i = b_type::load_unaligned(&particles.x[i]);
+                b_type rposy_i = b_type::load_unaligned(&particles.y[i]);
+                b_type rposz_i = b_type::load_unaligned(&particles.z[i]);
+
+                const b_type rvelx_i = b_type::load_unaligned(&velocitiesx[i]);
+                const b_type rvely_i = b_type::load_unaligned(&velocitiesy[i]);
+                const b_type rvelz_i = b_type::load_unaligned(&velocitiesz[i]);
+
+
+                rposx_i += rvelx_i * 0.1f;
+                rposy_i += rvely_i * 0.1f;
+                rposz_i += rvelz_i * 0.1f;
 
+                rposx_i.store_unaligned(&particles.x[i]);
+                rposy_i.store_unaligned(&particles.y[i]);
+                rposz_i.store_unaligned(&particles.z[i]);
+        }
 }
 
 #endif // GALAX_MODEL_CPU_FAST
diff --git a/src/Model/Model_GPU/Model_GPU.cpp b/src/Model/Model_GPU/Model_GPU.cpp
@@ -31,7 +31,7 @@ inline bool cuda_memcpy(void * dst, const void * src, size_t count, enum cudaMem
 	return true;
 }
 
-void update_position_gpu(float4* positionsGPU, float3* velocitiesGPU, float3* accelerationsGPU, int n_particles)
+void update_position_gpu(float4* positionsGPU, float3* velocitiesGPU, float4* accelerationsGPU, int n_particles)
 {
 	update_position_cu(positionsGPU, velocitiesGPU, accelerationsGPU, n_particles);
 	cudaError_t cudaStatus;
@@ -68,16 +68,17 @@ ::Model_GPU(const Initstate& initstate, Particles& particles)
 		accelerationsf3[i].x = 0;
 		accelerationsf3[i].y = 0;
 		accelerationsf3[i].z = 0;
+		accelerationsf3[i].w = 0;
 	}
 
 	cuda_malloc((void**)&positionsGPU,     n_particles * sizeof(float4));
 	cuda_malloc((void**)&velocitiesGPU,     n_particles * sizeof(float3));
-	cuda_malloc((void**)&accelerationsGPU,  n_particles * sizeof(float3));
+	cuda_malloc((void**)&accelerationsGPU,  n_particles * sizeof(float4));
 
 
 	cuda_memcpy(positionsGPU,  positionsf3.data()     , n_particles * sizeof(float4), cudaMemcpyHostToDevice);
 	cuda_memcpy(velocitiesGPU,  velocitiesf3.data()     , n_particles * sizeof(float3), cudaMemcpyHostToDevice);
-	cuda_memcpy(accelerationsGPU,  accelerationsf3.data()     , n_particles * sizeof(float3), cudaMemcpyHostToDevice);
+	cuda_memcpy(accelerationsGPU,  accelerationsf3.data()     , n_particles * sizeof(float4), cudaMemcpyHostToDevice);
 
 }
 

diff --git a/src/Model/Model_GPU/Model_GPU.hpp b/src/Model/Model_GPU/Model_GPU.hpp
@@ -14,12 +14,12 @@ class Model_GPU : public Model
 
 	std::vector<float4> positionsf3    ;
 	std::vector<float3> velocitiesf3   ;
-	std::vector<float3> accelerationsf3;
+	std::vector<float4> accelerationsf3;
 	std::vector<float> massesf;
 
 	float4* positionsGPU;
 	float3* velocitiesGPU;
-	float3* accelerationsGPU;
+	float4* accelerationsGPU;
 
 public:
 	Model_GPU(const Initstate& initstate, Particles& particles);