[Getfem-commits] [getfem-commits] branch master updated: Some more BLAS

getfem-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Getfem-commits] [getfem-commits] branch master updated: Some more BLAS

From:	Konstantinos Poulios
Subject:	[Getfem-commits] [getfem-commits] branch master updated: Some more BLAS alternative implementations in ga_instructions and fixes for the previous commit
Date:	Wed, 06 Dec 2023 14:57:40 -0500
This is an automated email from the git hooks/post-receive script.

logari81 pushed a commit to branch master
in repository getfem.

The following commit(s) were added to refs/heads/master by this push:
     new a2054d89 Some more BLAS alternative implementations in ga_instructions 
and fixes for the previous commit
a2054d89 is described below

commit a2054d896bd8da7e877e03376154b1a0bc890b88
Author: Konstantinos Poulios <logari81@gmail.com>
AuthorDate: Wed Dec 6 20:57:10 2023 +0100

    Some more BLAS alternative implementations in ga_instructions and fixes for 
the previous commit
---
 src/getfem_generic_assembly_compile_and_exec.cc | 291 ++++++++++++++++--------
 1 file changed, 191 insertions(+), 100 deletions(-)

diff --git a/src/getfem_generic_assembly_compile_and_exec.cc 
b/src/getfem_generic_assembly_compile_and_exec.cc
index 191e79a1..d6c398d1 100644
--- a/src/getfem_generic_assembly_compile_and_exec.cc
+++ b/src/getfem_generic_assembly_compile_and_exec.cc
@@ -2455,8 +2455,8 @@ namespace getfem {
 #if defined(GA_USES_BLAS)
       if (M*J*K > 27) {
         const BLAS_INT M_=BLAS_INT(M), J_=BLAS_INT(J), K_=BLAS_INT(K);
-        char notrans = 'N';
-        static const scalar_type one(1), zero(0);
+        constexpr char notrans = 'N';
+        constexpr scalar_type one(1), zero(0);
         gmm::dgemm_(&notrans, &notrans, &M_, &K_, &J_, &one,
                     &(tc1[0]), &M_, &(tc2[0]), &J_, &zero, &(t[0]), &M_);
       } else
@@ -2507,6 +2507,20 @@ namespace getfem {
                     "(dot product or matrix multiplication)");
       const size_type MI = tc1.size() / J,    M = MI / I,
                       NJ = tc2.size() / K,    N = NJ / J;
+#if defined(GA_USES_BLAS)
+      const BLAS_INT J_ = BLAS_INT(J), M_ = BLAS_INT(M), N_ = BLAS_INT(N),
+                     MI_ = BLAS_INT(MI);
+      constexpr char notrans = 'N', trans = 'T';
+      constexpr scalar_type one(1), zero(0);
+      size_type MN = M*N;
+      auto it = t.begin();
+      for (size_type k = 0; k < K; ++k)
+        for (size_type i = 0; i < I; ++i, it += MN)  // => t[M*N*(i+I*k)]
+          gmm::dgemm_(&notrans, &trans, &M_, &N_, &J_, &one,
+                      &(tc1[M*i]), &MI_, &(tc2[NJ*k]), &N_, &zero,
+                      &(*it), &M_);
+      GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
+#else
       auto it = t.begin();
       for (size_type k = 0; k < K; ++k)
         for (size_type i = 0; i < I; ++i)
@@ -2517,6 +2531,7 @@ namespace getfem {
                 *it += tc1[m+M*i+MI*j] * tc2[n+N*j+NJ*k];
             }
       GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
+#endif
       return 0;
     }
     ga_instruction_matrix_mult_spec(base_tensor &t_,
@@ -2537,6 +2552,18 @@ namespace getfem {
                     "(dot product or matrix multiplication)");
       const size_type MI = tc1.size() / J,
                       NJ = tc2.size() / K,    N = NJ / J;
+#if defined(GA_USES_BLAS)
+      const BLAS_INT J_ = BLAS_INT(J), MI_ = BLAS_INT(MI), N_ = BLAS_INT(N);
+      constexpr char notrans = 'N', trans = 'T';
+      constexpr scalar_type one(1), zero(0);
+      size_type NMI = N*MI;
+      auto it = t.begin();
+      for (size_type k = 0; k < K; ++k, it += NMI) // => it[N*M*I*k]
+        gmm::dgemm_(&notrans, &trans, &N_, &MI_, &J_, &one,
+                    &(tc2[NJ*k]), &N_, &(tc1[0]), &MI_, &zero,
+                    &(*it), &N_);
+      GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
+#else
       auto it = t.begin();
       for (size_type k = 0; k < K; ++k)
         for (size_type mi = 0; mi < MI; ++mi)
@@ -2546,6 +2573,7 @@ namespace getfem {
               *it += tc1[mi+MI*j] * tc2[n+N*j+NJ*k];
           }
       GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
+#endif
       return 0;
     }
     ga_instruction_matrix_mult_spec2(base_tensor &t_,
@@ -2924,6 +2952,88 @@ namespace getfem {
 
 
 
+  template<int I> inline void dax__(base_tensor::iterator &it,
+                                    base_tensor::const_iterator &itx,
+                                    const scalar_type &a) {
+    constexpr int I1 = I/8;
+    constexpr int I2 = I - I1*8;
+    for (int i=0; i < I1; ++i)
+      dax__<8>(it, itx , a);
+    dax__<I2>(it, itx , a);
+  }
+  template<> inline void dax__<8>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<7>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<6>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<5>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<4>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<3>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<2>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<1>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<0>(base_tensor::iterator &,
+                                  base_tensor::const_iterator &,
+                                  const scalar_type &) {}
+
+
   template<int I> inline
   void reduc_elem_unrolled__(base_tensor::iterator &it,
                              base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
@@ -3046,6 +3156,61 @@ namespace getfem {
       : t(t_), tc1(tc1_), tc2(tc2_) {}
   };
 
+  // Performs An Bm -> Cmn. Unrolled operation.
+  template<>
+  struct ga_instruction_contraction_unrolled<1> : public ga_instruction {
+    base_tensor &t;
+    const base_tensor &tc1, &tc2;
+    virtual int exec() {
+      GA_DEBUG_INFO("Instruction: unrolled contraction operation of size 1");
+      size_type N = tc1.size(), M = tc2.size();
+      GA_DEBUG_ASSERT(t.size() == N*M, "Internal error, " << t.size()
+                      << " != " << N << "*" << M);
+
+      base_tensor::iterator it = t.begin();
+      base_tensor::const_iterator it1 = tc1.cbegin();
+      switch(M) {
+      case(1):
+        for (size_type n = 0; n < N; ++n, ++it1)
+          *it++ = tc2[0] * (*it1);
+        break;
+      case(2):
+        for (size_type n = 0; n < N; ++n, ++it1) {
+          base_tensor::const_iterator it2 = tc2.cbegin();
+          dax__<2>(it, it2, *it1);
+        }
+        break;
+      case(3):
+        for (size_type n = 0; n < N; ++n, ++it1) {
+          base_tensor::const_iterator it2 = tc2.cbegin();
+          dax__<4>(it, it2, *it1);
+        }
+        break;
+      case(4):
+        for (size_type n = 0; n < N; ++n, ++it1) {
+          base_tensor::const_iterator it2 = tc2.cbegin();
+          dax__<4>(it, it2, *it1);
+        }
+        break;
+      default:
+        const int M1 = int(M)/4;
+        const int M2 = int(M) - M1*4;
+        for (size_type n = 0; n < N; ++n, ++it1) {
+          base_tensor::const_iterator it2 = tc2.cbegin();
+          for (int mm=0; mm < M1; ++mm)
+            dax__<4>(it, it2, *it1);
+          for (int mm=0; mm < M2; ++mm)
+            *it++ = (*it2++) * (*it1);
+        }
+      }
+      return 0;
+    }
+    ga_instruction_contraction_unrolled(base_tensor &t_,
+                                        const base_tensor &tc1_,
+                                        const base_tensor &tc2_)
+      : t(t_), tc1(tc1_), tc2(tc2_) {}
+  };
+
   template<int N, int S2>
   inline void reduc_elem_d_unrolled__(base_tensor::iterator &it,
                                       base_tensor::const_iterator &it1,
@@ -3310,7 +3475,8 @@ namespace getfem {
     }
 
     switch(n) {
-    case  1 : GMM_ASSERT1(false, "n=1 should not happen");
+    case  1 : return std::make_shared<ga_instruction_contraction_unrolled< 1>>
+                     (t, tc1, tc2);
     case  2 : return std::make_shared<ga_instruction_contraction_unrolled< 2>>
                      (t, tc1, tc2);
     case  3 : return std::make_shared<ga_instruction_contraction_unrolled< 3>>
@@ -3673,86 +3839,6 @@ namespace getfem {
       : t(t_), tc1(tc1_), tc2(tc2_) {}
   };
 
-  template<int I> inline void dax__(base_tensor::iterator &it,
-                                    base_tensor::const_iterator &itx,
-                                    const scalar_type &a) {
-    constexpr int I1 = I/8;
-    constexpr int I2 = I - I1*8;
-    for (int i=0; i < I1; ++i)
-      dax__<8>(it, itx , a);
-    dax__<I2>(it, itx , a);
-  }
-  template<> inline void dax__<8>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<7>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<6>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<5>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<4>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<3>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<2>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<1>(base_tensor::iterator &it,
-                                  base_tensor::const_iterator &itx,
-                                  const scalar_type &a) {
-    *it++ = *itx++ * a;
-  }
-  template<> inline void dax__<0>(base_tensor::iterator &,
-                                  base_tensor::const_iterator &,
-                                  const scalar_type &) {}
 
   // Performs Aij Bkl -> Cijkl, partially unrolled version
   template<int IJ> struct ga_instruction_simple_tmult_unrolled
@@ -3842,12 +3928,17 @@ namespace getfem {
       auto it = t.begin();
 #if 1 // there could be a smarter way to implement this, but this hardcoded 
version is fast and robust
       switch(M) {
-      case 1: GMM_ASSERT1(false, "M=1 should not happen");
+      case 1:
+        for (size_type j = 0; j < J; ++j)
+          for (auto it1 = tc1.cbegin(); it1 != tc1.end(); ++it1)
+            for (size_type n = 0; n < N; ++n)
+              *it++ = (*it1) * tc2[n+N*j];
+        break;
       case 2:
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<2>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3855,7 +3946,7 @@ namespace getfem {
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<3>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3863,7 +3954,7 @@ namespace getfem {
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<4>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3871,7 +3962,7 @@ namespace getfem {
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<5>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3879,7 +3970,7 @@ namespace getfem {
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<6>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3887,7 +3978,7 @@ namespace getfem {
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<7>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3895,7 +3986,7 @@ namespace getfem {
         for (size_type j = 0; j < J; ++j)
           for (size_type i = 0; i < I; ++i)
             for (size_type n = 0; n < N; ++n) {
-              auto it1 = tc1.begin() + M*i;
+              auto it1 = tc1.cbegin() + M*i;
               dax__<8>(it, it1, tc2[n+N*j]);
             }
         break;
@@ -3907,7 +3998,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
               }
@@ -3916,7 +4007,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<1>(it, it1, tc2[n+N*j]);
@@ -3926,7 +4017,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<2>(it, it1, tc2[n+N*j]);
@@ -3936,7 +4027,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<3>(it, it1, tc2[n+N*j]);
@@ -3946,7 +4037,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<4>(it, it1, tc2[n+N*j]);
@@ -3956,7 +4047,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<5>(it, it1, tc2[n+N*j]);
@@ -3966,7 +4057,7 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<6>(it, it1, tc2[n+N*j]);
@@ -3976,14 +4067,14 @@ namespace getfem {
           for (size_type j = 0; j < J; ++j)
             for (size_type i = 0; i < I; ++i)
               for (size_type n = 0; n < N; ++n) {
-                auto it1 = tc1.begin() + M*i;
+                auto it1 = tc1.cbegin() + M*i;
                 for (int mm=0; mm < M1; ++mm)
                   dax__<8>(it, it1, tc2[n+N*j]);
                 dax__<7>(it, it1, tc2[n+N*j]);
               }
           break;
         default:
-          GMM_ASSERT1(false, "M=1 should not happen");
+          GMM_ASSERT1(false, "should not happen");
         }
       }
       GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
[Prev in Thread]
Current Thread
[Next in Thread]
[Getfem-commits] [getfem-commits] branch master updated: Some more BLAS alternative implementations in ga_instructions and fixes for the previous commit, Konstantinos Poulios <=