Fix the bug using neon instruction fmla for data type half

commit: d194167149a2f18d51ea07ed6e87d2d51744c537 [log] [tgz]
author: Lianhuang Li <lilianhuang@bytedance.com> Thu Dec 01 17:28:57 2022 +0000
committer: Antonio Sánchez <cantonios@google.com> Thu Dec 01 17:28:57 2022 +0000
tree: a9d2e0317c1091ba90d783ead16ef2a892d1eb8a
parent: 31ab62d347824489a048c3423ea586625612013f [diff]
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index e49e394..00bdb9b 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h

@@ -183,7 +183,11 @@
   }
 };
 
-#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+// The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not
+// allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]',
+// so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented
+// through a costly dup in gcc compiler.
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
 
 template<>
 struct gebp_traits <half,half,false,false,Architecture::NEON>
@@ -240,19 +244,10 @@
   template<int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
-    #if EIGEN_COMP_GNUC_STRICT
-    // 1. vfmaq_lane_f16 is implemented through a costly dup
-    // 2. workaround the gcc register split problem on arm64-neon
-         if(LaneID==0)  asm("fmla %0.8h, %1.8h, %2.h[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==1)  asm("fmla %0.8h, %1.8h, %2.h[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==2)  asm("fmla %0.8h, %1.8h, %2.h[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==3)  asm("fmla %0.8h, %1.8h, %2.h[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    #else
     c = vfmaq_lane_f16(c, a, b, LaneID);
-    #endif
   }
 };
-#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
 #endif // EIGEN_ARCH_ARM64
 
 }  // namespace internal
commit	d194167149a2f18d51ea07ed6e87d2d51744c537	[log] [tgz]
author	Lianhuang Li <lilianhuang@bytedance.com>	Thu Dec 01 17:28:57 2022 +0000
committer	Antonio Sánchez <cantonios@google.com>	Thu Dec 01 17:28:57 2022 +0000
tree	a9d2e0317c1091ba90d783ead16ef2a892d1eb8a
parent	31ab62d347824489a048c3423ea586625612013f [diff]