dev-cpp/eigen: properly guard VSX use

Fixes build on VSX-less ppc* hardware. Closes: https://bugs.gentoo.org/936107 Thanks-to: jonys <vidra.jonas@seznam.cz> Signed-off-by: Sam James <sam@gentoo.org>
2026-01-06 02:17:34 -08:00 · 2024-07-17 00:03:41 +01:00 · 2024-07-17 00:03:41 +01:00 · c7a9b7d8cd
commit c7a9b7d8cd
parent 2fb52613e7
2 changed files with 653 additions and 0 deletions
--- a/dev-cpp/eigen/eigen-3.4.0-r3.ebuild
+++ b/dev-cpp/eigen/eigen-3.4.0-r3.ebuild
@ -0,0 +1,458 @@
+# Copyright 1999-2024 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=8
+
+FORTRAN_NEEDED="test"
+inherit cmake cuda fortran-2 llvm toolchain-funcs
+
+DESCRIPTION="C++ template library for linear algebra"
+HOMEPAGE="https://eigen.tuxfamily.org/index.php?title=Main_Page"
+
+if [[ ${PV} = *9999* ]] ; then
+	inherit git-r3
+	EGIT_REPO_URI="https://gitlab.com/lib${PN}/${PN}.git"
+	if [[ ${PV} = 3.4.9999* ]] ; then
+		EGIT_COMMIT="3.4"
+	fi
+else
+	SRC_URI="
+		https://gitlab.com/lib${PN}/${PN}/-/archive/${PV}/${P}.tar.bz2
+		test? ( lapack? ( https://downloads.tuxfamily.org/${PN}/lapack_addons_3.4.1.tgz -> ${PN}-lapack_addons-3.4.1.tgz ) )
+	"
+	KEYWORDS="~alpha ~amd64 ~arm arm64 ~hppa ~ia64 ~loong ~ppc ~ppc64 ~riscv ~s390 sparc ~x86 ~amd64-linux ~x86-linux ~arm64-macos ~x64-macos"
+fi
+
+LICENSE="MPL-2.0"
+SLOT="3"
+
+# The following lines are shamelessly stolen from ffmpeg-9999.ebuild with modifications
+ARM_CPU_FEATURES=(
+	neon:NEON
+)
+PPC_CPU_FEATURES=(
+	altivec:ALTIVEC
+	vsx:VSX
+)
+X86_CPU_FEATURES=(
+	avx:AVX
+	avx2:AVX2
+	avx512f:AVX512
+	avx512dq:AVX512DQ
+	f16c:FP16C
+	fma3:FMA
+	popcnt:POPCNT
+	sse:SSE
+	sse2:SSE2
+	sse3:SSE3
+	ssse3:SSSE3
+	sse4_1:SSE4_1
+	sse4_2:SSE4_2
+)
+# MIPS_CPU_FEATURES=(
+# 	msa:MSA
+# )
+# S390_CPU_FEATURES=(
+# 	z13:Z13
+# 	z14:Z14
+# )
+
+CPU_FEATURES_MAP=(
+	"${ARM_CPU_FEATURES[@]/#/cpu_flags_arm_}"
+	"${PPC_CPU_FEATURES[@]/#/cpu_flags_ppc_}"
+	"${X86_CPU_FEATURES[@]/#/cpu_flags_x86_}"
+	# "${MIPS_CPU_FEATURES[@]/#/cpu_flags_mips_}"
+	# "${S390_CPU_FEATURES[@]/#/cpu_flags_s390_}"
+)
+
+IUSE_TEST_BACKENDS=(
+	"adolc"
+	"boost"
+	"cholmod"
+	"fftw"
+	"klu"
+	"opengl"
+	"openmp"
+	"pastix"
+	"sparsehash"
+	"spqr"
+	"superlu"
+	"umfpack"
+)
+
+IUSE="${CPU_FEATURES_MAP[*]%:*} clang cuda hip debug doc lapack mathjax test ${IUSE_TEST_BACKENDS[*]}" #zvector
+
+# Tests failing again because of compiler issues
+RESTRICT="!test? ( test )"
+
+BDEPEND="
+	doc? (
+		app-text/doxygen[dot]
+		dev-texlive/texlive-bibtexextra
+		dev-texlive/texlive-fontsextra
+		dev-texlive/texlive-fontutils
+		dev-texlive/texlive-latex
+		dev-texlive/texlive-latexextra
+		mathjax? ( dev-libs/mathjax )
+	)
+	test? ( virtual/pkgconfig )
+"
+
+# METIS
+# MPREAL
+# dev-libs/mpfr:0
+# dev-libs/gmp:0
+
+TEST_BACKENDS="
+		boost? ( dev-libs/boost )
+		adolc? ( sci-libs/adolc[sparse] )
+		cholmod? ( sci-libs/cholmod:= )
+		fftw? ( sci-libs/fftw )
+		spqr? ( sci-libs/spqr )
+		klu? ( sci-libs/klu )
+		opengl? (
+			media-libs/freeglut
+			media-libs/glew
+			media-libs/libglvnd
+		)
+		pastix? ( sci-libs/pastix[-mpi] )
+		sparsehash? (
+			amd64? ( dev-cpp/sparsehash )
+			arm64? ( dev-cpp/sparsehash )
+			ppc64? ( dev-cpp/sparsehash )
+			x86?   ( dev-cpp/sparsehash )
+		)
+		superlu? ( sci-libs/superlu )
+		umfpack? ( sci-libs/umfpack )
+"
+DEPEND="
+	test? (
+		cuda? (
+			!clang? (
+				dev-util/nvidia-cuda-toolkit
+			)
+			clang? (
+				sys-devel/clang[llvm_targets_NVPTX]
+				openmp? ( sys-libs/libomp[llvm_targets_NVPTX,offload] )
+			)
+		)
+		hip? ( dev-util/hip )
+		lapack? ( virtual/lapacke )
+		${TEST_BACKENDS}
+	)
+"
+
+REQUIRED_USE="
+	test? ( !lapack )
+	|| ( ${IUSE_TEST_BACKENDS[*]} )
+"
+
+PATCHES=(
+	"${FILESDIR}/${PN}-3.3.9-max-macro.patch"
+	"${FILESDIR}/${PN}-3.4.0-doc-nocompress.patch" # bug 830064
+	"${FILESDIR}/${PN}-3.4.0-buildstring.patch"
+	"${FILESDIR}/${PN}-3.4.0-noansi.patch"
+	"${FILESDIR}/${PN}-3.4.0-cxxstandard.patch"
+	"${FILESDIR}/${PN}-3.4.0-ppc-no-vsx.patch" # bug 936107
+)
+
+# TODO should be in cuda.eclass
+cuda_set_CUDAHOSTCXX() {
+	local compiler
+	tc-is-gcc && compiler="gcc"
+	tc-is-clang && compiler="clang"
+	[[ -z "$compiler" ]] && die "no compiler specified"
+
+	local package="sys-devel/${compiler}"
+	local version="${package}"
+	local CUDAHOSTCXX_test
+	while
+		CUDAHOSTCXX="${CUDAHOSTCXX_test}"
+		version=$(best_version "${version}")
+		if [[ -z "${version}" ]]; then
+			if [[ -z "${CUDAHOSTCXX}" ]]; then
+				die "could not find supported version of ${package}"
+			fi
+			break
+		fi
+		CUDAHOSTCXX_test="$(
+			dirname "$(
+				realpath "$(
+					which "${compiler}-$(echo "${version}" | grep -oP "(?<=${package}-)[0-9]*")"
+				)"
+			)"
+		)"
+		version="<${version}"
+	do ! echo "int main(){}" | nvcc "-ccbin ${CUDAHOSTCXX_test}" - -x cu &>/dev/null; done
+
+	export CUDAHOSTCXX
+}
+
+pkg_setup() {
+	use test && use cuda && use clang && llvm_pkg_setup
+}
+
+src_unpack() {
+	if [[ ${PV} = *9999* ]] ; then
+		git-r3_src_unpack
+	else
+		unpack "${P}.tar.bz2"
+
+		if use test && use lapack; then
+			cd "${S}/lapack" || die
+			unpack "${PN}-lapack_addons-3.4.1.tgz"
+		fi
+	fi
+}
+
+src_prepare() {
+	cmake_src_prepare
+
+	sed \
+		-e "/add_subdirectory(bench\/spbench/s/^/#DONOTCOMPILE /g" \
+		-e "/add_subdirectory(demos/s/^/#DONOTCOMPILE /g" \
+		-i CMakeLists.txt || die
+
+	if ! use test; then
+		sed \
+			-e "/add_subdirectory(test/s/^/#DONOTCOMPILE /g" \
+			-e "/add_subdirectory(scripts/s/^/#DONOTCOMPILE /g" \
+			-e "/add_subdirectory(failtest/s/^/#DONOTCOMPILE /g" \
+			-e "/add_subdirectory(blas/s/^/#DONOTCOMPILE /g" \
+			-e "/add_subdirectory(lapack/s/^/#DONOTCOMPILE /g" \
+			-i CMakeLists.txt || die
+	fi
+}
+
+src_configure() {
+	local mycmakeargs=(
+		-DBUILD_SHARED_LIBS="yes"
+		-DBUILD_TESTING="$(usex test)"
+
+		-DEIGEN_BUILD_DOC="$(usex doc)" # Enable creation of Eigen documentation
+		-DEIGEN_BUILD_PKGCONFIG="yes" # Build pkg-config .pc file for Eigen
+	)
+	if use doc || use test; then
+		mycmakeargs+=(
+			# needs Qt4
+			-DEIGEN_TEST_NOQT="yes" # Disable Qt support in unit tests
+		)
+	fi
+
+	if use doc; then
+		mycmakeargs+=(
+			-DEIGEN_DOC_USE_MATHJAX="$(usex mathjax)" # Use MathJax for rendering math in HTML docs
+			-DEIGEN_INTERNAL_DOCUMENTATION=no # Build internal documentation
+		)
+	fi
+
+	if use test; then
+		mycmakeargs+=(
+			# the OpenGL testsuite is extremely brittle, bug #712808
+			-DOpenGL_GL_PREFERENCE="GLVND"
+			-DEIGEN_TEST_OPENGL="$(usex opengl)" # Enable OpenGL support in unit tests
+			-DEIGEN_TEST_OPENMP="$(usex openmp)" # Enable/Disable OpenMP in tests/examples
+
+			-DCMAKE_DISABLE_FIND_PACKAGE_MPREAL=ON
+
+			-DEIGEN_TEST_CXX11=yes
+
+			# -DEIGEN_TEST_CUSTOM_CXX_FLAGS= # Additional compiler flags when compiling unit tests.
+			# -DEIGEN_TEST_CUSTOM_LINKER_FLAGS= # Additional linker flags when linking unit tests.
+			# -DEIGEN_TEST_BUILD_FLAGS= # Options passed to the build command of unit tests
+
+			# -DEIGEN_BUILD_BTL=yes # Build benchmark suite
+
+			-DEIGEN_TEST_BUILD_DOCUMENTATION="$(usex doc)" # Test building the doxygen documentation
+
+			# -DEIGEN_COVERAGE_TESTING=no # Enable/disable gcov
+			# -DEIGEN_CTEST_ERROR_EXCEPTION= # Regular expression for build error messages to be filtered out
+			# -DEIGEN_DEBUG_ASSERTS=no # Enable advanced debugging of assertions
+			# -DEIGEN_NO_ASSERTION_CHECKING=no # Disable checking of assertions using exceptions
+			# -DEIGEN_TEST_NO_EXCEPTIONS=no # Disables C++ exceptions
+			# -DEIGEN_TEST_NO_EXPLICIT_ALIGNMENT=no # Disable explicit alignment (hence vectorization) in tests/examples
+			# -DEIGEN_TEST_NO_EXPLICIT_VECTORIZATION=no # Disable explicit vectorization in tests/examples
+
+			# -DEIGEN_DASHBOARD_BUILD_TARGET=buildtests # Target to be built in dashboard mode, default is buildtests
+
+			# -DEIGEN_DEFAULT_TO_ROW_MAJOR=no # Use row-major as default matrix storage order
+
+			# -DEIGEN_TEST_MATRIX_DIR=yes # Enable testing of realword sparse matrices contained in the specified path
+			# -DEIGEN_TEST_MAX_SIZE=320 # Maximal matrix/vector size, default is 320
+			# -DEIGEN_SPLIT_LARGE_TESTS=no # Split large tests into smaller executables
+		)
+
+		use !adolc      && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_Adolc="TRUE" )
+		use !boost      && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_Boost="TRUE" )
+		use !cholmod    && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_CHOLMOD="TRUE" )
+		use !fftw       && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_FFTW="TRUE" )
+		use !sparsehash && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_GoogleHash="TRUE" )
+		use !klu        && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_KLU="TRUE" )
+		use !opengl     && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_OpenGL="TRUE" )
+		use !openmp     && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_OpenMP="TRUE" )
+		use !pastix     && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_PASTIX="TRUE" )
+		use !spqr       && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_SPQR="TRUE" )
+		use !superlu    && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_SuperLU="TRUE" )
+		use !umfpack    && mycmakeargs+=( -DCMAKE_DISABLE_FIND_PACKAGE_UMFPACK="TRUE" )
+
+		if use lapack; then
+			mycmakeargs+=(
+				-DEIGEN_ENABLE_LAPACK_TESTS=yes
+				-DEIGEN_TEST_EXTERNAL_BLAS=yes # Use external BLAS library for testsuite
+				-DCMAKE_DISABLE_FIND_PACKAGE_SuperLU=ON
+			)
+		fi
+
+		if use arm; then
+			mycmakeargs+=(
+				-DEIGEN_TEST_NEON="$(usex cpu_flags_arm_neon)"
+			)
+		fi
+
+		if use arm64; then
+			mycmakeargs+=(
+				-DEIGEN_TEST_NEON64="$(usex cpu_flags_arm_neon)"
+			)
+		fi
+
+		if use ppc || use ppc64; then
+			mycmakeargs+=(
+				-DEIGEN_TEST_ALTIVEC="$(usex cpu_flags_ppc_altivec)"
+				-DEIGEN_TEST_VSX="$(usex cpu_flags_ppc_vsx)"
+			)
+		fi
+
+		if use amd64 || use x86; then
+			mycmakeargs+=(
+				# -DEIGEN_TEST_32BIT=no # Force generating 32bit code.
+				# -DEIGEN_TEST_X87=no # Force using X87 instructions. Implies no vectorization.
+				-DEIGEN_TEST_SSE2="$(usex cpu_flags_x86_sse2)"
+				-DEIGEN_TEST_SSE3="$(usex cpu_flags_x86_sse3)"
+				-DEIGEN_TEST_SSSE3="$(usex cpu_flags_x86_ssse3)"
+				-DEIGEN_TEST_FMA="$(usex cpu_flags_x86_fma3)"
+				-DEIGEN_TEST_SSE4_1="$(usex cpu_flags_x86_sse4_1)"
+				-DEIGEN_TEST_SSE4_2="$(usex cpu_flags_x86_sse4_2)"
+				-DEIGEN_TEST_AVX="$(usex cpu_flags_x86_avx)"
+				-DEIGEN_TEST_F16C="$(usex cpu_flags_x86_f16c)"
+				-DEIGEN_TEST_AVX2="$(usex cpu_flags_x86_avx2)"
+				-DEIGEN_TEST_AVX512="$(usex cpu_flags_x86_avx512f)"
+				-DEIGEN_TEST_AVX512DQ="$(usex cpu_flags_x86_avx512dq)"
+			)
+		fi
+
+		if use mips; then
+			mycmakeargs+=(
+				# -DEIGEN_TEST_MSA=no # Enable/Disable MSA in tests/examples
+			)
+		fi
+
+		if use s390; then
+			mycmakeargs+=(
+				# -DEIGEN_TEST_Z13=no # Enable/Disable S390X(zEC13) ZVECTOR in tests/examples
+				# -DEIGEN_TEST_Z14=no # Enable/Disable S390X(zEC14) ZVECTOR in tests/examples
+			)
+		fi
+
+		mycmakeargs+=(
+			-DEIGEN_TEST_CUDA="$(usex cuda)" # Enable CUDA support in unit tests
+			-DEIGEN_TEST_CUDA_CLANG="$(usex cuda "$(usex clang)")" # Use clang instead of nvcc to compile the CUDA tests
+
+			-DEIGEN_TEST_HIP="$(usex hip)" # Add HIP support.
+
+			# -DEIGEN_TEST_SYCL=no # Add Sycl support.
+			# -DEIGEN_SYCL_TRISYCL=no # Use the triSYCL Sycl implementation (ComputeCPP by default).
+		)
+
+		if use cuda; then
+			cuda_add_sandbox -w
+			if use clang; then
+				local llvm_prefix
+				llvm_prefix="$(get_llvm_prefix -b)"
+				export CC="${llvm_prefix}/bin/clang"
+				export CXX="${llvm_prefix}/bin/clang++"
+				export LIBRARY_PATH="${ESYSROOT}/usr/$(get_libdir)"
+			else
+				cuda_set_CUDAHOSTCXX
+				mycmakeargs+=(
+					-DCUDA_HOST_COMPILER="${CUDAHOSTCXX}"
+				)
+			fi
+			if [[ "${CUDA_VERBOSE}" == true ]]; then
+				mycmakeargs+=(
+					-DCUDA_VERBOSE_BUILD=yes
+				)
+				NVCCFLAGS+=" -v"
+			fi
+
+			export CUDAFLAGS="${NVCCFLAGS}"
+
+			[[ -z "${CUDAARCHS}" ]] && einfo "trying to determine host CUDAARCHS"
+			: "${CUDAARCHS:=$(__nvcc_device_query)}"
+			export CUDAARCHS
+
+			mycmakeargs+=(
+				-DEIGEN_CUDA_COMPUTE_ARCH="${CUDAARCHS}"
+			)
+		fi
+	fi
+
+	cmake_src_configure
+}
+
+src_compile() {
+	local targets=()
+	if use doc; then
+		targets+=( doc )
+		HTML_DOCS=( "${BUILD_DIR}"/doc/html/. )
+	fi
+	if use test; then
+		targets+=( buildtests )
+		if ! use lapack; then
+			targets+=( blas )
+		fi
+		# tests generate random data, which
+		# obviously fails for some seeds
+		export EIGEN_SEED=712808
+	fi
+
+	if use doc || use test; then
+		cmake_src_compile "${targets[@]}"
+	fi
+}
+
+src_test() {
+	CMAKE_SKIP_TESTS=(
+		product_small_32           #  143 (Subprocess aborted)
+		product_small_33           #  144 (Subprocess aborted)
+
+		eigensolver_selfadjoint_13 #  452 (Subprocess aborted)
+
+		cholmod_support_21         #  726 (Subprocess aborted)
+		cholmod_support_22         #  727 (Subprocess aborted)
+
+		NonLinearOptimization      #  930 (Subprocess aborted)
+		openglsupport              #  990 (Failed)
+		levenberg_marquardt        # 1020 (Subprocess aborted)
+	)
+
+	if use cuda ; then
+		cuda_add_sandbox -w
+
+		CMAKE_SKIP_TESTS+=(
+			cxx11_tensor_cast_float16_gpu
+			cxx11_tensor_gpu_5
+		)
+	fi
+
+	if use lapack ; then
+		CMAKE_SKIP_TESTS+=(
+			"^LAPACK-.*$"
+		)
+	fi
+
+	local myctestargs=(
+		-j1 # otherwise breaks due to cmake reruns
+	)
+
+	cmake_src_test
+}
--- a/dev-cpp/eigen/files/eigen-3.4.0-ppc-no-vsx.patch
+++ b/dev-cpp/eigen/files/eigen-3.4.0-ppc-no-vsx.patch
@ -0,0 +1,195 @@
+https://gitlab.com/libeigen/eigen/-/merge_requests/1028
+https://bugs.gentoo.org/936107
+--- a/Eigen/Core
+++ b/Eigen/Core
+@@ -346,7 +346,7 @@
+ #include "src/Core/CoreIterators.h"
+ #include "src/Core/ConditionEstimator.h"
+ 
+-#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_VECTORIZE_VSX)
+   #include "src/Core/arch/AltiVec/MatrixProduct.h"
+ #elif defined EIGEN_VECTORIZE_NEON
+   #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
+--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
+@@ -100,6 +100,7 @@
+     HasAbs2   = 0,
+     HasMin    = 0,
+     HasMax    = 0,
+    HasSqrt   = 1,
+ #ifdef __VSX__
+     HasBlend  = 1,
+ #endif
+@@ -320,6 +321,7 @@
+     HasAbs2   = 0,
+     HasMin    = 0,
+     HasMax    = 0,
+    HasSqrt   = 1,
+     HasSetLinear = 0
+   };
+ };
+--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+@@ -40,16 +40,14 @@
+   return pcos_float(_x);
+ }
+ 
+#ifdef __VSX__
+ #ifndef EIGEN_COMP_CLANG
+ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+ Packet4f prsqrt<Packet4f>(const Packet4f& x)
+ {
+   return  vec_rsqrt(x);
+ }
+-#endif
+ 
+-#ifdef __VSX__
+-#ifndef EIGEN_COMP_CLANG
+ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+ Packet2d prsqrt<Packet2d>(const Packet2d& x)
+ {
+@@ -74,6 +72,26 @@
+ {
+   return pexp_double(_x);
+ }
+
+template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(psqrt<Packet4f>, a);
+}
+
+#ifndef EIGEN_COMP_CLANG
+template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+#endif
+#else
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f psqrt<Packet4f>(const Packet4f& x)
+{
+  Packet4f a;
+  for (Index i = 0; i < packet_traits<float>::size; i++) {
+    a[i] = numext::sqrt(x[i]);
+  }
+  return a;
+}
+ #endif
+ 
+ // Hyperbolic Tangent function.
+--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
+@@ -175,16 +175,19 @@
+ #else
+     HasRsqrt = 0,
+ #endif
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasRint = 1,
+ #else
+     HasSqrt = 0,
+     HasRsqrt = 0,
+-    HasTanh = EIGEN_FAST_MATH,
+-    HasErf = EIGEN_FAST_MATH,
+    HasTanh = 0,
+    HasErf = 0,
+    HasRint = 0,
+ #endif
+     HasRound = 1,
+     HasFloor = 1,
+     HasCeil = 1,
+-    HasRint = 1,
+     HasNegate = 1,
+     HasBlend = 1
+   };
+@@ -217,16 +220,17 @@
+ #else
+     HasRsqrt = 0,
+ #endif
+    HasRint = 1,
+ #else
+     HasSqrt = 0,
+     HasRsqrt = 0,
+-    HasTanh = EIGEN_FAST_MATH,
+-    HasErf = EIGEN_FAST_MATH,
+    HasRint = 0,
+ #endif
+    HasTanh = 0,
+    HasErf = 0,
+     HasRound = 1,
+     HasFloor = 1,
+     HasCeil = 1,
+-    HasRint = 1,
+     HasNegate = 1,
+     HasBlend = 1
+   };
+@@ -872,19 +876,29 @@
+   return vec_nor(c,c);
+ }
+ 
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+#endif
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
+ template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+#endif
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
+ template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+#endif
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
+ template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+#endif
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
+ template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+#endif
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
+ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
+ 
+@@ -937,6 +951,7 @@
+ }
+ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
+ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
+ {
+     Packet4f res;
+@@ -947,6 +962,7 @@
+ 
+     return res;
+ }
+#endif
+ 
+ template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
+ {
+@@ -1341,12 +1357,6 @@
+   BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
+ }
+ 
+-template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+-  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
+-}
+-template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+-  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+-}
+ template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+   BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+ }
+@@ -1390,9 +1400,11 @@
+ template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
+   BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
+ }
+#ifdef __VSX__
+ template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
+   BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
+ }
+#endif
+ template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+   Packet4f a_even = Bf16ToF32Even(a);
+   Packet4f a_odd = Bf16ToF32Odd(a);