cleaned Tridiagonalization impl, with improved performance
diff --git a/Eigen/src/QR/Tridiagonalization.h b/Eigen/src/QR/Tridiagonalization.h index 5997863..54d705f 100644 --- a/Eigen/src/QR/Tridiagonalization.h +++ b/Eigen/src/QR/Tridiagonalization.h
@@ -220,100 +220,22 @@ // i.e., A = H' A H where H = I - h v v' and v = matA.col(i).end(n-i-1) matA.col(i).coeffRef(i+1) = 1; + Scalar* EIGEN_RESTRICT t = &hCoeffs.coeffRef(-1); - /* This is the initial algorithm which minimize operation counts and maximize - * the use of Eigen's expression. Unfortunately, the first matrix-vector product - * using Part<LowerTriangular|Selfadjoint> is very very slow */ - #ifdef EIGEN_NEVER_DEFINED - // matrix - vector product - hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular|SelfAdjoint>() - * (h * matA.col(i).end(n-i-1))).lazy(); - // simple axpy - hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) - * matA.col(i).end(n-i-1); - // rank-2 update - //Block<MatrixType,Dynamic,1> B(matA,i+1,i,n-i-1,1); - matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular>() -= - (matA.col(i).end(n-i-1) * hCoeffs.end(n-i-1).adjoint()).lazy() - + (hCoeffs.end(n-i-1) * matA.col(i).end(n-i-1).adjoint()).lazy(); - #endif - /* end initial algorithm */ + // hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1).template part<LowerTriangular|SelfAdjoint>() + // * matA.col(i).end(n-i-1)).lazy(); + // TODO map the above code to the function call below: + ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,LowerTriangularBit> + (n-i-1,matA.corner(BottomRight,n-i-1,n-i-1).data(), matA.stride(), matA.col(i).end(n-i-1).data(), const_cast<Scalar*>(hCoeffs.end(n-i-1).data())); - /* If we still want to minimize operation count (i.e., perform operation on the lower part only) - * then we could provide the following algorithm for selfadjoint - vector product. However, a full - * matrix-vector product is still faster (at least for dynamic size, and not too small, did not check - * small matrices). The algo performs block matrix-vector and transposed matrix vector products. */ - #ifdef EIGEN_NEVER_DEFINED - int n4 = (std::max(0,n-4)/4)*4; - hCoeffs.end(n-i-1).setZero(); - for (int b=i+1; b<n4; b+=4) - { - // the ?x4 part: - hCoeffs.end(b-4) += - Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4) * matA.template block<4,1>(b,i); - // the respective transposed part: - Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) += - Block<MatrixType,Dynamic,4>(matA,b+4,b,n-b-4,4).adjoint() * Block<MatrixType,Dynamic,1>(matA,b+4,i,n-b-4,1); - // the 4x4 block diagonal: - Block<CoeffVectorType,4,1>(hCoeffs, b, 0, 4,1) += - (Block<MatrixType,4,4>(matA,b,b,4,4).template part<LowerTriangular|SelfAdjoint>() - * (h * Block<MatrixType,4,1>(matA,b,i,4,1))).lazy(); - } - #endif - // todo: handle the remaining part - /* end optimized selfadjoint - vector product */ + hCoeffs.end(n-i-1) = hCoeffs.end(n-i-1)*h + + (h*ei_conj(h)*Scalar(-0.5)*matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) * + matA.col(i).end(n-i-1); - /* Another interesting note: the above rank-2 update is much slower than the following hand written loop. - * After an analyze of the ASM, it seems GCC (4.2) generate poor code because of the Block. Moreover, - * if we remove the specialization of Block for Matrix then it is even worse, much worse ! */ - #ifdef EIGEN_NEVER_DEFINED + // symmetric rank-2 update for (int j1=i+1; j1<n; ++j1) - for (int i1=j1; i1<n; ++i1) - matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1)) - + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i)); - #endif - /* end hand writen partial rank-2 update */ - - /* The current fastest implementation: the full matrix is used, no "optimization" to use/compute - * only half of the matrix. Custom vectorization of the inner col -= alpha X + beta Y such that access - * to col are always aligned. Once we support that in Assign, then the algorithm could be rewriten as - * a single compact expression. This code is therefore a good benchmark when will do that. */ - - // let's use the end of hCoeffs to store temporary values: - hCoeffs.end(n-i-1) = (matA.corner(BottomRight,n-i-1,n-i-1) * (h * matA.col(i).end(n-i-1))).lazy(); - // FIXME in the above expr a temporary is created because of the scalar multiple by h - - hCoeffs.end(n-i-1) += (h * Scalar(-0.5) * matA.col(i).end(n-i-1).dot(hCoeffs.end(n-i-1))) - * matA.col(i).end(n-i-1); - - const Scalar* EIGEN_RESTRICT pb = &matA.coeffRef(0,i); - const Scalar* EIGEN_RESTRICT pa = (&hCoeffs.coeffRef(0)) - 1; - for (int j1=i+1; j1<n; ++j1) - { - int starti = i+1; - int alignedEnd = starti; - if (PacketSize>1) - { - int alignedStart = (starti) + ei_alignmentOffset(&matA.coeffRef(starti,j1), n-starti); - alignedEnd = alignedStart + ((n-alignedStart)/PacketSize)*PacketSize; - - for (int i1=starti; i1<alignedStart; ++i1) - matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1)) - + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i)); - - Packet tmp0 = ei_pset1(hCoeffs.coeff(j1-1)); - Packet tmp1 = ei_pset1(matA.coeff(j1,i)); - Scalar* pc = &matA.coeffRef(0,j1); - for (int i1=alignedStart ; i1<alignedEnd; i1+=PacketSize) - ei_pstore(pc+i1,ei_psub(ei_pload(pc+i1), - ei_padd(ei_pmul(tmp0, ei_ploadu(pb+i1)), - ei_pmul(tmp1, ei_ploadu(pa+i1))))); - } - for (int i1=alignedEnd; i1<n; ++i1) - matA.coeffRef(i1,j1) -= matA.coeff(i1,i)*ei_conj(hCoeffs.coeff(j1-1)) - + hCoeffs.coeff(i1-1)*ei_conj(matA.coeff(j1,i)); - } - /* end optimized implementation */ + matA.col(j1).end(n-j1) -= matA.col(i).end(n-j1) * ei_conj(hCoeffs.coeff(j1-1)) + + hCoeffs.end(n-j1) * ei_conj(matA.coeff(j1,i)); // note: at that point matA(i+1,i+1) is the (i+1)-th element of the final diagonal // note: the sequence of the beta values leads to the subdiagonal entries