| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| // SPDX-License-Identifier: MPL-2.0 |
| |
| #ifndef EIGEN_SPARSEDENSEPRODUCT_H |
| #define EIGEN_SPARSEDENSEPRODUCT_H |
| |
| // IWYU pragma: private |
| #include "./InternalHeaderCheck.h" |
| |
| namespace Eigen { |
| |
| namespace internal { |
| |
| template <> |
| struct product_promote_storage_type<Sparse, Dense, OuterProduct> { |
| typedef Sparse ret; |
| }; |
| template <> |
| struct product_promote_storage_type<Dense, Sparse, OuterProduct> { |
| typedef Sparse ret; |
| }; |
| |
| // Type trait to detect if a sparse type supports direct compressed storage access |
| // (i.e., has valuePtr(), innerIndexPtr(), outerIndexPtr(), isCompressed()). |
| // All types deriving from SparseCompressedBase provide these methods. |
| template <typename T> |
| struct has_compressed_storage : std::is_base_of<SparseCompressedBase<T>, T> {}; |
| |
| template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType, |
| int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor, |
| bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1> |
| struct sparse_time_dense_product_impl; |
| |
| // RowMajor, single column (ColPerCol=true): CSR SpMV |
| template <typename SparseLhsType, typename DenseRhsType, typename DenseResType> |
| struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar, |
| RowMajor, true> { |
| typedef internal::remove_all_t<SparseLhsType> Lhs; |
| typedef internal::remove_all_t<DenseResType> Res; |
| typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator; |
| typedef evaluator<Lhs> LhsEval; |
| typedef typename Res::Scalar ResScalar; |
| |
| static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const typename Res::Scalar& alpha) { |
| LhsEval lhsEval(lhs); |
| Index n = lhs.outerSize(); |
| |
| for (Index c = 0; c < rhs.cols(); ++c) { |
| runCol(lhsEval, lhs, rhs, res, alpha, n, c, std::integral_constant<bool, has_compressed_storage<Lhs>::value>()); |
| } |
| } |
| |
| // Direct pointer path: works for both compressed and non-compressed storage. |
| static void runCol(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const ResScalar& alpha, Index n, Index c, std::true_type /* has_compressed_storage */) { |
| runColImpl(lhs, rhs, res, alpha, n, c, std::integral_constant<bool, bool(DenseRhsType::Flags & DirectAccessBit)>()); |
| } |
| |
| template <typename RhsT> |
| static void runColImpl(const SparseLhsType& lhs, const RhsT& rhs, DenseResType& res, const ResScalar& alpha, Index n, |
| Index c, std::true_type) { |
| const Lhs& mat = lhs; |
| const auto* vals = mat.valuePtr(); |
| const auto* inds = mat.innerIndexPtr(); |
| // Sparse vectors don't store outer indices. |
| const auto* outer = mat.outerIndexPtr(); |
| const auto* innerNnz = mat.innerNonZeroPtr(); |
| // The fast rhs pointer path requires unit inner stride (common case: VectorXd, contiguous matrix column). |
| if (rhs.innerStride() == 1) { |
| const auto* x = rhs.data() + c * rhs.outerStride(); |
| #ifdef EIGEN_HAS_OPENMP |
| Index threads = Eigen::nbThreads(); |
| if (threads > 1 && mat.nonZeros() > 20000) { |
| #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) |
| for (Index i = 0; i < n; ++i) { |
| Index k = outer ? outer[i] : 0; |
| const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i] : (outer ? outer[i + 1] : mat.nonZeros()); |
| ResScalar sum0(0), sum1(0); |
| for (; k < end; ++k) { |
| sum0 += vals[k] * x[inds[k]]; |
| ++k; |
| if (k < end) { |
| sum1 += vals[k] * x[inds[k]]; |
| } |
| } |
| res.coeffRef(i, c) += alpha * (sum0 + sum1); |
| } |
| } else |
| #endif |
| { |
| for (Index i = 0; i < n; ++i) { |
| Index k = outer ? outer[i] : 0; |
| const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i] : (outer ? outer[i + 1] : mat.nonZeros()); |
| // Two independent accumulators to break the dependency chain |
| ResScalar sum0(0), sum1(0); |
| for (; k < end; ++k) { |
| sum0 += vals[k] * x[inds[k]]; |
| ++k; |
| if (k < end) { |
| sum1 += vals[k] * x[inds[k]]; |
| } |
| } |
| res.coeffRef(i, c) += alpha * (sum0 + sum1); |
| } |
| } |
| } else { |
| runColImpl(lhs, rhs, res, alpha, n, c, std::false_type()); |
| } |
| } |
| |
| // Use fall-back path without direct access to rhs. |
| template <typename RhsT> |
| static void runColImpl(const SparseLhsType& lhs, const RhsT& rhs, DenseResType& res, const ResScalar& alpha, Index n, |
| Index c, std::false_type) { |
| const Lhs& mat = lhs; |
| const auto* vals = mat.valuePtr(); |
| const auto* inds = mat.innerIndexPtr(); |
| const auto* outer = mat.outerIndexPtr(); |
| const auto* innerNnz = mat.innerNonZeroPtr(); |
| // Non-unit rhs stride (or no direct access): use direct pointers for sparse side, coeff() for rhs |
| for (Index i = 0; i < n; ++i) { |
| Index k = outer ? outer[i] : 0; |
| const Index end = innerNnz ? (outer ? outer[i] : 0) + innerNnz[i] : (outer ? outer[i + 1] : mat.nonZeros()); |
| ResScalar sum0(0), sum1(0); |
| for (; k < end; ++k) { |
| sum0 += vals[k] * rhs.coeff(inds[k], c); |
| ++k; |
| if (k < end) { |
| sum1 += vals[k] * rhs.coeff(inds[k], c); |
| } |
| } |
| res.coeffRef(i, c) += alpha * (sum0 + sum1); |
| } |
| } |
| |
| // Iterator fallback path |
| static void runCol(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, DenseResType& res, |
| const ResScalar& alpha, Index n, Index c, std::false_type /* has_compressed_storage */) { |
| #ifdef EIGEN_HAS_OPENMP |
| Index threads = Eigen::nbThreads(); |
| if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) { |
| #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) |
| for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); |
| } else |
| #endif |
| { |
| for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); |
| } |
| } |
| |
| static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const ResScalar& alpha, |
| Index i, Index col) { |
| ResScalar tmp_a(0); |
| ResScalar tmp_b(0); |
| for (LhsInnerIterator it(lhsEval, i); it; ++it) { |
| tmp_a += it.value() * rhs.coeff(it.index(), col); |
| ++it; |
| if (it) { |
| tmp_b += it.value() * rhs.coeff(it.index(), col); |
| } |
| } |
| res.coeffRef(i, col) += alpha * (tmp_a + tmp_b); |
| } |
| }; |
| |
| // ColMajor, single column (ColPerCol=true): CSC SpMV |
| template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType> |
| struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> { |
| typedef internal::remove_all_t<SparseLhsType> Lhs; |
| typedef internal::remove_all_t<DenseRhsType> Rhs; |
| typedef internal::remove_all_t<DenseResType> Res; |
| typedef evaluator<Lhs> LhsEval; |
| typedef typename LhsEval::InnerIterator LhsInnerIterator; |
| |
| static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { |
| runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>()); |
| } |
| |
| // Direct pointer path: works for both compressed and non-compressed storage. |
| static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha, |
| std::true_type /* has_compressed_storage */) { |
| typedef typename Lhs::Scalar LhsScalar; |
| typedef typename Lhs::StorageIndex StorageIndex; |
| const Lhs& mat = lhs; |
| const LhsScalar* vals = mat.valuePtr(); |
| const StorageIndex* inds = mat.innerIndexPtr(); |
| // Sparse vectors don't store outer indices. |
| const auto* outer = mat.outerIndexPtr(); |
| const auto* innerNnz = mat.innerNonZeroPtr(); |
| // The fast result pointer path requires contiguous ColMajor result layout. |
| // Transpose<ColMajor> reports innerStride()==1 but is actually RowMajor, so check both. |
| EIGEN_IF_CONSTEXPR (!(Res::Flags & RowMajorBit)) { |
| if (res.innerStride() == 1) { |
| const Index n = lhs.outerSize(); |
| // The threaded scatter+reduce path relies on a thread_local scratch buffer for |
| // host-thread safety (see below), so it is only available where thread_local is |
| // usable; under EIGEN_AVOID_THREAD_LOCAL fall through to the serial scatter. |
| #if defined(EIGEN_HAS_OPENMP) && !defined(EIGEN_AVOID_THREAD_LOCAL) |
| typedef typename Res::Scalar ResScalar; |
| const Index m = res.rows(); |
| const Index threads = Eigen::nbThreads(); |
| // Per-thread scratch + reduction: the natural per-column partition would |
| // race on the output (writes to y[inds[k]] are scattered across rows), |
| // so each thread accumulates into its own m-sized output buffer and the |
| // results are summed at the end. Activated above the same 20000-nnz |
| // threshold as the RowMajor kernel, plus a second gate on per-thread |
| // scratch size: `threads * m` scalars are touched by the reduction, |
| // and on tall / very-sparse matrices that can dwarf the SpMV cost -- |
| // require avg nnz per row >= threads so the reduction can't dominate. |
| // `outer` is null for SparseVector lhs; the nnz-balanced partition needs |
| // the outer-index array, so fall back to the serial scatter below. |
| if (outer && threads > 1 && mat.nonZeros() > 20000 && mat.nonZeros() >= Index(threads) * m) { |
| // Per-calling-thread persistent scratch (per template instantiation). |
| // Grows monotonically; reused across calls. The buffer is left at |
| // all-zeros after each call by folding the zero-out into the reduction |
| // step, which avoids a separate init pass on every SpMV. `thread_local` |
| // is required: two unrelated host threads concurrently calling |
| // y = A*x would otherwise race on this static buffer (and a reallocating |
| // grow on one would dangle the other's scratch_ptr). |
| thread_local static std::vector<ResScalar> scratch_buf; |
| const std::size_t need = static_cast<std::size_t>(threads) * static_cast<std::size_t>(m); |
| if (scratch_buf.size() < need) scratch_buf.assign(need, ResScalar(0)); |
| ResScalar* scratch_ptr = scratch_buf.data(); |
| // nnz-balanced column partition: each thread t owns the contiguous |
| // column range [part[t], part[t+1]). Deterministic mapping of j to |
| // thread (required for bit-reproducible reduction below) AND |
| // nnz-balanced load (dynamic scheduling would balance but break |
| // determinism; static round-robin would be deterministic but |
| // imbalanced on skewed matrices). |
| std::vector<Index> part(static_cast<std::size_t>(threads) + 1); |
| part[threads] = n; |
| part[0] = 0; |
| // Targets are monotonically increasing in t, so each lower_bound starts |
| // from the previous result; total work is O(T + log n) rather than |
| // T * log n. |
| const StorageIndex* const part_last = outer + n + 1; |
| const StorageIndex* part_lo = outer; |
| for (Index t = 1; t < threads; ++t) { |
| const Index target = (t * mat.nonZeros()) / threads; |
| part_lo = std::lower_bound(part_lo, part_last, StorageIndex(target)); |
| part[t] = part_lo - outer; |
| } |
| for (Index c = 0; c < rhs.cols(); ++c) { |
| typename Res::Scalar* y = res.data() + c * res.outerStride(); |
| #pragma omp parallel for schedule(static, 1) num_threads(threads) |
| for (Index t = 0; t < threads; ++t) { |
| ResScalar* yt = scratch_ptr + t * m; |
| const Index j_lo = part[t], j_hi = part[t + 1]; |
| for (Index j = j_lo; j < j_hi; ++j) { |
| typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * |
| rhs.coeff(j, c)); |
| const Index start = outer ? outer[j] : 0; |
| const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros()); |
| Index k = start; |
| for (; k + 3 < end; k += 4) { |
| yt[inds[k]] += vals[k] * rhs_j; |
| yt[inds[k + 1]] += vals[k + 1] * rhs_j; |
| yt[inds[k + 2]] += vals[k + 2] * rhs_j; |
| yt[inds[k + 3]] += vals[k + 3] * rhs_j; |
| } |
| for (; k < end; ++k) yt[inds[k]] += vals[k] * rhs_j; |
| } |
| } |
| // Reduce per-thread buffers into y AND zero them, so the next call |
| // doesn't have to re-init. Process rows in cache-resident blocks: the |
| // natural [t*m+i] scratch layout makes the cross-thread read for a |
| // single row a stride-m gather (one cache line per thread, ~threads*m |
| // bytes apart, unvectorizable). Blocking lets each thread's stripe be |
| // swept as a unit-stride, vectorizable stream into a small per-block |
| // accumulator. Each thread owns a contiguous range of row blocks |
| // (static schedule) so the zero stores stay independent, and the |
| // accumulator is summed in the exact t = 0..threads-1 order, keeping |
| // the result bit-identical to the scalar reduction it replaces. |
| constexpr Index kReduceBlock = 512; |
| #pragma omp parallel for schedule(static) num_threads(threads) |
| for (Index i0 = 0; i0 < m; i0 += kReduceBlock) { |
| const Index i1 = numext::mini(i0 + kReduceBlock, m); |
| const Index len = i1 - i0; |
| EIGEN_ALIGN_MAX ResScalar acc[kReduceBlock]; |
| for (Index ii = 0; ii < len; ++ii) acc[ii] = ResScalar(0); |
| for (Index t = 0; t < threads; ++t) { |
| ResScalar* row = scratch_ptr + t * m + i0; |
| for (Index ii = 0; ii < len; ++ii) { |
| acc[ii] += row[ii]; |
| row[ii] = ResScalar(0); |
| } |
| } |
| for (Index ii = 0; ii < len; ++ii) y[i0 + ii] += acc[ii]; |
| } |
| } |
| } else |
| #endif |
| { |
| for (Index c = 0; c < rhs.cols(); ++c) { |
| typename Res::Scalar* y = res.data() + c * res.outerStride(); |
| for (Index j = 0; j < n; ++j) { |
| typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c)); |
| const Index start = outer ? outer[j] : 0; |
| const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros()); |
| Index k = start; |
| // 4-way unrolled scatter-add (no SIMD: writes are scattered) |
| for (; k + 3 < end; k += 4) { |
| y[inds[k]] += vals[k] * rhs_j; |
| y[inds[k + 1]] += vals[k + 1] * rhs_j; |
| y[inds[k + 2]] += vals[k + 2] * rhs_j; |
| y[inds[k + 3]] += vals[k + 3] * rhs_j; |
| } |
| for (; k < end; ++k) y[inds[k]] += vals[k] * rhs_j; |
| } |
| } |
| } |
| return; |
| } |
| } |
| // Non-unit result stride: use coeffRef() for result access |
| for (Index c = 0; c < rhs.cols(); ++c) { |
| for (Index j = 0; j < lhs.outerSize(); ++j) { |
| typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c)); |
| const Index start = outer ? outer[j] : 0; |
| const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros()); |
| for (Index k = start; k < end; ++k) res.coeffRef(inds[k], c) += vals[k] * rhs_j; |
| } |
| } |
| } |
| |
| // Iterator-based fallback |
| static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha, |
| std::false_type /* has_compressed_storage */) { |
| LhsEval lhsEval(lhs); |
| for (Index c = 0; c < rhs.cols(); ++c) { |
| for (Index j = 0; j < lhs.outerSize(); ++j) { |
| typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c)); |
| for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j; |
| } |
| } |
| } |
| }; |
| |
| // RowMajor, multiple columns (ColPerCol=false): sparse * dense_matrix |
| template <typename SparseLhsType, typename DenseRhsType, typename DenseResType> |
| struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar, |
| RowMajor, false> { |
| typedef internal::remove_all_t<SparseLhsType> Lhs; |
| typedef internal::remove_all_t<DenseResType> Res; |
| typedef evaluator<Lhs> LhsEval; |
| typedef typename LhsEval::InnerIterator LhsInnerIterator; |
| |
| static constexpr bool IsCompressedLhs = has_compressed_storage<Lhs>::value; |
| |
| static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const typename Res::Scalar& alpha) { |
| Index n = lhs.rows(); |
| LhsEval lhsEval(lhs); |
| |
| #ifdef EIGEN_HAS_OPENMP |
| Index threads = Eigen::nbThreads(); |
| // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. |
| // It basically represents the minimal amount of work to be done to be worth it. |
| if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) { |
| #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) |
| for (Index i = 0; i < n; ++i) |
| processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>()); |
| } else |
| #endif |
| { |
| for (Index i = 0; i < n; ++i) |
| processRow(lhsEval, lhs, rhs, res, alpha, i, std::integral_constant<bool, IsCompressedLhs>()); |
| } |
| } |
| |
| // Direct pointer path: works for both compressed and non-compressed storage. |
| static void processRow(const LhsEval& /*lhsEval*/, const SparseLhsType& lhs, const DenseRhsType& rhs, Res& res, |
| const typename Res::Scalar& alpha, Index i, std::true_type /* has_compressed_storage */) { |
| typedef typename Lhs::Scalar LhsScalar; |
| typedef typename Lhs::StorageIndex StorageIndex; |
| const Lhs& mat = lhs; |
| const LhsScalar* vals = mat.valuePtr(); |
| const StorageIndex* inds = mat.innerIndexPtr(); |
| // Sparse vectors don't store outer indices. |
| const Index start = mat.outerIndexPtr() ? mat.outerIndexPtr()[i] : 0; |
| const auto* innerNnz = mat.innerNonZeroPtr(); |
| const Index end = |
| innerNnz ? start + innerNnz[i] : (mat.outerIndexPtr() ? mat.outerIndexPtr()[i + 1] : mat.nonZeros()); |
| typename Res::RowXpr res_i(res.row(i)); |
| for (Index k = start; k < end; ++k) res_i += (alpha * vals[k]) * rhs.row(inds[k]); |
| } |
| |
| static void processRow(const LhsEval& lhsEval, const SparseLhsType& /*lhs*/, const DenseRhsType& rhs, Res& res, |
| const typename Res::Scalar& alpha, Index i, std::false_type /* has_compressed_storage */) { |
| typename Res::RowXpr res_i(res.row(i)); |
| for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index()); |
| } |
| }; |
| |
| // ColMajor, multiple columns (ColPerCol=false): sparse * dense_matrix |
| template <typename SparseLhsType, typename DenseRhsType, typename DenseResType> |
| struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar, |
| ColMajor, false> { |
| typedef internal::remove_all_t<SparseLhsType> Lhs; |
| typedef internal::remove_all_t<DenseRhsType> Rhs; |
| typedef internal::remove_all_t<DenseResType> Res; |
| typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator; |
| |
| static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const typename Res::Scalar& alpha) { |
| runImpl(lhs, rhs, res, alpha, std::integral_constant<bool, has_compressed_storage<Lhs>::value>()); |
| } |
| |
| // Direct pointer path: works for both compressed and non-compressed storage. |
| static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const typename Res::Scalar& alpha, std::true_type /* has_compressed_storage */) { |
| typedef typename Lhs::Scalar LhsScalar; |
| typedef typename Lhs::StorageIndex StorageIndex; |
| const Lhs& mat = lhs; |
| const LhsScalar* vals = mat.valuePtr(); |
| const StorageIndex* inds = mat.innerIndexPtr(); |
| // Sparse vectors don't store outer indices. |
| const auto* outer = mat.outerIndexPtr(); |
| const auto* innerNnz = mat.innerNonZeroPtr(); |
| for (Index j = 0; j < lhs.outerSize(); ++j) { |
| typename Rhs::ConstRowXpr rhs_j(rhs.row(j)); |
| const Index start = outer ? outer[j] : 0; |
| const Index end = innerNnz ? start + innerNnz[j] : (outer ? outer[j + 1] : mat.nonZeros()); |
| for (Index k = start; k < end; ++k) res.row(inds[k]) += (alpha * vals[k]) * rhs_j; |
| } |
| } |
| |
| static void runImpl(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const typename Res::Scalar& alpha, std::false_type /* has_compressed_storage */) { |
| evaluator<Lhs> lhsEval(lhs); |
| for (Index j = 0; j < lhs.outerSize(); ++j) { |
| typename Rhs::ConstRowXpr rhs_j(rhs.row(j)); |
| for (LhsInnerIterator it(lhsEval, j); it; ++it) res.row(it.index()) += (alpha * it.value()) * rhs_j; |
| } |
| } |
| }; |
| |
| template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType> |
| inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, |
| const AlphaType& alpha) { |
| sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType>::run(lhs, rhs, res, alpha); |
| } |
| |
| } // end namespace internal |
| |
| namespace internal { |
| |
| template <typename Lhs, typename Rhs, int ProductType> |
| struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType> |
| : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType> > { |
| typedef typename Product<Lhs, Rhs>::Scalar Scalar; |
| |
| template <typename Dest> |
| static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { |
| typedef typename nested_eval<Lhs, ((Rhs::Flags & RowMajorBit) == 0) ? 1 : Rhs::ColsAtCompileTime>::type LhsNested; |
| typedef typename nested_eval<Rhs, ((Lhs::Flags & RowMajorBit) == 0) ? 1 : Dynamic>::type RhsNested; |
| LhsNested lhsNested(lhs); |
| RhsNested rhsNested(rhs); |
| internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha); |
| } |
| }; |
| |
| template <typename Lhs, typename Rhs, int ProductType> |
| struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, DenseShape, ProductType> |
| : generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType> {}; |
| |
| template <typename Lhs, typename Rhs, int ProductType> |
| struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType> |
| : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType> > { |
| typedef typename Product<Lhs, Rhs>::Scalar Scalar; |
| |
| template <typename Dst> |
| static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { |
| typedef typename nested_eval<Lhs, ((Rhs::Flags & RowMajorBit) == 0) ? Dynamic : 1>::type LhsNested; |
| typedef typename nested_eval<Rhs, ((Lhs::Flags & RowMajorBit) == RowMajorBit) ? 1 : Lhs::RowsAtCompileTime>::type |
| RhsNested; |
| LhsNested lhsNested(lhs); |
| RhsNested rhsNested(rhs); |
| |
| // transpose everything |
| Transpose<Dst> dstT(dst); |
| internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); |
| } |
| }; |
| |
| template <typename Lhs, typename Rhs, int ProductType> |
| struct generic_product_impl<Lhs, Rhs, DenseShape, SparseTriangularShape, ProductType> |
| : generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType> {}; |
| |
| template <typename LhsT, typename RhsT, bool NeedToTranspose> |
| struct sparse_dense_outer_product_evaluator { |
| protected: |
| typedef std::conditional_t<NeedToTranspose, RhsT, LhsT> Lhs1; |
| typedef std::conditional_t<NeedToTranspose, LhsT, RhsT> ActualRhs; |
| typedef Product<LhsT, RhsT, DefaultProduct> ProdXprType; |
| |
| // if the actual left-hand side is a dense vector, |
| // then build a sparse-view so that we can seamlessly iterate over it. |
| typedef std::conditional_t<std::is_same<typename internal::traits<Lhs1>::StorageKind, Sparse>::value, Lhs1, |
| SparseView<Lhs1> > |
| ActualLhs; |
| typedef std::conditional_t<std::is_same<typename internal::traits<Lhs1>::StorageKind, Sparse>::value, Lhs1 const&, |
| SparseView<Lhs1> > |
| LhsArg; |
| |
| typedef evaluator<ActualLhs> LhsEval; |
| typedef evaluator<ActualRhs> RhsEval; |
| typedef typename evaluator<ActualLhs>::InnerIterator LhsIterator; |
| typedef typename ProdXprType::Scalar Scalar; |
| |
| public: |
| enum { Flags = NeedToTranspose ? RowMajorBit : 0, CoeffReadCost = HugeCost }; |
| |
| class InnerIterator : public LhsIterator { |
| public: |
| InnerIterator(const sparse_dense_outer_product_evaluator& xprEval, Index outer) |
| : LhsIterator(xprEval.m_lhsXprImpl, 0), |
| m_outer(outer), |
| m_empty(false), |
| m_factor(get(xprEval.m_rhsXprImpl, outer, typename internal::traits<ActualRhs>::StorageKind())) {} |
| |
| EIGEN_STRONG_INLINE Index outer() const { return m_outer; } |
| EIGEN_STRONG_INLINE Index row() const { return NeedToTranspose ? m_outer : LhsIterator::index(); } |
| EIGEN_STRONG_INLINE Index col() const { return NeedToTranspose ? LhsIterator::index() : m_outer; } |
| |
| EIGEN_STRONG_INLINE Scalar value() const { return LhsIterator::value() * m_factor; } |
| EIGEN_STRONG_INLINE operator bool() const { return LhsIterator::operator bool() && (!m_empty); } |
| |
| protected: |
| Scalar get(const RhsEval& rhs, Index outer, Dense = Dense()) const { return rhs.coeff(outer); } |
| |
| Scalar get(const RhsEval& rhs, Index outer, Sparse = Sparse()) { |
| typename RhsEval::InnerIterator it(rhs, outer); |
| if (it && it.index() == 0 && it.value() != Scalar(0)) return it.value(); |
| m_empty = true; |
| return Scalar(0); |
| } |
| |
| Index m_outer; |
| bool m_empty; |
| Scalar m_factor; |
| }; |
| |
| sparse_dense_outer_product_evaluator(const Lhs1& lhs, const ActualRhs& rhs) |
| : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs) { |
| EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); |
| } |
| |
| // transpose case |
| sparse_dense_outer_product_evaluator(const ActualRhs& rhs, const Lhs1& lhs) |
| : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs) { |
| EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); |
| } |
| |
| protected: |
| const LhsArg m_lhs; |
| evaluator<ActualLhs> m_lhsXprImpl; |
| evaluator<ActualRhs> m_rhsXprImpl; |
| }; |
| |
| // sparse * dense outer product |
| template <typename Lhs, typename Rhs> |
| struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape> |
| : sparse_dense_outer_product_evaluator<Lhs, Rhs, Lhs::IsRowMajor> { |
| typedef sparse_dense_outer_product_evaluator<Lhs, Rhs, Lhs::IsRowMajor> Base; |
| |
| typedef Product<Lhs, Rhs> XprType; |
| typedef typename XprType::PlainObject PlainObject; |
| |
| explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs()) {} |
| }; |
| |
| template <typename Lhs, typename Rhs> |
| struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape> |
| : sparse_dense_outer_product_evaluator<Lhs, Rhs, Rhs::IsRowMajor> { |
| typedef sparse_dense_outer_product_evaluator<Lhs, Rhs, Rhs::IsRowMajor> Base; |
| |
| typedef Product<Lhs, Rhs> XprType; |
| typedef typename XprType::PlainObject PlainObject; |
| |
| explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs()) {} |
| }; |
| |
| } // end namespace internal |
| |
| } // end namespace Eigen |
| |
| #endif // EIGEN_SPARSEDENSEPRODUCT_H |