blob: 83f5da55885b23189d59753bec86b9da18c33ded [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra. Eigen itself is part of the KDE project.
//
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
#define EIGEN_CACHE_FRIENDLY_PRODUCT_H
#ifndef EIGEN_EXTERN_INSTANTIATIONS
template<typename Scalar>
static void ei_cache_friendly_product(
int _rows, int _cols, int depth,
bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
bool resRowMajor, Scalar* res, int resStride)
{
const Scalar* __restrict__ lhs;
const Scalar* __restrict__ rhs;
int lhsStride, rhsStride, rows, cols;
bool lhsRowMajor;
if (resRowMajor)
{
lhs = _rhs;
rhs = _lhs;
lhsStride = _rhsStride;
rhsStride = _lhsStride;
cols = _rows;
rows = _cols;
lhsRowMajor = !_rhsRowMajor;
ei_assert(_lhsRowMajor);
}
else
{
lhs = _lhs;
rhs = _rhs;
lhsStride = _lhsStride;
rhsStride = _rhsStride;
rows = _rows;
cols = _cols;
lhsRowMajor = _lhsRowMajor;
ei_assert(!_rhsRowMajor);
}
typedef typename ei_packet_traits<Scalar>::type PacketType;
enum {
PacketSize = sizeof(PacketType)/sizeof(Scalar),
#if (defined __i386__)
// i386 architecture provides only 8 xmm registers,
// so let's reduce the max number of rows processed at once.
MaxBlockRows = 4,
MaxBlockRows_ClampingMask = 0xFFFFFC,
#else
MaxBlockRows = 8,
MaxBlockRows_ClampingMask = 0xFFFFF8,
#endif
// maximal size of the blocks fitted in L2 cache
MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
};
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
const int remainingSize = depth % PacketSize;
const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
Scalar* __restrict__ block = 0;
const int allocBlockSize = sizeof(Scalar)*l2BlockRows*size;
if (allocBlockSize>16000000)
block = (Scalar*)malloc(allocBlockSize);
else
block = (Scalar*)alloca(allocBlockSize);
Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSize);
// loops on each L2 cache friendly blocks of the result
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
{
const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw
const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows
//const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
// build a cache friendly blocky matrix
int count = 0;
// copy l2blocksize rows of m_lhs to blocks of ps x bw
asm("#eigen begin buildblocks");
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
{
const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
{
// TODO merge the if l2blockRemainingRows
// const int blockRows = std::min(i+MaxBlockRows, rows) - i;
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
{
// TODO write these loops using meta unrolling
// negligible for large matrices but useful for small ones
if (lhsRowMajor)
{
for (int w=0; w<MaxBlockRows; ++w)
for (int s=0; s<PacketSize; ++s)
block[count++] = lhs[(i+w)*lhsStride + (k+s)];
}
else
{
for (int w=0; w<MaxBlockRows; ++w)
for (int s=0; s<PacketSize; ++s)
block[count++] = lhs[(i+w) + (k+s)*lhsStride];
}
}
}
if (l2blockRemainingRows>0)
{
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
{
if (lhsRowMajor)
{
for (int w=0; w<l2blockRemainingRows; ++w)
for (int s=0; s<PacketSize; ++s)
block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
}
else
{
for (int w=0; w<l2blockRemainingRows; ++w)
for (int s=0; s<PacketSize; ++s)
block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
}
}
}
}
asm("#eigen end buildblocks");
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
{
int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
{
// acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
// for each bw x 1 result's block
for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
{
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
{
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
const Scalar* __restrict__ localB = &block[offsetblock];
const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]);
// copy unaligned rhs data
// YES it seems to be faster to copy some part of rhs multiple times
// to aligned memory rather than using unligned load.
// Moreover this avoids a "if" in the most nested loop :)
if (PacketSize>1 && size_t(rhsColumn)%16)
{
int count = 0;
for (int k = l2k; k<l2blockSizeEnd; ++k)
{
rhsCopy[count++] = rhsColumn[k];
}
rhsColumn = &(rhsCopy[-l2k]);
}
PacketType dst[MaxBlockRows];
dst[0] = ei_pset1(Scalar(0.));
dst[1] = dst[0];
dst[2] = dst[0];
dst[3] = dst[0];
if (MaxBlockRows==8)
{
dst[4] = dst[0];
dst[5] = dst[0];
dst[6] = dst[0];
dst[7] = dst[0];
}
PacketType tmp;
asm("#eigen begincore");
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
{
tmp = ei_pload(&rhsColumn[k]);
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows ])), dst[0]);
dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+ PacketSize])), dst[1]);
dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+2*PacketSize])), dst[2]);
dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+3*PacketSize])), dst[3]);
if (MaxBlockRows==8)
{
dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+4*PacketSize])), dst[4]);
dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+5*PacketSize])), dst[5]);
dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+6*PacketSize])), dst[6]);
dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+7*PacketSize])), dst[7]);
}
}
Scalar* __restrict__ localRes = &(res[l1i + l1j*resStride]);
if (PacketSize>1 && resIsAligned)
{
ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(dst)));
if (PacketSize==2)
ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
if (MaxBlockRows==8)
{
ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
if (PacketSize==2)
ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
}
}
else
{
localRes[0] += ei_predux(dst[0]);
localRes[1] += ei_predux(dst[1]);
localRes[2] += ei_predux(dst[2]);
localRes[3] += ei_predux(dst[3]);
if (MaxBlockRows==8)
{
localRes[4] += ei_predux(dst[4]);
localRes[5] += ei_predux(dst[5]);
localRes[6] += ei_predux(dst[6]);
localRes[7] += ei_predux(dst[7]);
}
}
asm("#eigen endcore");
}
}
if (l2blockRemainingRows>0)
{
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
const Scalar* localB = &block[offsetblock];
asm("#eigen begin dynkernel");
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
{
const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]);
// copy unaligned rhs data
if (PacketSize>1 && size_t(rhsColumn)%16)
{
int count = 0;
for (int k = l2k; k<l2blockSizeEnd; ++k)
{
rhsCopy[count++] = rhsColumn[k];
}
rhsColumn = &(rhsCopy[-l2k]);
}
PacketType dst[MaxBlockRows];
dst[0] = ei_pset1(Scalar(0.));
dst[1] = dst[0];
dst[2] = dst[0];
dst[3] = dst[0];
if (MaxBlockRows>4)
{
dst[4] = dst[0];
dst[5] = dst[0];
dst[6] = dst[0];
dst[7] = dst[0];
}
// let's declare a few other temporary registers
PacketType tmp;
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
{
tmp = ei_pload(&rhsColumn[k]);
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]);
if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]);
if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
if (MaxBlockRows>4)
{
if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
}
}
Scalar* __restrict__ localRes = &(res[l2blockRowEndBW + l1j*resStride]);
// process the remaining rows once at a time
localRes[0] += ei_predux(dst[0]);
if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
if (MaxBlockRows>4)
{
if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
}
asm("#eigen end dynkernel");
}
}
}
}
}
if (PacketSize>1 && remainingSize)
{
if (lhsRowMajor)
{
for (int j=0; j<cols; ++j)
for (int i=0; i<rows; ++i)
{
Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
for (int k=1; k<remainingSize; ++k)
tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
res[i+j*resStride] += tmp;
}
}
else
{
for (int j=0; j<cols; ++j)
for (int i=0; i<rows; ++i)
{
Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
for (int k=1; k<remainingSize; ++k)
tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
res[i+j*resStride] += tmp;
}
}
}
if (allocBlockSize>16000000)
free(block);
}
#endif // EIGEN_EXTERN_INSTANTIATIONS
/* Optimized col-major matrix * vector product:
* This algorithm processes 4 columns at onces that allows to both reduce
* the number of load/stores of the result by a factor 4 and to reduce
* the instruction dependency. Moreover, we know that all bands have the
* same alignment pattern.
* TODO: since rhs gets evaluated only once, no need to evaluate it
*/
template<typename Scalar, typename RhsType>
EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector(
int size,
const Scalar* lhs, int lhsStride,
const RhsType& rhs,
Scalar* res)
{
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) \
ei_pstore(&res[j OFFSET], \
ei_padd(ei_pload(&res[j OFFSET]), \
ei_padd( \
ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs0[j OFFSET])),ei_pmul(ptmp1,ei_pload ## A13(&lhs1[j OFFSET]))), \
ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs2[j OFFSET])),ei_pmul(ptmp3,ei_pload ## A13(&lhs3[j OFFSET]))) )))
asm("#begin matrix_vector_product");
typedef typename ei_packet_traits<Scalar>::type Packet;
const int PacketSize = sizeof(Packet)/sizeof(Scalar);
enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
const int columnsAtOnce = 4;
const int peels = 2;
const int PacketAlignedMask = PacketSize-1;
const int PeelAlignedMask = PacketSize*peels-1;
const bool Vectorized = sizeof(Packet) != sizeof(Scalar);
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
const int alignedStart = Vectorized
? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
: 0;
const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask;
int alignmentPattern = alignmentStep==0 ? AllAligned
: alignmentStep==2 ? EvenAligned
: FirstAligned;
// find how many columns do we have to skip to be aligned with the result (if possible)
int skipColumns=0;
for (; skipColumns<PacketSize && alignedStart != alignmentStep*skipColumns; ++skipColumns)
{}
if (skipColumns==PacketSize)
{
// nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
skipColumns = 0;
}
else
{
skipColumns = std::min(skipColumns,rhs.size());
// note that the skiped columns are processed later.
}
int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (int i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+1]),
ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+3]);
const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
// process initial unaligned coeffs
for (int j=0; j<alignedStart; j++)
res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
if (alignedSize>alignedStart)
{
switch(alignmentPattern)
{
case AllAligned:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,,,);
break;
case EvenAligned:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,u,,);
break;
case FirstAligned:
if(peels>1)
{
// NOTE peeling with two _EIGEN_ACCUMULATE_PACKETS() is much less efficient
// than the following code
asm("#mybegin");
Packet A00, A01, A02, A03, A10, A11, A12, A13;
for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
{
A01 = ei_ploadu(&lhs1[j]); A11 = ei_ploadu(&lhs1[j+PacketSize]);
A02 = ei_ploadu(&lhs2[j]); A12 = ei_ploadu(&lhs2[j+PacketSize]);
A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]);
A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j]));
A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize]));
A00 = ei_pmadd(ptmp1, A01, A00); A10 = ei_pmadd(ptmp1, A11, A10);
A03 = ei_ploadu(&lhs3[j]); A13 = ei_ploadu(&lhs3[j+PacketSize]);
A00 = ei_pmadd(ptmp2, A02, A00); A10 = ei_pmadd(ptmp2, A12, A10);
A00 = ei_pmadd(ptmp3, A03, A00); A10 = ei_pmadd(ptmp3, A13, A10);
ei_pstore(&res[j],A00); ei_pstore(&res[j+PacketSize],A10);
}
asm("#myend");
}
for (int j = peeledSize; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,u,u,);
break;
default:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(u,u,u,);
break;
}
}
// process remaining coeffs
for (int j=alignedSize; j<size; j++)
res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
}
// process remaining first and last columns (at most columnsAtOnce-1)
int end = rhs.size();
int start = columnBound;
do
{
for (int i=start; i<end; i++)
{
Packet ptmp0 = ei_pset1(rhs[i]);
const Scalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs
for (int j=0; j<alignedStart; j++)
res[j] += ei_pfirst(ptmp0) * lhs0[j];
// process aligned result's coeffs
if (((i*lhsStride) % PacketSize+alignedStart) == 0)
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j])));
else
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
ei_pstore(&res[j], ei_pmadd(ptmp0,ei_ploadu(&lhs0[j]),ei_pload(&res[j])));
// process remaining scalars
for (int j=alignedSize; j<size; j++)
res[j] += ei_pfirst(ptmp0) * lhs0[j];
}
if (skipColumns)
{
start = 0;
end = skipColumns;
skipColumns = 0;
}
else
break;
} while(true);
asm("#end matrix_vector_product");
#undef _EIGEN_ACCUMULATE_PACKETS
}
// TODO add peeling to mask unaligned load/stores
template<typename Scalar, typename ResType>
EIGEN_DONT_INLINE static void ei_cache_friendly_product_rowmajor_times_vector(
const Scalar* lhs, int lhsStride,
const Scalar* rhs, int rhsSize,
ResType& res)
{
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) {\
Packet b = ei_pload(&rhs[j]); \
ptmp0 = ei_pmadd(b, ei_pload##A0 (&lhs0[j]), ptmp0); \
ptmp1 = ei_pmadd(b, ei_pload##A13(&lhs1[j]), ptmp1); \
ptmp2 = ei_pmadd(b, ei_pload##A2 (&lhs2[j]), ptmp2); \
ptmp3 = ei_pmadd(b, ei_pload##A13(&lhs3[j]), ptmp3); }
asm("#begin matrix_vector_product");
typedef typename ei_packet_traits<Scalar>::type Packet;
const int PacketSize = sizeof(Packet)/sizeof(Scalar);
enum { AllAligned, EvenAligned, FirstAligned, NoneAligned };
const int rowsAtOnce = 4;
// const int peels = 2;
const int PacketAlignedMask = PacketSize-1;
// const int PeelAlignedMask = PacketSize*peels-1;
const bool Vectorized = sizeof(Packet) != sizeof(Scalar);
const int size = rhsSize;
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
const int alignedStart = Vectorized
? std::min<int>( (PacketSize - ((size_t(rhs)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
: 0;
const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
//const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
const int alignmentStep = (PacketSize - lhsStride % PacketSize) & PacketAlignedMask;
int alignmentPattern = alignmentStep==0 ? AllAligned
: alignmentStep==2 ? EvenAligned
: FirstAligned;
// find how many rows do we have to skip to be aligned with rhs (if possible)
int skipRows=0;
for (; skipRows<PacketSize && alignedStart != alignmentStep*skipRows; ++skipRows)
{}
if (skipRows==PacketSize)
{
// nothing can be aligned, no need to skip any column
alignmentPattern = NoneAligned;
skipRows = 0;
}
else
{
skipRows = std::min(skipRows,res.size());
// note that the skiped columns are processed later.
}
int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (int i=skipRows; i<rowBound; i+=rowsAtOnce)
{
Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+3)*lhsStride;
// process initial unaligned coeffs
for (int j=0; j<alignedStart; j++)
{
Scalar b = rhs[j];
tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
}
if (alignedSize>alignedStart)
{
switch(alignmentPattern)
{
case AllAligned:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,,,);
break;
case EvenAligned:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,u,,);
break;
case FirstAligned:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(,u,u,);
break;
default:
for (int j = alignedStart; j<alignedSize; j+=PacketSize)
_EIGEN_ACCUMULATE_PACKETS(u,u,u,);
break;
}
tmp0 += ei_predux(ptmp0);
tmp1 += ei_predux(ptmp1);
tmp2 += ei_predux(ptmp2);
tmp3 += ei_predux(ptmp3);
}
// process remaining coeffs
for (int j=alignedSize; j<size; j++)
{
Scalar b = rhs[j];
tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j];
}
res[i] += tmp0; res[i+1] += tmp1; res[i+2] += tmp2; res[i+3] += tmp3;
}
// process remaining first and last rows (at most columnsAtOnce-1)
int end = res.size();
int start = rowBound;
do
{
for (int i=start; i<end; i++)
{
Scalar tmp0 = Scalar(0);
Packet ptmp0 = ei_pset1(tmp0);
const Scalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs
for (int j=0; j<alignedStart; j++)
tmp0 += rhs[j] * lhs0[j];
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
if ((i*lhsStride+alignedStart) % PacketSize==0)
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0);
else
for (int j = alignedStart;j<alignedSize;j+=PacketSize)
ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_ploadu(&lhs0[j]), ptmp0);
tmp0 += ei_predux(ptmp0);
}
// process remaining scalars
for (int j=alignedSize; j<size; j++)
tmp0 += rhs[j] * lhs0[j];
res[i] += tmp0;
}
if (skipRows)
{
start = 0;
end = skipRows;
skipRows = 0;
}
else
break;
} while(true);
asm("#end matrix_vector_product");
#undef _EIGEN_ACCUMULATE_PACKETS
}
#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H