|  |  | 
|  | #include <iostream> | 
|  | #include <Eigen/Core> | 
|  | #include <bench/BenchTimer.h> | 
|  | using namespace Eigen; | 
|  |  | 
|  | #ifndef SIZE | 
|  | #define SIZE 50 | 
|  | #endif | 
|  |  | 
|  | #ifndef REPEAT | 
|  | #define REPEAT 10000 | 
|  | #endif | 
|  |  | 
|  | typedef float Scalar; | 
|  |  | 
|  | __attribute__((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); | 
|  | __attribute__((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); | 
|  | __attribute__((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); | 
|  |  | 
|  | int main(int argc, char* argv[]) { | 
|  | int size = SIZE * 8; | 
|  | int size2 = size * size; | 
|  | Scalar* a = internal::aligned_new<Scalar>(size2); | 
|  | Scalar* b = internal::aligned_new<Scalar>(size2 + 4) + 1; | 
|  | Scalar* c = internal::aligned_new<Scalar>(size2); | 
|  |  | 
|  | for (int i = 0; i < size; ++i) { | 
|  | a[i] = b[i] = c[i] = 0; | 
|  | } | 
|  |  | 
|  | BenchTimer timer; | 
|  |  | 
|  | timer.reset(); | 
|  | for (int k = 0; k < 10; ++k) { | 
|  | timer.start(); | 
|  | benchVec(a, b, c, size2); | 
|  | timer.stop(); | 
|  | } | 
|  | std::cout << timer.value() << "s  " << (double(size2 * REPEAT) / timer.value()) / (1024. * 1024. * 1024.) | 
|  | << " GFlops\n"; | 
|  | return 0; | 
|  | for (int innersize = size; innersize > 2; --innersize) { | 
|  | if (size2 % innersize == 0) { | 
|  | int outersize = size2 / innersize; | 
|  | MatrixXf ma = Map<MatrixXf>(a, innersize, outersize); | 
|  | MatrixXf mb = Map<MatrixXf>(b, innersize, outersize); | 
|  | MatrixXf mc = Map<MatrixXf>(c, innersize, outersize); | 
|  | timer.reset(); | 
|  | for (int k = 0; k < 3; ++k) { | 
|  | timer.start(); | 
|  | benchVec(ma, mb, mc); | 
|  | timer.stop(); | 
|  | } | 
|  | std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " | 
|  | << (double(size2 * REPEAT) / timer.value()) / (1024. * 1024. * 1024.) << " GFlops\n"; | 
|  | } | 
|  | } | 
|  |  | 
|  | VectorXf va = Map<VectorXf>(a, size2); | 
|  | VectorXf vb = Map<VectorXf>(b, size2); | 
|  | VectorXf vc = Map<VectorXf>(c, size2); | 
|  | timer.reset(); | 
|  | for (int k = 0; k < 3; ++k) { | 
|  | timer.start(); | 
|  | benchVec(va, vb, vc); | 
|  | timer.stop(); | 
|  | } | 
|  | std::cout << timer.value() << "s   " << (double(size2 * REPEAT) / timer.value()) / (1024. * 1024. * 1024.) | 
|  | << " GFlops\n"; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) { | 
|  | for (int k = 0; k < REPEAT; ++k) a = a + b; | 
|  | } | 
|  |  | 
|  | void benchVec(VectorXf& a, VectorXf& b, VectorXf& c) { | 
|  | for (int k = 0; k < REPEAT; ++k) a = a + b; | 
|  | } | 
|  |  | 
|  | void benchVec(Scalar* a, Scalar* b, Scalar* c, int size) { | 
|  | typedef internal::packet_traits<Scalar>::type PacketScalar; | 
|  | const int PacketSize = internal::packet_traits<Scalar>::size; | 
|  | PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; | 
|  | for (int k = 0; k < REPEAT; ++k) | 
|  | for (int i = 0; i < size; i += PacketSize * 8) { | 
|  | //             a0 = internal::pload(&a[i]); | 
|  | //             b0 = internal::pload(&b[i]); | 
|  | //             a1 = internal::pload(&a[i+1*PacketSize]); | 
|  | //             b1 = internal::pload(&b[i+1*PacketSize]); | 
|  | //             a2 = internal::pload(&a[i+2*PacketSize]); | 
|  | //             b2 = internal::pload(&b[i+2*PacketSize]); | 
|  | //             a3 = internal::pload(&a[i+3*PacketSize]); | 
|  | //             b3 = internal::pload(&b[i+3*PacketSize]); | 
|  | //             internal::pstore(&a[i], internal::padd(a0, b0)); | 
|  | //             a0 = internal::pload(&a[i+4*PacketSize]); | 
|  | //             b0 = internal::pload(&b[i+4*PacketSize]); | 
|  | // | 
|  | //             internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1)); | 
|  | //             a1 = internal::pload(&a[i+5*PacketSize]); | 
|  | //             b1 = internal::pload(&b[i+5*PacketSize]); | 
|  | // | 
|  | //             internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2)); | 
|  | //             a2 = internal::pload(&a[i+6*PacketSize]); | 
|  | //             b2 = internal::pload(&b[i+6*PacketSize]); | 
|  | // | 
|  | //             internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3)); | 
|  | //             a3 = internal::pload(&a[i+7*PacketSize]); | 
|  | //             b3 = internal::pload(&b[i+7*PacketSize]); | 
|  | // | 
|  | //             internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0)); | 
|  | //             internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1)); | 
|  | //             internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2)); | 
|  | //             internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3)); | 
|  |  | 
|  | internal::pstore(&a[i + 2 * PacketSize], internal::padd(internal::ploadu(&a[i + 2 * PacketSize]), | 
|  | internal::ploadu(&b[i + 2 * PacketSize]))); | 
|  | internal::pstore(&a[i + 3 * PacketSize], internal::padd(internal::ploadu(&a[i + 3 * PacketSize]), | 
|  | internal::ploadu(&b[i + 3 * PacketSize]))); | 
|  | internal::pstore(&a[i + 4 * PacketSize], internal::padd(internal::ploadu(&a[i + 4 * PacketSize]), | 
|  | internal::ploadu(&b[i + 4 * PacketSize]))); | 
|  | internal::pstore(&a[i + 5 * PacketSize], internal::padd(internal::ploadu(&a[i + 5 * PacketSize]), | 
|  | internal::ploadu(&b[i + 5 * PacketSize]))); | 
|  | internal::pstore(&a[i + 6 * PacketSize], internal::padd(internal::ploadu(&a[i + 6 * PacketSize]), | 
|  | internal::ploadu(&b[i + 6 * PacketSize]))); | 
|  | internal::pstore(&a[i + 7 * PacketSize], internal::padd(internal::ploadu(&a[i + 7 * PacketSize]), | 
|  | internal::ploadu(&b[i + 7 * PacketSize]))); | 
|  | } | 
|  | } |