| |
| #include <iostream> |
| #include <Eigen/Core> |
| #include <bench/BenchTimer.h> |
| using namespace Eigen; |
| |
| #ifndef SIZE |
| #define SIZE 50 |
| #endif |
| |
| #ifndef REPEAT |
| #define REPEAT 10000 |
| #endif |
| |
| typedef float Scalar; |
| |
| __attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); |
| __attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); |
| __attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); |
| |
| int main(int argc, char* argv[]) |
| { |
| int size = SIZE * 8; |
| int size2 = size * size; |
| Scalar* a = internal::aligned_new<Scalar>(size2); |
| Scalar* b = internal::aligned_new<Scalar>(size2+4)+1; |
| Scalar* c = internal::aligned_new<Scalar>(size2); |
| |
| for (int i=0; i<size; ++i) |
| { |
| a[i] = b[i] = c[i] = 0; |
| } |
| |
| BenchTimer timer; |
| |
| timer.reset(); |
| for (int k=0; k<10; ++k) |
| { |
| timer.start(); |
| benchVec(a, b, c, size2); |
| timer.stop(); |
| } |
| std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; |
| return 0; |
| for (int innersize = size; innersize>2 ; --innersize) |
| { |
| if (size2%innersize==0) |
| { |
| int outersize = size2/innersize; |
| MatrixXf ma = Map<MatrixXf>(a, innersize, outersize ); |
| MatrixXf mb = Map<MatrixXf>(b, innersize, outersize ); |
| MatrixXf mc = Map<MatrixXf>(c, innersize, outersize ); |
| timer.reset(); |
| for (int k=0; k<3; ++k) |
| { |
| timer.start(); |
| benchVec(ma, mb, mc); |
| timer.stop(); |
| } |
| std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; |
| } |
| } |
| |
| VectorXf va = Map<VectorXf>(a, size2); |
| VectorXf vb = Map<VectorXf>(b, size2); |
| VectorXf vc = Map<VectorXf>(c, size2); |
| timer.reset(); |
| for (int k=0; k<3; ++k) |
| { |
| timer.start(); |
| benchVec(va, vb, vc); |
| timer.stop(); |
| } |
| std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; |
| |
| return 0; |
| } |
| |
| void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) |
| { |
| for (int k=0; k<REPEAT; ++k) |
| a = a + b; |
| } |
| |
| void benchVec(VectorXf& a, VectorXf& b, VectorXf& c) |
| { |
| for (int k=0; k<REPEAT; ++k) |
| a = a + b; |
| } |
| |
| void benchVec(Scalar* a, Scalar* b, Scalar* c, int size) |
| { |
| typedef internal::packet_traits<Scalar>::type PacketScalar; |
| const int PacketSize = internal::packet_traits<Scalar>::size; |
| PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; |
| for (int k=0; k<REPEAT; ++k) |
| for (int i=0; i<size; i+=PacketSize*8) |
| { |
| // a0 = internal::pload(&a[i]); |
| // b0 = internal::pload(&b[i]); |
| // a1 = internal::pload(&a[i+1*PacketSize]); |
| // b1 = internal::pload(&b[i+1*PacketSize]); |
| // a2 = internal::pload(&a[i+2*PacketSize]); |
| // b2 = internal::pload(&b[i+2*PacketSize]); |
| // a3 = internal::pload(&a[i+3*PacketSize]); |
| // b3 = internal::pload(&b[i+3*PacketSize]); |
| // internal::pstore(&a[i], internal::padd(a0, b0)); |
| // a0 = internal::pload(&a[i+4*PacketSize]); |
| // b0 = internal::pload(&b[i+4*PacketSize]); |
| // |
| // internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1)); |
| // a1 = internal::pload(&a[i+5*PacketSize]); |
| // b1 = internal::pload(&b[i+5*PacketSize]); |
| // |
| // internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2)); |
| // a2 = internal::pload(&a[i+6*PacketSize]); |
| // b2 = internal::pload(&b[i+6*PacketSize]); |
| // |
| // internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3)); |
| // a3 = internal::pload(&a[i+7*PacketSize]); |
| // b3 = internal::pload(&b[i+7*PacketSize]); |
| // |
| // internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0)); |
| // internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1)); |
| // internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2)); |
| // internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3)); |
| |
| internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize]))); |
| internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize]))); |
| internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize]))); |
| internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize]))); |
| internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize]))); |
| internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize]))); |
| } |
| } |