various update of of BTL
diff --git a/bench/btl/README b/bench/btl/README
index 787002f..f3f5fb3 100644
--- a/bench/btl/README
+++ b/bench/btl/README
@@ -43,10 +43,10 @@
   BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata --overwrite" ctest -V -R eigen2
 
 4 : Analyze the result. different data files (.dat) are produced in each libs directories.
- If gnuplot is available, choose a directory name in the data directory to store the results and type
-	cd data
-	mkdir my_directory
-        cp ../libs/*/*.dat my_directory
+ If gnuplot is available, choose a directory name in the data directory to store the results and type:
+        $ cd data
+        $ mkdir my_directory
+        $ cp ../libs/*/*.dat my_directory
  Build the data utilities in this (data) directory
         make
  Then you can look the raw data,
diff --git a/bench/btl/actions/basic_actions.hh b/bench/btl/actions/basic_actions.hh
index 1e6e420..a23e580 100644
--- a/bench/btl/actions/basic_actions.hh
+++ b/bench/btl/actions/basic_actions.hh
@@ -12,7 +12,7 @@
 #include "action_trisolve.hh"
 
 #include "action_symv.hh"
-#include "action_symm.hh"
+// #include "action_symm.hh"
 #include "action_syr2.hh"
 
 // #include "action_lu_solve.hh"
diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake
index bba350b..b4a984a 100644
--- a/bench/btl/cmake/FindATLAS.cmake
+++ b/bench/btl/cmake/FindATLAS.cmake
@@ -15,23 +15,25 @@
 find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-# find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-# find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-# find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-# find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-# find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-# find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+if(NOT ATLAS_LAPACK)
+  find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+  find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+endif(NOT ATLAS_LAPACK)
 
-# find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-# find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 
-# if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
-set(ATLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_LAPACK}
-# ${ATLAS_CBLAS} ${ATLAS_LAPACK} ${ATLAS_F77BLAS}
-)
-# endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
+if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
+  
+  set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS}  ${ATLAS_F77BLAS} ${ATLAS_LIB})
+  
+endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(ATLAS DEFAULT_MSG
diff --git a/bench/btl/cmake/FindGOTO.cmake b/bench/btl/cmake/FindGOTO.cmake
index b2b648b..ad7eb32 100644
--- a/bench/btl/cmake/FindGOTO.cmake
+++ b/bench/btl/cmake/FindGOTO.cmake
@@ -15,6 +15,10 @@
 find_file(GOTO_LIBRARIES libgotoblas.so PATHS /usr/lib $ENV{GOTODIR} ${LIB_INSTALL_DIR})
 find_library(GOTO_LIBRARIES gotoblas PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR})
 
+if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
+  set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread")
+endif(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
+
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(GOTO DEFAULT_MSG
                                   GOTO_INCLUDES GOTO_LIBRARIES)
diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt
index 2655727..5e88cee 100644
--- a/bench/btl/data/action_settings.txt
+++ b/bench/btl/data/action_settings.txt
@@ -1,12 +1,14 @@
-aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:1024
-ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:1024
-atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:1024
-axpby ; "{/*1.5 Y = alpha * X + beta * Y}" ; "vector size" ; 5:1000000
-axpy ; "{/*1.5 Y += alpha * X}" ; "vector size" ; 5:1000000
-matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:1024
-matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:1024
-trisolve ; "{/*1.5 triangular solver (X = inv(L) * X)}" ; "size" ; 4:1024
-cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:1024
-lu_decomp ; "{/*1.5 LU decomposition}" ; "matrix size" ; 4:1024
-tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:1024
-hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:1024 
\ No newline at end of file
+aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:2048
+ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:2048
+atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:2048
+axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000
+axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000
+matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:2048
+matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:2048
+trisolve ; "{/*1.5 triangular solver (X = inv(L) X)}" ; "size" ; 4:2048
+cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:2048
+lu_decomp ; "{/*1.5 LU decomposition}" ; "matrix size" ; 4:2048
+tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:2048
+hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:2048 
+symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:2048 
+syr2 ; "{/*1.5 symmetric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:2048 
\ No newline at end of file
diff --git a/bench/btl/data/go_mean b/bench/btl/data/go_mean
index f8edf43..71cca61 100755
--- a/bench/btl/data/go_mean
+++ b/bench/btl/data/go_mean
@@ -1,7 +1,20 @@
 #! /bin/bash
+
+if [ $# < 1 ]; then
+  echo "Usage: $0 working_directory [tiny|large [prefix]]"
+else
+
 mkdir -p $1
 ##cp ../libs/*/*.dat $1
 
+mode=large
+if [ $# > 2 ]; then
+  mode=$2
+fi
+if [ $# > 3 ]; then
+  prefix=$3
+fi
+
 EIGENDIR=`cat eigen_root_dir.txt`
 
 webpagefilename=$1/index.html
@@ -18,19 +31,22 @@
   '</ul>' \
   '</p>'  >> $webpagefilename
 
-source mk_mean_script.sh axpy $1 11 2500 100000 250000  $2
-source mk_mean_script.sh axpby $1 11 2500 100000 250000 $2
-source mk_mean_script.sh matrix_vector $1 11 50 300 1000 $2
-source mk_mean_script.sh atv $1 11 50 300 1000 $2
-source mk_mean_script.sh matrix_matrix $1 11 100 300 1000 $2
-source mk_mean_script.sh aat $1 11 100 300 1000 $2
-source mk_mean_script.sh ata $1 11 100 300 1000 $2
-source mk_mean_script.sh trisolve $1 11 100 300 1000 $2
-source mk_mean_script.sh cholesky $1 11 100 300 1000 $2
-source mk_mean_script.sh lu_decomp $1 11 100 300 1000 $2
-source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $2
-source mk_mean_script.sh hessenberg $1 11 100 300 1000 $2
+source mk_mean_script.sh axpy $1 11 2500 100000 250000  $mode $prefix
+source mk_mean_script.sh axpby $1 11 2500 100000 250000 $mode $prefix
+source mk_mean_script.sh matrix_vector $1 11 50 300 1000 $mode $prefix
+source mk_mean_script.sh atv $1 11 50 300 1000 $mode $prefix
+source mk_mean_script.sh matrix_matrix $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh aat $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh ata $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh trisolve $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh cholesky $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh lu_decomp $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh hessenberg $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh symv $1 11 50 300 1000 $mode $prefix
+source mk_mean_script.sh syr2 $1 11 50 300 1000 $mode $prefix
 
+fi
 
 ## compile the web page ##
 
diff --git a/bench/btl/data/mk_mean_script.sh b/bench/btl/data/mk_mean_script.sh
index baa0fd9..43bab55 100644
--- a/bench/btl/data/mk_mean_script.sh
+++ b/bench/btl/data/mk_mean_script.sh
@@ -5,6 +5,7 @@
 MAXIC=$4
 MINOC=$5
 MAXOC=$6
+prefix=$8
 
 meanstatsfilename=$2/mean.html
 
@@ -37,7 +38,7 @@
 
 webpagefilename=$2/index.html
 # echo '<h3>'${WHAT}'</h3>'  >> $webpagefilename
-echo '<hr/><a href="/btl/'$1'.pdf"><img src="/btl/'$1'.png" alt="'${WHAT}'" /></a><br/>'  >> $webpagefilename
+echo '<hr/><a href="'$prefix$1'.pdf"><img src="'$prefix$1'.png" alt="'${WHAT}'" /></a><br/>'  >> $webpagefilename
 
 
 
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh
index 1295b37..08fea80 100644
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@@ -37,11 +37,11 @@
 // min matrix size for matrix matrix product bench
 #define MIN_MM 5
 // max matrix size for matrix matrix product bench
-#define MAX_MM 2048
+#define MAX_MM MAX_MV
 // min matrix size for LU bench
 #define MIN_LU 5
 // max matrix size for LU bench
-#define MAX_LU 1024
+#define MAX_LU 2048
 // max size for tiny vector and matrix
 #define TINY_MV_MAX_SIZE 16
 // default nb_sample for x86 timer
diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh
index 38e2c5f..fdc0992 100644
--- a/bench/btl/generic_bench/btl.hh
+++ b/bench/btl/generic_bench/btl.hh
@@ -169,7 +169,7 @@
 {
 public:
   BtlConfig()
-    : overwriteResults(false)
+    : overwriteResults(false), checkResults(true)
   {
     char * _config;
     _config = getenv ("BTL_CONFIG");
@@ -193,6 +193,10 @@
         {
           Instance.overwriteResults = true;
         }
+        else if (config[i].beginsWith("--nocheck"))
+        {
+          Instance.checkResults = false;
+        }
       }
     }
 
@@ -214,6 +218,7 @@
 
   static BtlConfig Instance;
   bool overwriteResults;
+  bool checkResults;
 
 protected:
   std::vector<BtlString> m_selectedActionNames;
diff --git a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
index d0fe95c..67d3378 100644
--- a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
+++ b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
@@ -65,9 +65,12 @@
     time_action = time_action / (double(_nb_calc));
 
     // check
-    action.initialize();
-    action.calculate();
-    action.check_result();
+    if (BtlConfig::Instance.checkResults)
+    {
+      action.initialize();
+      action.calculate();
+      action.check_result();
+    }
     return action.nb_op_base()/(time_action*1000000.0);
   }
 
diff --git a/bench/btl/libs/C_BLAS/C_BLAS_interface.hh b/bench/btl/libs/C_BLAS/C_BLAS_interface.hh
index 319658c..a726fa8 100644
--- a/bench/btl/libs/C_BLAS/C_BLAS_interface.hh
+++ b/bench/btl/libs/C_BLAS/C_BLAS_interface.hh
@@ -132,7 +132,7 @@
 static char trans = 'T';
 static char nonunit = 'N';
 static char lower = 'L';
-static blasint intone = 1;
+static int intone = 1;
 
 template<>
 class C_BLAS_interface<float> : public f77_interface_base<float>
@@ -160,6 +160,14 @@
     cblas_ssymv(CblasColMajor,CblasLower,N,1.0,A,N,B,1,0.0,X,1);
     #endif
   }
+  
+  static inline void syr2(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+    #ifdef PUREBLAS
+    ssyr2_(&lower,&N,&fone,B,&intone,X,&intone,A,&N);
+    #else
+    cblas_ssyr2(CblasColMajor,CblasLower,N,1.0,B,1,X,1,A,N);
+    #endif
+  }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
     #ifdef PUREBLAS
diff --git a/bench/btl/libs/C_BLAS/main.cpp b/bench/btl/libs/C_BLAS/main.cpp
index 1eee550..57cb993 100644
--- a/bench/btl/libs/C_BLAS/main.cpp
+++ b/bench/btl/libs/C_BLAS/main.cpp
@@ -41,6 +41,7 @@
   bench<Action_matrix_vector_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_atv_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_symv<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+  bench<Action_syr2<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
 
   bench<Action_matrix_matrix_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_ata_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh
index 9dd9b8e..3958d4a 100644
--- a/bench/btl/libs/STL/STL_interface.hh
+++ b/bench/btl/libs/STL/STL_interface.hh
@@ -146,6 +146,15 @@
       X[j] += t2;
     }
   }
+  
+  static inline void syr2(gene_matrix & A, gene_vector & B, gene_vector & X, int N)
+  {
+    for (int j=0; j<N; ++j)
+    {
+      for (int i=j; i<N; ++i)
+        A[j][i] += B[i]*X[j] + B[j]*X[i];
+    }
+  }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N)
   {
diff --git a/bench/btl/libs/STL/main.cpp b/bench/btl/libs/STL/main.cpp
index 619504e..4e73328 100644
--- a/bench/btl/libs/STL/main.cpp
+++ b/bench/btl/libs/STL/main.cpp
@@ -31,6 +31,7 @@
   bench<Action_matrix_vector_product<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_atv_product<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_symv<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+  bench<Action_syr2<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_matrix_matrix_product<STL_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_ata_product<STL_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_aat_product<STL_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
diff --git a/bench/btl/libs/eigen2/CMakeLists.txt b/bench/btl/libs/eigen2/CMakeLists.txt
index f061a27..beb37f7 100644
--- a/bench/btl/libs/eigen2/CMakeLists.txt
+++ b/bench/btl/libs/eigen2/CMakeLists.txt
@@ -7,7 +7,7 @@
   btl_add_bench(btl_eigen2_vecmat main_vecmat.cpp)
   btl_add_bench(btl_eigen2_matmat main_matmat.cpp)
   btl_add_bench(btl_eigen2_adv main_adv.cpp)
-
+  
   IF(NOT BTL_NOVEC)
     btl_add_bench(btl_eigen2_novec_linear main_linear.cpp)
     btl_add_bench(btl_eigen2_novec_vecmat main_vecmat.cpp)
diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index 2b463f0..92a5677 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -18,7 +18,7 @@
 #ifndef EIGEN2_INTERFACE_HH
 #define EIGEN2_INTERFACE_HH
 // #include <cblas.h>
-#include <Eigen/Core>
+#include <Eigen/Array>
 #include <Eigen/Cholesky>
 #include <Eigen/LU>
 #include <Eigen/QR>
@@ -45,7 +45,9 @@
 
   static inline std::string name( void )
   {
-    #if defined(EIGEN_VECTORIZE_SSE)
+    #if defined(EIGEN_USE_NEW_PRODUCT)
+    if (SIZE==Dynamic) return "eigen2_newprod"; else return "tiny_eigen2";
+    #elif defined(EIGEN_VECTORIZE_SSE)
     if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
     #elif defined(EIGEN_VECTORIZE_ALTIVEC)
     if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2";
@@ -114,7 +116,57 @@
   }
 
   static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
-    X = (A.template marked<SelfAdjoint|LowerTriangular>() * B)/*.lazy()*/;
+    //X = (A.template marked<SelfAdjoint|LowerTriangular>() * B)/*.lazy()*/;
+    ei_product_selfadjoint_vector<real,0,LowerTriangularBit>(N,A.data(),N, B.data(), X.data());
+  }
+  
+  template<typename Dest, typename Src> static void triassign(Dest& dst, const Src& src)
+  {
+    typedef typename Dest::Scalar Scalar;
+    typedef typename ei_packet_traits<Scalar>::type Packet;
+    const int PacketSize = sizeof(Packet)/sizeof(Scalar);
+    int size = dst.cols();
+    for(int j=0; j<size; j+=1)
+    {
+//       const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      Scalar* A0 = dst.data() + j*dst.stride();
+      int starti = j;
+      int alignedEnd = starti;
+      int alignedStart = (starti) + ei_alignmentOffset(&A0[starti], size-starti);
+      alignedEnd = alignedStart + ((size-alignedStart)/(2*PacketSize))*(PacketSize*2);
+
+      // do the non-vectorizable part of the assignment
+      for (int index = starti; index<alignedStart ; ++index)
+      {
+        if(Dest::Flags&RowMajorBit)
+          dst.copyCoeff(j, index, src);
+        else
+          dst.copyCoeff(index, j, src);
+      }
+
+      // do the vectorizable part of the assignment
+      for (int index = alignedStart; index<alignedEnd; index+=PacketSize)
+      {
+        if(Dest::Flags&RowMajorBit)
+          dst.template copyPacket<Src, Aligned, Unaligned>(j, index, src);
+        else
+          dst.template copyPacket<Src, Aligned, Unaligned>(index, j, src);
+      }
+
+      // do the non-vectorizable part of the assignment
+      for (int index = alignedEnd; index<size; ++index)
+      {
+        if(Dest::Flags&RowMajorBit)
+          dst.copyCoeff(j, index, src);
+        else
+          dst.copyCoeff(index, j, src);
+      }
+      //dst.col(j).end(N-j) = src.col(j).end(N-j);
+    }
+  }
+  
+  static EIGEN_DONT_INLINE void syr2(gene_matrix & A,  gene_vector & X, gene_vector & Y, int N){
+    // ei_product_selfadjoint_rank2_update<real,0,LowerTriangularBit>(N,A.data(),N, X.data(), 1, Y.data(), 1, -1);
   }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
@@ -126,7 +178,9 @@
   }
 
   static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
+  asm("#begin axpby");
     Y = a*X + b*Y;
+  asm("#end axpby");
   }
 
   static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
@@ -158,7 +212,10 @@
   }
 
   static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){
-    C = Tridiagonalization<gene_matrix>(X).packedMatrix();
+    typename Tridiagonalization<gene_matrix>::CoeffVectorType aux(N-1);
+    C = X;
+    Tridiagonalization<gene_matrix>::_compute(C, aux);
+//     C = Tridiagonalization<gene_matrix>(X).packedMatrix();
   }
 
   static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){
diff --git a/bench/btl/libs/eigen2/main_vecmat.cpp b/bench/btl/libs/eigen2/main_vecmat.cpp
index 881d90e..fb00d6f 100644
--- a/bench/btl/libs/eigen2/main_vecmat.cpp
+++ b/bench/btl/libs/eigen2/main_vecmat.cpp
@@ -19,7 +19,6 @@
 #include "eigen2_interface.hh"
 #include "bench.hh"
 #include "basic_actions.hh"
-#include "action_symv.hh"
 
 BTL_MAIN;
 
@@ -28,6 +27,7 @@
   bench<Action_matrix_vector_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_atv_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
   bench<Action_symv<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
+  bench<Action_syr2<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
 
   return 0;
 }
diff --git a/bench/btl/libs/hand_vec/hand_vec_interface.hh b/bench/btl/libs/hand_vec/hand_vec_interface.hh
index 4e7d549..6080b24 100755
--- a/bench/btl/libs/hand_vec/hand_vec_interface.hh
+++ b/bench/btl/libs/hand_vec/hand_vec_interface.hh
@@ -38,16 +38,16 @@
   typedef typename f77_interface_base<real>::gene_vector gene_vector;
 
   static void free_matrix(gene_matrix & A, int N){
-    ei_aligned_delete(A);
+    ei_aligned_free(A);
   }
 
   static void free_vector(gene_vector & B){
-    ei_aligned_delete(B);
+    ei_aligned_free(B);
   }
 
   static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
     int N = A_stl.size();
-    A = ei_aligned_new<real>(N*N);
+    A = (real*)ei_aligned_malloc(N*N*sizeof(real));
     for (int j=0;j<N;j++)
       for (int i=0;i<N;i++)
         A[i+N*j] = A_stl[j][i];
@@ -55,7 +55,7 @@
 
   static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
     int N = B_stl.size();
-    B = ei_aligned_new<real>(N);
+    B = (real*)ei_aligned_malloc(N*sizeof(real));
     for (int i=0;i<N;i++)
       B[i] = B_stl[i];
   }
@@ -236,6 +236,129 @@
     }
     asm("#end matrix_vector_product");
   }
+  
+  static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
+  {
+    
+//     int AN = (N/PacketSize)*PacketSize;
+//     int ANP = (AN/(2*PacketSize))*2*PacketSize;
+//     int bound = (N/4)*4;
+    for (int i=0;i<N;i++)
+      X[i] = 0;
+    
+    int bound = std::max(0,N-8) & 0xfffffffE;
+
+    for (int j=0;j<bound;j+=2)
+    {
+      register real* __restrict__ A0 = A + j*N;
+      register real* __restrict__ A1 = A + (j+1)*N;
+      
+      real t0 = B[j];
+      Packet ptmp0 = ei_pset1(t0);
+      real t1 = B[j+1];
+      Packet ptmp1 = ei_pset1(t1);
+      
+      real t2 = 0;
+      Packet ptmp2 = ei_pset1(t2);
+      real t3 = 0;
+      Packet ptmp3 = ei_pset1(t3);
+      
+      int starti = j+2;
+      int alignedEnd = starti;
+      int alignedStart = (starti) + ei_alignmentOffset(&X[starti], N-starti);
+      alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize);
+
+      X[j]   += t0 * A0[j];
+      X[j+1] += t1 * A1[j];
+      
+      X[j+1] += t0 * A0[j+1];
+      t2 += A0[j+1] * B[j+1];
+      
+//       alignedStart = alignedEnd;
+      for (int i=starti; i<alignedStart; ++i) {
+        X[i] += t0 * A0[i] + t1 * A1[i];
+        t2 += A0[i] * B[i];
+        t3 += A1[i] * B[i];
+      }
+      asm("#begin symv");
+      for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) {
+        Packet A0i = ei_ploadu(&A0[i]);
+        Packet A1i = ei_ploadu(&A1[i]);
+//         Packet A0i1 = ei_ploadu(&A0[i+PacketSize]);
+        Packet Xi = ei_pload(&X[i]);
+        Packet Bi = ei_pload/*u*/(&B[i]);
+//         Packet Xi1 = ei_pload(&X[i+PacketSize]);
+//         Packet Bi1 = ei_pload/*u*/(&B[i+PacketSize]);
+        Xi = ei_padd(ei_padd(Xi, ei_pmul(ptmp0, A0i)), ei_pmul(ptmp1, A1i));
+        ptmp2 = ei_padd(ptmp2, ei_pmul(A0i, Bi));
+        ptmp3 = ei_padd(ptmp3, ei_pmul(A1i, Bi));
+//         Xi1 = ei_padd(Xi1, ei_pmul(ptmp1, A0i1));
+//         ptmp2 = ei_padd(ptmp2, ei_pmul(A0i1, Bi1));
+//         
+        ei_pstore(&X[i],Xi);
+//         ei_pstore(&X[i+PacketSize],Xi1);
+//         asm(
+//           "prefetchnta   64(%[A0],%[i],4)   \n\t"
+//           //"movups     (%[A0],%[i],4), %%xmm8  \n\t"
+//           "movsd       (%[A0],%[i],4), %%xmm8  \n\t"
+//           "movhps     8(%[A0],%[i],4), %%xmm8  \n\t"
+// //           "movups   16(%[A0],%[i],4), %%xmm9  \n\t"
+// //           "movups   64(%[A0],%[i],4), %%xmm15  \n\t"
+//           "movaps     (%[B], %[i],4), %%xmm12 \n\t"
+// //           "movaps   16(%[B], %[i],4), %%xmm13 \n\t"
+//           "movaps     (%[X], %[i],4), %%xmm10 \n\t"
+// //           "movaps   16(%[X], %[i],4), %%xmm11 \n\t"
+//           
+//           "mulps %%xmm8, %%xmm12  \n\t"
+// //           "mulps %%xmm9, %%xmm13  \n\t"
+//           
+//           "mulps %[ptmp1], %%xmm8  \n\t"
+//           "addps %%xmm12, %[ptmp2]  \n\t"
+//           "addps %%xmm8, %%xmm10  \n\t"
+//           
+//           
+//           
+//           
+// //           "mulps %[ptmp1], %%xmm9  \n\t"
+//           
+// //           "addps %%xmm9, %%xmm11  \n\t"
+// //           "addps %%xmm13, %[ptmp2]  \n\t"
+//           
+//           "movaps %%xmm10,   (%[X],%[i],4) \n\t"
+// //           "movaps %%xmm11, 16(%[X],%[i],4) \n\t"
+//           : 
+//           : [X] "r" (X), [i] "r" (i), [A0] "r" (A0),
+//             [B] "r" (B),
+//             [ptmp1] "x" (ptmp1),
+//             [ptmp2] "x" (ptmp2)
+//           : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm15");
+      }
+      asm("#end symv");
+      for (int i=alignedEnd; i<N; i++) {
+        X[i] += t0 * A0[i] + t1 * A1[i];
+        t2 += A0[i] * B[i];
+        t3 += A1[i] * B[i];
+      }
+      
+      
+      X[j]   += t2 + ei_predux(ptmp2);
+      X[j+1] += t3 + ei_predux(ptmp3);
+    }
+    for (int j=bound;j<N;j++)
+    {
+      register real* __restrict__ A0 = A + j*N;
+      
+      real t1 = B[j];
+      real t2 = 0;
+      X[j] += t1 * A0[j];
+      for (int i=j+1; i<N; i+=PacketSize) {
+        X[i] += t1 * A0[i];
+        t2 += A0[i] * B[i];
+      }
+      X[j] += t2;
+    }
+    
+  }
 
 //   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
 //   {
diff --git a/bench/btl/libs/hand_vec/main.cpp b/bench/btl/libs/hand_vec/main.cpp
index eea8cb8..8bde4c8 100644
--- a/bench/btl/libs/hand_vec/main.cpp
+++ b/bench/btl/libs/hand_vec/main.cpp
@@ -26,6 +26,7 @@
 #include "action_axpy.hh"
 #include "action_ata_product.hh"
 #include "action_aat_product.hh"
+#include "basic_actions.hh"
 //#include "action_lu_solve.hh"
 // #include "timers/mixed_perf_analyzer.hh"
 
@@ -39,6 +40,7 @@
 //   bench<Action_matrix_matrix_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 //   bench<Action_aat_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 //   bench<Action_ata_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_symv<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
   bench<Action_axpy<hand_vec_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);