Implement functors for rank-1 and rank-2 update.
diff --git a/blas/GeneralRank1Update.h b/blas/GeneralRank1Update.h
new file mode 100644
index 0000000..a3301ed
--- /dev/null
+++ b/blas/GeneralRank1Update.h
@@ -0,0 +1,30 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GENERAL_RANK1UPDATE_H
+#define EIGEN_GENERAL_RANK1UPDATE_H
+
+namespace internal {
+
+/* Optimized matrix += alpha * uv' */
+template<typename Scalar, typename Index, bool ConjRhs>
+struct general_rank1_update
+{
+  static void run(Index rows, Index cols, Scalar* mat, Index stride, const Scalar* u, const Scalar* v, Scalar alpha)
+  {
+    typedef Matrix<Scalar,Dynamic,1> PlainVector;
+    internal::conj_if<ConjRhs> cj;
+    for (Index i=0; i<cols; ++i)
+      Map<PlainVector>(mat+stride*i,rows) += alpha * cj(v[i]) * Map<const PlainVector>(u,rows);
+  }
+};
+
+} // end namespace internal
+
+#endif // EIGEN_GENERAL_RANK1UPDATE_H
diff --git a/blas/Rank2Update.h b/blas/Rank2Update.h
new file mode 100644
index 0000000..e7a5eea
--- /dev/null
+++ b/blas/Rank2Update.h
@@ -0,0 +1,57 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANK2UPDATE_H
+#define EIGEN_RANK2UPDATE_H
+
+namespace internal {
+
+/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu'
+ * This is the low-level version of SelfadjointRank2Update.h
+ */
+template<typename Scalar, typename Index, int UpLo>
+struct rank2_update_selector;
+
+template<typename Scalar, typename Index>
+struct rank2_update_selector<Scalar,Index,Upper>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* _u, const Scalar* _v, Scalar alpha)
+  {
+    typedef Matrix<Scalar,Dynamic,1> PlainVector;
+    Map<const PlainVector> u(_u, size), v(_v, size);
+
+    for (Index i=0; i<size; ++i)
+    {
+      Map<PlainVector>(mat+stride*i, i+1) +=
+		  conj(alpha) * conj(_u[i]) * v.head(i+1)
+		+ alpha * conj(_v[i]) * u.head(i+1);
+    }
+  }
+};
+
+template<typename Scalar, typename Index>
+struct rank2_update_selector<Scalar,Index,Lower>
+{
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* _u, const Scalar* _v, Scalar alpha)
+  {
+    typedef Matrix<Scalar,Dynamic,1> PlainVector;
+    Map<const PlainVector> u(_u, size), v(_v, size);
+
+    for (Index i=0; i<size; ++i)
+    {
+      Map<PlainVector>(mat+(stride+1)*i, size-i) +=
+		  conj(alpha) * conj(_u[i]) * v.tail(size-i)
+		+ alpha * conj(_v[i]) * u.tail(size-i);
+    }
+  }
+};
+
+} // end namespace internal
+
+#endif // EIGEN_RANK2UPDATE_H
diff --git a/blas/common.h b/blas/common.h
index cd78391..26b4ed5 100644
--- a/blas/common.h
+++ b/blas/common.h
@@ -74,6 +74,8 @@
 
 namespace Eigen {
 #include "BandTriangularSolver.h"
+#include "GeneralRank1Update.h"
+#include "Rank2Update.h"
 }
 
 using namespace Eigen;
diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h
index 7878f2a..477f6d6 100644
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -117,6 +117,21 @@
   */
 int EIGEN_BLAS_FUNC(her)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *pa, int *lda)
 {
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, Scalar);
+  static functype func[2];
+
+  static bool init = false;
+  if(!init)
+  {
+    for(int k=0; k<2; ++k)
+      func[k] = 0;
+
+    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
+    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
+
+    init = true;
+  }
+
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   RealScalar alpha = *reinterpret_cast<RealScalar*>(palpha);
@@ -134,16 +149,11 @@
 
   Scalar* x_cpy = get_compact_vector(x, *n, *incx);
 
-  // TODO perform direct calls to underlying implementation
-//   if(UPLO(*uplo)==LO)       matrix(a,*n,*n,*lda).selfadjointView<Lower>().rankUpdate(vector(x_cpy,*n), alpha);
-//   else if(UPLO(*uplo)==UP)  matrix(a,*n,*n,*lda).selfadjointView<Upper>().rankUpdate(vector(x_cpy,*n), alpha);
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
 
-  if(UPLO(*uplo)==LO)
-    for(int j=0;j<*n;++j)
-      matrix(a,*n,*n,*lda).col(j).tail(*n-j) += alpha * internal::conj(x_cpy[j]) * vector(x_cpy+j,*n-j);
-  else
-    for(int j=0;j<*n;++j)
-      matrix(a,*n,*n,*lda).col(j).head(j+1) += alpha * internal::conj(x_cpy[j]) * vector(x_cpy,j+1);
+  func[code](*n, a, *lda, x_cpy, alpha);
 
   matrix(a,*n,*n,*lda).diagonal().imag().setZero();
 
@@ -161,6 +171,21 @@
   */
 int EIGEN_BLAS_FUNC(her2)(char *uplo, int *n, RealScalar *palpha, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pa, int *lda)
 {
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
+  static functype func[2];
+
+  static bool init = false;
+  if(!init)
+  {
+    for(int k=0; k<2; ++k)
+      func[k] = 0;
+
+    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
+    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
+
+    init = true;
+  }
+
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
   Scalar* a = reinterpret_cast<Scalar*>(pa);
@@ -181,9 +206,11 @@
   Scalar* x_cpy = get_compact_vector(x, *n, *incx);
   Scalar* y_cpy = get_compact_vector(y, *n, *incy);
 
-  // TODO perform direct calls to underlying implementation
-  if(UPLO(*uplo)==LO)       matrix(a,*n,*n,*lda).selfadjointView<Lower>().rankUpdate(vector(x_cpy,*n),vector(y_cpy,*n),alpha);
-  else if(UPLO(*uplo)==UP)  matrix(a,*n,*n,*lda).selfadjointView<Upper>().rankUpdate(vector(x_cpy,*n),vector(y_cpy,*n),alpha);
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
+
+  func[code](*n, a, *lda, x_cpy, y_cpy, alpha);
 
   matrix(a,*n,*n,*lda).diagonal().imag().setZero();
 
@@ -222,8 +249,7 @@
   Scalar* x_cpy = get_compact_vector(x,*m,*incx);
   Scalar* y_cpy = get_compact_vector(y,*n,*incy);
 
-  // TODO perform direct calls to underlying implementation
-  matrix(a,*m,*n,*lda) += alpha * vector(x_cpy,*m) * vector(y_cpy,*n).transpose();
+  internal::general_rank1_update<Scalar,int,false>::run(*m, *n, a, *lda, x_cpy, y_cpy, alpha);
 
   if(x_cpy!=x)  delete[] x_cpy;
   if(y_cpy!=y)  delete[] y_cpy;
@@ -260,8 +286,7 @@
   Scalar* x_cpy = get_compact_vector(x,*m,*incx);
   Scalar* y_cpy = get_compact_vector(y,*n,*incy);
 
-  // TODO perform direct calls to underlying implementation
-  matrix(a,*m,*n,*lda) += alpha * vector(x_cpy,*m) * vector(y_cpy,*n).adjoint();
+  internal::general_rank1_update<Scalar,int,Conj>::run(*m, *n, a, *lda, x_cpy, y_cpy, alpha);
 
   if(x_cpy!=x)  delete[] x_cpy;
   if(y_cpy!=y)  delete[] y_cpy;
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index 7099cf9..f1f7371 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -49,7 +49,8 @@
 
   int actual_m = *m;
   int actual_n = *n;
-  if(OP(*opa)!=NOTR)
+  int code = OP(*opa);
+  if(code!=NOTR)
     std::swap(actual_m,actual_n);
 
   Scalar* actual_b = get_compact_vector(b,actual_n,*incb);
@@ -61,7 +62,9 @@
     else                vector(actual_c, actual_m) *= beta;
   }
 
-  int code = OP(*opa);
+  if(code>=4 || func[code]==0)
+    return 0;
+
   func[code](actual_m, actual_n, a, *lda, actual_b, 1, actual_c, 1, alpha);
 
   if(actual_b!=b) delete[] actual_b;
@@ -416,42 +419,3 @@
 //   return 1;
 // }
 
-/**  DGER   performs the rank 1 operation
-  *
-  *     A := alpha*x*y' + A,
-  *
-  *  where alpha is a scalar, x is an m element vector, y is an n element
-  *  vector and A is an m by n matrix.
-  */
-int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *pa, int *lda)
-{
-  Scalar* x = reinterpret_cast<Scalar*>(px);
-  Scalar* y = reinterpret_cast<Scalar*>(py);
-  Scalar* a = reinterpret_cast<Scalar*>(pa);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-
-  int info = 0;
-       if(*m<0)                                                       info = 1;
-  else if(*n<0)                                                       info = 2;
-  else if(*incx==0)                                                   info = 5;
-  else if(*incy==0)                                                   info = 7;
-  else if(*lda<std::max(1,*m))                                        info = 9;
-  if(info)
-    return xerbla_(SCALAR_SUFFIX_UP"GER  ",&info,6);
-
-  if(alpha==Scalar(0))
-    return 1;
-
-  Scalar* x_cpy = get_compact_vector(x,*m,*incx);
-  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
-
-  // TODO perform direct calls to underlying implementation
-  matrix(a,*m,*n,*lda) += alpha * vector(x_cpy,*m) * vector(y_cpy,*n).adjoint();
-
-  if(x_cpy!=x)  delete[] x_cpy;
-  if(y_cpy!=y)  delete[] y_cpy;
-
-  return 1;
-}
-
-
diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h
index cd83329..06da283 100644
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -68,6 +68,20 @@
 
 //     init = true;
 //   }
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, Scalar);
+  static functype func[2];
+
+  static bool init = false;
+  if(!init)
+  {
+    for(int k=0; k<2; ++k)
+      func[k] = 0;
+
+    func[UP] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Upper,false,Conj>::run);
+    func[LO] = (selfadjoint_rank1_update<Scalar,int,ColMajor,Lower,false,Conj>::run);
+
+    init = true;
+  }
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
@@ -86,18 +100,11 @@
   // if the increment is not 1, let's copy it to a temporary vector to enable vectorization
   Scalar* x_cpy = get_compact_vector(x,*n,*incx);
 
-  Matrix<Scalar,Dynamic,Dynamic> m2(matrix(c,*n,*n,*ldc));
-  
-  // TODO check why this is not accurate enough for lapack tests
-//   if(UPLO(*uplo)==LO)       matrix(c,*n,*n,*ldc).selfadjointView<Lower>().rankUpdate(vector(x_cpy,*n), alpha);
-//   else if(UPLO(*uplo)==UP)  matrix(c,*n,*n,*ldc).selfadjointView<Upper>().rankUpdate(vector(x_cpy,*n), alpha);
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
 
-  if(UPLO(*uplo)==LO)
-    for(int j=0;j<*n;++j)
-      matrix(c,*n,*n,*ldc).col(j).tail(*n-j) += alpha * x_cpy[j] * vector(x_cpy+j,*n-j);
-  else
-    for(int j=0;j<*n;++j)
-      matrix(c,*n,*n,*ldc).col(j).head(j+1) += alpha * x_cpy[j] * vector(x_cpy,j+1);
+  func[code](*n, c, *ldc, x_cpy, alpha);
 
   if(x_cpy!=x)  delete[] x_cpy;
 
@@ -121,6 +128,20 @@
 //
 //     init = true;
 //   }
+  typedef void (*functype)(int, Scalar*, int, const Scalar*, const Scalar*, Scalar);
+  static functype func[2];
+
+  static bool init = false;
+  if(!init)
+  {
+    for(int k=0; k<2; ++k)
+      func[k] = 0;
+
+    func[UP] = (internal::rank2_update_selector<Scalar,int,Upper>::run);
+    func[LO] = (internal::rank2_update_selector<Scalar,int,Lower>::run);
+
+    init = true;
+  }
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
@@ -141,10 +162,12 @@
 
   Scalar* x_cpy = get_compact_vector(x,*n,*incx);
   Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+  
+  int code = UPLO(*uplo);
+  if(code>=2 || func[code]==0)
+    return 0;
 
-  // TODO perform direct calls to underlying implementation
-  if(UPLO(*uplo)==LO)       matrix(c,*n,*n,*ldc).selfadjointView<Lower>().rankUpdate(vector(x_cpy,*n), vector(y_cpy,*n), alpha);
-  else if(UPLO(*uplo)==UP)  matrix(c,*n,*n,*ldc).selfadjointView<Upper>().rankUpdate(vector(x_cpy,*n), vector(y_cpy,*n), alpha);
+  func[code](*n, c, *ldc, x_cpy, y_cpy, alpha);
 
   if(x_cpy!=x)  delete[] x_cpy;
   if(y_cpy!=y)  delete[] y_cpy;
@@ -208,3 +231,41 @@
 //   return 1;
 // }
 
+/**  DGER   performs the rank 1 operation
+  *
+  *     A := alpha*x*y' + A,
+  *
+  *  where alpha is a scalar, x is an m element vector, y is an n element
+  *  vector and A is an m by n matrix.
+  */
+int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx, Scalar *py, int *incy, Scalar *pa, int *lda)
+{
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+  Scalar* a = reinterpret_cast<Scalar*>(pa);
+  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
+
+  int info = 0;
+       if(*m<0)                                                       info = 1;
+  else if(*n<0)                                                       info = 2;
+  else if(*incx==0)                                                   info = 5;
+  else if(*incy==0)                                                   info = 7;
+  else if(*lda<std::max(1,*m))                                        info = 9;
+  if(info)
+    return xerbla_(SCALAR_SUFFIX_UP"GER  ",&info,6);
+
+  if(alpha==Scalar(0))
+    return 1;
+
+  Scalar* x_cpy = get_compact_vector(x,*m,*incx);
+  Scalar* y_cpy = get_compact_vector(y,*n,*incy);
+
+  internal::general_rank1_update<Scalar,int,false>::run(*m, *n, a, *lda, x_cpy, y_cpy, alpha);
+
+  if(x_cpy!=x)  delete[] x_cpy;
+  if(y_cpy!=y)  delete[] y_cpy;
+
+  return 1;
+}
+
+
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 2371f25..84c9f4f 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -305,6 +305,7 @@
 int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
 //   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
+  #if !ISCOMPLEX
   typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, Scalar);
   static functype func[8];
 
@@ -324,6 +325,7 @@
 
     init = true;
   }
+  #endif
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);