added FFT inverse complex-to-scalar interface (not yet optimized)

diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 041576b..84cc49f 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp

@@ -31,34 +31,67 @@
 using namespace Eigen;
 using namespace std;
 
-#ifndef NFFT
-#define NFFT 1024
-#endif
+
+template <typename T>
+string nameof();
+
+template <> string nameof<float>() {return "float";}
+template <> string nameof<double>() {return "double";}
+template <> string nameof<long double>() {return "long double";}
 
 #ifndef TYPE
 #define TYPE float
 #endif
 
-#ifndef NITS 
-#define NITS (10000000/NFFT)
+#ifndef NFFT
+#define NFFT 1024
+#endif
+#ifndef NDATA
+#define NDATA 1000000
 #endif
 
-int main() 
+using namespace Eigen;
+
+template <typename T>
+void bench(int nfft)
 {
-  vector<complex<TYPE> > inbuf(NFFT);
-  vector<complex<TYPE> > outbuf(NFFT);
-  Eigen::FFT<TYPE> fft;
+    typedef typename NumTraits<T>::Real Scalar;
+    typedef typename std::complex<Scalar> Complex;
+    int nits = NDATA/nfft;
+    vector<T> inbuf(nfft);
+    vector<Complex > outbuf(nfft);
+    FFT< Scalar > fft;
 
-  fft.fwd( outbuf , inbuf);
+    fft.fwd( outbuf , inbuf);
 
-  BenchTimer timer;
-  timer.reset();
-  for (int k=0;k<8;++k) {
-      timer.start();
-      for(int i = 0; i < NITS; i++)
-          fft.fwd( outbuf , inbuf);
-      timer.stop();
-  }
-  double mflops = 5.*NFFT*log2((double)NFFT) / (1e6 * timer.value() / (double)NITS );
-  cout << "NFFT=" << NFFT << "  " << (double(1e-6*NFFT*NITS)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+    BenchTimer timer;
+    timer.reset();
+    for (int k=0;k<8;++k) {
+        timer.start();
+        for(int i = 0; i < nits; i++)
+            fft.fwd( outbuf , inbuf);
+        timer.stop();
+    }
+
+    cout << nameof<Scalar>() << " ";
+    double mflops = 5.*nfft*log2((double)nfft) / (1e6 * timer.value() / (double)nits );
+    if ( NumTraits<T>::IsComplex ) {
+        cout << "complex";
+    }else{
+        cout << "real   ";
+        mflops /= 2;
+    }
+
+    cout << " NFFT=" << nfft << "  " << (double(1e-6*nfft*nits)/timer.value()) << " MS/s  " << mflops << "MFLOPS\n";
+}
+
+int main(int argc,char ** argv)
+{
+    bench<complex<float> >(NFFT);
+    bench<float>(NFFT);
+    bench<complex<double> >(NFFT);
+    bench<double>(NFFT);
+    bench<complex<long double> >(NFFT);
+    bench<long double>(NFFT);
+    return 0;
 }

diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index f7dd2b9..1e2be8f 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h

@@ -54,28 +54,14 @@
             work(0, dst, src, 1,1);
         }else{
             int ncfft = nfft>>1;
+            int ncfft2 = nfft>>2;
             // use optimized mode for even real
             fwd( dst, reinterpret_cast<const Complex*> (src),ncfft);
             make_real_twiddles(nfft);
-            std::cerr << "dst[0] = " << dst[0] << "\n";
             Complex dc = dst[0].real() +  dst[0].imag();
             Complex nyquist = dst[0].real() -  dst[0].imag();
             int k;
-#if 0
-            using namespace std;
-            cerr << "desired:\n";
-            for ( k=1;k <= (ncfft>>1) ; ++k ) {
-                Complex x = exp( Complex(0,-3.14159265358979323846264338327 * ((double) (k) / ncfft + .5) ) );
-                cerr << k << " " << x << "angle(x):" << arg(x) << "\n";
-            }
-            dc=0;
-            cerr << "twiddles:\n";
-            for (k=0;k<ncfft;++k) {
-                Complex x = m_twiddles[k];
-                cerr << k << " " << x << "angle(-x):" << arg(-x) << "\n";
-            }
-#endif
-            for ( k=1;k <= (ncfft>>1) ; ++k ) {
+            for ( k=1;k <= ncfft2 ; ++k ) {
                 Complex fpk = dst[k];
                 Complex fpnk = conj(dst[ncfft-k]);
                 Complex f1k = fpk + fpnk;
@@ -87,7 +73,6 @@
                 dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
             }
  
-
             // place conjugate-symmetric half at the end for completeness
             // TODO: make this configurable ( opt-out )
             for ( k=1;k < ncfft ; ++k )
@@ -98,6 +83,16 @@
         }
     }
 
+    // half-complex to scalar
+    void inv( Scalar * dst,const Complex * src,int nfft) 
+    {
+        // TODO add optimized version for even numbers
+        std::vector<Complex> tmp(nfft);
+        inv(&tmp[0],src,nfft);
+        for (int k=0;k<nfft;++k)
+            dst[k] = tmp[k].real();
+    }
+
     void inv(Complex * dst,const Complex  *src,int nfft)
     {
         prepare(nfft,true);
@@ -156,7 +151,7 @@
                         default: p += 2; break;
                     }
                     if (p*p>n)
-                        p=n;// no more factors
+                        p=n;// impossible to have a factor > sqrt(n)
                 }
                 n /= p;
                 m_stageRadix.push_back(p);

diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 41c7fed..75c3327 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp

@@ -28,6 +28,10 @@
 
 using namespace std;
 
+float norm(float x) {return x*x;}
+double norm(double x) {return x*x;}
+long double norm(long double x) {return x*x;}
+
 template < typename T>
 complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
 
@@ -83,7 +87,11 @@
     for (int k=0;k<nfft;++k)
         inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
     fft.fwd( outbuf,inbuf);
-    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+
+    vector<Scalar> buf3;
+    fft.inv( buf3 , outbuf);
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
 template <class T>
@@ -100,18 +108,18 @@
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
 
-    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
 
     fft.inv( buf3 , outbuf);
 
-    VERIFY( dif_rmse(inbuf,buf3) < 1e-5 );// gross check
+    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
 }
 
 void test_FFT()
 {
-#if 0
+#if 1
   CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
-  CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); CALL_SUBTEST( test_complex<long double>(256) );
   CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
   CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
   CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
@@ -120,8 +128,9 @@
 #endif
 
 #if 1
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); CALL_SUBTEST( test_scalar<long double>(45) );
   CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
-  CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); CALL_SUBTEST( test_scalar<long double>(256) );
   CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 #endif
 }