added non-optimized real forward fft (no inverse yet)
diff --git a/unsupported/Eigen/FFT.h b/unsupported/Eigen/FFT.h
index 03490d2..a1f87a6 100644
--- a/unsupported/Eigen/FFT.h
+++ b/unsupported/Eigen/FFT.h
@@ -57,21 +57,36 @@
 
     FFT(const traits_type & traits=traits_type() ) :m_traits(traits) { }
 
-    void fwd( Complex * dst, const Complex * src, int nfft)
+    template <typename _Input>
+    void fwd( Complex * dst, const _Input * src, int nfft)
     {
       m_traits.prepare(nfft,false,dst,src);
       m_traits.exec(dst,src);
       m_traits.postprocess(dst);
     }
 
-    void inv( Complex * dst, const Complex * src, int nfft)
+    template <typename _Input>
+    void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src) 
     {
-      m_traits.prepare(nfft,true,dst,src);
-      m_traits.exec(dst,src);
-      m_traits.postprocess(dst);
+        dst.resize( src.size() );
+        fwd( &dst[0],&src[0],src.size() );
     }
 
-    // TODO: fwd,inv for Scalar
+    template <typename _Output>
+    void inv( _Output * dst, const Complex * src, int nfft)
+    {
+        m_traits.prepare(nfft,true,dst,src);
+        m_traits.exec(dst,src);
+        m_traits.postprocess(dst);
+    }
+
+    template <typename _Output>
+    void inv( std::vector<_Output> & dst, const std::vector<Complex> & src) 
+    {
+        dst.resize( src.size() );
+        inv( &dst[0],&src[0],src.size() );
+    }
+
     // TODO: multi-dimensional FFTs
     // TODO: handle Eigen MatrixBase
 
diff --git a/unsupported/Eigen/src/FFT/simple_fft_traits.h b/unsupported/Eigen/src/FFT/simple_fft_traits.h
index 6fbbeac..5a910dd 100644
--- a/unsupported/Eigen/src/FFT/simple_fft_traits.h
+++ b/unsupported/Eigen/src/FFT/simple_fft_traits.h
@@ -34,7 +34,8 @@
     typedef std::complex<Scalar> Complex;
     simple_fft_traits() : m_nfft(0) {} 
 
-    void prepare(int nfft,bool inverse,Complex * dst,const Complex *src)
+    template <typename _Src>
+    void prepare(int nfft,bool inverse,Complex * dst,const _Src *src)
     {
       if (m_nfft == nfft) {
         // reuse the twiddles, conjugate if necessary
@@ -73,7 +74,8 @@
       }while(n>1);
     }
 
-    void exec(Complex * dst, const Complex * src)
+    template <typename _Src>
+    void exec(Complex * dst, const _Src * src)
     {
       work(0, dst, src, 1,1);
     }
@@ -89,7 +91,9 @@
 
     private:
 
-    void work( int stage,Complex * Fout, const Complex * f, size_t fstride,size_t in_stride)
+   
+    template <typename _Src>
+    void work( int stage,Complex * Fout, const _Src * f, size_t fstride,size_t in_stride)
     {
       int p = m_stageRadix[stage];
       int m = m_stageRemainder[stage];
diff --git a/unsupported/test/FFT.cpp b/unsupported/test/FFT.cpp
index 8347bb7..ef03359 100644
--- a/unsupported/test/FFT.cpp
+++ b/unsupported/test/FFT.cpp
@@ -25,55 +25,98 @@
 #include "main.h"
 #include <unsupported/Eigen/FFT.h>
 
+
 using namespace std;
 
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>( x); }
+complex<long double>  promote(double x) { return complex<long double>( x); }
+complex<long double>  promote(long double x) { return complex<long double>( x); }
+    
+
+    template <typename T1,typename T2>
+    long double fft_rmse( const vector<T1> & fftbuf,const vector<T2> & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        for (size_t k0=0;k0<fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = -2.*k0* M_PIl / timebuf.size();
+            for (size_t k1=0;k1<timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += norm(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += norm(dif);
+            cerr << k0 << ":" << acc << " " <<  x << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename T1,typename T2>
+    long double dif_rmse( const vector<T1> buf1,const vector<T2> buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = min( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (norm( buf1[k] ) + norm(buf2[k]) )/2.;
+            difpower += norm(buf1[k] - buf2[k]);
+        }
+        return sqrt(difpower/totalpower);
+    }
+
 template <class T>
-void test_fft(int nfft)
+void test_scalar(int nfft)
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    typedef typename Eigen::FFT<T>::Scalar Scalar;
+
+    FFT<T> fft;
+    vector<Scalar> inbuf(nfft);
+    vector<Complex> outbuf;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= (T)(rand()/(double)RAND_MAX - .5);
+    fft.fwd( outbuf,inbuf);
+    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
+}
+
+template <class T>
+void test_complex(int nfft)
 {
     typedef typename Eigen::FFT<T>::Complex Complex;
 
     FFT<T> fft;
 
     vector<Complex> inbuf(nfft);
-    vector<Complex> buf3(nfft);
-    vector<Complex> outbuf(nfft);
+    vector<Complex> outbuf;
+    vector<Complex> buf3;
     for (int k=0;k<nfft;++k)
-        inbuf[k]= Complex( 
-                (T)(rand()/(double)RAND_MAX - .5),
-                (T)(rand()/(double)RAND_MAX - .5) );
-    fft.fwd( &outbuf[0] , &inbuf[0] ,nfft);
-    fft.inv( &buf3[0] , &outbuf[0] ,nfft);
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
 
-    long double totalpower=0;
-    long double difpower=0;
-    for (int k0=0;k0<nfft;++k0) {
-        complex<long double> acc = 0;
-        long double phinc = 2*k0* M_PIl / nfft;
-        for (int k1=0;k1<nfft;++k1) {
-            complex<long double> x(inbuf[k1].real(),inbuf[k1].imag()); 
-            acc += x * exp( complex<long double>(0,-k1*phinc) );
-        }
-        totalpower += norm(acc);
-        complex<long double> x(outbuf[k0].real(),outbuf[k0].imag()); 
-        complex<long double> dif = acc - x;
-        difpower += norm(dif);
-    }
-    long double rmse = sqrt(difpower/totalpower);
-    VERIFY( rmse < 1e-5 );// gross check
+    VERIFY( fft_rmse(outbuf,inbuf) < 1e-5 );// gross check
 
-    totalpower=0;
-    difpower=0;
-    for (int k=0;k<nfft;++k) {
-        totalpower += norm( inbuf[k] );
-        difpower += norm(inbuf[k] - buf3[k]);
-    }
-    rmse = sqrt(difpower/totalpower);
-    VERIFY( rmse < 1e-5 );// gross check
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( dif_rmse(inbuf,buf3) < 1e-5 );// gross check
 }
 
 void test_FFT()
 {
-  CALL_SUBTEST(( test_fft<float>(32) )); CALL_SUBTEST(( test_fft<double>(32) )); CALL_SUBTEST(( test_fft<long double>(32) ));
-  CALL_SUBTEST(( test_fft<float>(1024) )); CALL_SUBTEST(( test_fft<double>(1024) )); CALL_SUBTEST(( test_fft<long double>(1024) ));
-  CALL_SUBTEST(( test_fft<float>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<double>(2*3*4*5*7) )); CALL_SUBTEST(( test_fft<long double>(2*3*4*5*7) ));
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<float>(1024) ); CALL_SUBTEST( test_complex<double>(1024) ); CALL_SUBTEST( test_complex<long double>(1024) );
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<float>(1024) ); CALL_SUBTEST( test_scalar<double>(1024) ); CALL_SUBTEST( test_scalar<long double>(1024) );
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
 }