merge
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 5d5095a..f481a7a 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -166,7 +166,7 @@
     inner = Index % DstXprType::InnerSizeAtCompileTime
   };
 
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     kernel.assignCoeffByOuterInner(outer, inner);
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
@@ -176,13 +176,13 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, typename Kernel::Index outer)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, typename Kernel::Index outer)
   {
     kernel.assignCoeffByOuterInner(outer, Index);
     copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index+1, Stop>::run(kernel, outer);
@@ -192,7 +192,7 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index) { }
 };
 
 /***********************
@@ -202,7 +202,7 @@
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
   {
     kernel.assignCoeff(Index);
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
@@ -212,7 +212,7 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
 /**************************
@@ -232,7 +232,7 @@
     JointAlignment = Kernel::AssignmentTraits::JointAlignment
   };
 
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     kernel.template assignPacketByOuterInner<Aligned, JointAlignment>(outer, inner);
     enum { NextIndex = Index + packet_traits<typename DstXprType::Scalar>::size };
@@ -243,13 +243,13 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel&) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
 };
 
 template<typename Kernel, int Index, int Stop>
 struct copy_using_evaluator_innervec_InnerUnrolling
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel, typename Kernel::Index outer)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, typename Kernel::Index outer)
   {
     kernel.template assignPacketByOuterInner<Aligned, Aligned>(outer, Index);
     enum { NextIndex = Index + packet_traits<typename Kernel::Scalar>::size };
@@ -260,7 +260,7 @@
 template<typename Kernel, int Stop>
 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &, typename Kernel::Index) { }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, typename Kernel::Index) { }
 };
 
 /***************************************************************************
@@ -281,7 +281,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
 {
-  static void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static void run(Kernel &kernel)
   {
     typedef typename Kernel::Index Index;
     
@@ -296,7 +296,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@@ -307,7 +307,7 @@
 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
 {
   typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
 
@@ -330,7 +330,7 @@
 {
   // if IsAligned = true, then do nothing
   template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, typename Kernel::Index, typename Kernel::Index) {}
 };
 
 template <>
@@ -346,7 +346,7 @@
                                     typename Kernel::Index end)
 #else
   template <typename Kernel>
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
                                       typename Kernel::Index start,
                                       typename Kernel::Index end)
 #endif
@@ -359,7 +359,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::Index Index;
 
@@ -387,7 +387,7 @@
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
 {
   typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     
@@ -407,7 +407,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
   {
     typedef typename Kernel::Index Index;
 
@@ -423,7 +423,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@@ -434,7 +434,7 @@
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 {
   typedef typename Kernel::Index Index;
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     const Index outerSize = kernel.outerSize();
@@ -450,7 +450,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
   {
     typedef typename Kernel::Index Index;
     const Index size = kernel.size();
@@ -462,7 +462,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
 {
-  static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
     copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
@@ -476,7 +476,7 @@
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
 {
-  static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
   {
     typedef typename Kernel::Index Index;
     typedef packet_traits<typename Kernel::Scalar> PacketTraits;
@@ -537,7 +537,7 @@
   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
   
   
-  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
   {
     #ifdef EIGEN_DEBUG_ASSIGN
@@ -545,33 +545,33 @@
     #endif
   }
   
-  Index size() const        { return m_dstExpr.size(); }
-  Index innerSize() const   { return m_dstExpr.innerSize(); }
-  Index outerSize() const   { return m_dstExpr.outerSize(); }
-  Index rows() const        { return m_dstExpr.rows(); }
-  Index cols() const        { return m_dstExpr.cols(); }
-  Index outerStride() const { return m_dstExpr.outerStride(); }
+  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
   
   // TODO get rid of this one:
-  DstXprType& dstExpression() const { return m_dstExpr; }
+  EIGEN_DEVICE_FUNC DstXprType& dstExpression() const { return m_dstExpr; }
   
-  DstEvaluatorType& dstEvaluator() { return m_dst; }
-  const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
   
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
-  void assignCoeff(Index row, Index col)
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
     m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
   }
   
   /// \sa assignCoeff(Index,Index)
-  void assignCoeff(Index index)
+  EIGEN_DEVICE_FUNC void assignCoeff(Index index)
   {
     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
   }
   
   /// \sa assignCoeff(Index,Index)
-  void assignCoeffByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC void assignCoeffByOuterInner(Index outer, Index inner)
   {
     Index row = rowIndexByOuterInner(outer, inner); 
     Index col = colIndexByOuterInner(outer, inner); 
@@ -580,26 +580,26 @@
   
   
   template<int StoreMode, int LoadMode>
-  void assignPacket(Index row, Index col)
+  EIGEN_DEVICE_FUNC void assignPacket(Index row, Index col)
   {
     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode>(row,col));
   }
   
   template<int StoreMode, int LoadMode>
-  void assignPacket(Index index)
+  EIGEN_DEVICE_FUNC void assignPacket(Index index)
   {
     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode>(index));
   }
   
   template<int StoreMode, int LoadMode>
-  void assignPacketByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC void assignPacketByOuterInner(Index outer, Index inner)
   {
     Index row = rowIndexByOuterInner(outer, inner); 
     Index col = colIndexByOuterInner(outer, inner);
     assignPacket<StoreMode,LoadMode>(row, col);
   }
   
-  static Index rowIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static Index rowIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1 ? 0
@@ -608,7 +608,7 @@
       : inner;
   }
 
-  static Index colIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static Index colIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::ColsAtCompileTime) == 1 ? 0
@@ -630,7 +630,7 @@
 ***************************************************************************/
 
 template<typename DstXprType, typename SrcXprType, typename Functor>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
 {
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
   
@@ -647,7 +647,7 @@
 }
 
 template<typename DstXprType, typename SrcXprType>
-void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
   call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
 }
@@ -681,26 +681,26 @@
 // does not has to bother about these annoying details.
 
 template<typename Dst, typename Src>
-void call_assignment(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
 {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 template<typename Dst, typename Src>
-void call_assignment(const Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
 {
   call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
                      
 // Deal with AssumeAliasing
 template<typename Dst, typename Src, typename Func>
-void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
 {
   typename plain_matrix_type<Src>::type tmp(src);
   call_assignment_no_alias(dst, tmp, func);
 }
 
 template<typename Dst, typename Src, typename Func>
-void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
 {
   call_assignment_no_alias(dst, src, func);
 }
@@ -709,19 +709,19 @@
 // FIXME the const version should probably not be needed
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
 {
   call_assignment_no_alias(dst.expression(), src, func);
 }
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
 {
   call_assignment_no_alias(dst.expression(), src, func);
 }
 
 
 template<typename Dst, typename Src, typename Func>
-void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
 {
   enum {
     NeedToTranspose = (  (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
@@ -752,19 +752,19 @@
   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
 template<typename Dst, typename Src>
-void call_assignment_no_alias(Dst& dst, const Src& src)
+EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
 {
   call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
 }
 
-// forxard declaration
+// forward declaration
 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
 
 // Generic Dense to Dense assignment
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     
@@ -781,7 +781,7 @@
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
     
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 737e5dc..9cf9d54 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -178,7 +178,7 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-    class InnerIterator;
+    // class InnerIterator; // FIXME apparently never used
 
     /** Column or Row constructor
       */
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 2980d7a..836d25b 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -89,7 +89,7 @@
 struct evaluator : public unary_evaluator<T>
 {
   typedef unary_evaluator<T> Base;
-  explicit evaluator(const T& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
 };
 
 
@@ -145,18 +145,18 @@
                                             Derived::Options,Derived::MaxRowsAtCompileTime,Derived::MaxColsAtCompileTime>::ret
   };
   
-  evaluator()
+  EIGEN_DEVICE_FUNC evaluator()
     : m_data(0),
       m_outerStride(IsVectorAtCompileTime  ? 0 
                                            : int(IsRowMajor) ? ColsAtCompileTime 
                                            : RowsAtCompileTime)
   {}
   
-  explicit evaluator(const PlainObjectType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
     : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
   { }
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     if (IsRowMajor)
       return m_data[row * m_outerStride.value() + col];
@@ -164,12 +164,12 @@
       return m_data[row + col * m_outerStride.value()];
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_data[index];
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     if (IsRowMajor)
       return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
@@ -177,7 +177,7 @@
       return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return const_cast<Scalar*>(m_data)[index];
   }
@@ -231,7 +231,7 @@
   
   evaluator() {}
 
-  explicit evaluator(const XprType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
     : evaluator<PlainObjectBase<XprType> >(m) 
   { }
 };
@@ -244,7 +244,7 @@
 
   evaluator() {}
   
-  explicit evaluator(const XprType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
     : evaluator<PlainObjectBase<XprType> >(m) 
   { }
 };
@@ -262,7 +262,7 @@
     Flags = evaluator<ArgType>::Flags ^ RowMajorBit
   };
 
-  explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
@@ -270,22 +270,22 @@
   typedef typename XprType::PacketScalar PacketScalar;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(col, row);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index);
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(col, row);
   }
 
-  typename XprType::Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index);
   }
@@ -339,7 +339,7 @@
           | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit) // FIXME EvalBeforeNestingBit should be needed anymore
   };
 
-  explicit evaluator(const XprType& n)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
     : m_functor(n.functor()) 
   { }
 
@@ -347,12 +347,12 @@
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(row, col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(index);
   }
@@ -389,7 +389,7 @@
             | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0))
   };
 
-  explicit unary_evaluator(const XprType& op)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
     : m_functor(op.functor()), 
       m_argImpl(op.nestedExpression()) 
   { }
@@ -398,12 +398,12 @@
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(m_argImpl.coeff(row, col));
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_argImpl.coeff(index));
   }
@@ -435,7 +435,7 @@
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
   typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
   
-  explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template<typename BinaryOp, typename Lhs, typename Rhs>
@@ -463,7 +463,7 @@
     Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit)
   };
 
-  explicit binary_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
@@ -473,12 +473,12 @@
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
   }
@@ -517,7 +517,7 @@
     Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit))
   };
 
-  explicit unary_evaluator(const XprType& op)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
     : m_unaryOp(op.functor()), 
       m_argImpl(op.nestedExpression()) 
   { }
@@ -526,22 +526,22 @@
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_unaryOp(m_argImpl.coeff(row, col));
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_unaryOp(m_argImpl.coeff(index));
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_unaryOp(m_argImpl.coeffRef(row, col));
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_unaryOp(m_argImpl.coeffRef(index));
   }
@@ -575,7 +575,7 @@
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
   
-  explicit mapbase_evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
     : m_data(const_cast<PointerType>(map.data())),  
       m_xpr(map)
   {
@@ -583,22 +583,22 @@
                         PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
   }
  
-  CoeffReturnType coeff(Index row, Index col) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
-  CoeffReturnType coeff(Index index) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_data[index * m_xpr.innerStride()];
   }
 
-  Scalar& coeffRef(Index row, Index col) 
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
   }
   
-  Scalar& coeffRef(Index index) 
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_data[index * m_xpr.innerStride()];
   }
@@ -665,7 +665,7 @@
     Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
   };
 
-  explicit evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
     : mapbase_evaluator<XprType, PlainObjectType>(map) 
   { }
 };
@@ -682,7 +682,7 @@
     Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags
   };
 
-  explicit evaluator(const XprType& ref)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
     : mapbase_evaluator<XprType, PlainObjectType>(ref) 
   { }
 };
@@ -733,7 +733,7 @@
     Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  explicit evaluator(const XprType& block) : block_evaluator_type(block) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) {}
 };
 
 // no direct-access => dispatch to a unary evaluator
@@ -743,7 +743,7 @@
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : unary_evaluator<XprType>(block) 
   {}
 };
@@ -754,7 +754,7 @@
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  explicit unary_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
       m_startCol(block.startCol()) 
@@ -770,22 +770,22 @@
     RowsAtCompileTime = XprType::RowsAtCompileTime
   };
  
-  CoeffReturnType coeff(Index row, Index col) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   { 
     return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
   }
   
-  CoeffReturnType coeff(Index index) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   { 
     return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
-  Scalar& coeffRef(Index row, Index col) 
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   { 
     return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
   }
   
-  Scalar& coeffRef(Index index) 
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   { 
     return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
@@ -833,7 +833,7 @@
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
   {
     // FIXME this should be an internal assertion
@@ -859,7 +859,7 @@
     Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits
   };
 
-  explicit evaluator(const XprType& select)
+  EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
     : m_conditionImpl(select.conditionMatrix()),
       m_thenImpl(select.thenMatrix()),
       m_elseImpl(select.elseMatrix())
@@ -868,7 +868,7 @@
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     if (m_conditionImpl.coeff(row, col))
       return m_thenImpl.coeff(row, col);
@@ -876,7 +876,7 @@
       return m_elseImpl.coeff(row, col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     if (m_conditionImpl.coeff(index))
       return m_thenImpl.coeff(index);
@@ -913,14 +913,14 @@
     Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit)
   };
 
-  explicit unary_evaluator(const XprType& replicate)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
     : m_arg(replicate.nestedExpression()),
       m_argImpl(m_arg),
       m_rows(replicate.nestedExpression().rows()),
       m_cols(replicate.nestedExpression().cols())
   {}
  
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     // try to avoid using modulo; this is a pure optimization strategy
     const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
@@ -977,19 +977,19 @@
     Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits)
   };
 
-  explicit evaluator(const XprType expr)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType expr)
     : m_expr(expr)
   {}
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
  
-  CoeffReturnType coeff(Index row, Index col) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   { 
     return m_expr.coeff(row, col);
   }
   
-  CoeffReturnType coeff(Index index) const 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   { 
     return m_expr.coeff(index);
   }
@@ -1014,7 +1014,7 @@
     Flags = evaluator<ArgType>::Flags
   };
 
-  explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
 
   typedef typename ArgType::Index Index;
   typedef typename ArgType::Scalar Scalar;
@@ -1022,22 +1022,22 @@
   typedef typename ArgType::PacketScalar PacketScalar;
   typedef typename ArgType::PacketReturnType PacketReturnType;
 
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(row, col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index);
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(row, col);
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index);
   }
@@ -1076,7 +1076,7 @@
 {
   typedef MatrixWrapper<TArgType> XprType;
 
-  explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
     : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
@@ -1087,7 +1087,7 @@
 {
   typedef ArrayWrapper<TArgType> XprType;
 
-  explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
     : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
@@ -1133,30 +1133,30 @@
   };
   typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
 
-  explicit unary_evaluator(const XprType& reverse)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
     : m_argImpl(reverse.nestedExpression()),
       m_rows(ReverseRow ? reverse.nestedExpression().rows() : 0),
       m_cols(ReverseCol ? reverse.nestedExpression().cols() : 0)
   { }
  
-  CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
     return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
                            ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
   }
 
-  Scalar& coeffRef(Index row, Index col)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
   {
     return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
                               ReverseCol ? m_cols.value() - col - 1 : col);
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
   }
@@ -1214,7 +1214,7 @@
     Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit
   };
 
-  explicit evaluator(const XprType& diagonal)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
     : m_argImpl(diagonal.nestedExpression()),
       m_index(diagonal.index())
   { }
@@ -1223,22 +1223,22 @@
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
-  CoeffReturnType coeff(Index row, Index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
   {
     return m_argImpl.coeff(row + rowOffset(), row + colOffset());
   }
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_argImpl.coeff(index + rowOffset(), index + colOffset());
   }
 
-  Scalar& coeffRef(Index row, Index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index)
   {
     return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
   }
 
-  Scalar& coeffRef(Index index)
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
   {
     return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
   }
@@ -1248,8 +1248,8 @@
   const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
 
 private:
-  EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
-  EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
 };
 
 
@@ -1311,7 +1311,7 @@
   typedef evaluator type;
   typedef evaluator nestedType;
 
-  explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
@@ -1320,7 +1320,7 @@
   }
 
   // This constructor is used when nesting an EvalTo evaluator in another evaluator
-  evaluator(const ArgType& arg) 
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
     : m_result(arg.rows(), arg.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h
index 6384dfd..6680f32 100644
--- a/Eigen/src/Core/CwiseUnaryView.h
+++ b/Eigen/src/Core/CwiseUnaryView.h
@@ -110,15 +110,15 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
     
-    inline Scalar* data() { return &(this->coeffRef(0)); }
-    inline const Scalar* data() const { return &(this->coeff(0)); }
+    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
 
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC inline Index outerStride() const
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
diff --git a/Eigen/src/Core/Flagged.h b/Eigen/src/Core/Flagged.h
index 6ce11ed..2e2a50b 100644
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -50,37 +50,37 @@
 
     explicit inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_matrix.innerStride(); }
 
-    inline CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
     {
       return m_matrix.coeff(row, col);
     }
 
-    inline CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
     {
       return m_matrix.coeff(index);
     }
     
-    inline const Scalar& coeffRef(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index col) const
     {
       return m_matrix.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const Scalar& coeffRef(Index index) const
+    EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const
     {
       return m_matrix.const_cast_derived().coeffRef(index);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_matrix.const_cast_derived().coeffRef(row, col);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_matrix.const_cast_derived().coeffRef(index);
     }
@@ -109,13 +109,13 @@
       m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
 
-    const ExpressionType& _expression() const { return m_matrix; }
+    EIGEN_DEVICE_FUNC const ExpressionType& _expression() const { return m_matrix; }
 
     template<typename OtherDerived>
-    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
+    EIGEN_DEVICE_FUNC typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
 
     template<typename OtherDerived>
-    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
+    EIGEN_DEVICE_FUNC void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
 
   protected:
     ExpressionTypeNested m_matrix;
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 065acfa..7b08b45 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -39,29 +39,29 @@
     typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
 
-    explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -90,7 +90,7 @@
       m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType& m_expression;
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 3e68b1e..3c67eda 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -85,7 +85,7 @@
       *
       * \sa innerStride(), outerStride()
       */
-    inline const Scalar* data() const { return m_data; }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index rowId, Index colId) const
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 048060e..0015131 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -180,7 +180,7 @@
 #ifdef __CUDACC__
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    const Product<Derived,OtherDerived,LazyProduct>
     operator*(const MatrixBase<OtherDerived> &other) const
     { return this->lazyProduct(other); }
 #else
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 248dd8e..9aeaf8d 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -40,29 +40,29 @@
     typedef typename internal::dense_xpr_base<NestByValue>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
-    explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -91,7 +91,7 @@
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType m_expression;
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index ec7621d..58fe0f6 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -221,11 +221,11 @@
     }
 
     /** \returns a const pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE const Scalar *data() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
     { return m_storage.data(); }
 
     /** \returns a pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE Scalar *data()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
     { return m_storage.data(); }
 
     /** Resizes \c *this to a \a rows x \a cols matrix.
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index ae64d52..cb79543 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -68,7 +68,7 @@
                                                 typename RhsTraits::StorageKind,
                                                 internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
   typedef typename promote_index_type<typename LhsTraits::Index,
-                                         typename RhsTraits::Index>::type Index;
+                                      typename RhsTraits::Index>::type Index;
   
   enum {
     RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
@@ -113,18 +113,18 @@
     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
 
-    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows()
         && "invalid matrix product"
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
 
-    const LhsNestedCleaned& lhs() const { return m_lhs; }
-    const RhsNestedCleaned& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
 
   protected:
 
@@ -186,7 +186,7 @@
     
   public:
   
-    Scalar coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@@ -194,7 +194,7 @@
       return typename internal::evaluator<Derived>::type(derived()).coeff(row,col);
     }
 
-    Scalar coeff(Index i) const
+    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
     {
       EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
       eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index c944ec9..3cebbbd 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -35,7 +35,7 @@
   typedef evaluator type;
   typedef evaluator nestedType;
   
-  explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
  
 // Catch scalar * ( A * B ) and transform it to (A*scalar) * B
@@ -50,7 +50,7 @@
   typedef evaluator type;
   typedef evaluator nestedType;
   
-  explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
   {}
 };
@@ -66,7 +66,7 @@
   typedef evaluator type;
   typedef evaluator nestedType;
 
-  explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
         Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
         xpr.index() ))
@@ -104,7 +104,7 @@
 //     CoeffReadCost = 0 // FIXME why is it needed? (this was already the case before the evaluators, see traits<ProductBase>)
   };
 
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
@@ -378,7 +378,7 @@
   typedef typename XprType::PacketScalar PacketScalar;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : m_lhs(xpr.lhs()),
       m_rhs(xpr.rhs()),
       m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
@@ -461,7 +461,7 @@
                         && (InnerSize % packet_traits<Scalar>::size == 0)
   };
   
-  const CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index row, Index col) const
   {
     // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
     return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
@@ -471,7 +471,7 @@
    * which is why we don't set the LinearAccessBit.
    * TODO: this seems possible when the result is a vector
    */
-  const CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
   {
     const Index row = RowsAtCompileTime == 1 ? 0 : index;
     const Index col = RowsAtCompileTime == 1 ? index : 0;
@@ -512,7 +512,7 @@
   enum {
     Flags = Base::Flags | EvalBeforeNestingBit
   };
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
   {}
 };
@@ -694,7 +694,7 @@
   {
   }
   
-  EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
   {
     return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
   }
@@ -743,19 +743,21 @@
     StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
   };
 
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(xpr.rhs(), xpr.lhs().diagonal())
   {
   }
   
-  EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
   {
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
   
+#ifndef __CUDACC__
   template<int LoadMode>
   EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
   {
+    // NVCC complains about template keyword, so we disable this function in CUDA mode
     return this->template packet_impl<LoadMode>(row,col, row,
                                  typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
   }
@@ -765,7 +767,7 @@
   {
     return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
   }
-  
+#endif
 };
 
 // dense * diagonal
@@ -787,16 +789,17 @@
   
   enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
 
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(xpr.lhs(), xpr.rhs().diagonal())
   {
   }
   
-  EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
   {
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
   
+#ifndef __CUDACC__
   template<int LoadMode>
   EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
   {
@@ -809,7 +812,7 @@
   {
     return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
   }
-  
+#endif
 };
 
 /***************************************************************************
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 14a2671..f654691 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -277,7 +277,7 @@
   typedef typename packet_traits<Scalar>::type PacketScalar;
   typedef typename Derived::Index Index;
 
-  static Scalar run(const Derived &mat, const Func& func)
+  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     const Index innerSize = mat.innerSize();
@@ -319,7 +319,7 @@
     Size = Derived::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
@@ -340,7 +340,7 @@
 {
 public:
   typedef _XprType XprType;
-  explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
   
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
@@ -359,15 +359,17 @@
     CoeffReadCost = evaluator<XprType>::CoeffReadCost
   };
   
-  Index rows() const { return m_xpr.rows(); }
-  Index cols() const { return m_xpr.cols(); }
-  Index size() const { return m_xpr.size(); }
-  Index innerSize() const { return m_xpr.innerSize(); }
-  Index outerSize() const { return m_xpr.outerSize(); }
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
 
+  EIGEN_DEVICE_FUNC
   CoeffReturnType coeff(Index row, Index col) const
   { return m_evaluator.coeff(row, col); }
 
+  EIGEN_DEVICE_FUNC
   CoeffReturnType coeff(Index index) const
   { return m_evaluator.coeff(index); }
 
@@ -379,6 +381,7 @@
   PacketReturnType packet(Index index) const
   { return m_evaluator.template packet<LoadMode>(index); }
   
+  EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
   { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h
index 2653f2b..27fa178 100644
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@@ -127,12 +127,12 @@
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  inline Index innerStride() const
+  EIGEN_DEVICE_FUNC inline Index innerStride() const
   {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  inline Index outerStride() const
+  EIGEN_DEVICE_FUNC inline Index outerStride() const
   {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
          : IsVectorAtCompileTime ? this->size()
@@ -140,7 +140,7 @@
          : this->rows();
   }
 
-  RefBase()
+  EIGEN_DEVICE_FUNC RefBase()
     : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
       // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
       m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
@@ -154,7 +154,7 @@
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
   template<typename Expression>
-  void construct(Expression& expr)
+  EIGEN_DEVICE_FUNC void construct(Expression& expr)
   {
     if(PlainObjectType::RowsAtCompileTime==1)
     {
@@ -192,13 +192,13 @@
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Derived>
-    inline Ref(PlainObjectBase<Derived>& expr,
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
                typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
       Base::construct(expr);
     }
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
                typename internal::enable_if<bool(internal::is_lvalue<Derived>::value&&bool(Traits::template match<Derived>::MatchAtCompileTime)),Derived>::type* = 0,
                int = Derived::ThisConstantIsPrivateInPlainObjectBase)
     #else
@@ -224,7 +224,7 @@
     EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr)
     {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
@@ -232,25 +232,25 @@
       construct(expr.derived(), typename Traits::template match<Derived>::type());
     }
 
-    inline Ref(const Ref& other) : Base(other) {
+    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
       // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
     }
 
     template<typename OtherRef>
-    inline Ref(const RefBase<OtherRef>& other) {
+    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
       construct(other.derived(), typename Traits::template match<OtherRef>::type());
     }
 
   protected:
 
     template<typename Expression>
-    void construct(const Expression& expr,internal::true_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
     {
       Base::construct(expr);
     }
 
     template<typename Expression>
-    void construct(const Expression& expr, internal::false_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
     {
       internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
       Base::construct(m_object);
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 5fcd9e3..4e2a81b 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -103,7 +103,7 @@
   typedef evaluator type;
   typedef evaluator nestedType;
 
-  explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h
index 9ba6ea2..291300a 100644
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@@ -89,47 +89,47 @@
     typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
   public:
 
-    explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return -m_matrix.innerStride();
     }
 
-    inline Scalar& operator()(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col)
     {
       eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
       return coeffRef(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
                                                     ReverseCol ? m_matrix.cols() - col - 1 : col);
     }
 
-    inline CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index col) const
     {
       return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
                             ReverseCol ? m_matrix.cols() - col - 1 : col);
     }
 
-    inline CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index index) const
     {
       return m_matrix.coeff(m_matrix.size() - index - 1);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
     }
 
-    inline Scalar& operator()(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& operator()(Index index)
     {
       eigen_assert(index >= 0 && index < m_matrix.size());
       return coeffRef(index);
@@ -164,7 +164,7 @@
       m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
     }
 
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() const 
     {
       return m_matrix;
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 1c44d9c..b785e8e 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -228,11 +228,11 @@
   typedef typename Base::AssignmentTraits AssignmentTraits;
   
   
-  triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
   
-  void assignCoeff(Index row, Index col)
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
     eigen_internal_assert(row!=col);
     Scalar tmp = m_src.coeff(row,col);
@@ -240,12 +240,12 @@
     m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
   }
   
-  void assignDiagonalCoeff(Index id)
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
     Base::assignCoeff(id,id);
   }
   
-  void assignOppositeCoeff(Index, Index)
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
   { eigen_internal_assert(false && "should never be called"); }
 };
 
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 641ffa2..3905cd6 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -121,7 +121,7 @@
   typedef evaluator type;
   typedef evaluator nestedType;
 
-  explicit evaluator(const SolveType& solve)
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
     : m_result(solve.rows(), solve.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index 3277cb5..5531932 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -32,7 +32,7 @@
   typedef typename Base::DstXprType DstXprType;
   typedef swap_assign_op<Scalar> Functor;
   
-  generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
   
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 57d6fd2..a3b9525 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -129,8 +129,8 @@
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    inline const Scalar* data() const { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
 
     // FIXME: shall we keep the const version of coeffRef?
     EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 055ed75..defe29c 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -692,12 +692,12 @@
   typedef typename Base::AssignmentTraits AssignmentTraits;
   
   
-  triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
   
 #ifdef EIGEN_INTERNAL_DEBUGGING
-  void assignCoeff(Index row, Index col)
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
     eigen_internal_assert(row!=col);
     Base::assignCoeff(row,col);
@@ -706,14 +706,14 @@
   using Base::assignCoeff;
 #endif
   
-  void assignDiagonalCoeff(Index id)
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
          if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
     else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
     else if(Mode==0)                       Base::assignCoeff(id,id);
   }
   
-  void assignOppositeCoeff(Index row, Index col)
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
   { 
     eigen_internal_assert(row!=col);
     if(SetOpposite)
@@ -722,7 +722,7 @@
 };
 
 template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
-void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
 {
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
   
@@ -746,7 +746,7 @@
 }
 
 template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
-void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src)
 {
   call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar>());
 }
@@ -759,7 +759,7 @@
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
     
@@ -770,7 +770,7 @@
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
   }
@@ -779,7 +779,7 @@
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular, Scalar>
 {
-  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
   }
diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h
index d4d85a1..161b0aa 100644
--- a/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -123,7 +123,7 @@
 
 
 /** \internal
-  * \brief Template functor for scalar/packet assignment with swaping
+  * \brief Template functor for scalar/packet assignment with swapping
   *
   * It works as follow. For a non-vectorized evaluation loop, we have:
   *   for(i) func(A.coeffRef(i), B.coeff(i));
@@ -142,8 +142,13 @@
   EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
   {
+#ifdef __CUDACC__
+    // FIXME is there some kind of cuda::swap?
+    Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
+#else
     using std::swap;
     swap(a,const_cast<Scalar&>(b));
+#endif
   }
   
   template<int LhsAlignment, int RhsAlignment, typename Packet>
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 65bb294..3ed002a 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -489,20 +489,12 @@
 endmacro(ei_set_build_string)
 
 macro(ei_is_64bit_env VAR)
-
-  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/is64.cpp"
-      "int main() { return (sizeof(int*) == 8 ? 1 : 0); }
-      ")
-  try_run(run_res compile_res
-         ${CMAKE_CURRENT_BINARY_DIR} "${CMAKE_CURRENT_BINARY_DIR}/is64.cpp"
-          RUN_OUTPUT_VARIABLE run_output)
-
-  if(compile_res AND run_res)
-    set(${VAR} ${run_res})
-  elseif(CMAKE_CL_64)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
     set(${VAR} 1)
-  elseif("$ENV{Platform}" STREQUAL "X64") # nmake 64 bit
-    set(${VAR} 1)
+  elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    set(${VAR} 0)
+  else()
+    message(WARNING "Unsupported pointer size. Please contact the authors.")
   endif()
 endmacro(ei_is_64bit_env)
 
diff --git a/test/cuda_basic.cu b/test/cuda_basic.cu
index 4c7e96c..300bced 100644
--- a/test/cuda_basic.cu
+++ b/test/cuda_basic.cu
@@ -65,7 +65,7 @@
 };
 
 template<typename T1, typename T2>
-struct prod {
+struct prod_test {
   EIGEN_DEVICE_FUNC
   void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
   {
@@ -125,8 +125,8 @@
   CALL_SUBTEST( run_and_compare_to_cuda(redux<Array4f>(), nthreads, in, out) );
   CALL_SUBTEST( run_and_compare_to_cuda(redux<Matrix3f>(), nthreads, in, out) );
   
-  CALL_SUBTEST( run_and_compare_to_cuda(prod<Matrix3f,Matrix3f>(), nthreads, in, out) );
-  CALL_SUBTEST( run_and_compare_to_cuda(prod<Matrix4f,Vector4f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
+  CALL_SUBTEST( run_and_compare_to_cuda(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
   
   CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
   CALL_SUBTEST( run_and_compare_to_cuda(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );