Fix a race in async tensor evaluation: Don't run on_done() until after device.deallocate() / evaluator.cleanup() complete, since the device might be destroyed after on_done() runs.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4f72156..0fb0a92 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h

@@ -592,8 +592,8 @@
         : evaluator(expr, thread_pool), on_done(std::move(done)) {}
 
     ~TensorAsyncExecutorContext() {
-      on_done();
       evaluator.cleanup();
+      on_done();
     }
 
     Evaluator evaluator;
@@ -674,9 +674,9 @@
           on_done(std::move(done)) {}
 
     ~TensorAsyncExecutorContext() {
-      on_done();
       device.deallocate(tiling.buffer);
       evaluator.cleanup();
+      on_done();
     }
 
     const ThreadPoolDevice& device;
@@ -755,9 +755,9 @@
           on_done(std::move(done)) {}
 
     ~TensorAsyncExecutorContext() {
-      on_done();
       device.deallocate(tiling.buffer);
       evaluator.cleanup();
+      on_done();
     }
 
     const ThreadPoolDevice& device;