bugfix
diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h
index d8b4fd4..2c06075 100644
--- a/bench/tensors/benchmark.h
+++ b/bench/tensors/benchmark.h
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <stddef.h>
 #include <stdint.h>
 #include <vector>
 
@@ -45,4 +46,5 @@
 void StartBenchmarkTiming();
 #define BENCHMARK(f) \
     static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
-        (new ::testing::Benchmark(#f, f))
\ No newline at end of file
+        (new ::testing::Benchmark(#f, f))
+
diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc
index 0fc1296..b2f457c 100644
--- a/bench/tensors/benchmark_main.cc
+++ b/bench/tensors/benchmark_main.cc
@@ -17,6 +17,7 @@
 #include <regex.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <string>
 #include <inttypes.h>
 #include <time.h>
@@ -27,8 +28,14 @@
 static int64_t g_benchmark_start_time_ns;
 typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
 typedef BenchmarkMap::iterator BenchmarkMapIt;
-static BenchmarkMap g_benchmarks;
+
+BenchmarkMap& gBenchmarks() {
+  static BenchmarkMap g_benchmarks;
+  return g_benchmarks;
+}
+
 static int g_name_column_width = 20;
+
 static int Round(int n) {
   int base = 1;
   while (base*10 < n) {
@@ -101,7 +108,7 @@
     fprintf(stderr, "%s: missing function\n", name_);
     exit(EXIT_FAILURE);
   }
-  g_benchmarks.insert(std::make_pair(name, this));
+  gBenchmarks().insert(std::make_pair(name, this));
 }
 void Benchmark::Run() {
   if (fn_ != NULL) {
@@ -183,16 +190,16 @@
   }
 }
 int main(int argc, char* argv[]) {
-  if (g_benchmarks.empty()) {
+  if (gBenchmarks().empty()) {
     fprintf(stderr, "No benchmarks registered!\n");
     exit(EXIT_FAILURE);
   }
-  for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
     int name_width = static_cast<int>(strlen(it->second->Name()));
     g_name_column_width = std::max(g_name_column_width, name_width);
   }
   bool need_header = true;
-  for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
+  for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
     ::testing::Benchmark* b = it->second;
     if (b->ShouldRun(argc, argv)) {
       if (need_header) {
@@ -206,10 +213,10 @@
   if (need_header) {
     fprintf(stderr, "No matching benchmarks!\n");
     fprintf(stderr, "Available benchmarks:\n");
-    for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
+    for (BenchmarkMapIt it = gBenchmarks().begin(); it != gBenchmarks().end(); ++it) {
       fprintf(stderr, "  %s\n", it->second->Name());
     }
     exit(EXIT_FAILURE);
   }
   return 0;
-}
\ No newline at end of file
+}
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index a1696af..071326a 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -10,13 +10,6 @@
 #define BENCHMARK_RANGE(bench, lo, hi) \
   BENCHMARK(bench)->Range(lo, hi)
 
-template <typename... Args>
-std::string StrCat(const Args... args) {
-  std::stringstream ss;
-  StrCatRecursive(ss, args...);
-  return ss.str();
-}
-
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
@@ -305,9 +298,9 @@
   }
 
 
-  size_t m_;
-  size_t k_;
-  size_t n_;
+  TensorIndex m_;
+  TensorIndex k_;
+  TensorIndex n_;
   float* a_;
   float* b_;
   float* c_;
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cu
similarity index 80%
rename from bench/tensors/tensor_benchmarks_gpu.cc
rename to bench/tensors/tensor_benchmarks_gpu.cu
index 9fe8f84..fbb486e 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cc
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -10,13 +10,11 @@
 #define BM_FuncGPU(FUNC)                                                       \
   static void BM_##FUNC(int iters, int N) {                                    \
     StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
-    cudaStreamDestroy(stream);                                                 \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
 
@@ -35,13 +33,11 @@
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
   static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
     StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3);                \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters);                                                         \
-    cudaStreamDestroy(stream);                                                 \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
 
@@ -55,13 +51,11 @@
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
   static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
     StopBenchmarkTiming();                                                     \
-    cudaStream_t stream;                                                       \
-    cudaStreamCreate(&stream);                                                 \
+    Eigen::CudaStreamDevice stream;                                            \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice> suite(device, N);                         \
     cudaDeviceSynchronize();                                                   \
     suite.FUNC(iters, DIM1, DIM2);                                             \
-    cudaStreamDestroy(stream);                                                 \
   }                                                                            \
   BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);