GPU: Fix size_t/int mismatch in NPP stream context attribute query

cudaDeviceGetAttribute writes into an int*, but NppStreamContext::
nSharedMemPerBlock is size_t. Query into a local int and cast. CI
failure on the previous commit pinned all five GPU builds (cuda-11.5
gcc-10/clang-14, cuda-12.6 gcc-13/clang-19, msvc-14.29).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/unsupported/Eigen/src/GPU/DeviceScalarOps.h b/unsupported/Eigen/src/GPU/DeviceScalarOps.h
index cd1171d..d6bc46a 100644
--- a/unsupported/Eigen/src/GPU/DeviceScalarOps.h
+++ b/unsupported/Eigen/src/GPU/DeviceScalarOps.h
@@ -36,7 +36,9 @@
   cudaDeviceGetAttribute(&ctx.nMultiProcessorCount, cudaDevAttrMultiProcessorCount, ctx.nCudaDeviceId);
   cudaDeviceGetAttribute(&ctx.nMaxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, ctx.nCudaDeviceId);
   cudaDeviceGetAttribute(&ctx.nMaxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, ctx.nCudaDeviceId);
-  cudaDeviceGetAttribute(&ctx.nSharedMemPerBlock, cudaDevAttrMaxSharedMemoryPerBlock, ctx.nCudaDeviceId);
+  int shared_mem_per_block = 0;
+  cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, ctx.nCudaDeviceId);
+  ctx.nSharedMemPerBlock = static_cast<size_t>(shared_mem_per_block);
   cudaStreamGetFlags(stream, &ctx.nStreamFlags);
   return ctx;
 }