[BUG] [stdlib] `llvm.masked.scatter` causes `CUDA_ERROR_ILLEGAL_ADDRESS` in shared memory
Bug description
It works in global memory but it doesn't in shared memory. We need to either provide a fallback or constrain it to AddressSpace.GENERIC
Steps to reproduce
from gpu.host import DeviceContext from layout import Layout, LayoutTensor from layout.tensor_builder import LayoutTensorBuild as tb from testing import assert_equal fn kernel(tensor: LayoutTensor[mut=False, DType.float32, Layout.row_major(4)]): var shared_mem = tb[DType.float32]().row_major[4]().shared().alloc() shared_mem.ptr.scatter( SIMD[DType.uint, 4](3, 2, 1, 0), SIMD[DType.float32, 4](1, 2, 3, 4) ) fn main() raises: var data = List[Float32](18, 7, 29, 27) with DeviceContext() as ctx: var dev_buf = ctx.enqueue_create_buffer[DType.float32](4) ctx.enqueue_copy(dev_buf, data.unsafe_ptr()) var tensor = LayoutTensor[ mut=False, DType.float32, Layout.row_major(4) ](dev_buf.unsafe_ptr()) ctx.enqueue_function[kernel](tensor, grid_dim=1, block_dim=1) ctx.synchronize() with dev_buf.map_to_host() as out_host: assert_equal(out_host[0], 4) assert_equal(out_host[1], 3) assert_equal(out_host[2], 2) assert_equal(out_host[3], 1)