[BUG] [stdlib] `llvm.masked.scatter` causes `CUDA_ERROR_ILLEGAL_ADDRESS` in shared memory

Bug description

It works in global memory but it doesn't in shared memory. We need to either provide a fallback or constrain it to AddressSpace.GENERIC

Steps to reproduce

from gpu.host import DeviceContext
from layout import Layout, LayoutTensor
from layout.tensor_builder import LayoutTensorBuild as tb
from testing import assert_equal


fn kernel(tensor: LayoutTensor[mut=False, DType.float32, Layout.row_major(4)]):
    var shared_mem = tb[DType.float32]().row_major[4]().shared().alloc()
    shared_mem.ptr.scatter(
        SIMD[DType.uint, 4](3, 2, 1, 0), SIMD[DType.float32, 4](1, 2, 3, 4)
    )


fn main() raises:
    var data = List[Float32](18, 7, 29, 27)

    with DeviceContext() as ctx:
        var dev_buf = ctx.enqueue_create_buffer[DType.float32](4)
        ctx.enqueue_copy(dev_buf, data.unsafe_ptr())
        var tensor = LayoutTensor[
            mut=False, DType.float32, Layout.row_major(4)
        ](dev_buf.unsafe_ptr())
        ctx.enqueue_function[kernel](tensor, grid_dim=1, block_dim=1)
        ctx.synchronize()
        with dev_buf.map_to_host() as out_host:
            assert_equal(out_host[0], 4)
            assert_equal(out_host[1], 3)
            assert_equal(out_host[2], 2)
            assert_equal(out_host[3], 1)