[Ascend] add qwen3next support by Yanguan619 · Pull Request #55 · flagos-ai/vllm-plugin-FL
PR Category
Vendor
PR Types
New Model
PR Description
image: quay.nju.edu.cn/ascend/vllm-ascend:v0.13.0-a3
We used the base image, which allowed us to avoid compiling additional operators from csrc, along with other dependency libraries. We will clean up the dependency libraries and submit the csrc code in subsequent updates.
import os import torch os.environ["USE_FLAGGEMS"] = "0" os.environ["VLLM_PLUGINS"] = "fl" os.environ["VLLM_FL_PLATFORM"] = "ascend" os.environ["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True" os.environ["TRITON_ALL_BLOCKS_PARALLEL"] = "1" os.environ["ENABLE_UNPUBLISHED_FEATURE"] = "1" os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" def test_qwen3_next(): from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=10, temperature=0.0) # Create an LLM. llm = LLM( model="/mnt/weights/Qwen3-Next-80B-A3B-Instruct/", tensor_parallel_size=4, max_model_len=262144, enforce_eager=True, gpu_memory_utilization=0.8, block_size=384, enable_prefix_caching=False, ) # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") del llm torch.npu.empty_cache() assert len(outputs) == len(prompts)
pytest test.py -sv ... Prompt: 'Hello, my name is', Generated text: ' [Your Name], and I am a 2'