[Ascend] add qwen3next support by Yanguan619 · Pull Request #55

[Ascend] add qwen3next support by Yanguan619 · Pull Request #55 · flagos-ai/vllm-plugin-FL

PR Category

Vendor

PR Types

New Model

PR Description

image: quay.nju.edu.cn/ascend/vllm-ascend:v0.13.0-a3

We used the base image, which allowed us to avoid compiling additional operators from csrc, along with other dependency libraries. We will clean up the dependency libraries and submit the csrc code in subsequent updates.

import os

import torch

os.environ["USE_FLAGGEMS"] = "0"
os.environ["VLLM_PLUGINS"] = "fl"
os.environ["VLLM_FL_PLATFORM"] = "ascend"
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TRITON_ALL_BLOCKS_PARALLEL"] = "1"
os.environ["ENABLE_UNPUBLISHED_FEATURE"] = "1"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"


def test_qwen3_next():
    
    from vllm import LLM, SamplingParams

    prompts = [
        "Hello, my name is",
    ]

    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=10, temperature=0.0)
    # Create an LLM.
    llm = LLM(
        model="/mnt/weights/Qwen3-Next-80B-A3B-Instruct/",
        tensor_parallel_size=4,
        max_model_len=262144,
        enforce_eager=True,
        gpu_memory_utilization=0.8,
        block_size=384,
        enable_prefix_caching=False,
    )

    # Generate texts from the prompts.
    outputs = llm.generate(prompts, sampling_params)

    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

    del llm
    torch.npu.empty_cache()
    assert len(outputs) == len(prompts)

pytest test.py -sv
...
Prompt: 'Hello, my name is', Generated text: ' [Your Name], and I am a 2'