fix some ctypes in mtmd_cpp.py and adapt Llava15ChatHandler for vision by flamingrickpat · Pull Request #10 · JamePeng/llama-cpp-python

from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler

MODEL_PATH = r"./gemma-3-12b-it-q4_0.gguf"
MMPROJ_PATH = r"./mmproj-model-f16-12B.gguf"

class Gemma3Handler(Llava15ChatHandler):
    DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
    CHAT_FORMAT = (
        "{% for message in messages %}"
        "{% if message['role'] == 'user' %}"
        "<start_of_turn>user\n"
        "{% else %}"
        "<start_of_turn>model\n"
        "{% endif %}"
        "{% if 1 == 1 %}"
        "{% if message['content'] is string %}"
        "{{ message['content'] }}"
        "{% else %}"
        "{% for content in message['content'] %}"
        "{% if content['type'] == 'text' %}"
        "{{ content['text'] }}"
        "{% elif content['type'] == 'image_url' %}"
        "{% if content.image_url is string %}"
        "{{ content.image_url }}"
        "{% else %}"
        "<start_of_image>{{ content.image_url.url }}"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "{% endif %}"
        "<end_of_turn>\n"
        "{% endif %}"
        "{% endfor %}"
        "<start_of_turn>model\n"
    )

    def __call__(self, **kwargs):
        llama = kwargs['llama']

        # Clear state for multiple runs
        llama.reset()
        llama.n_tokens = 0

        if hasattr(llama, 'input_ids'):
            llama.input_ids.fill(0)

        # Clear any handler state
        if hasattr(self, '_last_image_embed'):
            self._last_image_embed = None
            self._last_image_hash = None

        if self.verbose:
            messages = kwargs.get('messages', [])
            image_count = len(self.get_image_urls(messages))
            print(f"Minimal - Cleared state, processing {image_count} images")

        # Use parent implementation
        return super().__call__(**kwargs)

llm = Llama(
    model_path=MODEL_PATH,
    chat_handler=Gemma3Handler(clip_model_path=MMPROJ_PATH),
    n_gpu_layers=-1,
    n_ctx=2048,
)

res = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are an assistant who perfectly describes images."},
        {
            "role": "user",
            "content": [
                {"type" : "text", "text": "What's in this image?"},
                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/0/0d/20250510-Hannah_Hampton_%28cropped_-_portrait_-_2%29.jpg" } }
            ]
        }
    ]
)
print(res)