Source code for common.vlm

import logging
from abc import ABC
from typing import List, Union

import httpx
import openai
import tenacity

from cicada.common import llm
from cicada.common.utils import colorstring, cprint

logger = logging.getLogger(__name__)



[docs]
class VisionLanguageModel(llm.LanguageModel, ABC):
    def __init__(
        self,
        api_key: str,
        api_base_url: str,
        model_name: str,
        org_id: str,
        **model_kwargs,
    ):
        """
        Initialize the VisionLanguageModel.

        :param api_key: The API key for the OpenAI service.
        :param api_base_url: The base URL for the OpenAI API.
        :param model_name: The name of the model to use.
        :param org_id: The organization ID for the OpenAI service.
        :param model_kwargs: Additional keyword arguments for the model.
        """
        super().__init__(
            api_key,
            api_base_url,
            model_name,
            org_id,
            **model_kwargs,
        )

    def _prepare_prompt(
        self,
        images_with_text: List[Union[str, bytes]] | None = None,
        prompt: str | None = None,
        images: bytes | List[bytes] | None = None,
        max_items_per_message: int = 4,  # Adjust as needed
    ) -> List[dict]:
        """
        Prepare the prompt for the API by splitting the content into multiple messages if needed.

        :param images_with_text: A list of mixed text (str) and image (bytes) data.
        :param prompt: Optional user prompt text.
        :param images: Optional image data (single or list of bytes).
        :param max_items_per_message: Maximum items per message.
        :return: A list of messages with prepared content.
        """
        content = []
        messages = []

        if prompt:
            messages.append({"role": "user", "content": prompt})

        # either images or images_with_text
        # Handle images
        if images:
            if not isinstance(images, list):
                images = [images]
            content = []
            for image_data in images:
                content.append(
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
                    }
                )
                if len(content) >= max_items_per_message:
                    messages.append({"role": "user", "content": content})
                    content = []
            if content:  # Add remaining items
                messages.append({"role": "user", "content": content})

        # Handle images_with_text
        elif images_with_text:
            content = []
            for item in images_with_text:
                if isinstance(item, str):
                    content.append({"type": "text", "text": item})
                elif isinstance(item, bytes):
                    content.append(
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{item}"},
                        }
                    )
                else:
                    raise ValueError(
                        f"Unsupported type in images_with_text: {type(item)}"
                    )
                if len(content) >= max_items_per_message:
                    messages.append({"role": "user", "content": content})
                    content = []
            if content:  # Add remaining items
                messages.append({"role": "user", "content": content})

        return messages


[docs]
    @tenacity.retry(
        stop=tenacity.stop_after_attempt(3)
        | tenacity.stop_after_delay(30),  # Stop after 3 attempts or 30 seconds
        wait=tenacity.wait_random_exponential(multiplier=1, min=2, max=10),
        retry=tenacity.retry_if_exception_type(
            (openai.APIError, httpx.ReadTimeout, httpx.ConnectTimeout)
        ),  # Retry on API errors or network timeouts
        before_sleep=tenacity.before_sleep_log(
            logger, logging.WARNING
        ),  # Log before retrying
        reraise=True,
    )
    def query_with_image(
        self,
        prompt: str | None = None,
        images: bytes | List[bytes] | None = None,
        images_with_text: List[Union[str, bytes]] | None = None,
        system_prompt: str | None = None,
    ) -> str:
        """
        Query the VisionLanguageModel with mixed text and image data.

        :param prompt: Optional user prompt text.
        :param images: Optional image data (single or list of bytes).
        :param images_with_text: Optional list of mixed text (str) and image (bytes) data.
        :param system_prompt: Optional system prompt text.
        :return: Generated response from the model.
        """
        full_prompt = self._prepare_prompt(
            images_with_text=images_with_text, prompt=prompt, images=images
        )
        logger.info(colorstring(len(full_prompt), "white"))
        if system_prompt:
            full_prompt = [
                {"role": "system", "content": system_prompt},
            ] + full_prompt

        # Use stream from configuration
        stream = self.stream

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=full_prompt,
            stream=stream,
            **self.model_kwargs,
        )

        if stream:
            complete_response = ""
            for chunk in response:
                chunk_content = chunk.choices[0].delta.content
                if chunk_content:
                    cprint(chunk_content, "white", end="", flush=True)
                    complete_response += chunk_content
            print()  # Add a newline after the response
            return complete_response.strip()
        else:
            return response.choices[0].message.content.strip()




# Example usage
if __name__ == "__main__":
    import argparse

    from cicada.common.utils import image_to_base64, load_config, setup_logging

    parser = argparse.ArgumentParser(description="Vision Language Model")
    parser.add_argument(
        "--config", default="config.yaml", help="Path to the configuration YAML file"
    )
    parser.add_argument("--image", required=True, help="Path to the testing image")
    args = parser.parse_args()
    setup_logging()

    vlm_config = load_config(args.config, "vlm")

    image_path = args.image
    image_data = image_to_base64(image_path)

    vlm = VisionLanguageModel(
        vlm_config["api_key"],
        vlm_config.get("api_base_url"),
        vlm_config.get("model_name", "gpt-4o"),
        vlm_config.get("org_id"),
        **vlm_config.get("model_kwargs", {}),
    )
    response = vlm.query_with_image(
        "Describe this image.",
        image_data,
        system_prompt="you are great visual describer.",
    )
    if not vlm.stream:
        logger.info(colorstring(response, "white"))
    response = vlm.query("who made you?")
    if not vlm.stream:
        logger.info(colorstring(response, "white"))