{
  "name": "SmolVLM2 Captioner",
  "description": "Applies the SmolVLM2-2.2B-Instruct multimodal model to video frames selected by input TimeFrame annotations for prompt-driven captioning / scene description. Each invocation runs a single `prompt` against the TimeFrames selected by `tfLabels`; to apply different prompts to different label subsets (e.g. one prompt for slates, another for chyrons), run the app once per (`prompt`, `tfLabels`) combination. Per-TimeFrame captioning is composite: every frame sampled from a TF is fed to the model in a single prompt and yields one caption per TF. This app ships only the 2.2B-Instruct variant -- the largest and most general-purpose model in the SmolVLM2 family. The smaller (256M and 500M) SmolVLM2 releases are post-trained specifically for video-QA tasks and we do not expect them to generalize well, given their size.",
  "app_version": "v1.0",
  "mmif_version": "1.2.0",
  "analyzer_versions": {
    "HuggingFaceTB/SmolVLM2-2.2B-Instruct": "482adb5"
  },
  "app_license": "Apache 2.0",
  "analyzer_license": "Apache 2.0",
  "identifier": "http://apps.clams.ai/smolvlm2-captioner/v1.0",
  "url": "https://github.com/clamsproject/app-smolvlm2-captioner",
  "input": [
    {
      "@type": "http://clams.ai/vocabulary/type/VideoDocument/v2",
      "required": true
    },
    {
      "@type": "http://clams.ai/vocabulary/type/TimeFrame/v6",
      "description": "Labeled TimeFrame annotations selecting which video segments to caption. Frame selection within each segment is controlled by the universal `tfSamplingMode` parameter (see SDK docs). When present, the `representatives` property is consumed for representative-based sampling modes. Filter by label with the `tfLabels` parameter.",
      "properties": {
        "representatives": "?",
        "label": "*"
      },
      "required": true
    }
  ],
  "output": [
    {
      "@type": "http://clams.ai/vocabulary/type/TextDocument/v2",
      "description": "Caption text generated by the SmolVLM2 model for each processed image. The `origins` property points to the `TimePoint` anchoring the image (an existing TimePoint reused when one already backs the image, or a TimePoint newly created in this view when the image was sampled from a TimeFrame interval without a backing TimePoint).",
      "properties": {
        "origins": "*",
        "origination": "derived"
      }
    },
    {
      "@type": "http://clams.ai/vocabulary/type/Alignment/v1",
      "description": "Alignment between each parent TimeFrame and the TextDocument(s) derived from it."
    },
    {
      "@type": "http://clams.ai/vocabulary/type/TimePoint/v5",
      "description": "Optional output. Newly-created TimePoint annotations for images that were sampled from a TimeFrame interval without an existing backing TimePoint (see `tfSamplingMode`). When every sampled image came from an existing TimePoint, no TimePoints are created",
      "properties": {
        "timeUnit": "milliseconds",
        "timePoint": "*"
      }
    }
  ],
  "parameters": [
    {
      "name": "tfLabels",
      "description": "Label(s) of input TimeFrame annotations to caption. By default (`[]`), all TimeFrames are processed regardless of label. To restrict to specific labels, pass this parameter one or more times.",
      "type": "string",
      "default": [],
      "multivalued": true
    },
    {
      "name": "prompt",
      "description": "User prompt(s) sent to the model. A single value runs as a one-shot generation. A multi-value list is interpreted as a multi-turn static prompt; see ``promptMode`` for how turns are assembled.",
      "type": "string",
      "default": "You are looking at one or more frames sampled from a single segment of a news video. Describe what is shown, the purpose of this segment in the broader news video, and transcribe any visible text. Produce one consolidated caption across all provided frames.",
      "multivalued": true
    },
    {
      "name": "systemPrompt",
      "description": "Optional system-role text prepended to the conversation. Empty by default.",
      "type": "string",
      "default": "",
      "multivalued": false
    },
    {
      "name": "promptMode",
      "description": "How to interpret a multi-value ``prompt`` list. Has no effect when ``prompt`` has a single value. For semantics of each choice and worked examples, see https://clams.ai/clams-python/app-baseclasses.html#promptable-multiturn",
      "type": "string",
      "choices": [
        "user-only",
        "turn-taking"
      ],
      "default": "turn-taking",
      "multivalued": false
    },
    {
      "name": "maxNewTokens",
      "description": "Maximum number of new tokens generated per inference call. Forwarded to the backend's ``generate``-equivalent. Larger values grow the KV cache linearly and increase GPU memory usage; reduce if VRAM is constrained.",
      "type": "integer",
      "default": 200,
      "multivalued": false
    },
    {
      "name": "temperature",
      "description": "Sampling temperature. The default ``0.0`` selects deterministic / greedy decoding for maximum reproducibility; override for sampled generation.",
      "type": "number",
      "default": 0,
      "multivalued": false
    },
    {
      "name": "topP",
      "description": "Nucleus-sampling cumulative probability cutoff. Only meaningful when ``temperature`` is greater than 0.",
      "type": "number",
      "default": 1,
      "multivalued": false
    },
    {
      "name": "topK",
      "description": "Top-K sampling cutoff. Only meaningful when ``temperature`` is greater than 0.",
      "type": "integer",
      "default": 50,
      "multivalued": false
    },
    {
      "name": "parallelPrompts",
      "description": "Number of independent prompts the app runs in parallel (stacks into a single forward pass). The *size* of each prompt (how many images, how long the system/user text is, etc.) is NOT regulated by this parameter; that is each app's responsibility. Prompt count and per-prompt content size combine multiplicatively for GPU memory, so the two can blow up together. Catastrophic example: ``tfSamplingMode=all`` on a TimeFrame without ``targets`` expands that TF into one image per native-FPS frame (300 images for a 10-second TF at 30fps); ``parallelPrompts=4`` then runs 4 such prompts in one forward pass (~1200 images), guaranteed OOM. Keep at ``1`` on memory-tight setups; raise only when per-prompt content is small and bounded.",
      "type": "integer",
      "default": 1,
      "multivalued": false
    },
    {
      "name": "model",
      "description": "HuggingFace model identifier to use for this request. Must be one of the model ids declared in this app's ``analyzer_versions``; the SDK pins the corresponding commit hash at load time. When the app ships a single model (the typical case), this parameter defaults to that one model and can be omitted. Pass the full HF model id (e.g. ``org/repo-name``); URL-encoding the ``/`` is optional.",
      "type": "string",
      "choices": [
        "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
      ],
      "default": "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
      "multivalued": false
    },
    {
      "name": "pretty",
      "description": "The JSON body of the HTTP response will be re-formatted with 2-space indentation",
      "type": "boolean",
      "default": false,
      "multivalued": false
    },
    {
      "name": "runningTime",
      "description": "The running time of the app will be recorded in the view metadata",
      "type": "boolean",
      "default": true,
      "multivalued": false
    },
    {
      "name": "hwFetch",
      "description": "The hardware information (architecture, GPU and vRAM) will be recorded in the view metadata",
      "type": "boolean",
      "default": false,
      "multivalued": false
    },
    {
      "name": "tfSamplingMode",
      "description": "Sampling mode for TimeFrame annotations. Has no effect when the app does not process TimeFrames. \"representatives\" uses all representative timepoints if present, otherwise skips the TimeFrame. \"single\" uses the middle representative if present, otherwise extracts an image from the midpoint of the start/end interval (midpoint is calculated by floor division of the sum of start and end). \"all\" uses all target timepoints if present, otherwise extracts all images from the time interval.",
      "type": "string",
      "choices": [
        "representatives",
        "single",
        "all"
      ],
      "default": "representatives",
      "multivalued": false
    }
  ],
  "est_gpu_mem_min": 5000,
  "est_gpu_mem_typ": 7000
}