{
  "name": "Whisper Wrapper",
  "description": "A CLAMS wrapper for Whisper-based ASR software originally developed by OpenAI.",
  "app_version": "v16.0",
  "mmif_version": "1.2.0",
  "analyzer_versions": {
    "tiny.en": "d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03",
    "tiny": "65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9",
    "base.en": "25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead",
    "base": "ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e",
    "small.en": "f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872",
    "small": "9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794",
    "medium.en": "d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f",
    "medium": "345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1",
    "large-v1": "e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a",
    "large-v2": "81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524",
    "large-v3": "e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb",
    "large-v3-turbo": "aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a"
  },
  "app_license": "Apache 2.0",
  "analyzer_license": "MIT",
  "identifier": "http://apps.clams.ai/whisper-wrapper/v16.0",
  "url": "https://github.com/clamsproject/app-whisper-wrapper",
  "input": [
    [
      {
        "@type": "http://clams.ai/vocabulary/type/AudioDocument/v2",
        "required": true
      },
      {
        "@type": "http://clams.ai/vocabulary/type/VideoDocument/v2",
        "required": true
      }
    ]
  ],
  "output": [
    {
      "@type": "http://clams.ai/vocabulary/type/TextDocument/v2"
    },
    {
      "@type": "http://clams.ai/vocabulary/type/TimeFrame/v6",
      "properties": {
        "timeUnit": "milliseconds"
      }
    },
    {
      "@type": "http://clams.ai/vocabulary/type/Alignment/v1"
    },
    {
      "@type": "http://vocab.lappsgrid.org/Token"
    },
    {
      "@type": "http://vocab.lappsgrid.org/Sentence"
    }
  ],
  "parameters": [
    {
      "name": "model",
      "description": "(from openai-whisper CLI) name of the Whisper model to use. Canonical names are the keys of this app's `analyzer_versions`; short aliases (e.g. `tu`/`turbo` for `large-v3-turbo`) are also accepted.",
      "type": "string",
      "choices": [
        "tiny.en",
        "tiny",
        "base.en",
        "base",
        "small.en",
        "small",
        "medium.en",
        "medium",
        "large-v1",
        "large-v2",
        "large-v3",
        "large-v3-turbo",
        "t",
        "b",
        "s",
        "m",
        "l",
        "l2",
        "l3",
        "tu",
        "large",
        "turbo"
      ],
      "default": "turbo",
      "multivalued": false
    },
    {
      "name": "language",
      "description": "(from openai-whisper CLI) language spoken in the audio, specify None to perform language detection. For the list of supported language codes, see https://github.com/openai/whisper/blob/04f449b8a437f1bbd3dba5c9f826aca972e7709a/whisper/tokenizer.py",
      "type": "string",
      "default": "",
      "multivalued": false
    },
    {
      "name": "task",
      "description": "(from openai-whisper CLI) whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')",
      "type": "string",
      "choices": [
        "transcribe",
        "translate"
      ],
      "default": "transcribe",
      "multivalued": false
    },
    {
      "name": "initialPrompt",
      "description": "(from openai-whisper CLI) optional text to provide as a prompt for the first window.",
      "type": "string",
      "default": "",
      "multivalued": false
    },
    {
      "name": "conditionOnPreviousText",
      "description": "(from openai-whisper CLI) if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop",
      "type": "string",
      "default": true,
      "multivalued": false
    },
    {
      "name": "noSpeechThreshold",
      "description": "(from openai-whisper CLI) if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence",
      "type": "number",
      "default": 0.6,
      "multivalued": false
    },
    {
      "name": "pretty",
      "description": "The JSON body of the HTTP response will be re-formatted with 2-space indentation",
      "type": "boolean",
      "default": false,
      "multivalued": false
    },
    {
      "name": "runningTime",
      "description": "The running time of the app will be recorded in the view metadata",
      "type": "boolean",
      "default": true,
      "multivalued": false
    },
    {
      "name": "hwFetch",
      "description": "The hardware information (architecture, GPU and vRAM) will be recorded in the view metadata",
      "type": "boolean",
      "default": false,
      "multivalued": false
    },
    {
      "name": "tfSamplingMode",
      "description": "Sampling mode for TimeFrame annotations. Has no effect when the app does not process TimeFrames. \"representatives\" uses all representative timepoints if present, otherwise skips the TimeFrame. \"single\" uses the middle representative if present, otherwise extracts an image from the midpoint of the start/end interval (midpoint is calculated by floor division of the sum of start and end). \"all\" uses all target timepoints if present, otherwise extracts all images from the time interval.",
      "type": "string",
      "choices": [
        "representatives",
        "single",
        "all"
      ],
      "default": "representatives",
      "multivalued": false
    }
  ],
  "est_gpu_mem_min": 1500,
  "est_gpu_mem_typ": 6000
}