feat: Add social media video import (YouTube, TikTok, Instagram) (#6764)

Co-authored-by: Maxime Louward <61564950+mlouward@users.noreply.github.com> Co-authored-by: Michael Genson <genson.michael@gmail.com> Co-authored-by: Michael Genson <71845777+michael-genson@users.noreply.github.com>
2026-06-07 17:40:13 -04:00 · 2026-03-09 21:44:27 +01:00
parent 5a223aa92d
commit 1344f1674d
25 changed files with 563 additions and 45 deletions
--- a/mealie/core/exceptions.py
+++ b/mealie/core/exceptions.py
@@ -46,6 +46,30 @@ class NoEntryFound(Exception):
    pass


+class OpenAIServiceError(Exception):
+    """
+    This exception is raised when there is an error communicating with OpenAI services.
+    """
+
+    pass
+
+
+class RateLimitError(Exception):
+    """
+    This exception is raised when an external API returns a rate limit (429) error.
+    """
+
+    pass
+
+
+class VideoDownloadError(Exception):
+    """
+    This exception is raised when there is an error downloading or processing a video.
+    """
+
+    pass
+
+
 def mealie_registered_exceptions(t: Translator) -> dict:
    """
    This function returns a dictionary of all the globally registered exceptions in the Mealie application.
--- a/mealie/core/settings/settings.py
+++ b/mealie/core/settings/settings.py
@@ -393,12 +393,16 @@ class AppSettings(AppLoggingSettings):
    """Your OpenAI API key. Required to enable OpenAI features"""
    OPENAI_MODEL: str = "gpt-4o"
    """Which OpenAI model to send requests to. Leave this unset for most usecases"""
+    OPENAI_AUDIO_MODEL: str = "whisper-1"
+    """Which OpenAI model to use for audio transcription. Leave this unset for most usecases"""
    OPENAI_CUSTOM_HEADERS: dict[str, str] = {}
    """Custom HTTP headers to send with each OpenAI request"""
    OPENAI_CUSTOM_PARAMS: dict[str, Any] = {}
    """Custom HTTP parameters to send with each OpenAI request"""
    OPENAI_ENABLE_IMAGE_SERVICES: bool = True
    """Whether to enable image-related features in OpenAI"""
+    OPENAI_ENABLE_TRANSCRIPTION_SERVICES: bool = True
+    """Whether to enable audio transcription features in OpenAI"""
    OPENAI_WORKERS: int = 2
    """
    Number of OpenAI workers per request. Higher values may increase
--- a/mealie/routes/admin/admin_about.py
+++ b/mealie/routes/admin/admin_about.py
@@ -38,6 +38,8 @@ class AdminAboutController(BaseAdminController):
            oidc_provider_name=settings.OIDC_PROVIDER_NAME,
            enable_openai=settings.OPENAI_ENABLED,
            enable_openai_image_services=settings.OPENAI_ENABLED and settings.OPENAI_ENABLE_IMAGE_SERVICES,
+            enable_openai_transcription_services=settings.OPENAI_ENABLED
+            and settings.OPENAI_ENABLE_TRANSCRIPTION_SERVICES,
        )

    @router.get("/statistics", response_model=AppStatistics)
--- a/mealie/routes/admin/admin_debug.py
+++ b/mealie/routes/admin/admin_debug.py
@@ -34,14 +34,14 @@ class AdminDebugController(BaseAdminController):

            try:
                openai_service = OpenAIService()
-                prompt = openai_service.get_prompt("debug")
+                prompt = openai_service.get_prompt("general.debug")

                message = "Hello, checking to see if I can reach you."
                if local_images:
                    message = f"{message} Here is an image to test with:"

                response = await openai_service.get_response(
-                    prompt, message, response_schema=OpenAIText, images=local_images
+                    prompt, message, response_schema=OpenAIText, attachments=local_images
                )

                if not response:
--- a/mealie/routes/app/app_about.py
+++ b/mealie/routes/app/app_about.py
@@ -43,6 +43,7 @@ def get_app_info(session: Session = Depends(generate_session)):
        oidc_provider_name=settings.OIDC_PROVIDER_NAME,
        enable_openai=settings.OPENAI_ENABLED,
        enable_openai_image_services=settings.OPENAI_ENABLED and settings.OPENAI_ENABLE_IMAGE_SERVICES,
+        enable_openai_transcription_services=settings.OPENAI_ENABLED and settings.OPENAI_ENABLE_TRANSCRIPTION_SERVICES,
        allow_password_login=settings.ALLOW_PASSWORD_LOGIN,
        token_time=settings.TOKEN_TIME,
    )
--- a/mealie/schema/admin/about.py
+++ b/mealie/schema/admin/about.py
@@ -23,6 +23,7 @@ class AppInfo(MealieModel):
    oidc_provider_name: str
    enable_openai: bool
    enable_openai_image_services: bool
+    enable_openai_transcription_services: bool
    token_time: int


--- a/mealie/services/openai/openai.py
+++ b/mealie/services/openai/openai.py
@@ -7,14 +7,16 @@ from pathlib import Path
 from textwrap import dedent
 from typing import TypeVar

+import openai
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletion
 from pydantic import BaseModel, field_validator

-from mealie.core import root_logger
+from mealie.core import exceptions, root_logger
 from mealie.core.config import get_app_settings
 from mealie.pkgs import img
 from mealie.schema.openai._base import OpenAIBase
+from mealie.schema.openai.general import OpenAIText

 from .._base_service import BaseService

@@ -48,7 +50,12 @@ class OpenAIDataInjection(BaseModel):
            return value


-class OpenAIImageBase(BaseModel, ABC):
+class OpenAIAttachment(BaseModel, ABC):
+    @abstractmethod
+    def build_message(self) -> dict: ...
+
+
+class OpenAIImageBase(OpenAIAttachment):
    @abstractmethod
    def get_image_url(self) -> str: ...

@@ -79,6 +86,17 @@ class OpenAILocalImage(OpenAIImageBase):
        return f"data:image/jpeg;base64,{b64content}"


+class OpenAILocalAudio(OpenAIAttachment):
+    data: str
+    format: str
+
+    def build_message(self) -> dict:
+        return {
+            "type": "input_audio",
+            "input_audio": {"data": self.data, "format": self.format},
+        }
+
+
 class OpenAIService(BaseService):
    PROMPTS_DIR = Path(os.path.dirname(os.path.abspath(__file__))) / "prompts"

@@ -88,9 +106,9 @@ class OpenAIService(BaseService):
            raise ValueError("OpenAI is not enabled")

        self.model = settings.OPENAI_MODEL
+        self.audio_model = settings.OPENAI_AUDIO_MODEL
        self.workers = settings.OPENAI_WORKERS
        self.send_db_data = settings.OPENAI_SEND_DATABASE_DATA
-        self.enable_image_services = settings.OPENAI_ENABLE_IMAGE_SERVICES
        self.custom_prompt_dir = settings.OPENAI_CUSTOM_PROMPT_DIR

        self.get_client = lambda: AsyncOpenAI(
@@ -215,17 +233,14 @@ class OpenAIService(BaseService):
        message: str,
        *,
        response_schema: type[T],
-        images: list[OpenAIImageBase] | None = None,
+        attachments: list[OpenAIAttachment] | None = None,
    ) -> T | None:
        """Send data to OpenAI and return the response message content"""
-        if images and not self.enable_image_services:
-            self.logger.warning("OpenAI image services are disabled, ignoring images")
-            images = None

        try:
            user_messages = [{"type": "text", "text": message}]
-            for image in images or []:
-                user_messages.append(image.build_message())
+            for attachment in attachments or []:
+                user_messages.append(attachment.build_message())

            response = await self._get_raw_response(prompt, user_messages, response_schema)
            if not response.choices:
@@ -233,5 +248,41 @@ class OpenAIService(BaseService):

            response_text = response.choices[0].message.content
            return response_schema.parse_openai_response(response_text)
+        except openai.RateLimitError as e:
+            raise exceptions.RateLimitError(str(e)) from e
        except Exception as e:
            raise Exception(f"OpenAI Request Failed. {e.__class__.__name__}: {e}") from e
+
+    async def transcribe_audio(self, audio_file_path: Path) -> str | None:
+        client = self.get_client()
+
+        # Create a transcription from the audio
+        try:
+            with open(audio_file_path, "rb") as audio_file:
+                transcript = await client.audio.transcriptions.create(
+                    model=self.audio_model,
+                    file=audio_file,
+                )
+            return transcript.text
+        except openai.RateLimitError as e:
+            raise exceptions.RateLimitError(str(e)) from e
+        except Exception as e:
+            self.logger.warning(
+                f"Failed to create audio transcription, falling back to chat completion ({e.__class__.__name__}: {e})"
+            )
+
+        # Fallback to chat completion
+        path_obj = Path(audio_file_path)
+        with open(path_obj, "rb") as audio_file:
+            audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
+
+        file_ext = path_obj.suffix.lstrip(".").lower()
+        audio_attachment = OpenAILocalAudio(data=audio_data, format=file_ext)
+        response = await self.get_response(
+            self.get_prompt("general.transcribe-audio"),
+            "Attached is the audio data.",
+            response_schema=OpenAIText,
+            attachments=[audio_attachment],
+        )
+
+        return response.text if response else None
--- a/mealie/services/openai/prompts/general/debug.txt
+++ b/mealie/services/openai/prompts/general/debug.txt
--- a/mealie/services/openai/prompts/general/transcribe-audio.txt
+++ b/mealie/services/openai/prompts/general/transcribe-audio.txt
@@ -0,0 +1 @@
+Transcribe any audio data provided to you. You should respond only with the audio transcription and nothing else.
--- a/mealie/services/openai/prompts/recipes/parse-recipe-video.txt
+++ b/mealie/services/openai/prompts/recipes/parse-recipe-video.txt
@@ -0,0 +1,7 @@
+You will receive a video transcript and the video's original caption text. Analyze BOTH inputs to generate a single, accurate recipe in schema.org Recipe format. Reference: https://schema.org/Recipe
+
+Do not create or make up any information. If insufficient data is found, return an empty object.
+
+- The video transcript is the primary source for instructions.
+- The caption text is the primary source for the ingredient list and title.
+- If there is a conflict (e.g., caption says "1 cup" but transcript says "1.5 cups"), trust the video transcript.
--- a/mealie/services/recipe/recipe_service.py
+++ b/mealie/services/recipe/recipe_service.py
@@ -611,7 +611,7 @@ class OpenAIRecipeService(RecipeServiceBase):
                prompt,
                message,
                response_schema=OpenAIRecipe,
-                images=openai_images,
+                attachments=openai_images,
            )
            if not response:
                raise ValueError("Received empty response from OpenAI")
--- a/mealie/services/scraper/recipe_scraper.py
+++ b/mealie/services/scraper/recipe_scraper.py
@@ -7,6 +7,7 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras
 from .scraper_strategies import (
    ABCScraperStrategy,
    RecipeScraperOpenAI,
+    RecipeScraperOpenAITranscription,
    RecipeScraperOpenGraph,
    RecipeScraperPackage,
    safe_scrape_html,
@@ -14,6 +15,7 @@ from .scraper_strategies import (

 DEFAULT_SCRAPER_STRATEGIES: list[type[ABCScraperStrategy]] = [
    RecipeScraperPackage,
+    RecipeScraperOpenAITranscription,
    RecipeScraperOpenAI,
    RecipeScraperOpenGraph,
 ]
@@ -42,8 +44,11 @@ class RecipeScraper:
        """

        raw_html = html or await safe_scrape_html(url)
-        for scraper_type in self.scrapers:
-            scraper = scraper_type(url, self.translator, raw_html=raw_html)
+        for ScraperClass in self.scrapers:
+            scraper = ScraperClass(url, self.translator, raw_html=raw_html)
+            if not scraper.can_scrape():
+                self.logger.debug(f"Skipping {scraper.__class__.__name__}")
+                continue

            try:
                result = await scraper.parse()
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -1,22 +1,33 @@
+import asyncio
+import functools
+import re
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Callable
-from typing import Any
+from pathlib import Path
+from typing import Any, TypedDict

 import bs4
 import extruct
+import yt_dlp
 from fastapi import HTTPException, status
 from httpx import AsyncClient, Response
 from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
 from slugify import slugify
 from w3lib.html import get_base_url
+from yt_dlp.extractor.generic import GenericIE

+from mealie.core import exceptions
 from mealie.core.config import get_app_settings
+from mealie.core.dependencies.dependencies import get_temporary_path
 from mealie.core.root_logger import get_logger
 from mealie.lang.providers import Translator
 from mealie.pkgs import safehttp
 from mealie.schema.openai.general import OpenAIText
+from mealie.schema.openai.recipe import OpenAIRecipe
 from mealie.schema.recipe.recipe import Recipe, RecipeStep
+from mealie.schema.recipe.recipe_ingredient import RecipeIngredient
+from mealie.schema.recipe.recipe_notes import RecipeNote
 from mealie.services.openai import OpenAIService
 from mealie.services.scraper.scraped_extras import ScrapedExtras

@@ -27,6 +38,12 @@ SCRAPER_TIMEOUT = 15
 logger = get_logger()


+@functools.cache
+def _get_yt_dlp_extractors() -> list:
+    """Build and cache the yt-dlp extractor list once per process lifetime."""
+    return [ie for ie in yt_dlp.extractor.gen_extractors() if ie.working() and not isinstance(ie, GenericIE)]
+
+
 class ForceTimeoutException(Exception):
    pass

@@ -115,6 +132,9 @@ class ABCScraperStrategy(ABC):
        self.raw_html = raw_html
        self.translator = translator

+    @abstractmethod
+    def can_scrape(self) -> bool: ...
+
    @abstractmethod
    async def get_html(self, url: str) -> str: ...

@@ -132,6 +152,9 @@ class ABCScraperStrategy(ABC):


 class RecipeScraperPackage(ABCScraperStrategy):
+    def can_scrape(self) -> bool:
+        return bool(self.url or self.raw_html)
+
    @staticmethod
    def ld_json_to_html(ld_json: str) -> str:
        return (
@@ -271,6 +294,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
    rather than trying to scrape it directly.
    """

+    def can_scrape(self) -> bool:
+        settings = get_app_settings()
+        return settings.OPENAI_ENABLED and super().can_scrape()
+
    def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str:
        data_parts: list[str] = []
        for script in soup.find_all("script", type="application/ld+json"):
@@ -350,7 +377,178 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
            return ""


+class TranscribedAudio(TypedDict):
+    audio: Path
+    subtitle: Path | None
+    title: str
+    description: str
+    thumbnail_url: str | None
+    transcription: str
+
+
+class RecipeScraperOpenAITranscription(ABCScraperStrategy):
+    SUBTITLE_LANGS = ["en", "fr", "es", "de", "it"]
+
+    def can_scrape(self) -> bool:
+        if not self.url:
+            return False
+
+        settings = get_app_settings()
+        if not (settings.OPENAI_ENABLED and settings.OPENAI_ENABLE_TRANSCRIPTION_SERVICES):
+            return False
+
+        # Check if we can actually download something to transcribe
+        return any(ie.suitable(self.url) for ie in _get_yt_dlp_extractors())
+
+    @staticmethod
+    def _parse_subtitle_content(subtitle_content: str) -> str:
+        # TODO: is there a better way to parse subtitles that's more efficient?
+
+        lines = []
+        for line in subtitle_content.split("\n"):
+            if line.strip() and not line.startswith("WEBVTT") and "-->" not in line and not line.isdigit():
+                lines.append(line.strip())
+
+        raw_content = " ".join(lines)
+        content = re.sub(r"<[^>]+>", "", raw_content)
+        return content
+
+    def _download_audio(self, temp_path: Path) -> TranscribedAudio:
+        """Downloads audio and subtitles from the video URL."""
+        output_template = temp_path / "mealie"  # No extension here
+
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "outtmpl": str(output_template) + ".%(ext)s",
+            "quiet": True,
+            "writesubtitles": True,
+            "writeautomaticsub": True,
+            "subtitleslangs": self.SUBTITLE_LANGS,
+            "skip_download": False,
+            "ignoreerrors": True,
+            "postprocessors": [
+                {
+                    "key": "FFmpegExtractAudio",
+                    "preferredcodec": "mp3",
+                    "preferredquality": "32",
+                }
+            ],
+            "postprocessor_args": ["-ac", "1"],
+        }
+
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(self.url, download=True)
+
+                if info is None:
+                    raise exceptions.VideoDownloadError(
+                        "Failed to extract video information. The video may be unavailable or the URL is invalid."
+                    )
+
+                sub_path = None
+                for lang in self.SUBTITLE_LANGS:
+                    potential_path = output_template.with_suffix(f".{lang}.vtt")
+                    if potential_path.exists():
+                        sub_path = potential_path
+                        break
+
+                return {
+                    "audio": output_template.with_suffix(".mp3"),
+                    "subtitle": sub_path,
+                    "title": info.get("title", ""),
+                    "description": info.get("description", ""),
+                    "thumbnail_url": info.get("thumbnail") or None,
+                    "transcription": "",
+                }
+        except exceptions.VideoDownloadError:
+            raise
+        except Exception as e:
+            raise exceptions.VideoDownloadError(f"Failed to download video: {e}") from e
+
+    async def get_html(self, url: str) -> str:
+        return self.raw_html or ""  # we don't use HTML with this scraper since we use ytdlp
+
+    async def parse(self) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
+        openai_service = OpenAIService()
+
+        with get_temporary_path() as temp_path:
+            video_data = await asyncio.to_thread(self._download_audio, temp_path)
+
+            if video_data["subtitle"]:
+                try:
+                    with open(video_data["subtitle"], encoding="utf-8") as f:
+                        subtitle_content = f.read()
+                    video_data["transcription"] = self._parse_subtitle_content(subtitle_content)
+                    self.logger.info("Using subtitles from video instead of transcription")
+                except Exception:
+                    self.logger.exception("Failed to read subtitles, falling back to transcription")
+                    video_data["transcription"] = ""
+
+            if not video_data["transcription"]:
+                try:
+                    transcription = await openai_service.transcribe_audio(video_data["audio"])
+                except exceptions.RateLimitError:
+                    raise
+                except Exception as e:
+                    raise exceptions.OpenAIServiceError(f"Failed to transcribe audio: {e}") from e
+                if not transcription:
+                    raise exceptions.OpenAIServiceError("No transcription returned from OpenAI")
+                video_data["transcription"] = transcription
+
+        if not video_data["transcription"]:
+            self.logger.error("Could not extract a transcript (no data)")
+            return None, None
+
+        self.logger.debug(f"Transcription: {video_data['transcription'][:200]}...")
+        prompt = openai_service.get_prompt("recipes.parse-recipe-video")
+
+        message_parts = [
+            f"Title: {video_data['title']}",
+            f"Description: {video_data['description']}",
+            f"Transcription: {video_data['transcription']}",
+        ]
+
+        try:
+            response = await openai_service.get_response(prompt, "\n".join(message_parts), response_schema=OpenAIRecipe)
+        except exceptions.RateLimitError:
+            raise
+        except Exception as e:
+            raise exceptions.OpenAIServiceError(f"Failed to extract recipe from video: {e}") from e
+
+        if not response:
+            raise exceptions.OpenAIServiceError("OpenAI returned an empty response when extracting recipe")
+
+        recipe = Recipe(
+            name=response.name,
+            slug="",
+            description=response.description,
+            recipe_yield=response.recipe_yield,
+            total_time=response.total_time,
+            prep_time=response.prep_time,
+            perform_time=response.perform_time,
+            recipe_ingredient=[
+                RecipeIngredient(title=ingredient.title, note=ingredient.text)
+                for ingredient in response.ingredients
+                if ingredient.text
+            ],
+            recipe_instructions=[
+                RecipeStep(title=instruction.title, text=instruction.text)
+                for instruction in response.instructions
+                if instruction.text
+            ],
+            notes=[RecipeNote(title=note.title or "", text=note.text) for note in response.notes if note.text],
+            image=video_data["thumbnail_url"] or None,
+            org_url=self.url,
+        )
+
+        self.logger.info(f"Successfully extracted recipe from video: {video_data['title']}")
+        return recipe, ScrapedExtras()
+
+
 class RecipeScraperOpenGraph(ABCScraperStrategy):
+    def can_scrape(self) -> bool:
+        return bool(self.url or self.raw_html)
+
    async def get_html(self, url: str) -> str:
        return self.raw_html or await safe_scrape_html(url)
				`@@ -0,0 +1 @@`
				`Transcribe any audio data provided to you. You should respond only with the audio transcription and nothing else.`