security: multiple reported CVE fixes (#1515)

* update out of date license * update typing / refactor * fix arbitrarty path injection * use markdown sanatizer to prevent XSS CWE-79 * fix CWE-918 SSRF by validating url and mime type * add security docs * update recipe-scrapers * resolve DOS from arbitrary url * update changelog * bump version * add ref to #1506 * add #1511 to changelog * use requests decoder * actually fix encoding issue
2025-12-15 23:05:23 -05:00 · 2022-07-31 13:10:20 -08:00
parent 483f789b8e
commit 13850cda1f
23 changed files with 401 additions and 118 deletions
--- a/mealie/core/settings/static.py
+++ b/mealie/core/settings/static.py
@@ -1,6 +1,6 @@
 from pathlib import Path

-APP_VERSION = "v1.0.0beta-3"
+APP_VERSION = "v1.0.0beta-4"

 CWD = Path(__file__).parent
 BASE_DIR = CWD.parent.parent.parent
--- a/mealie/routes/recipe/recipe_crud_routes.py
+++ b/mealie/routes/recipe/recipe_crud_routes.py
@@ -29,13 +29,13 @@ from mealie.schema.response.responses import ErrorResponse
 from mealie.services import urls
 from mealie.services.event_bus_service.event_bus_service import EventBusService, EventSource
 from mealie.services.event_bus_service.message_types import EventTypes
-from mealie.services.recipe.recipe_data_service import RecipeDataService
+from mealie.services.recipe.recipe_data_service import InvalidDomainError, NotAnImageError, RecipeDataService
 from mealie.services.recipe.recipe_service import RecipeService
 from mealie.services.recipe.template_service import TemplateService
 from mealie.services.scraper.recipe_bulk_scraper import RecipeBulkScraperService
 from mealie.services.scraper.scraped_extras import ScraperContext
 from mealie.services.scraper.scraper import create_from_url
-from mealie.services.scraper.scraper_strategies import RecipeScraperPackage
+from mealie.services.scraper.scraper_strategies import ForceTimeoutException, RecipeScraperPackage


 class BaseRecipeController(BaseUserController):
@@ -139,7 +139,12 @@ class RecipeController(BaseRecipeController):
    @router.post("/create-url", status_code=201, response_model=str)
    def parse_recipe_url(self, req: ScrapeRecipe):
        """Takes in a URL and attempts to scrape data and load it into the database"""
-        recipe, extras = create_from_url(req.url)
+        try:
+            recipe, extras = create_from_url(req.url)
+        except ForceTimeoutException as e:
+            raise HTTPException(
+                status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
+            ) from e

        if req.include_tags:
            ctx = ScraperContext(self.user.id, self.group_id, self.repos)
@@ -176,8 +181,13 @@ class RecipeController(BaseRecipeController):
    @router.post("/test-scrape-url")
    def test_parse_recipe_url(self, url: ScrapeRecipeTest):
        # Debugger should produce the same result as the scraper sees before cleaning
-        if scraped_data := RecipeScraperPackage(url.url).scrape_url():
-            return scraped_data.schema.data
+        try:
+            if scraped_data := RecipeScraperPackage(url.url).scrape_url():
+                return scraped_data.schema.data
+        except ForceTimeoutException as e:
+            raise HTTPException(
+                status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
+            ) from e

        return "recipe_scrapers was unable to scrape this URL"

@@ -314,7 +324,19 @@ class RecipeController(BaseRecipeController):
    def scrape_image_url(self, slug: str, url: ScrapeRecipe):
        recipe = self.mixins.get_one(slug)
        data_service = RecipeDataService(recipe.id)
-        data_service.scrape_image(url.url)
+
+        try:
+            data_service.scrape_image(url.url)
+        except NotAnImageError as e:
+            raise HTTPException(
+                status_code=400,
+                detail=ErrorResponse.respond("Url is not an image"),
+            ) from e
+        except InvalidDomainError as e:
+            raise HTTPException(
+                status_code=400,
+                detail=ErrorResponse.respond("Url is not from an allowed domain"),
+            ) from e

        recipe.image = cache.cache_key.new_key()
        self.service.update_one(recipe.slug, recipe)
@@ -338,13 +360,27 @@ class RecipeController(BaseRecipeController):
        file: UploadFile = File(...),
    ):
        """Upload a file to store as a recipe asset"""
-        file_name = f"{slugify(name)}.{extension}"
+        if "." in extension:
+            extension = extension.split(".")[-1]
+
+        file_slug = slugify(name)
+        if not extension or not file_slug:
+            raise HTTPException(status_code=400, detail="Missing required fields")
+
+        file_name = f"{file_slug}.{extension}"
        asset_in = RecipeAsset(name=name, icon=icon, file_name=file_name)

        recipe = self.mixins.get_one(slug)

        dest = recipe.asset_dir / file_name

+        # Ensure path is relative to the recipe's asset directory
+        if dest.absolute().parent != recipe.asset_dir:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File name {file_name} or extension {extension} not valid",
+            )
+
        with dest.open("wb") as buffer:
            copyfileobj(file.file, buffer)

--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@@ -11,6 +11,14 @@ from mealie.services._base_service import BaseService
 _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"


+class NotAnImageError(Exception):
+    pass
+
+
+class InvalidDomainError(Exception):
+    pass
+
+
 class RecipeDataService(BaseService):
    minifier: img.ABCMinifier

@@ -56,8 +64,26 @@ class RecipeDataService(BaseService):

        return image_path

+    @staticmethod
+    def _validate_image_url(url: str) -> bool:
+        # sourcery skip: invert-any-all, use-any
+        """
+        Validates that the URL is of an allowed source and restricts certain sources to prevent
+        malicious images from being downloaded.
+        """
+        invalid_domains = {"127.0.0.1", "localhost"}
+        for domain in invalid_domains:
+            if domain in url:
+                return False
+
+        return True
+
    def scrape_image(self, image_url) -> None:
-        self.logger.info(f"Image URL: {image_url}")
+        self.logger.debug(f"Image URL: {image_url}")
+
+        if not self._validate_image_url(image_url):
+            self.logger.error(f"Invalid image URL: {image_url}")
+            raise InvalidDomainError(f"Invalid domain: {image_url}")

        if isinstance(image_url, str):  # Handles String Types
            pass
@@ -74,7 +100,7 @@ class RecipeDataService(BaseService):
                try:
                    r = requests.get(url, stream=True, headers={"User-Agent": _FIREFOX_UA})
                except Exception:
-                    self.logger.exception("Image {url} could not be requested")
+                    self.logger.exception(f"Image {url} could not be requested")
                    continue
                if r.status_code == 200:
                    all_image_requests.append((url, r))
@@ -100,9 +126,19 @@ class RecipeDataService(BaseService):
            self.logger.exception("Fatal Image Request Exception")
            return None

-        if r.status_code == 200:
-            r.raw.decode_content = True
-            self.logger.info(f"File Name Suffix {file_path.suffix}")
-            self.write_image(r.raw, file_path.suffix)
+        if r.status_code != 200:
+            # TODO: Probably should throw an exception in this case as well, but before these changes
+            # we were returning None if it failed anyways.
+            return None

-            file_path.unlink(missing_ok=True)
+        content_type = r.headers.get("content-type", "")
+
+        if "image" not in content_type:
+            self.logger.error(f"Content-Type: {content_type} is not an image")
+            raise NotAnImageError(f"Content-Type {content_type} is not an image")
+
+        r.raw.decode_content = True
+        self.logger.info(f"File Name Suffix {file_path.suffix}")
+        self.write_image(r.raw, file_path.suffix)
+
+        file_path.unlink(missing_ok=True)
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -1,10 +1,11 @@
+import time
 from abc import ABC, abstractmethod
 from typing import Any, Callable

 import extruct
 import requests
 from fastapi import HTTPException, status
-from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
+from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
 from slugify import slugify
 from w3lib.html import get_base_url

@@ -14,6 +15,59 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras

 from . import cleaner

+SCRAPER_TIMEOUT = 15
+
+
+class ForceTimeoutException(Exception):
+    pass
+
+
+def safe_scrape_html(url: str) -> str:
+    """
+    Scrapes the html from a url but will cancel the request
+    if the request takes longer than 15 seconds. This is used to mitigate
+    DDOS attacks from users providing a url with arbitrary large content.
+    """
+    resp = requests.get(url, timeout=SCRAPER_TIMEOUT, stream=True)
+
+    html_bytes = b""
+
+    start_time = time.time()
+
+    for chunk in resp.iter_content(chunk_size=1024):
+        html_bytes += chunk
+
+        if time.time() - start_time > SCRAPER_TIMEOUT:
+            raise ForceTimeoutException()
+
+    # =====================================
+    # Coppied from requests text property
+
+    # Try charset from content-type
+    content = None
+    encoding = resp.encoding
+
+    if not html_bytes:
+        return ""
+
+    # Fallback to auto-detected encoding.
+    if encoding is None:
+        encoding = resp.apparent_encoding
+
+    # Decode unicode from given encoding.
+    try:
+        content = str(html_bytes, encoding, errors="replace")
+    except (LookupError, TypeError):
+        # A LookupError is raised if the encoding was not found which could
+        # indicate a misspelling or similar mistake.
+        #
+        # A TypeError can be raised if encoding is None
+        #
+        # So we try blindly encoding.
+        content = str(html_bytes, errors="replace")
+
+    return content
+

 class ABCScraperStrategy(ABC):
    """
@@ -103,14 +157,13 @@ class RecipeScraperPackage(ABCScraperStrategy):
        return recipe, extras

    def scrape_url(self) -> SchemaScraperFactory.SchemaScraper | Any | None:
+        recipe_html = safe_scrape_html(self.url)
+
        try:
-            scraped_schema = scrape_me(self.url)
-        except (WebsiteNotImplementedError, AttributeError):
-            try:
-                scraped_schema = scrape_me(self.url, wild_mode=True)
-            except (NoSchemaFoundInWildMode, AttributeError):
-                self.logger.error("Recipe Scraper was unable to extract a recipe.")
-                return None
+            scraped_schema = scrape_html(recipe_html, org_url=self.url)
+        except (NoSchemaFoundInWildMode, AttributeError):
+            self.logger.error("Recipe Scraper was unable to extract a recipe.")
+            return None

        except ConnectionError as e:
            raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": "CONNECTION_ERROR"}) from e
@@ -150,7 +203,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
    """

    def get_html(self) -> str:
-        return requests.get(self.url).text
+        return safe_scrape_html(self.url)

    def get_recipe_fields(self, html) -> dict | None:
        """