security: multiple reported CVE fixes (#1515)

* update out of date license * update typing / refactor * fix arbitrarty path injection * use markdown sanatizer to prevent XSS CWE-79 * fix CWE-918 SSRF by validating url and mime type * add security docs * update recipe-scrapers * resolve DOS from arbitrary url * update changelog * bump version * add ref to #1506 * add #1511 to changelog * use requests decoder * actually fix encoding issue
2026-02-16 12:53:12 -05:00 · 2022-07-31 13:10:20 -08:00
parent 483f789b8e
commit 13850cda1f
23 changed files with 401 additions and 118 deletions
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@@ -11,6 +11,14 @@ from mealie.services._base_service import BaseService
 _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"


+class NotAnImageError(Exception):
+    pass
+
+
+class InvalidDomainError(Exception):
+    pass
+
+
 class RecipeDataService(BaseService):
    minifier: img.ABCMinifier

@@ -56,8 +64,26 @@ class RecipeDataService(BaseService):

        return image_path

+    @staticmethod
+    def _validate_image_url(url: str) -> bool:
+        # sourcery skip: invert-any-all, use-any
+        """
+        Validates that the URL is of an allowed source and restricts certain sources to prevent
+        malicious images from being downloaded.
+        """
+        invalid_domains = {"127.0.0.1", "localhost"}
+        for domain in invalid_domains:
+            if domain in url:
+                return False
+
+        return True
+
    def scrape_image(self, image_url) -> None:
-        self.logger.info(f"Image URL: {image_url}")
+        self.logger.debug(f"Image URL: {image_url}")
+
+        if not self._validate_image_url(image_url):
+            self.logger.error(f"Invalid image URL: {image_url}")
+            raise InvalidDomainError(f"Invalid domain: {image_url}")

        if isinstance(image_url, str):  # Handles String Types
            pass
@@ -74,7 +100,7 @@ class RecipeDataService(BaseService):
                try:
                    r = requests.get(url, stream=True, headers={"User-Agent": _FIREFOX_UA})
                except Exception:
-                    self.logger.exception("Image {url} could not be requested")
+                    self.logger.exception(f"Image {url} could not be requested")
                    continue
                if r.status_code == 200:
                    all_image_requests.append((url, r))
@@ -100,9 +126,19 @@ class RecipeDataService(BaseService):
            self.logger.exception("Fatal Image Request Exception")
            return None

-        if r.status_code == 200:
-            r.raw.decode_content = True
-            self.logger.info(f"File Name Suffix {file_path.suffix}")
-            self.write_image(r.raw, file_path.suffix)
+        if r.status_code != 200:
+            # TODO: Probably should throw an exception in this case as well, but before these changes
+            # we were returning None if it failed anyways.
+            return None

-            file_path.unlink(missing_ok=True)
+        content_type = r.headers.get("content-type", "")
+
+        if "image" not in content_type:
+            self.logger.error(f"Content-Type: {content_type} is not an image")
+            raise NotAnImageError(f"Content-Type {content_type} is not an image")
+
+        r.raw.decode_content = True
+        self.logger.info(f"File Name Suffix {file_path.suffix}")
+        self.write_image(r.raw, file_path.suffix)
+
+        file_path.unlink(missing_ok=True)
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -1,10 +1,11 @@
+import time
 from abc import ABC, abstractmethod
 from typing import Any, Callable

 import extruct
 import requests
 from fastapi import HTTPException, status
-from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
+from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
 from slugify import slugify
 from w3lib.html import get_base_url

@@ -14,6 +15,59 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras

 from . import cleaner

+SCRAPER_TIMEOUT = 15
+
+
+class ForceTimeoutException(Exception):
+    pass
+
+
+def safe_scrape_html(url: str) -> str:
+    """
+    Scrapes the html from a url but will cancel the request
+    if the request takes longer than 15 seconds. This is used to mitigate
+    DDOS attacks from users providing a url with arbitrary large content.
+    """
+    resp = requests.get(url, timeout=SCRAPER_TIMEOUT, stream=True)
+
+    html_bytes = b""
+
+    start_time = time.time()
+
+    for chunk in resp.iter_content(chunk_size=1024):
+        html_bytes += chunk
+
+        if time.time() - start_time > SCRAPER_TIMEOUT:
+            raise ForceTimeoutException()
+
+    # =====================================
+    # Coppied from requests text property
+
+    # Try charset from content-type
+    content = None
+    encoding = resp.encoding
+
+    if not html_bytes:
+        return ""
+
+    # Fallback to auto-detected encoding.
+    if encoding is None:
+        encoding = resp.apparent_encoding
+
+    # Decode unicode from given encoding.
+    try:
+        content = str(html_bytes, encoding, errors="replace")
+    except (LookupError, TypeError):
+        # A LookupError is raised if the encoding was not found which could
+        # indicate a misspelling or similar mistake.
+        #
+        # A TypeError can be raised if encoding is None
+        #
+        # So we try blindly encoding.
+        content = str(html_bytes, errors="replace")
+
+    return content
+

 class ABCScraperStrategy(ABC):
    """
@@ -103,14 +157,13 @@ class RecipeScraperPackage(ABCScraperStrategy):
        return recipe, extras

    def scrape_url(self) -> SchemaScraperFactory.SchemaScraper | Any | None:
+        recipe_html = safe_scrape_html(self.url)
+
        try:
-            scraped_schema = scrape_me(self.url)
-        except (WebsiteNotImplementedError, AttributeError):
-            try:
-                scraped_schema = scrape_me(self.url, wild_mode=True)
-            except (NoSchemaFoundInWildMode, AttributeError):
-                self.logger.error("Recipe Scraper was unable to extract a recipe.")
-                return None
+            scraped_schema = scrape_html(recipe_html, org_url=self.url)
+        except (NoSchemaFoundInWildMode, AttributeError):
+            self.logger.error("Recipe Scraper was unable to extract a recipe.")
+            return None

        except ConnectionError as e:
            raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": "CONNECTION_ERROR"}) from e
@@ -150,7 +203,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
    """

    def get_html(self) -> str:
-        return requests.get(self.url).text
+        return safe_scrape_html(self.url)

    def get_recipe_fields(self, html) -> dict | None:
        """