security: gh security recs (#3368)

* change ALLOW_SIGNUP to default to false * add 1.4.0 tag for OIDC docs * new notes on security inline with security/policy review * safer transport for external requests * fix linter errors * docs: Tidy up wording/formatting * fix request errors * whoops * fix implementation with std lib * format * Remove check on netloc_parts. It only includes URL after any @ --------- Co-authored-by: boc-the-git <3479092+boc-the-git@users.noreply.github.com> Co-authored-by: Brendan <b.oconnell14@gmail.com>
2026-02-06 16:03:12 -05:00 · 2024-04-02 10:04:42 -05:00
parent 737a370874
commit 2a3463b746
11 changed files with 180 additions and 54 deletions
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@@ -5,7 +5,8 @@ from pathlib import Path
 from httpx import AsyncClient, Response
 from pydantic import UUID4

-from mealie.pkgs import img
+from mealie.pkgs import img, safehttp
+from mealie.pkgs.safehttp.transport import AsyncSafeTransport
 from mealie.schema.recipe.recipe import Recipe
 from mealie.services._base_service import BaseService

@@ -29,12 +30,14 @@ async def largest_content_len(urls: list[str]) -> tuple[str, int]:
    largest_url = ""
    largest_len = 0

+    max_concurrency = 10
+
    async def do(client: AsyncClient, url: str) -> Response:
        return await client.head(url, headers={"User-Agent": _FIREFOX_UA})

-    async with AsyncClient() as client:
+    async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
        tasks = [do(client, url) for url in urls]
-        responses: list[Response] = await gather_with_concurrency(10, *tasks, ignore_exceptions=True)
+        responses: list[Response] = await gather_with_concurrency(max_concurrency, *tasks, ignore_exceptions=True)
        for response in responses:
            len_int = int(response.headers.get("Content-Length", 0))
            if len_int > largest_len:
@@ -101,42 +104,29 @@ class RecipeDataService(BaseService):

        return image_path

-    @staticmethod
-    def _validate_image_url(url: str) -> bool:
-        # sourcery skip: invert-any-all, use-any
-        """
-        Validates that the URL is of an allowed source and restricts certain sources to prevent
-        malicious images from being downloaded.
-        """
-        invalid_domains = {"127.0.0.1", "localhost"}
-        for domain in invalid_domains:
-            if domain in url:
-                return False
-
-        return True
-
-    async def scrape_image(self, image_url) -> None:
+    async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
        self.logger.info(f"Image URL: {image_url}")

-        if not self._validate_image_url(image_url):
-            self.logger.error(f"Invalid image URL: {image_url}")
-            raise InvalidDomainError(f"Invalid domain: {image_url}")
+        image_url_str = ""

        if isinstance(image_url, str):  # Handles String Types
-            pass
+            image_url_str = image_url

        elif isinstance(image_url, list):  # Handles List Types
            # Multiple images have been defined in the schema - usually different resolutions
            # Typically would be in smallest->biggest order, but can't be certain so test each.
            # 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
-            image_url, _ = await largest_content_len(image_url)
+            image_url_str, _ = await largest_content_len(image_url)

        elif isinstance(image_url, dict):  # Handles Dictionary Types
            for key in image_url:
                if key == "url":
-                    image_url = image_url.get("url")
+                    image_url_str = image_url.get("url", "")

-        ext = image_url.split(".")[-1]
+        if not image_url_str:
+            raise ValueError(f"image url could not be parsed from input: {image_url}")
+
+        ext = image_url_str.split(".")[-1]

        if ext not in img.IMAGE_EXTENSIONS:
            ext = "jpg"  # Guess the extension
@@ -144,9 +134,9 @@ class RecipeDataService(BaseService):
        file_name = f"{str(self.recipe_id)}.{ext}"
        file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name)

-        async with AsyncClient() as client:
+        async with AsyncClient(transport=AsyncSafeTransport()) as client:
            try:
-                r = await client.get(image_url, headers={"User-Agent": _FIREFOX_UA})
+                r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA})
            except Exception:
                self.logger.exception("Fatal Image Request Exception")
                return None
--- a/mealie/services/scraper/scraper.py
+++ b/mealie/services/scraper/scraper.py
@@ -43,7 +43,7 @@ async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, Scr
    recipe_data_service = RecipeDataService(new_recipe.id)

    try:
-        await recipe_data_service.scrape_image(new_recipe.image)
+        await recipe_data_service.scrape_image(new_recipe.image)  # type: ignore

        if new_recipe.name is None:
            new_recipe.name = "Untitled"
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@@ -12,6 +12,7 @@ from w3lib.html import get_base_url

 from mealie.core.root_logger import get_logger
 from mealie.lang.providers import Translator
+from mealie.pkgs import safehttp
 from mealie.schema.recipe.recipe import Recipe, RecipeStep
 from mealie.services.scraper.scraped_extras import ScrapedExtras

@@ -31,7 +32,7 @@ async def safe_scrape_html(url: str) -> str:
    if the request takes longer than 15 seconds. This is used to mitigate
    DDOS attacks from users providing a url with arbitrary large content.
    """
-    async with AsyncClient() as client:
+    async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
        html_bytes = b""
        async with client.stream("GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": _FIREFOX_UA}) as resp:
            start_time = time.time()