mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-12-30 05:47:09 -05:00
security: multiple reported CVE fixes (#1515)
* update out of date license * update typing / refactor * fix arbitrarty path injection * use markdown sanatizer to prevent XSS CWE-79 * fix CWE-918 SSRF by validating url and mime type * add security docs * update recipe-scrapers * resolve DOS from arbitrary url * update changelog * bump version * add ref to #1506 * add #1511 to changelog * use requests decoder * actually fix encoding issue
This commit is contained in:
@@ -11,6 +11,14 @@ from mealie.services._base_service import BaseService
|
||||
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
|
||||
|
||||
|
||||
class NotAnImageError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidDomainError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RecipeDataService(BaseService):
|
||||
minifier: img.ABCMinifier
|
||||
|
||||
@@ -56,8 +64,26 @@ class RecipeDataService(BaseService):
|
||||
|
||||
return image_path
|
||||
|
||||
@staticmethod
|
||||
def _validate_image_url(url: str) -> bool:
|
||||
# sourcery skip: invert-any-all, use-any
|
||||
"""
|
||||
Validates that the URL is of an allowed source and restricts certain sources to prevent
|
||||
malicious images from being downloaded.
|
||||
"""
|
||||
invalid_domains = {"127.0.0.1", "localhost"}
|
||||
for domain in invalid_domains:
|
||||
if domain in url:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def scrape_image(self, image_url) -> None:
|
||||
self.logger.info(f"Image URL: {image_url}")
|
||||
self.logger.debug(f"Image URL: {image_url}")
|
||||
|
||||
if not self._validate_image_url(image_url):
|
||||
self.logger.error(f"Invalid image URL: {image_url}")
|
||||
raise InvalidDomainError(f"Invalid domain: {image_url}")
|
||||
|
||||
if isinstance(image_url, str): # Handles String Types
|
||||
pass
|
||||
@@ -74,7 +100,7 @@ class RecipeDataService(BaseService):
|
||||
try:
|
||||
r = requests.get(url, stream=True, headers={"User-Agent": _FIREFOX_UA})
|
||||
except Exception:
|
||||
self.logger.exception("Image {url} could not be requested")
|
||||
self.logger.exception(f"Image {url} could not be requested")
|
||||
continue
|
||||
if r.status_code == 200:
|
||||
all_image_requests.append((url, r))
|
||||
@@ -100,9 +126,19 @@ class RecipeDataService(BaseService):
|
||||
self.logger.exception("Fatal Image Request Exception")
|
||||
return None
|
||||
|
||||
if r.status_code == 200:
|
||||
r.raw.decode_content = True
|
||||
self.logger.info(f"File Name Suffix {file_path.suffix}")
|
||||
self.write_image(r.raw, file_path.suffix)
|
||||
if r.status_code != 200:
|
||||
# TODO: Probably should throw an exception in this case as well, but before these changes
|
||||
# we were returning None if it failed anyways.
|
||||
return None
|
||||
|
||||
file_path.unlink(missing_ok=True)
|
||||
content_type = r.headers.get("content-type", "")
|
||||
|
||||
if "image" not in content_type:
|
||||
self.logger.error(f"Content-Type: {content_type} is not an image")
|
||||
raise NotAnImageError(f"Content-Type {content_type} is not an image")
|
||||
|
||||
r.raw.decode_content = True
|
||||
self.logger.info(f"File Name Suffix {file_path.suffix}")
|
||||
self.write_image(r.raw, file_path.suffix)
|
||||
|
||||
file_path.unlink(missing_ok=True)
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable
|
||||
|
||||
import extruct
|
||||
import requests
|
||||
from fastapi import HTTPException, status
|
||||
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
|
||||
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
|
||||
from slugify import slugify
|
||||
from w3lib.html import get_base_url
|
||||
|
||||
@@ -14,6 +15,59 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
from . import cleaner
|
||||
|
||||
SCRAPER_TIMEOUT = 15
|
||||
|
||||
|
||||
class ForceTimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def safe_scrape_html(url: str) -> str:
|
||||
"""
|
||||
Scrapes the html from a url but will cancel the request
|
||||
if the request takes longer than 15 seconds. This is used to mitigate
|
||||
DDOS attacks from users providing a url with arbitrary large content.
|
||||
"""
|
||||
resp = requests.get(url, timeout=SCRAPER_TIMEOUT, stream=True)
|
||||
|
||||
html_bytes = b""
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
for chunk in resp.iter_content(chunk_size=1024):
|
||||
html_bytes += chunk
|
||||
|
||||
if time.time() - start_time > SCRAPER_TIMEOUT:
|
||||
raise ForceTimeoutException()
|
||||
|
||||
# =====================================
|
||||
# Coppied from requests text property
|
||||
|
||||
# Try charset from content-type
|
||||
content = None
|
||||
encoding = resp.encoding
|
||||
|
||||
if not html_bytes:
|
||||
return ""
|
||||
|
||||
# Fallback to auto-detected encoding.
|
||||
if encoding is None:
|
||||
encoding = resp.apparent_encoding
|
||||
|
||||
# Decode unicode from given encoding.
|
||||
try:
|
||||
content = str(html_bytes, encoding, errors="replace")
|
||||
except (LookupError, TypeError):
|
||||
# A LookupError is raised if the encoding was not found which could
|
||||
# indicate a misspelling or similar mistake.
|
||||
#
|
||||
# A TypeError can be raised if encoding is None
|
||||
#
|
||||
# So we try blindly encoding.
|
||||
content = str(html_bytes, errors="replace")
|
||||
|
||||
return content
|
||||
|
||||
|
||||
class ABCScraperStrategy(ABC):
|
||||
"""
|
||||
@@ -103,14 +157,13 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
return recipe, extras
|
||||
|
||||
def scrape_url(self) -> SchemaScraperFactory.SchemaScraper | Any | None:
|
||||
recipe_html = safe_scrape_html(self.url)
|
||||
|
||||
try:
|
||||
scraped_schema = scrape_me(self.url)
|
||||
except (WebsiteNotImplementedError, AttributeError):
|
||||
try:
|
||||
scraped_schema = scrape_me(self.url, wild_mode=True)
|
||||
except (NoSchemaFoundInWildMode, AttributeError):
|
||||
self.logger.error("Recipe Scraper was unable to extract a recipe.")
|
||||
return None
|
||||
scraped_schema = scrape_html(recipe_html, org_url=self.url)
|
||||
except (NoSchemaFoundInWildMode, AttributeError):
|
||||
self.logger.error("Recipe Scraper was unable to extract a recipe.")
|
||||
return None
|
||||
|
||||
except ConnectionError as e:
|
||||
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": "CONNECTION_ERROR"}) from e
|
||||
@@ -150,7 +203,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
|
||||
"""
|
||||
|
||||
def get_html(self) -> str:
|
||||
return requests.get(self.url).text
|
||||
return safe_scrape_html(self.url)
|
||||
|
||||
def get_recipe_fields(self, html) -> dict | None:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user