security: multiple reported CVE fixes (#1515)

* update out of date license

* update typing / refactor

* fix arbitrarty path injection

* use markdown sanatizer to prevent XSS CWE-79

* fix CWE-918 SSRF by validating url and mime type

* add security docs

* update recipe-scrapers

* resolve DOS from arbitrary url

* update changelog

* bump version

* add ref to #1506

* add #1511 to changelog

* use requests decoder

* actually fix encoding issue
This commit is contained in:
Hayden
2022-07-31 13:10:20 -08:00
committed by GitHub
parent 483f789b8e
commit 13850cda1f
23 changed files with 401 additions and 118 deletions

View File

@@ -11,6 +11,14 @@ from mealie.services._base_service import BaseService
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
class NotAnImageError(Exception):
pass
class InvalidDomainError(Exception):
pass
class RecipeDataService(BaseService):
minifier: img.ABCMinifier
@@ -56,8 +64,26 @@ class RecipeDataService(BaseService):
return image_path
@staticmethod
def _validate_image_url(url: str) -> bool:
# sourcery skip: invert-any-all, use-any
"""
Validates that the URL is of an allowed source and restricts certain sources to prevent
malicious images from being downloaded.
"""
invalid_domains = {"127.0.0.1", "localhost"}
for domain in invalid_domains:
if domain in url:
return False
return True
def scrape_image(self, image_url) -> None:
self.logger.info(f"Image URL: {image_url}")
self.logger.debug(f"Image URL: {image_url}")
if not self._validate_image_url(image_url):
self.logger.error(f"Invalid image URL: {image_url}")
raise InvalidDomainError(f"Invalid domain: {image_url}")
if isinstance(image_url, str): # Handles String Types
pass
@@ -74,7 +100,7 @@ class RecipeDataService(BaseService):
try:
r = requests.get(url, stream=True, headers={"User-Agent": _FIREFOX_UA})
except Exception:
self.logger.exception("Image {url} could not be requested")
self.logger.exception(f"Image {url} could not be requested")
continue
if r.status_code == 200:
all_image_requests.append((url, r))
@@ -100,9 +126,19 @@ class RecipeDataService(BaseService):
self.logger.exception("Fatal Image Request Exception")
return None
if r.status_code == 200:
r.raw.decode_content = True
self.logger.info(f"File Name Suffix {file_path.suffix}")
self.write_image(r.raw, file_path.suffix)
if r.status_code != 200:
# TODO: Probably should throw an exception in this case as well, but before these changes
# we were returning None if it failed anyways.
return None
file_path.unlink(missing_ok=True)
content_type = r.headers.get("content-type", "")
if "image" not in content_type:
self.logger.error(f"Content-Type: {content_type} is not an image")
raise NotAnImageError(f"Content-Type {content_type} is not an image")
r.raw.decode_content = True
self.logger.info(f"File Name Suffix {file_path.suffix}")
self.write_image(r.raw, file_path.suffix)
file_path.unlink(missing_ok=True)

View File

@@ -1,10 +1,11 @@
import time
from abc import ABC, abstractmethod
from typing import Any, Callable
import extruct
import requests
from fastapi import HTTPException, status
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
from slugify import slugify
from w3lib.html import get_base_url
@@ -14,6 +15,59 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras
from . import cleaner
SCRAPER_TIMEOUT = 15
class ForceTimeoutException(Exception):
pass
def safe_scrape_html(url: str) -> str:
"""
Scrapes the html from a url but will cancel the request
if the request takes longer than 15 seconds. This is used to mitigate
DDOS attacks from users providing a url with arbitrary large content.
"""
resp = requests.get(url, timeout=SCRAPER_TIMEOUT, stream=True)
html_bytes = b""
start_time = time.time()
for chunk in resp.iter_content(chunk_size=1024):
html_bytes += chunk
if time.time() - start_time > SCRAPER_TIMEOUT:
raise ForceTimeoutException()
# =====================================
# Coppied from requests text property
# Try charset from content-type
content = None
encoding = resp.encoding
if not html_bytes:
return ""
# Fallback to auto-detected encoding.
if encoding is None:
encoding = resp.apparent_encoding
# Decode unicode from given encoding.
try:
content = str(html_bytes, encoding, errors="replace")
except (LookupError, TypeError):
# A LookupError is raised if the encoding was not found which could
# indicate a misspelling or similar mistake.
#
# A TypeError can be raised if encoding is None
#
# So we try blindly encoding.
content = str(html_bytes, errors="replace")
return content
class ABCScraperStrategy(ABC):
"""
@@ -103,14 +157,13 @@ class RecipeScraperPackage(ABCScraperStrategy):
return recipe, extras
def scrape_url(self) -> SchemaScraperFactory.SchemaScraper | Any | None:
recipe_html = safe_scrape_html(self.url)
try:
scraped_schema = scrape_me(self.url)
except (WebsiteNotImplementedError, AttributeError):
try:
scraped_schema = scrape_me(self.url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError):
self.logger.error("Recipe Scraper was unable to extract a recipe.")
return None
scraped_schema = scrape_html(recipe_html, org_url=self.url)
except (NoSchemaFoundInWildMode, AttributeError):
self.logger.error("Recipe Scraper was unable to extract a recipe.")
return None
except ConnectionError as e:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": "CONNECTION_ERROR"}) from e
@@ -150,7 +203,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
"""
def get_html(self) -> str:
return requests.get(self.url).text
return safe_scrape_html(self.url)
def get_recipe_fields(self, html) -> dict | None:
"""