security: multiple reported CVE fixes (#1515)

* update out of date license

* update typing / refactor

* fix arbitrarty path injection

* use markdown sanatizer to prevent XSS CWE-79

* fix CWE-918 SSRF by validating url and mime type

* add security docs

* update recipe-scrapers

* resolve DOS from arbitrary url

* update changelog

* bump version

* add ref to #1506

* add #1511 to changelog

* use requests decoder

* actually fix encoding issue
This commit is contained in:
Hayden
2022-07-31 13:10:20 -08:00
committed by GitHub
parent 483f789b8e
commit 13850cda1f
23 changed files with 401 additions and 118 deletions

View File

@@ -1,6 +1,6 @@
from pathlib import Path
APP_VERSION = "v1.0.0beta-3"
APP_VERSION = "v1.0.0beta-4"
CWD = Path(__file__).parent
BASE_DIR = CWD.parent.parent.parent

View File

@@ -29,13 +29,13 @@ from mealie.schema.response.responses import ErrorResponse
from mealie.services import urls
from mealie.services.event_bus_service.event_bus_service import EventBusService, EventSource
from mealie.services.event_bus_service.message_types import EventTypes
from mealie.services.recipe.recipe_data_service import RecipeDataService
from mealie.services.recipe.recipe_data_service import InvalidDomainError, NotAnImageError, RecipeDataService
from mealie.services.recipe.recipe_service import RecipeService
from mealie.services.recipe.template_service import TemplateService
from mealie.services.scraper.recipe_bulk_scraper import RecipeBulkScraperService
from mealie.services.scraper.scraped_extras import ScraperContext
from mealie.services.scraper.scraper import create_from_url
from mealie.services.scraper.scraper_strategies import RecipeScraperPackage
from mealie.services.scraper.scraper_strategies import ForceTimeoutException, RecipeScraperPackage
class BaseRecipeController(BaseUserController):
@@ -139,7 +139,12 @@ class RecipeController(BaseRecipeController):
@router.post("/create-url", status_code=201, response_model=str)
def parse_recipe_url(self, req: ScrapeRecipe):
"""Takes in a URL and attempts to scrape data and load it into the database"""
recipe, extras = create_from_url(req.url)
try:
recipe, extras = create_from_url(req.url)
except ForceTimeoutException as e:
raise HTTPException(
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
) from e
if req.include_tags:
ctx = ScraperContext(self.user.id, self.group_id, self.repos)
@@ -176,8 +181,13 @@ class RecipeController(BaseRecipeController):
@router.post("/test-scrape-url")
def test_parse_recipe_url(self, url: ScrapeRecipeTest):
# Debugger should produce the same result as the scraper sees before cleaning
if scraped_data := RecipeScraperPackage(url.url).scrape_url():
return scraped_data.schema.data
try:
if scraped_data := RecipeScraperPackage(url.url).scrape_url():
return scraped_data.schema.data
except ForceTimeoutException as e:
raise HTTPException(
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
) from e
return "recipe_scrapers was unable to scrape this URL"
@@ -314,7 +324,19 @@ class RecipeController(BaseRecipeController):
def scrape_image_url(self, slug: str, url: ScrapeRecipe):
recipe = self.mixins.get_one(slug)
data_service = RecipeDataService(recipe.id)
data_service.scrape_image(url.url)
try:
data_service.scrape_image(url.url)
except NotAnImageError as e:
raise HTTPException(
status_code=400,
detail=ErrorResponse.respond("Url is not an image"),
) from e
except InvalidDomainError as e:
raise HTTPException(
status_code=400,
detail=ErrorResponse.respond("Url is not from an allowed domain"),
) from e
recipe.image = cache.cache_key.new_key()
self.service.update_one(recipe.slug, recipe)
@@ -338,13 +360,27 @@ class RecipeController(BaseRecipeController):
file: UploadFile = File(...),
):
"""Upload a file to store as a recipe asset"""
file_name = f"{slugify(name)}.{extension}"
if "." in extension:
extension = extension.split(".")[-1]
file_slug = slugify(name)
if not extension or not file_slug:
raise HTTPException(status_code=400, detail="Missing required fields")
file_name = f"{file_slug}.{extension}"
asset_in = RecipeAsset(name=name, icon=icon, file_name=file_name)
recipe = self.mixins.get_one(slug)
dest = recipe.asset_dir / file_name
# Ensure path is relative to the recipe's asset directory
if dest.absolute().parent != recipe.asset_dir:
raise HTTPException(
status_code=400,
detail=f"File name {file_name} or extension {extension} not valid",
)
with dest.open("wb") as buffer:
copyfileobj(file.file, buffer)

View File

@@ -11,6 +11,14 @@ from mealie.services._base_service import BaseService
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
class NotAnImageError(Exception):
pass
class InvalidDomainError(Exception):
pass
class RecipeDataService(BaseService):
minifier: img.ABCMinifier
@@ -56,8 +64,26 @@ class RecipeDataService(BaseService):
return image_path
@staticmethod
def _validate_image_url(url: str) -> bool:
# sourcery skip: invert-any-all, use-any
"""
Validates that the URL is of an allowed source and restricts certain sources to prevent
malicious images from being downloaded.
"""
invalid_domains = {"127.0.0.1", "localhost"}
for domain in invalid_domains:
if domain in url:
return False
return True
def scrape_image(self, image_url) -> None:
self.logger.info(f"Image URL: {image_url}")
self.logger.debug(f"Image URL: {image_url}")
if not self._validate_image_url(image_url):
self.logger.error(f"Invalid image URL: {image_url}")
raise InvalidDomainError(f"Invalid domain: {image_url}")
if isinstance(image_url, str): # Handles String Types
pass
@@ -74,7 +100,7 @@ class RecipeDataService(BaseService):
try:
r = requests.get(url, stream=True, headers={"User-Agent": _FIREFOX_UA})
except Exception:
self.logger.exception("Image {url} could not be requested")
self.logger.exception(f"Image {url} could not be requested")
continue
if r.status_code == 200:
all_image_requests.append((url, r))
@@ -100,9 +126,19 @@ class RecipeDataService(BaseService):
self.logger.exception("Fatal Image Request Exception")
return None
if r.status_code == 200:
r.raw.decode_content = True
self.logger.info(f"File Name Suffix {file_path.suffix}")
self.write_image(r.raw, file_path.suffix)
if r.status_code != 200:
# TODO: Probably should throw an exception in this case as well, but before these changes
# we were returning None if it failed anyways.
return None
file_path.unlink(missing_ok=True)
content_type = r.headers.get("content-type", "")
if "image" not in content_type:
self.logger.error(f"Content-Type: {content_type} is not an image")
raise NotAnImageError(f"Content-Type {content_type} is not an image")
r.raw.decode_content = True
self.logger.info(f"File Name Suffix {file_path.suffix}")
self.write_image(r.raw, file_path.suffix)
file_path.unlink(missing_ok=True)

View File

@@ -1,10 +1,11 @@
import time
from abc import ABC, abstractmethod
from typing import Any, Callable
import extruct
import requests
from fastapi import HTTPException, status
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
from slugify import slugify
from w3lib.html import get_base_url
@@ -14,6 +15,59 @@ from mealie.services.scraper.scraped_extras import ScrapedExtras
from . import cleaner
SCRAPER_TIMEOUT = 15
class ForceTimeoutException(Exception):
pass
def safe_scrape_html(url: str) -> str:
"""
Scrapes the html from a url but will cancel the request
if the request takes longer than 15 seconds. This is used to mitigate
DDOS attacks from users providing a url with arbitrary large content.
"""
resp = requests.get(url, timeout=SCRAPER_TIMEOUT, stream=True)
html_bytes = b""
start_time = time.time()
for chunk in resp.iter_content(chunk_size=1024):
html_bytes += chunk
if time.time() - start_time > SCRAPER_TIMEOUT:
raise ForceTimeoutException()
# =====================================
# Coppied from requests text property
# Try charset from content-type
content = None
encoding = resp.encoding
if not html_bytes:
return ""
# Fallback to auto-detected encoding.
if encoding is None:
encoding = resp.apparent_encoding
# Decode unicode from given encoding.
try:
content = str(html_bytes, encoding, errors="replace")
except (LookupError, TypeError):
# A LookupError is raised if the encoding was not found which could
# indicate a misspelling or similar mistake.
#
# A TypeError can be raised if encoding is None
#
# So we try blindly encoding.
content = str(html_bytes, errors="replace")
return content
class ABCScraperStrategy(ABC):
"""
@@ -103,14 +157,13 @@ class RecipeScraperPackage(ABCScraperStrategy):
return recipe, extras
def scrape_url(self) -> SchemaScraperFactory.SchemaScraper | Any | None:
recipe_html = safe_scrape_html(self.url)
try:
scraped_schema = scrape_me(self.url)
except (WebsiteNotImplementedError, AttributeError):
try:
scraped_schema = scrape_me(self.url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError):
self.logger.error("Recipe Scraper was unable to extract a recipe.")
return None
scraped_schema = scrape_html(recipe_html, org_url=self.url)
except (NoSchemaFoundInWildMode, AttributeError):
self.logger.error("Recipe Scraper was unable to extract a recipe.")
return None
except ConnectionError as e:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": "CONNECTION_ERROR"}) from e
@@ -150,7 +203,7 @@ class RecipeScraperOpenGraph(ABCScraperStrategy):
"""
def get_html(self) -> str:
return requests.get(self.url).text
return safe_scrape_html(self.url)
def get_recipe_fields(self, html) -> dict | None:
"""