Convert scraper to use async (#1915)

* add httpx depedency for async http requests

* rework scraper strategies to download recipe html asynchronously

* rework recipe_data_service to download recipe images asynchronously

* fix recipe_parser test, so it can use async results

* fix bulk import so that it also works with async scraper

* fix broken recipe_parser tests

* Fix issues found by scanners

* Add additional checks for ingredient and instruction count in test_create_by_url

* Revert changes in test recipe_data
Since we are checking ingredients and instructions in test_create_url now, these would fail with the stored html of recipe data

* Add explicit type annotation in recipe_data_service.largest_content_len

* Fix typo in annotation
This commit is contained in:
Sören
2023-01-29 01:43:27 +01:00
committed by GitHub
parent 7275dd2696
commit 3415a9c310
11 changed files with 129 additions and 115 deletions

View File

@@ -162,10 +162,10 @@ class RecipeController(BaseRecipeController):
# URL Scraping Operations
@router.post("/create-url", status_code=201, response_model=str)
def parse_recipe_url(self, req: ScrapeRecipe):
async def parse_recipe_url(self, req: ScrapeRecipe):
"""Takes in a URL and attempts to scrape data and load it into the database"""
try:
recipe, extras = create_from_url(req.url)
recipe, extras = await create_from_url(req.url)
except ForceTimeoutException as e:
raise HTTPException(
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
@@ -206,10 +206,10 @@ class RecipeController(BaseRecipeController):
return {"reportId": report_id}
@router.post("/test-scrape-url")
def test_parse_recipe_url(self, url: ScrapeRecipeTest):
async def test_parse_recipe_url(self, url: ScrapeRecipeTest):
# Debugger should produce the same result as the scraper sees before cleaning
try:
if scraped_data := RecipeScraperPackage(url.url).scrape_url():
if scraped_data := await RecipeScraperPackage(url.url).scrape_url():
return scraped_data.schema.data
except ForceTimeoutException as e:
raise HTTPException(
@@ -381,12 +381,12 @@ class RecipeController(BaseRecipeController):
# Image and Assets
@router.post("/{slug}/image", tags=["Recipe: Images and Assets"])
def scrape_image_url(self, slug: str, url: ScrapeRecipe):
async def scrape_image_url(self, slug: str, url: ScrapeRecipe):
recipe = self.mixins.get_one(slug)
data_service = RecipeDataService(recipe.id)
try:
data_service.scrape_image(url.url)
await data_service.scrape_image(url.url)
except NotAnImageError as e:
raise HTTPException(
status_code=400,