Convert scraper to use async (#1915)

* add httpx depedency for async http requests * rework scraper strategies to download recipe html asynchronously * rework recipe_data_service to download recipe images asynchronously * fix recipe_parser test, so it can use async results * fix bulk import so that it also works with async scraper * fix broken recipe_parser tests * Fix issues found by scanners * Add additional checks for ingredient and instruction count in test_create_by_url * Revert changes in test recipe_data Since we are checking ingredients and instructions in test_create_url now, these would fail with the stored html of recipe data * Add explicit type annotation in recipe_data_service.largest_content_len * Fix typo in annotation
2026-02-14 20:03:14 -05:00 · 2023-01-29 01:43:27 +01:00
parent 7275dd2696
commit 3415a9c310
11 changed files with 129 additions and 115 deletions
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@@ -1,9 +1,8 @@
 import asyncio
 import shutil
-from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

-import requests
+from httpx import AsyncClient, Response
 from pydantic import UUID4

 from mealie.pkgs import img
@@ -13,29 +12,32 @@ from mealie.services._base_service import BaseService
 _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"


+async def gather_with_concurrency(n, *coros):
+    semaphore = asyncio.Semaphore(n)
+
+    async def sem_coro(coro):
+        async with semaphore:
+            return await coro
+
+    return await asyncio.gather(*(sem_coro(c) for c in coros))
+
+
 async def largest_content_len(urls: list[str]) -> tuple[str, int]:
    largest_url = ""
    largest_len = 0

-    def do(session: requests.Session, url: str):
-        def _do() -> requests.Response:
-            return session.head(url, headers={"User-Agent": _FIREFOX_UA})
+    async def do(client: AsyncClient, url: str) -> Response:
+        return await client.head(url, headers={"User-Agent": _FIREFOX_UA})

-        return _do
+    async with AsyncClient() as client:
+        tasks = [do(client, url) for url in urls]
+        responses: list[Response] = await gather_with_concurrency(10, *tasks)
+        for response in responses:

-    with ThreadPoolExecutor(max_workers=10) as executor:
-        with requests.Session() as session:
-            loop = asyncio.get_event_loop()
-
-            tasks = [loop.run_in_executor(executor, do(session, url)) for url in urls]
-
-            response: requests.Response  # required for type hinting within the loop
-            for response in await asyncio.gather(*tasks):
-
-                len_int = int(response.headers.get("Content-Length", 0))
-                if len_int > largest_len:
-                    largest_url = response.url
-                    largest_len = len_int
+            len_int = int(response.headers.get("Content-Length", 0))
+            if len_int > largest_len:
+                largest_url = str(response.url)
+                largest_len = len_int

    return largest_url, largest_len

@@ -107,8 +109,8 @@ class RecipeDataService(BaseService):

        return True

-    def scrape_image(self, image_url) -> None:
-        self.logger.debug(f"Image URL: {image_url}")
+    async def scrape_image(self, image_url) -> None:
+        self.logger.info(f"Image URL: {image_url}")

        if not self._validate_image_url(image_url):
            self.logger.error(f"Invalid image URL: {image_url}")
@@ -121,15 +123,7 @@ class RecipeDataService(BaseService):
            # Multiple images have been defined in the schema - usually different resolutions
            # Typically would be in smallest->biggest order, but can't be certain so test each.
            # 'Google will pick the best image to display in Search results based on the aspect ratio and resolution.'
-
-            # TODO: We should refactor the scraper to use a async session provided by FastAPI using a sync
-            # route instead of bootstrapping async behavior this far down the chain. Will require some work
-            # so leaving this improvement here for now.
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            future = asyncio.ensure_future(largest_content_len(image_url))
-            loop.run_until_complete(future)
-            image_url, _ = future.result()
+            image_url, _ = await largest_content_len(image_url)

        elif isinstance(image_url, dict):  # Handles Dictionary Types
            for key in image_url:
@@ -144,25 +138,25 @@ class RecipeDataService(BaseService):
        file_name = f"{str(self.recipe_id)}.{ext}"
        file_path = Recipe.directory_from_id(self.recipe_id).joinpath("images", file_name)

-        try:
-            r = requests.get(image_url, stream=True, headers={"User-Agent": _FIREFOX_UA})
-        except Exception:
-            self.logger.exception("Fatal Image Request Exception")
-            return None
+        async with AsyncClient() as client:
+            try:
+                r = await client.get(image_url, headers={"User-Agent": _FIREFOX_UA})
+            except Exception:
+                self.logger.exception("Fatal Image Request Exception")
+                return None

-        if r.status_code != 200:
-            # TODO: Probably should throw an exception in this case as well, but before these changes
-            # we were returning None if it failed anyways.
-            return None
+            if r.status_code != 200:
+                # TODO: Probably should throw an exception in this case as well, but before these changes
+                # we were returning None if it failed anyways.
+                return None

-        content_type = r.headers.get("content-type", "")
+            content_type = r.headers.get("content-type", "")

-        if "image" not in content_type:
-            self.logger.error(f"Content-Type: {content_type} is not an image")
-            raise NotAnImageError(f"Content-Type {content_type} is not an image")
+            if "image" not in content_type:
+                self.logger.error(f"Content-Type: {content_type} is not an image")
+                raise NotAnImageError(f"Content-Type {content_type} is not an image")

-        r.raw.decode_content = True
-        self.logger.info(f"File Name Suffix {file_path.suffix}")
-        self.write_image(r.raw, file_path.suffix)
+            self.logger.info(f"File Name Suffix {file_path.suffix}")
+            self.write_image(r.read(), file_path.suffix)

-        file_path.unlink(missing_ok=True)
+            file_path.unlink(missing_ok=True)