2021-01-05 19:13:37 +01:00
|
|
|
"""
|
|
|
|
|
Helper script to download raw recipe data from a URL and dump it to disk.
|
|
|
|
|
The resulting files can be used as test input data.
|
|
|
|
|
"""
|
|
|
|
|
|
2021-01-10 20:15:43 +01:00
|
|
|
import sys, json, pprint
|
|
|
|
|
import requests
|
|
|
|
|
import extruct
|
2021-01-05 19:13:37 +01:00
|
|
|
from scrape_schema_recipe import scrape_url
|
2021-01-10 20:15:43 +01:00
|
|
|
from w3lib.html import get_base_url
|
2021-01-05 19:13:37 +01:00
|
|
|
|
|
|
|
|
for url in sys.argv[1:]:
|
|
|
|
|
try:
|
|
|
|
|
data = scrape_url(url)[0]
|
|
|
|
|
slug = list(filter(None, url.split("/")))[-1]
|
|
|
|
|
filename = f"{slug}.json"
|
|
|
|
|
with open(filename, "w") as f:
|
|
|
|
|
json.dump(data, f, indent=4, default=str)
|
|
|
|
|
print(f"Saved {filename}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error for {url}: {e}")
|
2021-01-10 20:15:43 +01:00
|
|
|
print("Trying extruct instead")
|
|
|
|
|
pp = pprint.PrettyPrinter(indent=2)
|
|
|
|
|
r = requests.get(url)
|
|
|
|
|
base_url = get_base_url(r.text, r.url)
|
|
|
|
|
data = extruct.extract(r.text, base_url=base_url)
|
|
|
|
|
pp.pprint(data)
|