pax_global_header00006660000000000000000000000064141027357170014521gustar00rootroot0000000000000052 comment=df456e9b1ff3e9d180706f928a0fe8893d57ab4b scrape-schema-recipe-0.2.0/000077500000000000000000000000001410273571700155005ustar00rootroot00000000000000scrape-schema-recipe-0.2.0/.gitattributes000066400000000000000000000000631410273571700203720ustar00rootroot00000000000000scrape_schema_recipe/test_data/* linguist-vendored scrape-schema-recipe-0.2.0/.github/000077500000000000000000000000001410273571700170405ustar00rootroot00000000000000scrape-schema-recipe-0.2.0/.github/workflows/000077500000000000000000000000001410273571700210755ustar00rootroot00000000000000scrape-schema-recipe-0.2.0/.github/workflows/python-package.yml000066400000000000000000000027331410273571700245370ustar00rootroot00000000000000# This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Python package on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: [3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install flake8 mypy pytest types-requests if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # TODO: Have a couple issues that need to be resolved before this could be enabled. # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Check with mypy run: | mypy scrape_schema_recipe/ - name: Test with pytest run: | pytest scrape-schema-recipe-0.2.0/.gitignore000066400000000000000000000001531410273571700174670ustar00rootroot00000000000000.coverage .mypy_cache/ build/ dist/ scrape_schema_recipe.egg-info/ scraple_schema_recipe/__pycache__/ *.pycscrape-schema-recipe-0.2.0/.travis.yml000066400000000000000000000007721410273571700176170ustar00rootroot00000000000000language: python env: RUN_MYPY=true matrix: include: - python: 3.6 - python: 3.7 - python: 3.8 - python: 3.9 - python: "pypy3" env: RUN_MYPY=false install: - pip install -r requirements-dev.txt - pip install . script: # test the static typing - if [ "$RUN_MYPY" = "true" ]; then mypy scrape_schema_recipe/scrape.py; fi - python test_scrape.py -v # runs test and coverage # - python3 -m nose2 # nose2 isn't working for some reason scrape-schema-recipe-0.2.0/ATTRIBUTION.md000066400000000000000000000073441410273571700176360ustar00rootroot00000000000000# Test Data Attribution/Licenses Attribution for the test data (in the `scrape_schema_recipe/test_data/` folder) included in this python package. These are input for testing the software package. I've tried to use recipes under Creative Commons Licenses that do not restrict derivation because a unit test case could be considered a derivation. I've sparingly using recipes that are licensed non-commercial. I think unit testing could constitute as a use that would not conflict with that. These recipes and their HTML remain are licensed under their respective license, NOT the Apache 2.0 license that the software is licensed (except where noted). ## Examples ### Irish Coffee Recipe * File: bevvy-irish-coffee.html * Author: Bevvy * Recipe URL: https://bevvy.co/cocktail/irish-coffee/smq * License: [Creative Commons Attribution-ShareAlike 3.0 License](https://creativecommons.org/licenses/by-sa/3.0/) * Website: [Bevvy](https://bevvy.co/) * Format: LD-JSON ### British Treacle Tart * File: foodista-british-treacle-tart.html * Creator: [Leah Rodrigues](http://foodista.com/profile/XYPGHGF4/leah-rodrigues) * Recipe URL: http://foodista.com/recipe/3LH62BBP/british-treacle-tart * License: [Creative Commons Attribution License](http://creativecommons.org/licenses/by/3.0/) * Website: [Foodista](http://foodista.com/) * Format: MicroData (note: this uses an older version of http://schema.org/Recipe) ### Rum & Tonka Bean Dark Chocolate Truffles * File: sweetestkitchen-truffles.html * Author: [Jamieanne](http://www.sweetestkitchen.com/about/) * Recipe URL: http://www.sweetestkitchen.com/2014/11/rum-tonka-bean-dark-chocolate-truffles/ * License: [Creative Commons Attribution-NonCommercial 3.0 Unported License](http://creativecommons.org/licenses/by-nc/3.0/) * Website: [Sweetest Kitchen](http://www.sweetestkitchen.com/) * Format: Microdata (note: this uses an older version of http://schema.org/Recipe) ### Meyer Lemon Poppyseed Tea Cakes * File: crumb-meyer-lemon-poppyseed-tea-cakes.html * Author: [Isabell Boucher](http://www.crumbblog.com/about-isabelle/) * Recipe URL: http://www.crumbblog.com/meyer-lemon-poppyseed-tea-cakes/ * License: [Creative Commons Attribution-NonCommercial-ShareAlike 2.5 Canada License](http://creativecommons.org/licenses/by-nc-sa/2.5/ca/) from http://www.crumbblog.com/about-the-recipes/ * Website: [Crumb](http://www.crumbblog.com/) * Format: LD-JSON ### Google Example Party Cake * File: google-recipe-example.html * Recipe URL: https://developers.google.com/search/docs/data-types/recipe * License: [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) * Format: LD-JSON ### Chicken Taco Salad with Chili Lime Chicken * File: mm-skinny-chicken-taco-salad.html * Author: [Lauren](https://www.midgetmomma.com/about/about-midgetmomma/) * Recipe URL: https://www.midgetmomma.com/skinny-chicken-taco-salad/ * License: [Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-nc-sa/3.0/) * Website: [Midget Momma](https://www.midgetmomma.com/) * Format: LD-JSON ### Flavorful Green Beans * File: flavorful-green-beans.html * Author: National Heart, Lung, and Blood Institute * Recipe URL: https://medlineplus.gov/recipes/flavorful-green-beans/ * License: [Public Domain](https://medlineplus.gov/about/using/usingcontent/) * Website: [MedlinePlus](https://medlineplus.gov/) ## Unit Tests ### Simple Moscow Mule * File: allrecipes-moscow-mule.html * Author: [Lorem Ipsum](https://www.allrecipes.com/cook/loremipsum/) * Recipe URL: https://www.allrecipes.com/recipe/237874/simple-moscow-mule/ * License: Proprietary * Website: [All Recipes](https://www.allrecipes.com/) * Format: LD-JSON scrape-schema-recipe-0.2.0/LICENSE000066400000000000000000000261271410273571700165150ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019-2021 Micah Cochran Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. scrape-schema-recipe-0.2.0/README.md000066400000000000000000000204451410273571700167640ustar00rootroot00000000000000# scrape-schema-recipe [![PyPI](https://img.shields.io/pypi/v/scrape-schema-recipe)](https://pypi.org/project/scrape-schema-recipe/) ![Build Status](https://github.com/micahcochran/scrape-schema-recipe/actions/workflows/python-package.yml/badge.svg) [![Downloads](https://pepy.tech/badge/scrape-schema-recipe)](https://pepy.tech/project/scrape-schema-recipe) Scrapes recipes from HTML https://schema.org/Recipe (Microdata/JSON-LD) into Python dictionaries. ## Install ``` pip install scrape-schema-recipe ``` ## Requirements Python version 3.6+ This library relies heavily upon [extruct](https://github.com/scrapinghub/extruct). Other requirements: * isodate (>=0.5.1) * requests ## Online Example ```python >>> import scrape_schema_recipe >>> url = 'https://www.foodnetwork.com/recipes/alton-brown/honey-mustard-dressing-recipe-1939031' >>> recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True) >>> len(recipe_list) 1 >>> recipe = recipe_list[0] # Name of the recipe >>> recipe['name'] 'Honey Mustard Dressing' # List of the Ingredients >>> recipe['recipeIngredient'] ['5 tablespoons medium body honey (sourwood is nice)', '3 tablespoons smooth Dijon mustard', '2 tablespoons rice wine vinegar'] # List of the Instructions >>> recipe['recipeInstructions'] ['Combine all ingredients in a bowl and whisk until smooth. Serve as a dressing or a dip.'] # Author >>> recipe['author'] [{'@type': 'Person', 'name': 'Alton Brown', 'url': 'https://www.foodnetwork.com/profiles/talent/alton-brown'}] ``` '@type': 'Person' is a [https://schema.org/Person](https://schema.org/Person) object ```python # Preparation Time >>> recipe['prepTime'] datetime.timedelta(0, 300) # The library pendulum can give you something a little easier to read. >>> import pendulum # for pendulum version 1.0 >>> pendulum.Interval.instanceof(recipe['prepTime']) # for version 2.0 of pendulum >>> pendulum.Duration(seconds=recipe['prepTime'].total_seconds()) ``` If `python_objects` is set to `False`, this would return the string ISO8611 representation of the duration, `'PT5M'` [pendulum's library website](https://pendulum.eustace.io/). ```python # Publication date >>> recipe['datePublished'] datetime.datetime(2016, 11, 13, 21, 5, 50, 518000, tzinfo=) >>> str(recipe['datePublished']) '2016-11-13 21:05:50.518000-05:00' # Identifying this is http://schema.org/Recipe data (in LD-JSON format) >>> recipe['@context'], recipe['@type'] ('http://schema.org', 'Recipe') # Content's URL >>> recipe['url'] 'https://www.foodnetwork.com/recipes/alton-brown/honey-mustard-dressing-recipe-1939031' # all the keys in this dictionary >>> recipe.keys() dict_keys(['recipeYield', 'totalTime', 'dateModified', 'url', '@context', 'name', 'publisher', 'prepTime', 'datePublished', 'recipeIngredient', '@type', 'recipeInstructions', 'author', 'mainEntityOfPage', 'aggregateRating', 'recipeCategory', 'image', 'headline', 'review']) ``` ## Example from a File (alternative representations) Also works with locally saved [HTML example file](/test_data/google-recipe-example.html). ```python >>> filelocation = 'test_data/google-recipe-example.html' >>> recipe_list = scrape_schema_recipe.scrape(filelocation, python_objects=True) >>> recipe = recipe_list[0] >>> recipe['name'] 'Party Coffee Cake' >>> repcipe['datePublished'] datetime.date(2018, 3, 10) # Recipe Instructions using the HowToStep >>> recipe['recipeInstructions'] [{'@type': 'HowToStep', 'text': 'Preheat the oven to 350 degrees F. Grease and flour a 9x9 inch pan.'}, {'@type': 'HowToStep', 'text': 'In a large bowl, combine flour, sugar, baking powder, and salt.'}, {'@type': 'HowToStep', 'text': 'Mix in the butter, eggs, and milk.'}, {'@type': 'HowToStep', 'text': 'Spread into the prepared pan.'}, {'@type': 'HowToStep', 'text': 'Bake for 30 to 35 minutes, or until firm.'}, {'@type': 'HowToStep', 'text': 'Allow to cool.'}] ``` ## What Happens When Things Go Wrong If there aren't any http://schema.org/Recipe formatted recipes on the site. ```python >>> url = 'https://www.google.com' >>> recipe_list = scrape_schema_recipe.scrape(url, python_objects=True) >>> len(recipe_list) 0 ``` Some websites will cause an `HTTPError`. You may get around a 403 - Forbidden Errror by putting in an alternative user-agent via the variable `user_agent_str`. ## Functions * `load()` - load HTML schema.org/Recipe structured data from a file or file-like object * `loads()` - loads HTML schema.org/Recipe structured data from a string * `scrape_url()` - scrape a URL for HTML schema.org/Recipe structured data * `scrape()` - load HTML schema.org/Recipe structured data from a file, file-like object, string, or URL ``` Parameters ---------- location : string or file-like object A url, filename, or text_string of HTML, or a file-like object. python_objects : bool, list, or tuple (optional) when True it translates certain data types into python objects dates into datetime.date, datetimes into datetime.datetimes, durations as dateime.timedelta. when set to a list or tuple only converts types specified to python objects: * when set to either [dateime.date] or [datetime.datetimes] either will convert dates. * when set to [datetime.timedelta] durations will be converted when False no conversion is performed (defaults to False) nonstandard_attrs : bool, optional when True it adds nonstandard (for schema.org/Recipe) attributes to the resulting dictionaries, that are outside the specification such as: '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML) '_source_url' is the source url, when 'url' has already been defined as another value (defaults to False) migrate_old_schema : bool, optional when True it migrates the schema from older version to current version (defaults to True) user_agent_str : string, optional ***only for scrape_url() and scrape()*** overide the user_agent_string with this value. (defaults to None) Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned ``` These are also available with `help()` in the python console. ## Example function The `example_output()` function gives quick access to data for prototyping and debugging. It accepts the same parameters as load(), but the first parameter, `name`, is different. ```python >>> from scrape_schema_recipe import example_names, example_output >>> example_names ('irish-coffee', 'google', 'taco-salad', 'tart', 'tea-cake', 'truffles') >>> recipes = example_output('truffles') >>> recipes[0]['name'] 'Rum & Tonka Bean Dark Chocolate Truffles' ``` ## Files License: Apache 2.0 see [LICENSE](LICENSE) Test data attribution and licensing: [ATTRIBUTION.md](ATTRIBUTION.md) ## Development The unit testing must be run from a copy of the repository folder. Unit testing can be run by: ``` schema-recipe-scraper$ python3 test_scrape.py ``` mypy is used for static type checking from the project directory: ``` schema-recipe-scraper$ mypy schema_recipe_scraper/scrape.py ``` If you run mypy from another directory the `--ignore-missing-imports` flag will need to be added, thus `$ mypy --ignore-missing-imports scrape.py` `--ignore-missing-imports` flag is used because most libraries don't have static typing information contained in their own code or typeshed. ## Reference Documentation Here are some references for how schema.org/Recipe *should* be structured: * [https://schema.org/Recipe](https://schema.org/Recipe) - official specification * [Recipe Google Search Guide](https://developers.google.com/search/docs/data-types/recipe) - material teaching developers how to use the schema (with emphasis on how structured data impacts search results) ## Other Similar Python Libraries * [recipe_scrapers](https://github.com/hhursev/recipe-scrapers) - library scrapes recipes by using extruct to scrape the schema.org/Recipe format or HTML tags with BeautifulSoup. The library has drivers that support many different websites that further parse the information. This is a solid alternative to schema-recipe-scraper that is focused on a different kind of simplicity. scrape-schema-recipe-0.2.0/nose2.cfg000066400000000000000000000002331410273571700172050ustar00rootroot00000000000000[coverage] always-on = True coverage = scrape_schema_recipe/scrape.py coverage-report = term # coverage-report = term-missing # coverage-report = annotate scrape-schema-recipe-0.2.0/requirements-dev.txt000066400000000000000000000001271410273571700215400ustar00rootroot00000000000000mypy; implementation_name != 'pypy' nose2 nose2[coverage_plugin]>=0.6.5 types-requests scrape-schema-recipe-0.2.0/requirements.txt000066400000000000000000000002601410273571700207620ustar00rootroot00000000000000setuptools >= 39.2.0 dataclasses; python_version < '3.7' extruct importlib_resources; python_version < '3.9' isodate >= 0.5.1 requests types-dataclasses; python_version < '3.7'scrape-schema-recipe-0.2.0/scrape_schema_recipe/000077500000000000000000000000001410273571700216245ustar00rootroot00000000000000scrape-schema-recipe-0.2.0/scrape_schema_recipe/VERSION000066400000000000000000000000061410273571700226700ustar00rootroot000000000000000.2.0 scrape-schema-recipe-0.2.0/scrape_schema_recipe/__init__.py000066400000000000000000000013401410273571700237330ustar00rootroot00000000000000# # Copyright 2018 Micah Cochran # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from .scrape import __version__, load, loads, scrape, scrape_url, SSRTypeError from .example_output import example_names, example_output scrape-schema-recipe-0.2.0/scrape_schema_recipe/example_output.py000066400000000000000000000037711410273571700252610ustar00rootroot00000000000000from .scrape import load from sys import version_info if version_info < (3, 9): from importlib_resources import files else: from importlib.resources import files # type: ignore from pathlib import Path from typing import Dict, List, Tuple, Union _ex_name_filename = {'irish-coffee': 'bevvy-irish-coffee-2019.html', 'google': 'google-recipe-example.html', 'green-beans': 'flavorful-green-beans.html', 'taco-salad': 'mm-skinny-chicken-taco-salad.html', 'tart': 'foodista-british-treacle-tart.html', 'tea-cake': 'crumb-lemon-tea-cakes-2019.html', 'truffles': 'sweetestkitchen-truffles.html'} example_names = tuple(_ex_name_filename.keys()) def example_output(name: str, python_objects: Union[bool, List, Tuple] = False, nonstandard_attrs: bool = False, migrate_old_schema: bool = True) -> List[Dict]: """ Example data useful for prototyping and debugging. Calls the load() function. Note: the variable example_names is a list of the example names. Parameters ---------- name : string the name of the example python_objects : bool, list, tuple (optional) (defaults to False) nonstandard_attrs : bool, optional (defaults to False) migrate_old_schema : bool, optional (defaults to True) [Note: refer to load() function for documentation about the optional boolean variables] Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned """ if name not in example_names: raise(ValueError("no example named '{}'".format(name))) return load(files(__package__) / 'test_data' / _ex_name_filename[name], python_objects=python_objects, nonstandard_attrs=nonstandard_attrs, migrate_old_schema=migrate_old_schema) scrape-schema-recipe-0.2.0/scrape_schema_recipe/scrape.py000066400000000000000000000435411410273571700234620ustar00rootroot00000000000000# # Copyright 2019-2021 Micah Cochran # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # internal libraries from dataclasses import dataclass import datetime from pathlib import Path import sys # for mypy from typing import Callable, Dict, IO, List, Optional, Tuple, Union # external libraries import extruct import isodate import requests _PACKAGE_PATH = Path(__file__).resolve().parent # read version from VERSION file __version__ = (_PACKAGE_PATH / 'VERSION').read_text().strip() # Follow RFC 7231 sec. 5.5.3 USER_AGENT_STR = f'scrape-schema-recipe/{__version__} requests/{requests.__version__}' @dataclass class SSRTypeError(TypeError): """Custom error that is raised when the input given is not of the correct type.""" var_name: str object_type: type expected_types: str def __str__(self): s = f'{self.var_name} is of type "{self.object_type.__name__}", when expecting one of the following type(s): {self.expected_types}' return s def scrape(location: Union[str, IO[str]], python_objects: Union[bool, List, Tuple] = False, nonstandard_attrs: bool = False, migrate_old_schema: bool = True, user_agent_str: Optional[str] = None) -> List[Dict]: """ Parse data in https://schema.org/Recipe format into a list of dictionaries representing the recipe data. Parameters ---------- location : string or file-like object A url, filename, or text_string of HTML, or a file-like object. python_objects : bool, list, tuple (optional) when True it translates certain data types into python objects dates into datetime.date, datetimes into datetime.datetimes, durations as dateime.timedelta. when set to a list or tuple only converts types specified to python objects: when set to either [dateime.date] or [datetime.datetimes] either will convert dates. when set to [datetime.timedelta] durations will be converted when False no conversion is performed (defaults to False) nonstandard_attrs : bool, optional when True it adds nonstandard (for schema.org/Recipe) attributes to the resulting dictionaries, that are outside the specification such as: '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML) '_source_url' is the source url, when 'url' has already been defined as another value (defaults to False) migrate_old_schema : bool, optional when True it migrates the schema from older version to current version (defaults to True) user_agent_str : string, optional overide the user_agent_string with this value. (defaults to None) Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned """ data = {} # type: Dict[str, List[Dict]] if not user_agent_str: user_agent_str = USER_AGENT_STR # make sure that one and only are defined url = None if isinstance(location, str): # Is this a url? if location.startswith(("http://", "https://")): return scrape_url(location, python_objects=python_objects, nonstandard_attrs=nonstandard_attrs, user_agent_str=user_agent_str) # Is this is is a very long string? Perhaps it has HTML content. elif len(location) > 255: data = extruct.extract(location) # Maybe it is a filename? else: with open(location) as f: data = extruct.extract(f.read()) elif hasattr(location, 'read'): # Assume this is some kind of file-like object that can be read. data = extruct.extract(location.read()) else: raise SSRTypeError(var_name="location", object_type=type(location), expected_types = "string for a url, filename, or text_string of the HTML, or a file-like object") scrapings = _convert_to_scrapings(data, nonstandard_attrs, url=url) if migrate_old_schema is True: scrapings = _migrate_old_schema(scrapings) if python_objects is not False: scrapings = _pythonize_objects(scrapings, python_objects) return scrapings def load(fp: Union[str, IO[str], Path], python_objects: Union[bool, List, Tuple] = False, nonstandard_attrs: bool = False, migrate_old_schema: bool = True) -> List[Dict]: """load a filename or file object to scrape Parameters ---------- fp : string or file-like object A file name or a file-like object. python_objects : bool, list, tuple (optional) when True it translates certain data types into python objects dates into datetime.date, datetimes into datetime.datetimes, durations as dateime.timedelta. when set to a list or tuple only converts types specified to python objects: when set to either [dateime.date] or [datetime.datetimes] either will convert dates. when set to [datetime.timedelta] durations will be converted when False no conversion is performed (defaults to False) nonstandard_attrs : bool, optional when True it adds nonstandard (for schema.org/Recipe) attributes to the resulting dictionaries, that are outside the specification such as: '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML) '_source_url' is the source url, when 'url' has already been defined as another value (defaults to False) migrate_old_schema : bool, optional when True it migrates the schema from older version to current version (defaults to True) Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned """ data = {} # type: Dict[str, List[Dict]] if isinstance(fp, str): with open(fp) as f: data = extruct.extract(f.read()) elif isinstance(fp, Path): data = extruct.extract(fp.read_text()) elif hasattr(fp, 'read'): # Assume this is some kind of file-like object that can be read. data = extruct.extract(fp.read()) else: raise SSRTypeError(var_name="fp", object_type=type(fp), expected_types="a filename, pathlib.Path object, or a file-like object") scrapings = _convert_to_scrapings(data, nonstandard_attrs) if migrate_old_schema is True: scrapings = _migrate_old_schema(scrapings) if python_objects is not False: scrapings = _pythonize_objects(scrapings, python_objects) return scrapings def loads(string: str, python_objects: Union[bool, List, Tuple] = False, nonstandard_attrs: bool = False, migrate_old_schema: bool = True) -> List[Dict]: """scrapes a string Parameters ---------- string : string A text string of HTML. python_objects : bool, list, tuple (optional) when True it translates certain data types into python objects dates into datetime.date, datetimes into datetime.datetimes, durations as dateime.timedelta. when set to a list or tuple only converts types specified to python objects: when set to either [dateime.date] or [datetime.datetimes] either will convert dates. when set to [datetime.timedelta] durations will be converted when False no conversion is performed (defaults to False) nonstandard_attrs : bool, optional when True it adds nonstandard (for schema.org/Recipe) attributes to the resulting dictionaries, that are outside the specification such as: '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML) '_source_url' is the source url, when 'url' has already been defined as another value (defaults to False) migrate_old_schema : bool, optional when True it migrates the schema from older version to current version (defaults to True) Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned """ if not isinstance(string, str): raise SSRTypeError(var_name="string", object_type=type(string), expected_types="string") data = {} # type: Dict[str, List[Dict]] data = extruct.extract(string) scrapings = _convert_to_scrapings(data, nonstandard_attrs) if migrate_old_schema is True: scrapings = _migrate_old_schema(scrapings) if python_objects is not False: scrapings = _pythonize_objects(scrapings, python_objects) return scrapings def scrape_url(url: str, python_objects: Union[bool, List, Tuple] = False, nonstandard_attrs: bool = False, migrate_old_schema: bool = True, user_agent_str: str = None) -> List[Dict]: """scrape from a URL Parameters ---------- url : string A url to download data from and scrape. python_objects : bool, list, tuple (optional) when True it translates certain data types into python objects dates into datetime.date, datetimes into datetime.datetimes, durations as dateime.timedelta. when set to a list or tuple only converts types specified to python objects: when set to either [dateime.date] or [datetime.datetimes] either will convert dates. when set to [datetime.timedelta] durations will be converted when False no conversion is performed (defaults to False) nonstandard_attrs : bool, optional when True it adds nonstandard (for schema.org/Recipe) attributes to the resulting dictionaries, that are outside the specification such as: '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML) '_source_url' is the source url, when 'url' has already been defined as another value (defaults to False) migrate_old_schema : bool, optional when True it migrates the schema from older version to current version (defaults to True) user_agent_str : string, optional overide the user_agent_string with this value. (defaults to None) Returns ------- list a list of dictionaries in the style of schema.org/Recipe JSON-LD no results - an empty list will be returned """ if not isinstance(url, str): raise SSRTypeError(var_name="url", object_type=type(url), expected_types="string") data: Dict[str, List[Dict]] = {} if not user_agent_str: user_agent_str = USER_AGENT_STR r = requests.get(url, headers={"User-Agent": user_agent_str}, timeout=5) r.raise_for_status() data = extruct.extract(r.text, r.url) url = r.url scrapings = _convert_to_scrapings(data, nonstandard_attrs, url=url) if migrate_old_schema is True: scrapings = _migrate_old_schema(scrapings) if python_objects is not False: scrapings = _pythonize_objects(scrapings, python_objects) return scrapings def _convert_json_ld_recipe(rec: Dict, nonstandard_attrs: bool = False, url: str = None) -> Dict: """Helper function for _convert_to_scraping for a json-ld record adding extra tags""" # not sure if a copy is necessary? d = rec.copy() if nonstandard_attrs is True: d['_format'] = 'json-ld' # store the url if url: if d.get('url') and d.get('url') != url and nonstandard_attrs is True: d['_source_url'] = url else: d['url'] = url return d def _convert_to_scrapings(data: Dict[str, List[Dict]], nonstandard_attrs: bool = False, url: str = None) -> List[Dict]: """dectects schema.org/Recipe content in the dictionary and extracts it""" out = [] if data['json-ld'] != []: for rec in data['json-ld']: if rec.get('@type') == 'Recipe': d = _convert_json_ld_recipe(rec, nonstandard_attrs, url) out.append(d) if rec.get('@context') == 'https://schema.org' and '@graph' in rec.keys(): # walk the graph for subrec in rec['@graph']: if subrec['@type'] == 'Recipe': d = _convert_json_ld_recipe(subrec, nonstandard_attrs, url) out.append(d) if data['microdata'] != []: for rec in data['microdata']: if rec['type'] in ('http://schema.org/Recipe', 'https://schema.org/Recipe'): d = rec['properties'].copy() if nonstandard_attrs is True: d['_format'] = 'microdata' # add @context and @type for conversion to the JSON-LD # style format if rec['type'][:6] == 'https:': d['@context'] = 'https://schema.org' else: d['@context'] = 'http://schema.org' d['@type'] = 'Recipe' # store the url if url: if d.get('url') and nonstandard_attrs is True: d['_source_url'] = url else: d['url'] = url for key in d.keys(): if isinstance(d[key], dict) and 'type' in d[key]: type_ = d[key].pop('type') d[key]['@type'] = type_.split('/')[3] out.append(d) return out # properties that will be passed into datetime objects DATETIME_PROPERTIES = frozenset(['dateCreated', 'dateModified', 'datePublished', 'expires']) DURATION_PROPERTIES = frozenset(['cookTime', 'performTime', 'prepTime', 'totalTime', 'timeRequired']) def _parse_determine_date_datetime(s: str) -> Union[datetime.datetime, datetime.date]: """Parse function parses a date, if time is included it parses as a datetime. """ if sys.version_info >= (3, 7): # Check if the date includes time. if 'T' in s: return datetime.datetime.fromisoformat(s) else: return datetime.date.fromisoformat(s) else: # Check if the date includes time. if 'T' in s: return isodate.parse_datetime(s) else: return isodate.parse_date(s) # Test if lists/tuples have contain matching items def _have_matching_items(lst1: Union[bool, List, Tuple], lst2: Union[bool, List, Tuple]): if isinstance(lst1, bool): return lst1 if isinstance(lst2, bool): return lst2 s = set(lst1).intersection(lst2) return len(s) > 0 def _pythonize_objects(scrapings: List[Dict], python_objects: Union[bool, List, Tuple]) -> List[Dict]: if python_objects is False: # this really should not be happening return scrapings # this should work, mypy gives error, this isn't bulletproof code if python_objects is True or datetime.timedelta in python_objects: # type: ignore # convert ISO 8601 date times into timedelta scrapings = _convert_properties_scrape(scrapings, DURATION_PROPERTIES, isodate.parse_duration) if python_objects is True or _have_matching_items((datetime.date, datetime.datetime), python_objects): # convert ISO 8601 date times into datetimes.datetime objects scrapings = _convert_properties_scrape(scrapings, DATETIME_PROPERTIES, _parse_determine_date_datetime) return scrapings def _convert_properties_scrape(recipes: List[Dict], properties: frozenset, function: Callable[[str], Union[datetime.datetime, datetime.date]]) -> List[Dict]: for i in range(len(recipes)): key_set = set(recipes[i].keys()) for p in key_set.intersection(properties): try: recipes[i][p] = function(recipes[i][p]) except (isodate.ISO8601Error, ValueError, TypeError): if recipes[i][p] is None: # TypeError recipes[i].pop(p) # otherwise, it's a parse error, just leave the value as is return recipes def _migrate_old_schema(recipes: List[Dict]) -> List[Dict]: """Migrate old schema.org/Recipe version to current schema version.""" for i in range(len(recipes)): # rename 'ingredients' to 'recipeIngredient' if 'ingredients' in recipes[i]: recipes[i]['recipeIngredient'] = recipes[i].pop('ingredients') return recipes scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/000077500000000000000000000000001410273571700235745ustar00rootroot00000000000000scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/__init__.py000066400000000000000000000002221410273571700257010ustar00rootroot00000000000000# The only reason this ___init.py___ file is here is so that python setuptools # will install test_data/ as a module. It serves no other purpose.scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/allrecipes-moscow-mule.html000066400000000000000000016050431410273571700310630ustar00rootroot00000000000000 Simple Moscow Mule Recipe | Allrecipes
Rating: 4.9 stars
48 Ratings
  • 5 star values: 43
  • 4 star values: 5
  • 3 star values: 0
  • 2 star values: 0
  • 1 star values: 0

This is a vodka version of a dark and stormy.

Advertisement

Ingredients

1
Original recipe yields 1 servings
The ingredient list now reflects the servings specified
Ingredient Checklist

Directions

Instructions Checklist
  • Fill a tall glass with ice. Squeeze 1/2 lime over ice. Pour vodka over ice and top with ginger beer. Garnish with lime slices.

    Advertisement

Nutrition Facts

192 calories; protein 0.2g; carbohydrates 17.4g; fat 0.1g; sodium 10.6mg. Full Nutrition
Advertisement

Reviews (35)

Most helpful positive review

Rating: 5 stars
07/04/2014
We loved it! New summer favorite drink! We ended up finding Ginger Beer at Total Wine & more, Fry's didn't have it. Don't try and substitute it with ginger ale; ginger ale and ginger beer are separate things. Read More
(18)
48 Ratings
  • 5 star values: 43
  • 4 star values: 5
  • 3 star values: 0
  • 2 star values: 0
  • 1 star values: 0
Rating: 5 stars
07/04/2014
We loved it! New summer favorite drink! We ended up finding Ginger Beer at Total Wine & more, Fry's didn't have it. Don't try and substitute it with ginger ale; ginger ale and ginger beer are separate things. Read More
(18)
Rating: 5 stars
04/27/2015
So refreshing! I used Jamaican ginger beer and domestic vodka instead of Russian, so what would you call that? I also threw a sprig of mint in there just for looks. Because presentation is everything. Thanks for the recipe! Read More
(8)
Rating: 5 stars
07/11/2014
My favorite al time cocktail!! This was great! Just need a copper mug! Read More
(7)
Advertisement
Rating: 5 stars
08/28/2016
I searched high and low for the perfect copper mug. So it only makes sense I would search for the perfect Moscow Mule recipe! I used Goslings Ginger Beer and vodka we got from a distillery we visited in Virginia this past summer. Fresh-squeezed lime juice is the only way to go. Yummy! So cool and refreshing on a hot summer day! Read More
(1)
Rating: 5 stars
05/26/2018
If you want a light easy drink that goes down smoothly you simply cannot go wrong with a Moscow mule. It is so delicious and takes seconds to make. I love this drink! Read More
(1)
Rating: 4 stars
06/29/2015
Delicious! Read More
(1)
Advertisement
Rating: 5 stars
08/10/2018
My new favorite drink that I first had on a cruise a few months back. So refreshing. Made this exactly as written with a slice of candied ginger on the glass. Yum. Read More
(1)
Rating: 5 stars
12/28/2015
perfect! I added a sprig of fresh mint as well. Used Bundaberg brand ginger beer--my favorite!:D Read More
(1)
Rating: 5 stars
02/03/2020
Make sure you use ginger beer, not ginger ale! A great summer drink, but also nice when you have a cold as the ginger is good on the throat. Read More
(1)
scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/bevvy-irish-coffee-2018.html000066400000000000000000001176371410273571700305650ustar00rootroot00000000000000 Irish Coffee Recipe | Bevvy
 
Skip header links
scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/bevvy-irish-coffee-2019.html000066400000000000000000005510341410273571700305570ustar00rootroot00000000000000 Irish Coffee Recipe | Bevvy
 
Skip header links
scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/crumb-lemon-tea-cakes-2018.html000066400000000000000000002763401410273571700311410ustar00rootroot00000000000000 Meyer Lemon Poppyseed Tea Cakes - Crumb: A Food Blog

All images and content Copyright © 2018 Isabelle Boucher | Crumb: A Food Blog · WordPress

scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/crumb-lemon-tea-cakes-2019.html000066400000000000000000003425661410273571700311460ustar00rootroot00000000000000 Meyer Lemon Poppyseed Tea Cakes - Crumb: A Food Blog

All images and content Copyright © 2019 Isabelle Boucher | Crumb: A Food Blog · WordPress

scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/flavorful-green-beans.html000066400000000000000000000541031410273571700306510ustar00rootroot00000000000000 MedlinePlus: Flavorful Green Beans
URL of this page: https://medlineplus.gov/recipes/flavorful-green-beans/

Flavorful Green Beans

A Heart-Healthy Recipe from the National Heart, Lung, and Blood Institute

Prep time: 10 minutes
Cook time: 40 minutes
Total time: 50 minutes
Number of Servings: 7

The seasonings are perfect companions to the green beans.

Ingredients

  • 2 lb fresh green beans
  • 1/2 cup cold water
  • Nonstick cooking spray
  • 1/3 cup chopped onions
  • 4 cloves garlic, chopped
  • 1/2 tsp ground black pepper
  • 1/2 tsp dried basil
  • 1/2 tsp dried oregano

Directions

  1. Rinse green beans and snap off tips.
  2. Place green beans in a large pot and add 1/2 cup of cold water.
  3. Cook green beans on stovetop over medium heat for 10 minutes.
  4. Spray a sauté pan with cooking spray, and sauté the chopped onions and garlic for 5 minutes, or until they are tender and very lightly browned.
  5. Add onions, garlic, and black pepper to green beans. Spray the cooking spray over mixture, and cook on medium heat for another 20 minutes or until green beans are tender, but not soft. Stir occasionally.
  6. Sprinkle dried basil and oregano over green beans. Mix and serve.

Find more delicious heart-healthy recipes from

NIH National Heart, Lung, and Blood Institute logo

scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/foodista-british-treacle-tart.html000066400000000000000000001372571410273571700323400ustar00rootroot00000000000000 Foodista | Recipes, Cooking Tips, and Food News | British Treacle Tart

British Treacle Tart

Ingredients

Pastry Crust:
1/2 teaspoon salt
5 ounces unsalted butter (11 tbs), cut into small cubes
1 large egg, lightly beaten
Filling:
10 ounces golden syrup
1 tablespoon molasses
1 Zest and juice of lemon

Preparation

1
Combine flour, salt and confectioner’s sugar in a large bowl. Using tips of fingers, rub butter into flour until it looks mostly like wet sand. Make a well in the middle, and pour in the egg. Gradually work the egg into the flour until a moist dough forms, using the heel of your hand to distribute any remaining large pieces of butter. Shape into a 5-inch disk and cover with plastic wrap. Refrigerate for 1 hour. This can be done in a food processor.
2
Roll out dough into a 13-inch circle. Lay dough into an 11-inch tart pan with a removable bottom, and press against the sides to secure dough. Refrigerate for one hour.
3
Preheat oven to 325° F. Dock the bottom of the crust with a fork, lay a large, crumpled piece of parchment on top, and pour in pie weights (or a pound of dried beans). Bake for 25 minutes, until crust is a light golden brown. Set aside.
4
Increase the temperature to 350 F
5
Mix together the golden syrup and molasses with the lemon juice. Beat the eggs in a bowl and add to the treacle mixture. Finally stir in the bread crumbs.
6
Pour the mixture into the prepared tart case.
7
Bake for 20 – 25 minutes until the crust and filling are golden brown and firm to the touch. You may have to cover the crust with aluminum foil to keep it from getting too brown or burning.
8
Serve warm with crème fraiche which balances really well with the sweetness of the tart or a good vanilla ice cream.
.

About

You can use pre-made pie dough if short on time. Golden syrup may be found online or in specialty food stores.

Yield:

1 servings

Added:

Monday, June 27, 2011 - 11:05am

Creator:

Related Cooking Videos

scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/google-recipe-example.html000066400000000000000000000064341410273571700306430ustar00rootroot00000000000000 Party Coffee Cake

The best coffee cake you’ll ever try!

scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/mm-skinny-chicken-taco-salad.html000066400000000000000000005760611410273571700320330ustar00rootroot00000000000000 Chicken Taco Salad with Chili Lime Chicken & Homemade Salad Dressing
Midget Momma is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
scrape-schema-recipe-0.2.0/scrape_schema_recipe/test_data/sweetestkitchen-truffles.html000066400000000000000000002604261410273571700315350ustar00rootroot00000000000000 Rum & Tonka Bean Dark Chocolate Truffles - sweetest kitchen
scrape-schema-recipe-0.2.0/setup.cfg000066400000000000000000000026521410273571700173260ustar00rootroot00000000000000[metadata] name = scrape-schema-recipe version = file:scrape_schema_recipe/VERSION author = Micah Cochran description = Extracts cooking recipe from HTML structured data in the https://schema.org/Recipe format. long_description = file: README.md long_description_content_type = text/markdown license = Apache-2 license_file = LICENSE home-page = https://github.com/micahcochran/scrape-schema-recipe keywords = recipe, cooking, food, schema.org, schema.org/Recipe Requires-Python = >=3.6 classifier = Intended Audience :: Developers Operating System :: OS Independent License :: OSI Approved :: Apache Software License Programming Language :: Python :: 3 Topic :: Internet :: WWW/HTTP Topic :: Text Processing :: Markup :: HTML [options] zip_safe = True include_package_data = True packages = find: # these should be the same as requirements.txt install_requires = setuptools >= 39.2.0 dataclasses; python_version < '3.7' extruct importlib_resources; python_version < '3.9' isodate >= 0.5.1 requests types-dataclasses; python_version < '3.7' [options.package_data] * = *.txt, *.md, *.html scrape-schema-recipe = VERSION [bdist_wheel] universal = 1 [mypy] # for mypy, many of the module do not have stubs, which declare # their static types, which will cause mypy to have many errors # requests is the only one that has stubs ignore_missing_imports = True scrape-schema-recipe-0.2.0/setup.py000066400000000000000000000001411410273571700172060ustar00rootroot00000000000000from setuptools import setup # all the setup configuration is in the setup.cfg file setup() scrape-schema-recipe-0.2.0/test_scrape.py000077500000000000000000000323071410273571700203760ustar00rootroot00000000000000#!/usr/bin/env python3 # # Copyright 2019-2020 Micah Cochran # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import datetime import isodate import unittest from pathlib import Path from typing import List from scrape_schema_recipe import load, loads, scrape, scrape_url, SSRTypeError from scrape_schema_recipe import example_output, __version__ DISABLE_NETWORK_TESTS = False DATA_PATH = "scrape_schema_recipe/test_data" def lists_are_equal(lst1: List, lst2: List) -> bool: lst1.sort() lst2.sort() if lst1 != lst2: print("\n[lst1 != lst2]") print("lst1: {}".format(lst1)) print("lst2: {}".format(lst2)) return lst1 == lst2 class TestParsingFileMicroData(unittest.TestCase): @classmethod def setUpClass(cls): cls.recipes = load(f"{DATA_PATH}/foodista-british-treacle-tart.html") cls.recipe = cls.recipes[0] def test_recipe_keys(self): input_keys = list(self.recipe.keys()) expected_output = [ "@context", "recipeYield", "@type", "recipeInstructions", "recipeIngredient", "name", ] assert lists_are_equal(expected_output, input_keys) def test_name(self): assert self.recipe["name"] == "British Treacle Tart" def test_recipe_yield(self): assert self.recipe["recipeYield"] == "1 servings" def test_num_recipes(self): assert len(self.recipes) == 1 class TestUnsetTimeDate(unittest.TestCase): @classmethod def setUpClass(cls): cls.recipes = scrape( f"{DATA_PATH}/allrecipes-moscow-mule.html", python_objects=True ) cls.recipe = cls.recipes[0] def test_recipe_keys(self): input_keys = list(self.recipe.keys()) expected_output = [ "@context", "@type", "aggregateRating", "author", "datePublished", "description", "image", "mainEntityOfPage", "name", "nutrition", "prepTime", "recipeCategory", "recipeCuisine", "recipeIngredient", "recipeInstructions", "recipeYield", "review", "totalTime", "video", ] assert lists_are_equal(expected_output, input_keys) def test_name(self): assert self.recipe["name"] == "Simple Moscow Mule" def test_recipe_yield(self): assert self.recipe["recipeYield"] == "1 cocktail" def test_num_recipes(self): assert len(self.recipes) == 1 def test_recipe_durations(self): assert str(self.recipe["prepTime"]) == "0:10:00" assert str(self.recipe["totalTime"]) == "0:10:00" assert "cookTime" not in self.recipe.keys() # also uses the old ingredients name class TestParsingFileMicroData2(unittest.TestCase): @classmethod def setUpClass(cls): cls.recipes = scrape( f"{DATA_PATH}/sweetestkitchen-truffles.html", python_objects=True ) cls.recipe = cls.recipes[0] def test_recipe_keys(self): input_keys = list(self.recipe.keys()) expected_output = [ "prepTime", "cookTime", "name", "recipeYield", "recipeCategory", "image", "description", "@type", "author", "aggregateRating", "recipeIngredient", "recipeInstructions", "totalTime", "@context", ] assert lists_are_equal(expected_output, input_keys) def test_name(self): assert self.recipe["name"] == "Rum & Tonka Bean Dark Chocolate Truffles" def test_recipe_yield(self): assert self.recipe["recipeYield"] == "15-18 3cm squares" def test_num_recipes(self): assert len(self.recipes) == 1 def test_totalTime_sum(self): r = self.recipe assert r["prepTime"] + r["cookTime"] == r["totalTime"] class TestParsingFileLDJSON(unittest.TestCase): @classmethod def setUpClass(cls): cls.recipes = scrape(f"{DATA_PATH}/bevvy-irish-coffee-2018.html") cls.recipe = cls.recipes[0] def test_category(self): assert self.recipe["recipeCategory"] == "Cocktail" def test_duration(self): assert self.recipe["totalTime"] == "PT5M" def test_ingredients(self): ingredients = [ "1.5 oz Irish whiskey", "1 tsp brown sugar syrup", "Hot black coffee", "Unsweetened whipped cream", ] assert lists_are_equal(ingredients, self.recipe["recipeIngredient"]) # in the 2019 version this was changed def test_instructions(self): expected_str = "Add Irish whiskey, brown sugar syrup, and hot coffee to an Irish coffee mug.\nTop with whipped cream." assert self.recipe["recipeInstructions"] == expected_str def test_recipe_keys(self): input_keys = list(self.recipe.keys()) expected_output = [ "author", "publisher", "recipeCategory", "@type", "recipeIngredient", "recipeInstructions", "image", "@context", "totalTime", "description", "name", ] assert lists_are_equal(expected_output, input_keys) def test_name(self): assert self.recipe["name"] == "Irish Coffee" def test_num_recipes(self): assert len(self.recipes) == 1 class TestTimeDelta(unittest.TestCase): @classmethod def setUpClass(cls): cls.recipes = scrape( f"{DATA_PATH}/crumb-lemon-tea-cakes-2018.html", python_objects=True ) cls.recipe = cls.recipes[0] def test_timedelta(self): td = datetime.timedelta(minutes=10) assert self.recipe["prepTime"] == td def test_totalTime_sum(self): r = self.recipe assert r["prepTime"] + r["cookTime"] == r["totalTime"] class TestDateTime(unittest.TestCase): @classmethod def setUpClass(cls): cls.recipes = load( f"{DATA_PATH}/google-recipe-example.html", python_objects=True ) cls.recipe = cls.recipes[0] # input string (upload_date) is "2018-02-05T08:00:00+08:00" upload_date = cls.recipe["video"][0]["uploadDate"] cls.datetime_test = isodate.parse_datetime(upload_date) def test_publish_date_python_obj(self): assert self.recipe["datePublished"] == datetime.date(2018, 3, 10) def test_datetime_tz_python_obj_isodate(self): tz8 = isodate.FixedOffset(offset_hours=8) expected = datetime.datetime(2018, 2, 5, 8, 0, tzinfo=tz8) assert self.datetime_test == expected def test_datetime_tz_python_obj(self): tz8 = datetime.timezone(datetime.timedelta(hours=8)) expected = datetime.datetime(2018, 2, 5, 8, 0, tzinfo=tz8) assert self.datetime_test == expected # test loads() class TestLoads(unittest.TestCase): def test_loads(self): with open(f"{DATA_PATH}/bevvy-irish-coffee-2019.html") as fp: s = fp.read() recipes = loads(s) recipe = recipes[0] assert recipe["name"] == "Irish Coffee" # feed bad types into the functions class BadTypes(unittest.TestCase): def test_load(self): with self.assertRaises(SSRTypeError): load(0xFEED) def test_loads(self): with self.assertRaises(SSRTypeError): loads(0xDEADBEEF) def test_scrape(self): with self.assertRaises(SSRTypeError): scrape(0xBEE) def test_scrape_url(self): with self.assertRaises(SSRTypeError): scrape_url(0xC0FFEE) class TestURL(unittest.TestCase): @classmethod def setUpClass(cls): cls.url = "https://raw.githubusercontent.com/micahcochran/scrape-schema-recipe/master/scrape_schema_recipe/test_data/bevvy-irish-coffee-2018.html" @unittest.skipIf(DISABLE_NETWORK_TESTS is True, "network tests disabled") def test_scrape_url(self): self.recipes = scrape_url(self.url) self.recipe = self.recipes[0] assert self.recipe["name"] == "Irish Coffee" @unittest.skipIf(DISABLE_NETWORK_TESTS is True, "network tests disabled") def test_scrape(self): self.recipes = scrape(self.url) self.recipe = self.recipes[0] assert self.recipe["name"] == "Irish Coffee" # test that the schema still works when not migrated class TestUnMigratedSchema(unittest.TestCase): # Some of these examples use 'ingredients', which was superceded by # 'recipeIngredients' in the http://schema.org/Recipe standard for a list # of ingredients in a recipe. def test_recipe1(self): recipes = load( f"{DATA_PATH}/foodista-british-treacle-tart.html", migrate_old_schema=False ) recipe = recipes[0] input_keys = list(recipe.keys()) # Note: 'ingredients' has been superceded by 'recipeIngredients' in # the http://schema.org/Recipe standard for a list of ingredients. expected_output = [ "@context", "recipeYield", "@type", "recipeInstructions", "ingredients", "name", ] assert lists_are_equal(expected_output, input_keys) def test_recipe2(self): recipes = scrape( f"{DATA_PATH}/sweetestkitchen-truffles.html", python_objects=True, migrate_old_schema=False, ) recipe = recipes[0] input_keys = list(recipe.keys()) expected_output = [ "prepTime", "cookTime", "name", "recipeYield", "recipeCategory", "image", "description", "@type", "author", "aggregateRating", "ingredients", "recipeInstructions", "totalTime", "@context", ] assert lists_are_equal(expected_output, input_keys) class TestExampleOutput(unittest.TestCase): def test_example_output(self): name = example_output("tea-cake")[0]["name"] assert name == "Meyer Lemon Poppyseed Tea Cakes" class TestVersion(unittest.TestCase): def test_version_not_null(self): assert __version__ is not None def test_version_is_type_string(self): assert isinstance(__version__, str) class TestPythonObjects(unittest.TestCase): @classmethod def setUpClass(cls): # these are named based on what is passed via python_objects cls.true = example_output("google", python_objects=True)[0] cls.false = example_output("google", python_objects=False)[0] cls.duration = example_output("google", python_objects=[datetime.timedelta])[0] cls.dates = example_output("google", python_objects=[datetime.date])[0] def testDurationTypes(self): assert isinstance(self.duration["cookTime"], datetime.timedelta) assert isinstance(self.duration["prepTime"], datetime.timedelta) assert isinstance(self.duration["totalTime"], datetime.timedelta) def testDurationEqual(self): assert self.duration["cookTime"] == self.true["cookTime"] assert self.duration["prepTime"] == self.true["prepTime"] assert self.duration["totalTime"] == self.true["totalTime"] def testDateTypes(self): assert isinstance(self.dates["datePublished"], datetime.date) # note that datePublished can also be of type datetime.dateime def testDatesEqual(self): assert self.dates["datePublished"] == self.true["datePublished"] class TestGraph(unittest.TestCase): # tests @graph, also test Path def test_graph(self): recipes_old = load( f"{DATA_PATH}/crumb-lemon-tea-cakes-2018.html", python_objects=True ) recipes_graph = load( Path(f"{DATA_PATH}/crumb-lemon-tea-cakes-2019.html"), python_objects=True ) r_old = recipes_old[0] r_graph = recipes_graph[0] assert r_old["name"] == r_graph["name"] assert r_old["recipeCategory"] == r_graph["recipeCategory"] assert r_old["recipeCuisine"] == r_graph["recipeCuisine"] assert r_old["recipeIngredient"] == r_graph["recipeIngredient"] assert r_old["recipeYield"] == r_graph["recipeYield"] assert r_old["totalTime"] == r_graph["totalTime"] # ---- check differences ---- # the recipeInstructions in 2019 version are HowToStep format, 2018 version are in a list assert r_old["recipeInstructions"] != r_graph["recipeInstructions"] # 2019 has a datePublished, 2018 version does not r_graph["datePublished"] == datetime.date(2018, 3, 19) assert "datePublished" not in r_old.keys() if __name__ == "__main__": unittest.main()