pax_global_header00006660000000000000000000000064137454271350014525gustar00rootroot0000000000000052 comment=25439f9fd25aa599a90234a150c86bfbe8718554 url-normalize-1.4.3/000077500000000000000000000000001374542713500143325ustar00rootroot00000000000000url-normalize-1.4.3/.github/000077500000000000000000000000001374542713500156725ustar00rootroot00000000000000url-normalize-1.4.3/.github/workflows/000077500000000000000000000000001374542713500177275ustar00rootroot00000000000000url-normalize-1.4.3/.github/workflows/linter.yml000066400000000000000000000017441374542713500217550ustar00rootroot00000000000000--- ########################### ########################### ## Linter GitHub Actions ## ########################### ########################### name: Lint Code Base # # Documentation: # https://help.github.com/en/articles/workflow-syntax-for-github-actions # ############################# # Start the job on all push # ############################# on: push: branches-ignore: - 'master' ############### # Set the Job # ############### jobs: build: # Name the Job name: Lint Code Base # Set the agent to run on runs-on: ubuntu-latest ################## # Load all steps # ################## steps: ########################## # Checkout the code base # ########################## - name: Checkout Code uses: actions/checkout@v2 ################################ # Run Linter against code base # ################################ - name: Lint Code Base uses: github/super-linter@v2.0.0 url-normalize-1.4.3/.gitignore000066400000000000000000000001101374542713500163120ustar00rootroot00000000000000.coverage .*cache .tox .vscode dist *.lock __pycache__ *.pyc *.egg-info url-normalize-1.4.3/.travis.yml000066400000000000000000000002671374542713500164500ustar00rootroot00000000000000language: python sudo: required dist: xenial python: - "2.7" - "3.7" install: - "pip install coveralls poetry" - "poetry install -v" script: "pytest" after_success: coveralls url-normalize-1.4.3/LICENSE000066400000000000000000000020561374542713500153420ustar00rootroot00000000000000MIT License Copyright (c) 2020 Nikolay Panov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. url-normalize-1.4.3/Makefile000066400000000000000000000001241374542713500157670ustar00rootroot00000000000000tox: @tox test: @py.test build: @poetry build publish: build @poetry publish url-normalize-1.4.3/README.md000066400000000000000000000041301374542713500156070ustar00rootroot00000000000000url-normalize ============= [![Build Status](https://travis-ci.org/niksite/url-normalize.svg?branch=master)](https://travis-ci.org/niksite/url-normalize) [![Coverage Status](https://coveralls.io/repos/github/niksite/url-normalize/badge.svg?branch=master)](https://coveralls.io/github/niksite/url-normalize?branch=master) URI Normalization function: * Take care of IDN domains. * Always provide the URI scheme in lowercase characters. * Always provide the host, if any, in lowercase characters. * Only perform percent-encoding where it is essential. * Always use uppercase A-through-F characters when percent-encoding. * Prevent dot-segments appearing in non-relative URI paths. * For schemes that define a default authority, use an empty authority if the default is desired. * For schemes that define an empty path to be equivalent to a path of "/", use "/". * For schemes that define a port, use an empty port if the default is desired * All portions of the URI must be utf-8 encoded NFC from Unicode strings Inspired by Sam Ruby's [urlnorm.py](http://intertwingly.net/blog/2004/08/04/Urlnorm) Example: ```sh $ pip install url-normalize Collecting url-normalize ... Successfully installed future-0.16.0 url-normalize-1.3.3 $ python Python 3.6.1 (default, Jul 8 2017, 05:00:20) [GCC 4.9.2] on linux Type "help", "copyright", "credits" or "license" for more information. > from url_normalize import url_normalize > print(url_normalize('www.foo.com:80/foo')) > https://www.foo.com/foo ``` History: * 1.4.3: Added LICENSE file * 1.4.2: Added an optional param sort_query_params (True by default) * 1.4.1: Added an optional param default_scheme to the url_normalize ('https' by default) * 1.4.0: A bit of code refactoring and cleanup * 1.3.3: Support empty string and double slash urls (//domain.tld) * 1.3.2: Same code support both Python 3 and Python 2. * 1.3.1: Python 3 compatibility * 1.2.1: PEP8, setup.py * 1.1.2: support for shebang (#!) urls * 1.1.1: using 'http' schema by default when appropriate * 1.1.0: added handling of IDN domains * 1.0.0: code pep8 * 0.1.0: forked from Sam Ruby's urlnorm.py License: MIT License url-normalize-1.4.3/pyproject.toml000066400000000000000000000023221374542713500172450ustar00rootroot00000000000000[tool.poetry] authors = ["Nikolay Panov "] description = "URL normalization for Python" homepage = "https://github.com/niksite/url-normalize" keywords = ['url', 'normalization', 'normalize'] license = "MIT" name = "url-normalize" readme = "README.md" repository = "https://github.com/niksite/url-normalize" version = "1.4.3" [tool.poetry.urls] "Bug Tracker" = "https://github.com/niksite/url-normalize/issues" "Changelog" = "https://github.com/niksite/url-normalize#url-normalize" [tool.poetry.dependencies] python = "~2.7 || ^3.6" six = "*" [tool.poetry.dev-dependencies] bandit = [{version="*", python="^3.6"}] flake8 = [{version="*", python="^3.6"}] pluggy = [{version="0.13.1", python="~2.7"}] pydocstyle = [{version="*", python="^3.6"}] pylint = [{version="*", python="^3.6"}] pytest = [{version="3.10.1", python="~2.7"},{version="*", python="^3.6"}] pytest-cov = [{version="2.9.0", python="~2.7"},{version="*", python="^3.6"}] pytest-flakes = [{version="4.0.1", python="~2.7"},{version="*", python="^3.6"}] pytest-socket = [{version="0.3.3", python="~2.7"}, {version="*", python="^3.6"}] tox = [{version="*", python="^3.6"}] [build-system] build-backend = "poetry.masonry.api" requires = ["poetry>=0.12"] url-normalize-1.4.3/tests/000077500000000000000000000000001374542713500154745ustar00rootroot00000000000000url-normalize-1.4.3/tests/__init__.py000066400000000000000000000000001374542713500175730ustar00rootroot00000000000000url-normalize-1.4.3/tests/test_deconstruct_url.py000066400000000000000000000014521374542713500223260ustar00rootroot00000000000000"""Deconstruct url tests.""" from url_normalize.tools import deconstruct_url, URL EXPECTED_DATA = { "http://site.com": URL( fragment="", host="site.com", path="", port="", query="", scheme="http", userinfo="", ), "http://user@www.example.com:8080/path/index.html?param=val#fragment": URL( fragment="fragment", host="www.example.com", path="/path/index.html", port="8080", query="param=val", scheme="http", userinfo="user@", ), } def test_deconstruct_url_result_is_expected(): """Assert we got expected results from the deconstruct_url function.""" for url, expected in EXPECTED_DATA.items(): result = deconstruct_url(url) assert result == expected, url url-normalize-1.4.3/tests/test_generic_url_cleanup.py000066400000000000000000000013111374542713500231060ustar00rootroot00000000000000"""Tests for generic_url_cleanup function.""" from url_normalize.url_normalize import generic_url_cleanup EXPECTED_DATA = { "//site/#!fragment": "//site/?_escaped_fragment_=fragment", "//site/?utm_source=some source¶m=value": "//site/?param=value", "//site/?utm_source=some source": "//site/", "//site/?param=value&utm_source=some source": "//site/?param=value", "//site/page": "//site/page", "//site/?& ": "//site/", } def test_generic_url_cleanup_result_is_expected(): """Assert we got expected results from the generic_url_cleanup function.""" for url, expected in EXPECTED_DATA.items(): result = generic_url_cleanup(url) assert result == expected, url url-normalize-1.4.3/tests/test_normalize_fragment.py000066400000000000000000000010741374542713500227720ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Tests for normalize_fragment function.""" from url_normalize.url_normalize import normalize_fragment EXPECTED_DATA = { "": "", "fragment": "fragment", "пример": "%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80", "!fragment": "%21fragment", "~fragment": "~fragment", } def test_normalize_fragment_result_is_expected(): """Assert we got expected results from the normalize_fragment function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_fragment(url) assert result == expected, url url-normalize-1.4.3/tests/test_normalize_host.py000066400000000000000000000010431374542713500221400ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Tests for normalize_host function.""" from url_normalize.url_normalize import normalize_host EXPECTED_DATA = { "site.com": "site.com", "SITE.COM": "site.com", "site.com.": "site.com", "пример.испытание": "xn--e1afmkfd.xn--80akhbyknj4f", } def test_normalize_host_result_is_expected(): """Assert we got expected results from the normalize_host function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_host(url) assert result == expected, url url-normalize-1.4.3/tests/test_normalize_path.py000066400000000000000000000020411374542713500221160ustar00rootroot00000000000000"""Tests for normalize_path function.""" from url_normalize.url_normalize import normalize_path EXPECTED_DATA = { "..": "/", "": "/", "/../foo": "/foo", "/..foo": "/..foo", "/./../foo": "/foo", "/./foo": "/foo", "/./foo/.": "/foo/", "/.foo": "/.foo", "/": "/", "/foo..": "/foo..", "/foo.": "/foo.", "/FOO": "/FOO", "/foo/../bar": "/bar", "/foo/./bar": "/foo/bar", "/foo//": "/foo/", "/foo///bar//": "/foo/bar/", "/foo/bar/..": "/foo/", "/foo/bar/../..": "/", "/foo/bar/../../../../baz": "/baz", "/foo/bar/../../../baz": "/baz", "/foo/bar/../../": "/", "/foo/bar/../../baz": "/baz", "/foo/bar/../": "/foo/", "/foo/bar/../baz": "/foo/baz", "/foo/bar/.": "/foo/bar/", "/foo/bar/./": "/foo/bar/", } def test_normalize_host_result_is_expected(): """Assert we got expected results from the normalize_path function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_path(url, "http") assert result == expected, url url-normalize-1.4.3/tests/test_normalize_port.py000066400000000000000000000006511374542713500221530ustar00rootroot00000000000000"""Tests for normalize_port function.""" from url_normalize.url_normalize import normalize_port EXPECTED_DATA = {"8080": "8080", "": "", "80": "", "string": "string"} def test_normalize_port_result_is_expected(): """Assert we got expected results from the normalize_port function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_port(url, "http") assert result == expected, url url-normalize-1.4.3/tests/test_normalize_query.py000066400000000000000000000010631374542713500223320ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Tests for normalize_query function.""" from url_normalize.url_normalize import normalize_query EXPECTED_DATA = { "": "", "param1=val1¶m2=val2": "param1=val1¶m2=val2", "Ç=Ç": "%C3%87=%C3%87", "%C3%87=%C3%87": "%C3%87=%C3%87", "q=C%CC%A7": "q=%C3%87", } def test_normalize_query_result_is_expected(): """Assert we got expected results from the normalize_query function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_query(url) assert result == expected, url url-normalize-1.4.3/tests/test_normalize_scheme.py000066400000000000000000000006251374542713500224340ustar00rootroot00000000000000"""Tests for normalize_scheme function.""" from url_normalize.url_normalize import normalize_scheme EXPECTED_DATA = {"http": "http", "HTTP": "http"} def test_normalize_scheme_result_is_expected(): """Assert we got expected results from the normalize_scheme function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_scheme(url) assert result == expected, url url-normalize-1.4.3/tests/test_normalize_userinfo.py000066400000000000000000000007471374542713500230270ustar00rootroot00000000000000"""Tests for normalize_userinfo function.""" from url_normalize.url_normalize import normalize_userinfo EXPECTED_DATA = { ":@": "", "": "", "@": "", "user:password@": "user:password@", "user@": "user@", } def test_normalize_userinfo_result_is_expected(): """Assert we got expected results from the normalize_userinfo function.""" for url, expected in EXPECTED_DATA.items(): result = normalize_userinfo(url) assert result == expected, url url-normalize-1.4.3/tests/test_provide_url_scheme.py000066400000000000000000000015101374542713500227600ustar00rootroot00000000000000"""Tests for provide_url_scheme function.""" from url_normalize.url_normalize import provide_url_scheme EXPECTED_DATA = { "": "", "-": "-", "/file/path": "/file/path", "//site/path": "https://site/path", "ftp://site/": "ftp://site/", "site/page": "https://site/page", } def test_provide_url_scheme_result_is_expected(): """Assert we got expected results from the provide_url_scheme function.""" for url, expected in EXPECTED_DATA.items(): result = provide_url_scheme(url) assert result == expected, url def test_provide_url_scheme_accept_default_scheme_param(): """Assert we could provide default_scheme param other than https.""" url = "//site/path" expected = "http://site/path" actual = provide_url_scheme(url, default_scheme="http") assert actual == expected url-normalize-1.4.3/tests/test_reconstruct_url.py000066400000000000000000000016241374542713500223450ustar00rootroot00000000000000"""Reconstruct url tests.""" from url_normalize.tools import reconstruct_url, URL EXPECTED_DATA = ( ( URL( fragment="", host="site.com", path="", port="", query="", scheme="http", userinfo="", ), "http://site.com", ), ( URL( fragment="fragment", host="www.example.com", path="/path/index.html", port="8080", query="param=val", scheme="http", userinfo="user@", ), "http://user@www.example.com:8080/path/index.html?param=val#fragment", ), ) def test_deconstruct_url_result_is_expected(): """Assert we got expected results from the deconstruct_url function.""" for url, expected in EXPECTED_DATA: result = reconstruct_url(url) assert result == expected, url url-normalize-1.4.3/tests/test_url_normalize.py000066400000000000000000000100151374542713500217640ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Integrations tests.""" from url_normalize import url_normalize EXPECTED_RESULTS = { "/../foo": "/foo", "/./../foo": "/foo", "/./foo": "/foo", "/./foo/.": "/foo/", "//www.foo.com/": "https://www.foo.com/", "/foo/../bar": "/bar", "/foo/./bar": "/foo/bar", "/foo//": "/foo/", "/foo///bar//": "/foo/bar/", "/foo/bar/..": "/foo/", "/foo/bar/../..": "/", "/foo/bar/../../../../baz": "/baz", "/foo/bar/../../../baz": "/baz", "/foo/bar/../../": "/", "/foo/bar/../../baz": "/baz", "/foo/bar/../": "/foo/", "/foo/bar/../baz": "/foo/baz", "/foo/bar/.": "/foo/bar/", "/foo/bar/./": "/foo/bar/", "http://:@example.com/": "http://example.com/", "http://@example.com/": "http://example.com/", "http://127.0.0.1:80/": "http://127.0.0.1/", "http://example.com:081/": "http://example.com:81/", "http://example.com:80/": "http://example.com/", "http://example.com": "http://example.com/", "http://example.com/?b&a": "http://example.com/?a&b", "http://example.com/?q=%5c": "http://example.com/?q=%5C", "http://example.com/?q=%C7": "http://example.com/?q=%EF%BF%BD", "http://example.com/?q=C%CC%A7": "http://example.com/?q=%C3%87", "http://EXAMPLE.COM/": "http://example.com/", "http://example.com/%7Ejane": "http://example.com/~jane", "http://example.com/a/../a/b": "http://example.com/a/b", "http://example.com/a/./b": "http://example.com/a/b", "http://example.com/#!5753509/hello-world": "http://example.com/?_escaped_fragment_=5753509/hello-world", "http://USER:pass@www.Example.COM/foo/bar": "http://USER:pass@www.example.com/foo/bar", "http://www.example.com./": "http://www.example.com/", "http://www.foo.com:80/foo": "http://www.foo.com/foo", "http://www.foo.com.:81/foo": "http://www.foo.com:81/foo", "http://www.foo.com./foo/bar.html": "http://www.foo.com/foo/bar.html", "http://www.foo.com/%7Ebar": "http://www.foo.com/~bar", "http://www.foo.com/%7ebar": "http://www.foo.com/~bar", "пример.испытание/Служебная:Search/Test": "https://xn--e1afmkfd.xn--80akhbyknj4f/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Search/Test", } NO_CHANGES_EXPECTED = ( "-", "", "/..foo", "/.foo", "/foo..", "/foo.", "ftp://user:pass@ftp.foo.net/foo/bar", "http://127.0.0.1/", "http://example.com:8080/", "http://example.com/?a&b", "http://example.com/?q=%5C", "http://example.com/?q=%C3%87", "http://example.com/?q=%E2%85%A0", "http://example.com/", "http://example.com/~jane", "http://example.com/a/b", "http://example.com/FOO", "http://user:password@example.com/", "http://www.foo.com:8000/foo", # from rfc2396bis "ftp://ftp.is.co.za/rfc/rfc1808.txt", "http://www.ietf.org/rfc/rfc2396.txt", "ldap://[2001:db8::7]/c=GB?objectClass?one", "mailto:John.Doe@example.com", "news:comp.infosystems.www.servers.unix", "tel:+1-816-555-1212", "telnet://192.0.2.16:80/", "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", ) def test_url_normalize_changes(): """Assert url_normalize do not change URI if not required. http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """ for value in NO_CHANGES_EXPECTED: assert url_normalize(value) == value def test_url_normalize_results(): """Assert url_normalize return expected results.""" for value, expected in EXPECTED_RESULTS.items(): assert expected == url_normalize(value), value def test_url_normalize_with_http_scheme(): """Assert we could use http scheme as default.""" url = "//www.foo.com/" expected = "http://www.foo.com/" actual = url_normalize(url, default_scheme="http") assert actual == expected def test_url_normalize_with_no_params_sorting(): """Assert we could use http scheme as default.""" url = "http://www.foo.com/?b=1&a=2" expected = "http://www.foo.com/?b=1&a=2" actual = url_normalize(url, sort_query_params=False) assert actual == expected url-normalize-1.4.3/tox.ini000066400000000000000000000006511374542713500156470ustar00rootroot00000000000000[tox] skipsdist = True envlist = py27, py39 [testenv] whitelist_externals = poetry skip_install = true commands = poetry install -v poetry run pytest [pytest] addopts = --cov-fail-under=99 --cov-report=term-missing:skip-covered --cov=url_normalize --disable-socket --flakes -v python_files = tests.py test_*.py *_tests.py [flake8] max-line-length = 80 select = C,E,F,W,B,B950 ignore = E501 url-normalize-1.4.3/url_normalize/000077500000000000000000000000001374542713500172145ustar00rootroot00000000000000url-normalize-1.4.3/url_normalize/__init__.py000066400000000000000000000020441374542713500213250ustar00rootroot00000000000000# -*- coding: utf-8 -*- """ URI normalizator. URI Normalization function: * Take care of IDN domains. * Always provide the URI scheme in lowercase characters. * Always provide the host, if any, in lowercase characters. * Only perform percent-encoding where it is essential. * Always use uppercase A-through-F characters when percent-encoding. * Prevent dot-segments appearing in non-relative URI paths. * For schemes that define a default authority, use an empty authority if the default is desired. * For schemes that define an empty path to be equivalent to a path of "/", use "/". * For schemes that define a port, use an empty port if the default is desired * All portions of the URI must be utf-8 encoded NFC from Unicode strings Inspired by Sam Ruby's urlnorm.py: http://intertwingly.net/blog/2004/08/04/Urlnorm This fork author: Nikolay Panov () """ from __future__ import absolute_import from .url_normalize import url_normalize __license__ = "Python" __version__ = "1.4.3" __all__ = ["url_normalize"] url-normalize-1.4.3/url_normalize/tools.py000066400000000000000000000044021374542713500207260ustar00rootroot00000000000000"""Url normalize tools (py27/py37 compatible).""" import re import unicodedata from collections import namedtuple import six from six.moves.urllib.parse import quote as quote_orig from six.moves.urllib.parse import unquote as unquote_orig from six.moves.urllib.parse import urlsplit, urlunsplit URL = namedtuple( "URL", ["scheme", "userinfo", "host", "port", "path", "query", "fragment"] ) def deconstruct_url(url): """Tranform the url into URL structure. Params: url : string : the URL Returns: URL """ scheme, auth, path, query, fragment = urlsplit(url.strip()) (userinfo, host, port) = re.search("([^@]*@)?([^:]*):?(.*)", auth).groups() return URL( fragment=fragment, host=host, path=path, port=port, query=query, scheme=scheme, userinfo=userinfo or "", ) def reconstruct_url(url): """Reconstruct string url from URL. Params: url : URL object instance Returns: string : reconstructed url string """ auth = (url.userinfo or "") + url.host if url.port: auth += ":" + url.port return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment)) def force_unicode(string, charset="utf-8"): """Convert string to unicode if it is not yet unicode. Params: string : string/unicode : an input string charset : string : optional : output encoding Returns: unicode """ if isinstance(string, six.text_type): # Always True on Py3 return string return string.decode(charset, "replace") # Py2 only def unquote(string, charset="utf-8"): """Unquote and normalize unicode string. Params: string : string to be unquoted charset : string : optional : output encoding Returns: string : an unquoted and normalized string """ string = unquote_orig(string) string = force_unicode(string, charset) string = unicodedata.normalize("NFC", string).encode(charset) return string def quote(string, safe="/"): """Quote string. Params: string : string to be quoted safe : string of safe characters Returns: string : quoted string """ string = quote_orig(string, safe) return string url-normalize-1.4.3/url_normalize/url_normalize.py000066400000000000000000000144761374542713500224640ustar00rootroot00000000000000# -*- coding: utf-8 -*- """URL normalize main module.""" import re from .tools import deconstruct_url, force_unicode, quote, reconstruct_url, unquote DEFAULT_PORT = { "ftp": "21", "gopher": "70", "http": "80", "https": "443", "news": "119", "nntp": "119", "snews": "563", "snntp": "563", "telnet": "23", "ws": "80", "wss": "443", } DEFAULT_CHARSET = "utf-8" DEFAULT_SCHEME = "https" def provide_url_scheme(url, default_scheme=DEFAULT_SCHEME): """Make sure we have valid url scheme. Params: url : string : the URL default_scheme : string : default scheme to use, e.g. 'https' Returns: string : updated url with validated/attached scheme """ has_scheme = ":" in url[:7] is_universal_scheme = url.startswith("//") is_file_path = url == "-" or (url.startswith("/") and not is_universal_scheme) if not url or has_scheme or is_file_path: return url if is_universal_scheme: return default_scheme + ":" + url return default_scheme + "://" + url def generic_url_cleanup(url): """Cleanup the URL from unnecessary data and convert to final form. Converts shebang urls to final form, removed unnecessary data from the url. Params: url : string : the URL Returns: string : update url """ url = url.replace("#!", "?_escaped_fragment_=") url = re.sub(r"utm_source=[^&]+&?", "", url) url = url.rstrip("&? ") return url def normalize_scheme(scheme): """Normalize scheme part of the url. Params: scheme : string : url scheme, e.g., 'https' Returns: string : normalized scheme data. """ return scheme.lower() def normalize_userinfo(userinfo): """Normalize userinfo part of the url. Params: userinfo : string : url userinfo, e.g., 'user@' Returns: string : normalized userinfo data. """ if userinfo in ["@", ":@"]: return "" return userinfo def normalize_host(host, charset=DEFAULT_CHARSET): """Normalize host part of the url. Lowercase and strip of final dot. Also, take care about IDN domains. Params: host : string : url host, e.g., 'site.com' Returns: string : normalized host data. """ host = force_unicode(host, charset) host = host.lower() host = host.strip(".") host = host.encode("idna").decode(charset) return host def normalize_port(port, scheme): """Normalize port part of the url. Remove mention of default port number Params: port : string : url port, e.g., '8080' scheme : string : url scheme, e.g., 'http' Returns: string : normalized port data. """ if not port.isdigit(): return port port = str(int(port)) if DEFAULT_PORT[scheme] == port: return "" return port def normalize_path(path, scheme): """Normalize path part of the url. Remove mention of default path number Params: path : string : url path, e.g., '/section/page.html' scheme : string : url scheme, e.g., 'http' Returns: string : normalized path data. """ # Only perform percent-encoding where it is essential. # Always use uppercase A-through-F characters when percent-encoding. # All portions of the URI must be utf-8 encoded NFC from Unicode strings path = quote(unquote(path), "~:/?#[]@!$&'()*+,;=") # Prevent dot-segments appearing in non-relative URI paths. if scheme in ["", "http", "https", "ftp", "file"]: output, part = [], None for part in path.split("/"): if part == "": if not output: output.append(part) elif part == ".": pass elif part == "..": if len(output) > 1: output.pop() else: output.append(part) if part in ["", ".", ".."]: output.append("") path = "/".join(output) # For schemes that define an empty path to be equivalent to a path of "/", # use "/". if not path and scheme in ["http", "https", "ftp", "file"]: path = "/" return path def normalize_fragment(fragment): """Normalize fragment part of the url. Params: fragment : string : url fragment, e.g., 'fragment' Returns: string : normalized fragment data. """ return quote(unquote(fragment), "~") def normalize_query(query, sort_query_params=True): """Normalize query part of the url. Params: query : string : url query, e.g., 'param1=val1¶m2=val2' Returns: string : normalized query data. """ param_arr = [ "=".join([quote(unquote(t), "~:/?#[]@!$'()*+,;=") for t in q.split("=", 1)]) for q in query.split("&") ] if sort_query_params: param_arr = sorted(param_arr) query = "&".join(param_arr) return query def url_normalize( url, charset=DEFAULT_CHARSET, default_scheme=DEFAULT_SCHEME, sort_query_params=True ): """URI normalization routine. Sometimes you get an URL by a user that just isn't a real URL because it contains unsafe characters like ' ' and so on. This function can fix some of the problems in a similar way browsers handle data entered by the user: >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' Params: charset : string : optional The target charset for the URL if the url was given as unicode string. Returns: string : a normalized url """ if not url: return url url = provide_url_scheme(url, default_scheme) url = generic_url_cleanup(url) url_elements = deconstruct_url(url) url_elements = url_elements._replace( scheme=normalize_scheme(url_elements.scheme), userinfo=normalize_userinfo(url_elements.userinfo), host=normalize_host(url_elements.host, charset), query=normalize_query(url_elements.query, sort_query_params), fragment=normalize_fragment(url_elements.fragment), ) url_elements = url_elements._replace( port=normalize_port(url_elements.port, url_elements.scheme), path=normalize_path(url_elements.path, url_elements.scheme), ) url = reconstruct_url(url_elements) return url