Commit b310a9a4 by Bartosz Sokorski Committed by GitHub

Rewrite HTML parsers and drop html5lib (#7924)

parent fd70f7e1
......@@ -550,27 +550,6 @@ docs = ["furo (>=2023.3.27)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1
testing = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"]
[[package]]
name = "html5lib"
version = "1.1"
description = "HTML parser based on the WHATWG HTML specification"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"},
{file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"},
]
[package.dependencies]
six = ">=1.9"
webencodings = "*"
[package.extras]
all = ["chardet (>=2.2)", "genshi", "lxml"]
chardet = ["chardet (>=2.2)"]
genshi = ["genshi"]
lxml = ["lxml"]
[[package]]
name = "httpretty"
version = "1.1.4"
description = "HTTP client mock for Python"
......@@ -1492,17 +1471,6 @@ files = [
]
[[package]]
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
files = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
[[package]]
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
......@@ -1536,17 +1504,6 @@ files = [
]
[[package]]
name = "types-html5lib"
version = "1.1.11.14"
description = "Typing stubs for html5lib"
optional = false
python-versions = "*"
files = [
{file = "types-html5lib-1.1.11.14.tar.gz", hash = "sha256:091e9e74e0ee37c93fd789a164e99b2af80ecf5a314280450c6a763d027ea209"},
{file = "types_html5lib-1.1.11.14-py3-none-any.whl", hash = "sha256:758c1a27f3b63363a346f3646be9f8b1f25df4fc1f96f88af6d1d831f24ad675"},
]
[[package]]
name = "types-jsonschema"
version = "4.17.0.8"
description = "Typing stubs for jsonschema"
......@@ -1630,17 +1587,6 @@ docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-
test = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.3.1)", "pytest-env (>=0.8.1)", "pytest-freezegun (>=0.4.2)", "pytest-mock (>=3.10)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=67.7.1)", "time-machine (>=2.9)"]
[[package]]
name = "webencodings"
version = "0.5.1"
description = "Character encoding aliases for legacy web content"
optional = false
python-versions = "*"
files = [
{file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
{file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
]
[[package]]
name = "xattr"
version = "0.10.1"
description = "Python wrapper for extended filesystem attributes"
......@@ -1742,4 +1688,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
[metadata]
lock-version = "2.0"
python-versions = "^3.8"
content-hash = "0eccb38c0af361232f1dadebed6e98fdd7e4c9f7238accf0c634121af0afcb88"
content-hash = "2a028eba8da6043cb897458153fc28cadc9a7b9184fe8edaee48e3e5a8468f06"
......@@ -40,7 +40,6 @@ cleo = "^2.0.0"
crashtest = "^0.4.1"
dulwich = "^0.21.2"
filelock = "^3.8.0"
html5lib = "^1.0"
importlib-metadata = { version = ">=4.4", python = "<3.10" }
installer = "^0.7.0"
jsonschema = "^4.10.0"
......@@ -79,7 +78,6 @@ pytest-xdist = { version = "^3.1", extras = ["psutil"] }
[tool.poetry.group.typing.dependencies]
mypy = ">=1.0"
types-html5lib = ">=1.1.9"
types-jsonschema = ">=4.9.0"
types-requests = ">=2.28.8"
......
from __future__ import annotations
import urllib.parse
import warnings
from collections import defaultdict
from functools import cached_property
......@@ -11,30 +10,30 @@ from typing import TYPE_CHECKING
from poetry.core.packages.utils.link import Link
from poetry.repositories.link_sources.base import LinkSource
from poetry.repositories.parsers.html_page_parser import HTMLPageParser
if TYPE_CHECKING:
from poetry.repositories.link_sources.base import LinkCache
with warnings.catch_warnings():
warnings.simplefilter("ignore")
import html5lib
class HTMLPage(LinkSource):
def __init__(self, url: str, content: str) -> None:
super().__init__(url=url)
self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
parser = HTMLPageParser()
parser.feed(content)
self._parsed = parser.anchors
self._base_url: str | None = parser.base_url
@cached_property
def _link_cache(self) -> LinkCache:
links: LinkCache = defaultdict(lambda: defaultdict(list))
for anchor in self._parsed.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
url = self.clean_link(urllib.parse.urljoin(self._url, href))
for anchor in self._parsed:
if href := anchor.get("href"):
url = self.clean_link(
urllib.parse.urljoin(self._base_url or self._url, href)
)
pyrequire = anchor.get("data-requires-python")
pyrequire = unescape(pyrequire) if pyrequire else None
yanked_value = anchor.get("data-yanked")
......@@ -42,7 +41,7 @@ class HTMLPage(LinkSource):
if yanked_value:
yanked = unescape(yanked_value)
else:
yanked = "data-yanked" in anchor.attrib
yanked = "data-yanked" in anchor
link = Link(url, requires_python=pyrequire, yanked=yanked)
if link.ext not in self.SUPPORTED_FORMATS:
......
from __future__ import annotations
from html.parser import HTMLParser
class HTMLPageParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.base_url: str | None = None
self.anchors: list[dict[str, str | None]] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag == "base" and self.base_url is None:
base_url = dict(attrs).get("href")
if base_url is not None:
self.base_url = base_url
elif tag == "a":
self.anchors.append(dict(attrs))
from __future__ import annotations
import functools
from dataclasses import dataclass
from html.parser import HTMLParser
from typing import Callable
# The following code was originally written for PDM project
# https://github.com/pdm-project/pdm/blob/1f4f48a35cdded064def85df117bebf713f7c17a/src/pdm/models/search.py
# and later changed to fit Poetry needs
@dataclass
class Result:
name: str = ""
version: str = ""
description: str = ""
class SearchResultParser(HTMLParser):
"""A simple HTML parser for pypi.org search results."""
def __init__(self) -> None:
super().__init__()
self.results: list[Result] = []
self._current: Result | None = None
self._nest_anchors = 0
self._data_callback: Callable[[str], None] | None = None
@staticmethod
def _match_class(attrs: list[tuple[str, str | None]], name: str) -> bool:
attrs_map = dict(attrs)
return name in (attrs_map.get("class") or "").split()
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if not self._current:
if tag == "a" and self._match_class(attrs, "package-snippet"):
self._current = Result()
self._nest_anchors = 1
else:
if tag == "span" and self._match_class(attrs, "package-snippet__name"):
self._data_callback = functools.partial(setattr, self._current, "name")
elif tag == "span" and self._match_class(attrs, "package-snippet__version"):
self._data_callback = functools.partial(
setattr, self._current, "version"
)
elif tag == "p" and self._match_class(
attrs, "package-snippet__description"
):
self._data_callback = functools.partial(
setattr, self._current, "description"
)
elif tag == "a":
self._nest_anchors += 1
def handle_data(self, data: str) -> None:
if self._data_callback is not None:
self._data_callback(data)
self._data_callback = None
def handle_endtag(self, tag: str) -> None:
if tag != "a" or self._current is None:
return
self._nest_anchors -= 1
if self._nest_anchors == 0:
if self._current.name and self._current.version:
self.results.append(self._current)
self._current = None
......@@ -9,7 +9,6 @@ from typing import Any
import requests
from cachecontrol.controller import logger as cache_control_logger
from html5lib.html5parser import parse
from poetry.core.packages.package import Package
from poetry.core.packages.utils.link import Link
from poetry.core.version.exceptions import InvalidVersion
......@@ -17,7 +16,7 @@ from poetry.core.version.exceptions import InvalidVersion
from poetry.repositories.exceptions import PackageNotFound
from poetry.repositories.http_repository import HTTPRepository
from poetry.repositories.link_sources.json import SimpleJsonPage
from poetry.utils._compat import decode
from poetry.repositories.parsers.pypi_search_parser import SearchResultParser
from poetry.utils.constants import REQUESTS_TIMEOUT
......@@ -50,45 +49,22 @@ class PyPiRepository(HTTPRepository):
def search(self, query: str) -> list[Package]:
results = []
search = {"q": query}
response = requests.session().get(
self._base_url + "search", params=search, timeout=REQUESTS_TIMEOUT
self._base_url + "search", params={"q": query}, timeout=REQUESTS_TIMEOUT
)
content = parse(response.content, namespaceHTMLElements=False)
for result in content.findall(".//*[@class='package-snippet']"):
name_element = result.find("h3/*[@class='package-snippet__name']")
version_element = result.find("h3/*[@class='package-snippet__version']")
if (
name_element is None
or version_element is None
or not name_element.text
or not version_element.text
):
continue
name = name_element.text
version = version_element.text
description_element = result.find(
"p[@class='package-snippet__description']"
)
description = (
description_element.text
if description_element is not None and description_element.text
else ""
)
parser = SearchResultParser()
parser.feed(response.text)
for result in parser.results:
try:
package = Package(name, version)
package.description = decode(description.strip())
package = Package(result.name, result.version)
package.description = result.description.strip()
results.append(package)
except InvalidVersion:
self._log(
(
f'Unable to parse version "{version}" for the {name} package,'
" skipping"
f'Unable to parse version "{result.version}" for the'
f" {result.name} package, skipping"
),
level="debug",
)
......
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
if TYPE_CHECKING:
from tests.types import HTMLPageGetter
@pytest.fixture
def html_page_content() -> HTMLPageGetter:
def _fixture(content: str, base_url: str | None = None) -> str:
base = f'<base href="{base_url}"' if base_url else ""
return """
<!DOCTYPE html>
<html>
<head>
{base}
<meta name="pypi:repository-version" content="1.0">
<title>Links for demo</title>
</head>
<body>
<h1>Links for demo</h1>
{content}
</body>
</html>
""".format(content=content, base=base)
return _fixture
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from packaging.utils import canonicalize_name
......@@ -9,19 +11,8 @@ from poetry.core.packages.utils.link import Link
from poetry.repositories.link_sources.html import HTMLPage
DEMO_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<meta name="pypi:repository-version" content="1.0">
<title>Links for demo</title>
</head>
<body>
<h1>Links for demo</h1>
{}
</body>
</html>
"""
if TYPE_CHECKING:
from tests.types import HTMLPageGetter
@pytest.mark.parametrize(
......@@ -52,11 +43,13 @@ DEMO_TEMPLATE = """
),
],
)
def test_link_attributes(attributes: str, expected_link: Link) -> None:
def test_link_attributes(
html_page_content: HTMLPageGetter, attributes: str, expected_link: Link
) -> None:
anchor = (
f'<a href="https://example.org/demo-0.1.whl" {attributes}>demo-0.1.whl</a><br/>'
)
content = DEMO_TEMPLATE.format(anchor)
content = html_page_content(anchor)
page = HTMLPage("https://example.org", content)
assert len(list(page.links)) == 1
......@@ -82,13 +75,41 @@ def test_link_attributes(attributes: str, expected_link: Link) -> None:
(("data-yanked='reason 1'", "data-yanked='reason 2'"), "reason 1\nreason 2"),
],
)
def test_yanked(yanked_attrs: tuple[str, str], expected: bool | str) -> None:
def test_yanked(
html_page_content: HTMLPageGetter,
yanked_attrs: tuple[str, str],
expected: bool | str,
) -> None:
anchors = (
f'<a href="https://example.org/demo-0.1.tar.gz" {yanked_attrs[0]}>'
"demo-0.1.tar.gz</a>"
f'<a href="https://example.org/demo-0.1.whl" {yanked_attrs[1]}>demo-0.1.whl</a>'
)
content = DEMO_TEMPLATE.format(anchors)
content = html_page_content(anchors)
page = HTMLPage("https://example.org", content)
assert page.yanked(canonicalize_name("demo"), Version.parse("0.1")) == expected
@pytest.mark.parametrize(
"anchor, base_url, expected",
(
(
'<a href="https://example.org/demo-0.1.whl">demo-0.1.whl</a>',
None,
"https://example.org/demo-0.1.whl",
),
(
'<a href="demo-0.1.whl">demo-0.1.whl</a>',
"https://example.org/",
"https://example.org/demo-0.1.whl",
),
),
)
def test_base_url(
html_page_content: HTMLPageGetter, anchor: str, base_url: str | None, expected: str
) -> None:
content = html_page_content(anchor, base_url)
page = HTMLPage("https://example.org", content)
link = list(page.links)[0]
assert link.url == expected
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from poetry.repositories.parsers.html_page_parser import HTMLPageParser
if TYPE_CHECKING:
from tests.types import HTMLPageGetter
@pytest.fixture()
def html_page(html_page_content: HTMLPageGetter) -> str:
links = """
<a href="https://example.org/demo-0.1.whl">demo-0.1.whl</a><br/>
<a href="https://example.org/demo-0.1.whl"
data-requires-python=">=3.7">demo-0.1.whl</a><br/>
<a href="https://example.org/demo-0.1.whl" data-yanked>demo-0.1.whl</a><br/>
<a href="https://example.org/demo-0.1.whl" data-yanked="">demo-0.1.whl</a><br/>
<a href="https://example.org/demo-0.1.whl"
data-yanked="<reason>"
>demo-0.1.whl</a><br/>
<a href="https://example.org/demo-0.1.whl"
data-requires-python=">=3.7"
data-yanked
>demo-0.1.whl</a><br/>
"""
return html_page_content(links)
def test_html_page_parser_anchors(html_page: str) -> None:
parser = HTMLPageParser()
parser.feed(html_page)
assert parser.anchors == [
{"href": "https://example.org/demo-0.1.whl"},
{"data-requires-python": ">=3.7", "href": "https://example.org/demo-0.1.whl"},
{"data-yanked": None, "href": "https://example.org/demo-0.1.whl"},
{"data-yanked": "", "href": "https://example.org/demo-0.1.whl"},
{"data-yanked": "<reason>", "href": "https://example.org/demo-0.1.whl"},
{
"data-requires-python": ">=3.7",
"data-yanked": None,
"href": "https://example.org/demo-0.1.whl",
},
]
def test_html_page_parser_base_url() -> None:
content = """
<!DOCTYPE html>
<html>
<head>
<base href="https://example.org/">
<meta name="pypi:repository-version" content="1.0">
<title>Links for demo</title>
</head>
<body>
<h1>Links for demo</h1>
<a href="demo-0.1.whl">demo-0.1.whl</a><br/>
</body>
</html>
"""
parser = HTMLPageParser()
parser.feed(content)
assert parser.base_url == "https://example.org/"
from __future__ import annotations
from pathlib import Path
import pytest
from poetry.repositories.parsers.pypi_search_parser import Result
from poetry.repositories.parsers.pypi_search_parser import SearchResultParser
FIXTURES_DIRECTORY = Path(__file__).parent.parent / "fixtures" / "pypi.org" / "search"
@pytest.fixture
def search_page_data() -> str:
with FIXTURES_DIRECTORY.joinpath("search.html").open(encoding="utf-8") as f:
return f.read()
def test_search_parser(search_page_data: str) -> None:
parser = SearchResultParser()
parser.feed(search_page_data)
assert parser.results == [
Result(
name="SQLAlchemy",
version="1.3.10",
description="Database Abstraction Library",
),
Result(
name="SQLAlchemy-Dao",
version="1.3.1",
description="Simple wrapper for sqlalchemy.",
),
Result(
name="graphene-sqlalchemy",
version="2.2.2",
description="Graphene SQLAlchemy integration",
),
Result(
name="SQLAlchemy-UTCDateTime",
version="1.0.4",
description=(
"Convert to/from timezone aware datetimes when storing in a DBMS"
),
),
Result(
name="paginate_sqlalchemy",
version="0.3.0",
description="Extension to paginate.Page that supports SQLAlchemy queries",
),
Result(
name="sqlalchemy_audit",
version="0.1.0",
description=(
"sqlalchemy-audit provides an easy way to set up revision "
"tracking for your data."
),
),
Result(
name="transmogrify.sqlalchemy",
version="1.0.2",
description="Feed data from SQLAlchemy into a transmogrifier pipeline",
),
Result(
name="sqlalchemy_schemadisplay",
version="1.3",
description="Turn SQLAlchemy DB Model into a graph",
),
Result(name="sqlalchemy_traversal", version="0.5.2", description="UNKNOWN"),
Result(
name="sqlalchemy-filters",
version="0.10.0",
description="A library to filter SQLAlchemy queries.",
),
Result(
name="SQLAlchemy-wrap",
version="2.1.7",
description="Python wrapper for the CircleCI API",
),
Result(
name="sqlalchemy-nav",
version="0.0.2",
description=(
"SQLAlchemy-Nav provides SQLAlchemy Mixins for creating "
"navigation bars compatible with Bootstrap"
),
),
Result(
name="sqlalchemy-repr",
version="0.0.1",
description="Automatically generates pretty repr of a SQLAlchemy model.",
),
Result(
name="sqlalchemy-diff",
version="0.1.3",
description="Compare two database schemas using sqlalchemy.",
),
Result(
name="SQLAlchemy-Equivalence",
version="0.1.1",
description=(
"Provides natural equivalence support for SQLAlchemy "
"declarative models."
),
),
Result(
name="Broadway-SQLAlchemy",
version="0.0.1",
description="A broadway extension wrapping Flask-SQLAlchemy",
),
Result(
name="jsonql-sqlalchemy",
version="1.0.1",
description="Simple JSON-Based CRUD Query Language for SQLAlchemy",
),
Result(
name="sqlalchemy-plus",
version="0.2.0",
description="Create Views and Materialized Views with SqlAlchemy",
),
Result(
name="CherryPy-SQLAlchemy",
version="0.5.3",
description="Use SQLAlchemy with CherryPy",
),
Result(
name="sqlalchemy_sqlany",
version="1.0.3",
description="SAP Sybase SQL Anywhere dialect for SQLAlchemy",
),
]
......@@ -60,3 +60,8 @@ class FixtureDirGetter(Protocol):
class FixtureCopier(Protocol):
def __call__(self, relative_path: str, target: Path | None = None) -> Path:
...
class HTMLPageGetter(Protocol):
def __call__(self, content: str, base_url: str | None = None) -> str:
...
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment