Commit c4b22537 by Randy Döring Committed by GitHub

fix: performance regression when parsing links from legacy repositories (#6442)

Resolves: #6436

Measurements of `poetry lock` with warm cache with example
pyproject.toml from #6436:

|test case|time in s|peak memory usage in MB|
|---|---|---|
|legacy repository (before)|422|113|
|legacy repository (after)|3|118|
|pypi repository|1|92|

`backports.cached-property` is used in order to support
cached_property on Python 3.7. 

Co-authored-by: Jarrod Moore <jmo@jmo.name>
Co-authored-by: Bjorn Neergaard <bjorn@neersighted.com>
parent 692bdaa9
......@@ -21,6 +21,14 @@ tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900
tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"]
[[package]]
name = "backports.cached-property"
version = "1.0.2"
description = "cached_property() - computed once per instance, cached as attribute"
category = "main"
optional = false
python-versions = ">=3.6.0"
[[package]]
name = "CacheControl"
version = "0.12.11"
description = "httplib2 caching for requests"
......@@ -931,7 +939,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
[metadata]
lock-version = "1.1"
python-versions = "^3.7"
content-hash = "3810aef128102dea7cd2797cfcee77f1345831ed057aac5a962916a76e29fb6a"
content-hash = "7e40cde2399843d53e715f657b703397ce236196663788ada896d7e2a53c869a"
[metadata.files]
atomicwrites = [
......@@ -941,6 +949,10 @@ attrs = [
{file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"},
{file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"},
]
"backports.cached-property" = [
{file = "backports.cached-property-1.0.2.tar.gz", hash = "sha256:9306f9eed6ec55fd156ace6bc1094e2c86fae5fb2bf07b6a9c00745c656e75dd"},
{file = "backports.cached_property-1.0.2-py3-none-any.whl", hash = "sha256:baeb28e1cd619a3c9ab8941431fe34e8490861fb998c6c4590693d50171db0cc"},
]
CacheControl = [
{file = "CacheControl-0.12.11-py2.py3-none-any.whl", hash = "sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b"},
{file = "CacheControl-0.12.11.tar.gz", hash = "sha256:a5b9fcc986b184db101aa280b42ecdcdfc524892596f606858e0b7a8b4d9e144"},
......
......@@ -46,6 +46,7 @@ python = "^3.7"
poetry-core = "^1.1.0"
poetry-plugin-export = "^1.0.6"
"backports.cached-property" = { version = "^1.0.2", python = "<3.8" }
cachecontrol = { version = "^0.12.9", extras = ["filecache"] }
cachy = "^0.3.0"
cleo = "^1.0.0a5"
......
......@@ -3,13 +3,15 @@ from __future__ import annotations
import logging
import re
from abc import abstractmethod
from typing import TYPE_CHECKING
from typing import DefaultDict
from typing import List
from packaging.utils import canonicalize_name
from poetry.core.packages.package import Package
from poetry.core.semver.version import Version
from poetry.utils._compat import cached_property
from poetry.utils.patterns import sdist_file_re
from poetry.utils.patterns import wheel_file_re
......@@ -20,6 +22,8 @@ if TYPE_CHECKING:
from packaging.utils import NormalizedName
from poetry.core.packages.utils.link import Link
LinkCache = DefaultDict[NormalizedName, DefaultDict[Version, List[Link]]]
logger = logging.getLogger(__name__)
......@@ -44,16 +48,8 @@ class LinkSource:
def url(self) -> str:
return self._url
def versions(self, name: str) -> Iterator[Version]:
name = canonicalize_name(name)
seen: set[Version] = set()
for link in self.links:
pkg = self.link_package_data(link)
if pkg and pkg.name == name and pkg.version not in seen:
seen.add(pkg.version)
yield pkg.version
def versions(self, name: NormalizedName) -> Iterator[Version]:
yield from self._link_cache[name]
@property
def packages(self) -> Iterator[Package]:
......@@ -64,9 +60,10 @@ class LinkSource:
yield pkg
@property
@abstractmethod
def links(self) -> Iterator[Link]:
raise NotImplementedError()
for links_per_version in self._link_cache.values():
for links in links_per_version.values():
yield from links
@classmethod
def link_package_data(cls, link: Link) -> Package | None:
......@@ -102,11 +99,7 @@ class LinkSource:
def links_for_version(
self, name: NormalizedName, version: Version
) -> Iterator[Link]:
for link in self.links:
pkg = self.link_package_data(link)
if pkg and pkg.name == name and pkg.version == version:
yield link
yield from self._link_cache[name][version]
def clean_link(self, url: str) -> str:
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
......@@ -127,3 +120,7 @@ class LinkSource:
if reasons:
return "\n".join(sorted(reasons))
return True
@cached_property
def _link_cache(self) -> LinkCache:
raise NotImplementedError()
......@@ -3,16 +3,19 @@ from __future__ import annotations
import urllib.parse
import warnings
from collections import defaultdict
from html import unescape
from typing import TYPE_CHECKING
from poetry.core.packages.utils.link import Link
from poetry.repositories.link_sources.base import LinkSource
from poetry.utils._compat import cached_property
if TYPE_CHECKING:
from collections.abc import Iterator
from poetry.repositories.link_sources.base import LinkCache
with warnings.catch_warnings():
warnings.simplefilter("ignore")
......@@ -25,8 +28,9 @@ class HTMLPage(LinkSource):
self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
@property
def links(self) -> Iterator[Link]:
@cached_property
def _link_cache(self) -> LinkCache:
links: LinkCache = defaultdict(lambda: defaultdict(list))
for anchor in self._parsed.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
......@@ -44,7 +48,11 @@ class HTMLPage(LinkSource):
if link.ext not in self.SUPPORTED_FORMATS:
continue
yield link
pkg = self.link_package_data(link)
if pkg:
links[pkg.name][pkg.version].append(link)
return links
class SimpleRepositoryPage(HTMLPage):
......
......@@ -13,6 +13,12 @@ if sys.version_info < (3, 10):
else:
from importlib import metadata
if sys.version_info < (3, 8):
# compatibility for python <3.8
from backports.cached_property import cached_property
else:
from functools import cached_property
WINDOWS = sys.platform == "win32"
......@@ -53,4 +59,12 @@ def list_to_shell_command(cmd: list[str]) -> str:
)
__all__ = ["WINDOWS", "decode", "encode", "list_to_shell_command", "metadata", "to_str"]
__all__ = [
"WINDOWS",
"cached_property",
"decode",
"encode",
"list_to_shell_command",
"metadata",
"to_str",
]
from __future__ import annotations
from collections import defaultdict
from typing import TYPE_CHECKING
from unittest.mock import PropertyMock
......@@ -24,16 +25,22 @@ def link_source(mocker: MockerFixture) -> LinkSource:
url = "https://example.org"
link_source = LinkSource(url)
mocker.patch(
f"{LinkSource.__module__}.{LinkSource.__qualname__}.links",
f"{LinkSource.__module__}.{LinkSource.__qualname__}._link_cache",
new_callable=PropertyMock,
return_value=iter(
[
Link(f"{url}/demo-0.1.0.tar.gz"),
Link(f"{url}/demo-0.1.0_invalid.tar.gz"),
Link(f"{url}/invalid.tar.gz"),
Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"),
Link(f"{url}/demo-0.1.1.tar.gz"),
]
return_value=defaultdict(
lambda: defaultdict(list),
{
canonicalize_name("demo"): defaultdict(
list,
{
Version.parse("0.1.0"): [
Link(f"{url}/demo-0.1.0.tar.gz"),
Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"),
],
Version.parse("0.1.1"): [Link(f"{url}/demo-0.1.1.tar.gz")],
},
),
},
),
)
return link_source
......@@ -63,7 +70,7 @@ def test_link_package_data(filename: str, expected: Package | None) -> None:
],
)
def test_versions(name: str, expected: set[Version], link_source: LinkSource) -> None:
assert set(link_source.versions(name)) == expected
assert set(link_source.versions(canonicalize_name(name))) == expected
def test_packages(link_source: LinkSource) -> None:
......
......@@ -2,6 +2,7 @@ from __future__ import annotations
import pytest
from packaging.utils import canonicalize_name
from poetry.core.packages.utils.link import Link
from poetry.core.semver.version import Version
......@@ -90,4 +91,4 @@ def test_yanked(yanked_attrs: tuple[str, str], expected: bool | str) -> None:
content = DEMO_TEMPLATE.format(anchors)
page = HTMLPage("https://example.org", content)
assert page.yanked("demo", Version.parse("0.1")) == expected
assert page.yanked(canonicalize_name("demo"), Version.parse("0.1")) == expected
......@@ -102,25 +102,12 @@ def test_page_invalid_version_link() -> None:
assert page is not None
links = list(page.links)
assert len(links) == 2
assert len(links) == 1
versions = list(page.versions("poetry"))
versions = list(page.versions(canonicalize_name("poetry")))
assert len(versions) == 1
assert versions[0].to_string() == "0.1.0"
invalid_link = None
for link in links:
if link.filename.startswith("poetry-21"):
invalid_link = link
break
links_010 = list(page.links_for_version(canonicalize_name("poetry"), versions[0]))
assert invalid_link not in links_010
assert invalid_link
assert not page.link_package_data(invalid_link)
packages = list(page.packages)
assert len(packages) == 1
assert packages[0].name == "poetry"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment