Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorder requirements file decoding #12795

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions docs/html/reference/requirements-file-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,14 @@ examples of all these forms, see {ref}`pip install Examples`.

### Encoding

Requirements files are `utf-8` encoding by default and also support
{pep}`263` style comments to change the encoding (i.e.
`# -*- coding: <encoding name> -*-`).
The default encoding for requirement files is `UTF-8` unless a different
encoding is specified using a {pep}`263` style comment (e.g. `# -*- coding:
<encoding name> -*-`).

```{warning}
pip will fallback to the locale defined encoding if `UTF-8` decoding fails. This is a quirk
of pip's parser. This behaviour is *deprecated* and should not be relied upon.
```

### Line continuations

Expand Down
2 changes: 2 additions & 0 deletions news/12771.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Reorder the encoding detection when decoding a requirements file, relying on
UTF-8 over the locale encoding by default.
53 changes: 51 additions & 2 deletions src/pip/_internal/req/req_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
Requirements file parsing
"""

import codecs
import locale
import logging
import optparse
import os
import re
import shlex
import sys
import urllib.parse
from dataclasses import dataclass
from optparse import Values
Expand All @@ -26,7 +29,6 @@
from pip._internal.cli import cmdoptions
from pip._internal.exceptions import InstallationError, RequirementsFileParseError
from pip._internal.models.search_scope import SearchScope
from pip._internal.utils.encoding import auto_decode

if TYPE_CHECKING:
from pip._internal.index.package_finder import PackageFinder
Expand Down Expand Up @@ -82,6 +84,21 @@
str(o().dest) for o in SUPPORTED_OPTIONS_EDITABLE_REQ
]

# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
BOMS: List[Tuple[bytes, str]] = [
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF32, "utf-32"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
(codecs.BOM_UTF16, "utf-16"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
]

PEP263_ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
DEFAULT_ENCODING = "utf-8"

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -568,7 +585,39 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
# Assume this is a bare path.
try:
with open(url, "rb") as f:
content = auto_decode(f.read())
raw_content = f.read()
except OSError as exc:
raise InstallationError(f"Could not open requirements file: {exc}")

content = _decode_req_file(raw_content, url)
Comment on lines -571 to +592
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, if the decoding with the locale encoding also fails, the exception won't be caught. Any objections to calling _decode_req_file within in the try-except?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, if the decoding with the locale encoding also fails, the exception won't be caught. Any objections to calling _decode_req_file within in the try-except?

No objection to adding it there. My original assumption, and the reason I moved it out, was the catching of OSError was only really relevant for reading the content, and not the actual decoding.


return url, content


def _decode_req_file(data: bytes, url: str) -> str:
for bom, encoding in BOMS:
if data.startswith(bom):
return data[len(bom) :].decode(encoding)

for line in data.split(b"\n")[:2]:
if line[0:1] == b"#":
result = PEP263_ENCODING_RE.search(line)
if result is not None:
encoding = result.groups()[0].decode("ascii")
return data.decode(encoding)

try:
return data.decode(DEFAULT_ENCODING)
except UnicodeDecodeError:
locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
logging.warning(
"unable to decode data from %s with default encoding %s, "
"falling back to encoding from locale: %s. "
"If this is intentional you should specify the encoding with a "
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
url,
DEFAULT_ENCODING,
locale_encoding,
locale_encoding,
)
return data.decode(locale_encoding)
36 changes: 0 additions & 36 deletions src/pip/_internal/utils/encoding.py

This file was deleted.

114 changes: 114 additions & 0 deletions tests/unit/test_req_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import collections
import logging
import os
Expand Down Expand Up @@ -955,3 +956,116 @@ def test_install_requirements_with_options(
)

assert req.global_options == [global_option]

@pytest.mark.parametrize(
"raw_req_file,expected_name,expected_spec",
[
pytest.param(
b"Django==1.4.2",
"Django",
"==1.4.2",
id="defaults to UTF-8",
),
pytest.param(
"# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
"Django",
"==1.4.2",
id="decodes based on PEP-263 style headers",
),
],
)
def test_general_decoding(
self,
raw_req_file: bytes,
expected_name: str,
expected_spec: str,
tmpdir: Path,
session: PipSession,
) -> None:
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(raw_req_file)

reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(reqs) == 1
assert reqs[0].name == expected_name
assert reqs[0].specifier == expected_spec

@pytest.mark.parametrize(
"bom,encoding",
[
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
# BOM automatically added when encoding byte-order dependent encodings
(b"", "utf-16"),
(b"", "utf-32"),
],
)
def test_decoding_with_BOM(
self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
) -> None:
req_name = "Django"
req_specifier = "==1.4.2"
encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(encoded_contents)

reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(reqs) == 1
assert reqs[0].name == req_name
assert reqs[0].specifier == req_specifier

def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
self,
tmpdir: Path,
session: PipSession,
caplog: pytest.LogCaptureFixture,
) -> None:
# \xff is valid in latin-1 but not UTF-8
data = b"pip<=24.0 # some comment\xff\n"
locale_encoding = "latin-1"
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(data)

# it's hard to rely on a locale definitely existing for testing
# so patch things out for simplicity
with caplog.at_level(logging.WARNING), mock.patch(
"locale.getpreferredencoding", return_value=locale_encoding
):
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(caplog.records) == 1
assert (
caplog.records[0].msg
== "unable to decode data from %s with default encoding %s, "
"falling back to encoding from locale: %s. "
"If this is intentional you should specify the encoding with a "
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
)
assert caplog.records[0].args == (
str(req_file),
"utf-8",
locale_encoding,
locale_encoding,
)

assert len(reqs) == 1
assert reqs[0].name == "pip"
assert str(reqs[0].specifier) == "<=24.0"

@pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
def test_errors_on_non_decodable_data(
self, encoding: str, tmpdir: Path, session: PipSession
) -> None:
data = b"\xff"
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(data)

with pytest.raises(UnicodeDecodeError), mock.patch(
"locale.getpreferredencoding", return_value=encoding
):
next(parse_reqfile(req_file.resolve(), session=session))
46 changes: 1 addition & 45 deletions tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""

import codecs
import os
import shutil
import stat
Expand All @@ -12,7 +11,7 @@
from io import BytesIO
from pathlib import Path
from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
from unittest.mock import Mock, patch
from unittest.mock import Mock

import pytest

Expand All @@ -21,7 +20,6 @@
from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
from pip._internal.utils.egg_link import egg_link_path_from_location
from pip._internal.utils.encoding import BOMS, auto_decode
from pip._internal.utils.glibc import (
glibc_version_string,
glibc_version_string_confstr,
Expand Down Expand Up @@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
assert not empty_hashes.has_one_of({"sha256": "xyzt"})


class TestEncoding:
"""Tests for pip._internal.utils.encoding"""

def test_auto_decode_utf_16_le(self) -> None:
data = (
b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
b"=\x001\x00.\x004\x00.\x002\x00"
)
assert data.startswith(codecs.BOM_UTF16_LE)
assert auto_decode(data) == "Django==1.4.2"

def test_auto_decode_utf_16_be(self) -> None:
data = (
b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
b"\x00=\x001\x00.\x004\x00.\x002"
)
assert data.startswith(codecs.BOM_UTF16_BE)
assert auto_decode(data) == "Django==1.4.2"

def test_auto_decode_no_bom(self) -> None:
assert auto_decode(b"foobar") == "foobar"

def test_auto_decode_pep263_headers(self) -> None:
latin1_req = "# coding=latin1\n# Pas trop de café"
assert auto_decode(latin1_req.encode("latin1")) == latin1_req

def test_auto_decode_no_preferred_encoding(self) -> None:
om, em = Mock(), Mock()
om.return_value = "ascii"
em.return_value = None
data = "data"
with patch("sys.getdefaultencoding", om):
with patch("locale.getpreferredencoding", em):
ret = auto_decode(data.encode(sys.getdefaultencoding()))
assert ret == data

@pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
def test_all_encodings_are_valid(self, encoding: str) -> None:
# we really only care that there is no LookupError
assert "".encode(encoding).decode(encoding) == ""


def raises(error: Type[Exception]) -> NoReturn:
raise error

Expand Down
Loading