Linux lhjmq-records 5.15.0-118-generic #128-Ubuntu SMP Fri Jul 5 09:28:59 UTC 2024 x86_64
Your IP : 3.145.105.85
from debian._deb822_repro.tokens import Deb822Token
# Consider these "opaque" enum-like values. The actual value was chosen to
# make repr easier to implement but they are subject to change.
_CONTENT_TYPE_VALUE = "is_value"
_CONTENT_TYPE_COMMENT = "is_comment"
_CONTENT_TYPE_SEPARATOR = "is_separator"
try:
from typing import Iterator, Union
from debian._deb822_repro.types import TokenOrElement, FormatterCallback
except ImportError:
pass
class FormatterContentToken(object):
"""Typed, tagged text for use with the formatting API
The FormatterContentToken is used by the formatting API and provides the
formatter callback with context about the textual tokens it is supposed
to format.
"""
__slots__ = ('_text', '_content_type')
def __init__(self, text, content_type):
# type: (str, object) -> None
self._text = text
self._content_type = content_type
@classmethod
def from_token_or_element(cls, token_or_element):
# type: (TokenOrElement) -> FormatterContentToken
if isinstance(token_or_element, Deb822Token):
if token_or_element.is_comment:
return cls.comment_token(token_or_element.text)
if token_or_element.is_whitespace:
raise ValueError("FormatterContentType cannot be whitespace")
return cls.value_token(token_or_element.text)
# Elements are assumed to be content (this is specialized for the
# interpretations where comments are always tokens).
return cls.value_token(token_or_element.convert_to_text())
@classmethod
def separator_token(cls, text):
# type: (str) -> FormatterContentToken
# Special-case separators as a minor memory optimization
if text == ' ':
return SPACE_SEPARATOR_FT
if text == ',':
return COMMA_SEPARATOR_FT
return cls(text, _CONTENT_TYPE_SEPARATOR)
@classmethod
def comment_token(cls, text):
# type: (str) -> FormatterContentToken
"""Generates a single comment token with the provided text
Mostly useful for creating test cases
"""
return cls(text, _CONTENT_TYPE_COMMENT)
@classmethod
def value_token(cls, text):
# type: (str) -> FormatterContentToken
"""Generates a single value token with the provided text
Mostly useful for creating test cases
"""
return cls(text, _CONTENT_TYPE_VALUE)
@property
def is_comment(self):
# type: () -> bool
"""True if this formatter token represent a comment
This should be used for determining whether the token is a comment
or not. It might be tempting to check whether the text in the token
starts with a "#" but that is insufficient because a value *can*
start with that as well. Whether it is a comment or a value is
based on the context (it is a comment if and only if the "#" was
at the start of a line) but the formatter often do not have the
context available to assert this.
The formatter *should* preserve the order of comments and interleave
between the value tokens in the same order as it see them. Failing
to preserve the order of comments and values can cause confusing
comments (such as associating the comment with a different value
than it was written for).
The formatter *may* discard comment tokens if it does not want to
preserve them. If so, they would be omitted in the output, which
may be acceptable in some cases. This is a lot better than
re-ordering comments.
Formatters must be aware of the following special cases for comments:
* Comments *MUST* be emitted after a newline. If the very first token
is a comment, the formatter is expected to emit a newline before it
as well (Fields cannot start immediately on a comment).
"""
return self._content_type is _CONTENT_TYPE_COMMENT
@property
def is_value(self):
# type: () -> bool
"""True if this formatter token represents a semantic value
The formatter *MUST* preserve values as-in in its output. It may
"unpack" it from the token (as in, return it as a part of a plain
str) but the value content must not be changed nor re-ordered relative
to other value tokens (as that could change the meaning of the field).
"""
return self._content_type is _CONTENT_TYPE_VALUE
@property
def is_separator(self):
# type: () -> bool
"""True if this formatter token represents a separator token
The formatter is not required to preserve the provided separators but it
is required to properly separate values. In fact, often is a lot easier
to discard existing separator tokens. As an example, in whitespace
separated list of values space, tab and newline all counts as separator.
However, formatting-wise, there is a world of difference between the
a space, tab and a newline. In particularly, newlines must be followed
by an additional space or tab (to act as a value continuation line) if
there is a value following it (otherwise, the generated output is
invalid).
"""
return self._content_type is _CONTENT_TYPE_SEPARATOR
@property
def is_whitespace(self):
# type: () -> bool
"""True if this formatter token represents a whitespace token"""
return self._content_type is _CONTENT_TYPE_SEPARATOR and self._text.isspace()
@property
def text(self):
# type: () -> str
"""The actual context of the token
This field *must not* be used to determine the type of token. The
formatter cannot reliably tell whether "#..." is a comment or a value
(it can be both). Use is_value and is_comment instead for discriminating
token types.
For value tokens, this the concrete value to be omitted.
For comment token, this is the full comment text.
This is the same as str(token).
"""
return self._text
def __str__(self):
# type: () -> str
return self._text
def __repr__(self):
# type: () -> str
return "{}({!r}, {}=True)".format(self.__class__.__name__, self._text, self._content_type)
SPACE_SEPARATOR_FT = FormatterContentToken(' ', _CONTENT_TYPE_SEPARATOR)
COMMA_SEPARATOR_FT = FormatterContentToken(',', _CONTENT_TYPE_SEPARATOR)
def one_value_per_line_trailing_separator(
name, # type: str
sep_token, # type: FormatterContentToken
formatter_tokens, # type: Iterator[FormatterContentToken]
):
# type: (...) -> Iterator[Union[FormatterContentToken, str]]
"""Use "one value per line, always trailing separator" formatting
"""
indent_len = len(name) + 2
indent = ' ' * indent_len
emitted_first = True
for t in formatter_tokens:
if t.is_comment:
if emitted_first:
yield "\n"
yield t
elif t.is_value:
if emitted_first:
yield ' '
else:
yield indent
yield t
if not sep_token.is_whitespace:
yield sep_token
yield "\n"
else:
# Skip existing separators (etc.)
continue
emitted_first = False
def format_field(formatter, # type: FormatterCallback
field_name, # type: str
separator_token, # type: FormatterContentToken
token_iter, # type: Iterator[FormatterContentToken]
):
# type: (...) -> str
"""Format a field using a provided formatter
This function formats a series of tokens using the provided formatter.
It can be used as a standalone formatter engine and can be used in test
suites to validate third-party formatters (enabling them to test for
corner cases without involving parsing logic).
The formatter receives series of FormatterContentTokens (via the
token_iter) and is expected to yield one or more str or
FormatterContentTokens. The calling function will combine all of
these into a single string, which will be used as the value.
The formatter is recommended to yield the provided value and comment
tokens interleaved with text segments of whitespace and separators
as part of its output. If it preserve comment and value tokens, the
calling function can provide some runtime checks to catch bugs
(like the formatter turning a comment into a value because it forgot
to ensure that the comment was emitted directly after a newline
character).
When writing a formatter, please keep the following in mind:
* The output of the formatter is appended directly after the ":" separator.
Most formatters will want to emit either a space or a newline as the very
first character for readability.
(compare "Depends:foo\\n" to "Depends: foo\\n")
* The formatter must always end its output on a newline. This is a design
choice of how the round-trip safe parser represent values that is imposed
on the formatter.
* It is often easier to discard/ignore all separator tokens from the
the provided token sequence and instead just yield separator tokens/str
where the formatter wants to place them.
- The formatter is strongly recommended to special-case formatting
for whitespace separators (check for `separator_token.is_whitespace`).
This is because space, tab and newline all counts as valid separators
and can all appear in the token sequence. If the original field uses
a mix of these separators it is likely to completely undermine the
desired result. Not to mention the additional complexity of handling
when a separator token happens to use the newline character which
affects how the formatter is supposed what comes after it
(see the rules for comments, empty lines and continuation line
markers).
* The formatter must remember to emit a "continuation line" marker
(typically a single space or tab) when emitting a value after
a newline or a comment. A `yield " "` is sufficient.
- The continuation line marker may be embedded inside a str
with other whitespace (such as the newline coming before it
or/and whitespace used for indentation purposes following
the marker).
* The formatter must not cause the output to contain completely
empty/whitespace lines as these cause syntax errors. The first
line never counts as an empty line (as it will be appended after
the field name).
* Tokens must be discriminated via the `token.is_value` (etc.)
properties. Assuming that `token.text.startswith("#")` implies a
comment and similar stunts are wrong. As an example, "#foo" is a
perfectly valid value in some contexts.
* Comment tokens *always* take up exactly one complete line including
the newline character at the end of the line. They must be emitted
directly after a newline character or another comment token.
* Special cases that are rare but can happen:
- Fields *can* start with comments and requires a formatter provided newline.
(Example: "Depends:\\n# Comment here\\n foo")
- Fields *can* start on a separator or have two separators in a row.
This is especially true for whitespace separated fields where every
whitespace counts as a separator, but it can also happen with other
separators (such as comma).
- Value tokens can contain whitespace (for non-whitespace separators).
When they do, the formatter must not attempt change nor "normalize"
the whitespace inside the value token as that might change how the
value is interpreted. (If you want to normalize such whitespace,
the formatter is at the wrong abstraction level. Instead, manipulate
the values directly in the value interpretation layer)
This function will provide *some* runtime checks of its input and the
output from the formatter to detect some errors early and provide
helpful diagnostics. If you use the function for testing, you are
recommended to rely on verifying the output of the function rather than
relying on the runtime checks (as these are subject to change).
:param formatter: A formatter (see FormatterCallback for an example)
:param field_name: The name of the field.
:param separator_token: One of SPACE_SEPARATOR and COMMA_SEPARATOR
:param token_iter: An iterable of tokens to be formatted.
The following example shows how to define a formatter_callback along with
a few verifications.
>>> fmt = one_value_per_line_trailing_separator
>>> # Omit separator tokens for in the token list for simplicity (the formatter does
>>> # not use them and it enables us to keep the example simple by reusing the list)
>>> tokens = [
... FormatterContentToken.value_token("foo"),
... FormatterContentToken.comment_token("# some comment about bar\\n"),
... FormatterContentToken.value_token("bar"),
... ]
>>> print(format_field(fmt, "Depends", COMMA_SEPARATOR_FT, tokens), end='')
Depends: foo,
# some comment about bar
bar,
>>> print(format_field(fmt, "Architecture", SPACE_SEPARATOR_FT, tokens), end='')
Architecture: foo
# some comment about bar
bar
>>> # Control check for the special case where the field starts with a comment
>>> print(format_field(fmt, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='')
Depends:
# some comment about bar
bar,
>>> # Also, check single line values (to ensure it ends on a newline)
>>> print(format_field(fmt, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='')
Depends: bar,
"""
formatted_tokens = [field_name, ':']
just_after_newline = False
last_was_value_token = False
if isinstance(token_iter, list):
# Some people from using this to test known "invalid" cases.
last_token = token_iter[-1]
if last_token.is_comment:
raise ValueError("Invalid token_iter: Field values cannot end with comments")
for token in formatter(field_name, separator_token, token_iter):
token_as_text = str(token)
# If we are given formatter tokens, then use them to verify the output.
if isinstance(token, FormatterContentToken):
if token.is_comment:
if not just_after_newline:
raise ValueError("Bad format: Comments must appear directly after a newline.")
# for the sake of ensuring people use proper test data.
if not token_as_text.startswith("#"):
raise ValueError("Invalid Comment token: Must start with #")
if not token_as_text.endswith("\n"):
raise ValueError("Invalid Comment token: Must end on a newline")
elif token.is_value:
if token_as_text[0].isspace() or token_as_text[-1].isspace():
raise ValueError("Invalid Value token: It cannot start nor end on whitespace")
if just_after_newline:
raise ValueError("Bad format: Missing continuation line marker")
if last_was_value_token:
raise ValueError("Bad format: Formatter omitted a separator")
last_was_value_token = token.is_value
else:
last_was_value_token = False
if just_after_newline:
if token_as_text[0] in ('\r', '\n'):
raise ValueError("Bad format: Saw completely empty line.")
if not token_as_text[0].isspace() and not token_as_text.startswith("#"):
raise ValueError("Bad format: Saw completely empty line.")
formatted_tokens.append(token_as_text)
just_after_newline = token_as_text.endswith("\n")
formatted_text = "".join(formatted_tokens)
if not formatted_text.endswith("\n"):
raise ValueError("Bad format: The field value must end on a newline")
return formatted_text
|