diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 03a6da6..c6b3c3f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ '3.9', '3.10', '3.11', '3.12', '3.13' ] + python-version: [ '3.10', '3.11', '3.12', '3.13' ] steps: - uses: actions/checkout@v3 @@ -22,7 +22,8 @@ jobs: - name: Install build environment run: | python -m pip install --upgrade pip - python -m pip install tox setuptools pytest pytest-cov codecov - - name: Build and test with tox. + python -m pip install uv + - name: Build and test with uv. run: | - tox -vv -e flake8 + uv run ruff check + uv build diff --git a/benchmarking/run_benchmarking.py b/benchmarking/run_benchmarking.py index aee76e0..e9796fa 100755 --- a/benchmarking/run_benchmarking.py +++ b/benchmarking/run_benchmarking.py @@ -1,8 +1,5 @@ #!/usr/bin/env python3 -""" -Runs a benchmarking suite to compare speed -and output of different implementations. -""" +"""Run a benchmarking suite to compare speed and output of different implementations.""" import argparse import operator @@ -53,23 +50,16 @@ class AbstractHtmlConverter: - """ - An abstract HTML convert class. - """ + """An abstract HTML convert class.""" def get_text(self, html): - """ - Returns: - a text representation of the given HTML snippet. - """ + """Return a text representation of the given HTML snippet.""" raise NotImplementedError def benchmark(self, html): - """ - Benchmarks the classes HTML to text converter. + """Benchmarks the classes HTML to text converter. - Returns: - A tuple of the required time and the obtained text representation. + Return a tuple of the required time and the obtained text representation. """ start_time = time() for _ in range(TRIES): @@ -78,9 +68,7 @@ def benchmark(self, html): class BeautifulSoupHtmlConverter(AbstractHtmlConverter): - """ - Converts HTML to text using BeautifulSoup. - """ + """Converts HTML to text using BeautifulSoup.""" name = "BeautifulSoup" @@ -100,9 +88,7 @@ def get_text(self, html): class JustextConverter(AbstractHtmlConverter): - """ - Converts HTML to text using Justtext. - """ + """Converts HTML to text using Justtext.""" name = "Justtext" @@ -116,9 +102,7 @@ def get_text(self, html): class Html2TextConverter(AbstractHtmlConverter): - """ - Converts HTML to text using Html2Text. - """ + """Converts HTML to text using Html2Text.""" name = "Html2Text" @@ -133,9 +117,7 @@ def get_text(self, html): class LynxConverter(AbstractHtmlConverter): - """ - Converts HTML to text using lynx. - """ + """Converts HTML to text using lynx.""" name = "Lynx" @@ -166,9 +148,7 @@ def kill_lynx(pid): class LinksConverter(AbstractHtmlConverter): - """ - Converts HTML to text using links. - """ + """Converts HTML to text using links.""" name = "Links" @@ -199,9 +179,7 @@ def kill_links(pid): class InscriptisHtmlConverter(AbstractHtmlConverter): - """ - Converts HTML to text using Inscriptis. - """ + """Converts HTML to text using Inscriptis.""" name = "Inscriptis" @@ -217,18 +195,14 @@ def __init__(self): def save_to_file(algorithm, url, data, benchmarking_results_dir): - """ - Saves a benchmarking result to the given file. - """ + """Save the benchmarking result to the given file.""" result_file = os.path.join(benchmarking_results_dir, f"{algorithm}_{url}.txt") with open(result_file, "w") as output_file: output_file.write(data) def get_speed_table(times): - """ - Provides the table which compares the conversion speed. - """ + """Provide the table which compares the conversion speed.""" fastest = min((value for _, value in times.items())) longest_key = max(len(key) for key, _ in times.items()) longest_value = max(len(str(value)) for _, value in times.items()) @@ -251,9 +225,7 @@ def get_speed_table(times): def get_fname(url) -> str: - """ - Transforms a URL to a file name. - """ + """Transform a URL to a file name.""" trash = (("http://", ""), ("https://", ""), ("/", "-"), (":", "-"), ("%", "")) for key, value in trash: @@ -272,9 +244,7 @@ def get_fname(url) -> str: def parse_args(): - """ - Parse optional benchmarking arguments. - """ + """Parse optional benchmarking arguments.""" parser = argparse.ArgumentParser(description="Inscriptis benchmarking suite") parser.add_argument( "converter", @@ -306,11 +276,11 @@ def parse_args(): def _setup_benchmarking_directories(args): - """ - Setup the benchmarking result and caching directories. + """Set up the benchmarking result and caching directories. Args: args: command line arguments that provide the directory names. + """ if not os.path.exists(args.benchmarking_results): os.makedirs(args.benchmarking_results) @@ -319,16 +289,17 @@ def _setup_benchmarking_directories(args): def _fetch_url(url, cache_dir): - """ - Fetch the given URL either from the cache or from the Web. + """Fetch the given URL either from the cache or from the Web. URLs that are not yet cached are added to the cache. Args: url: the URL to fetch. + cache_dir: the cache directory. Returns: A tuple of the cache file name and the URLs content. + """ source_name = get_fname(url) source_cache_path = os.path.join(cache_dir, source_name) @@ -349,14 +320,13 @@ def _fetch_url(url, cache_dir): def benchmark(args, source_list): - """ - Run the benchmark. + """Run the benchmark. Args: args: command line arguments source_list: a list of URLs to benchmark. - """ + """ _setup_benchmarking_directories(args) output = [] diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py index f768ded..490d724 100755 --- a/examples/custom-html-handling.py +++ b/examples/custom-html-handling.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 -""" -Custom HTML tag handling example. +"""Custom HTML tag handling example. Add a custom HTML handler for the bold tag which encloses bold text with "**". Example: "Welcome to Chur" is rendered as "Welcome to **Chur**". + """ from lxml.html import fromstring diff --git a/publish.sh b/publish.sh index 3898831..a93be4c 100755 --- a/publish.sh +++ b/publish.sh @@ -19,8 +19,9 @@ case "$1" in # cleanup dist rm -rf ./dist - # build and publish packages - poetry publish --build + # build with hatchling and publish to PyPI + uv build + uv publish ;; docker) echo "Publishing ${IMAGE_NAME} in version ${VERSION}" diff --git a/pyproject.toml b/pyproject.toml index da57ba7..18937b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "2.7.0" description = "inscriptis - HTML to text converter." license = "Apache-2.0" readme = "README.rst" -requires-python = ">=3.9,<3.14" +requires-python = ">=3.10,<3.15" authors = [ { name = "Albert Weichselbraun", email = "albert.weichselbraun@fhgr.ch" }, @@ -20,12 +20,13 @@ classifiers = [ "Topic :: Text Processing :: Markup :: HTML", "Topic :: Utilities", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", ] + dependencies = [ "requests>=2.32.3,<3.0.0", @@ -49,14 +50,15 @@ web-service = [ [dependency-groups] dev = [ - "pytest>=8.3.5", + "pytest>=9.0.1", "fastapi>=0.115.11,<1.0.0", - "ruff>=0.11.12", + "ruff>=0.14.5", "httpx>=0.28.1", - "uvicorn>=0.34.2", - "ty>=0.0.1a7", - "pytest-cov>=6.1.1", - "safety>=3.5.1", + "uvicorn>=0.38.0", + "ty>=0.0.1a26", + "pytest-cov>=7.0.0", + "safety>=3.7.0", + "tox>=4.23.0", ] [build-system] requires = ["hatchling"] @@ -77,32 +79,56 @@ quote-style = "double" [tool.ruff.lint] select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - "UP", - # flake8-builtins - "A", - # flake8-bugbear - "B", - # flake8-comprehensions - "C4", - # flake8-errmsg - "EM", - # flake8-quotes - "Q", - # flake8-pyi - "PYI", - # flake8-simplify - "SIM", - # isort - "I", - "RSE", "RET", "SLOT", "TID", "TC", "C90", "N", "PERF", "E", "W", - "UP", "FURB", "RUF", "TRY", "YTT" + "A", # flake8-builtins + "B", # flake8-bugbear + "COM", # flake8-commas - trailing commas + "BLE", # flake8-blind-except - avoid bare except + "D", # flake8-docstrings + "C4", # flake8-comprehensions + "E", # pycodestyle + "EM", # flake8-errmsg + "F", # Pyflakes + "FA", # flake8-future-annotations - use modern annotations + "ICN", # flake8-import-conventions - standard import aliases + "PIE", # flake8-pie + "PLE", # pylint equivalents + "PLW", # pylint equivalents + "PTH", # flake8-use-pathlib - prefer pathlib over os.path + "PYI", # flake8-pyi + "Q", # flake8-quotes + "N", # flake8-naming + "SIM", # flake8-simplify + "I", # isort + "RET", # flake8-return + "RSE", "SLOT", "TID", "TC", "C90", "PERF", "E", "W", + "FURB", "RUF", "TRY", "YTT", + "TCH", # flake8-type-checking - optimize type checking imports + "S", # flake8-bandit (security) — replaces dlint/bandit + "UP", # pyupgrade ] +ignore = [ + "D102", # missing docstring in public method + "D105", # missing docstring in magic method + "D107", # missing docstring in __init__ + "D203", # incorrect-blank-line-before-class + "D213", # multi-line-summary-second-line +] + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = [ + "S101", # allow asserts + "D", # no dockstring checks + "S310", # allow URLs + "PTH", # prefer pathlib +] +"benchmarking/*.py" = [ + "S310", # allow URLs + "S603", # call: check for execution of untrusted input + "PTH", # prefer pathlib +] + + [tool.ty.src] root="./src" diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index e5aa13a..52f2e6f 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -83,6 +83,7 @@ def _get_html_tree(html_content: str) -> HtmlElement | None: Returns: The corresponding HTML parse tree. + """ html_content = html_content.strip() if not html_content: @@ -107,6 +108,7 @@ def get_text(html_content: str, config: ParserConfig | None = None) -> str: Returns: The text representation of the HTML content. + """ html_tree = _get_html_tree(html_content) return Inscriptis(html_tree, config).get_text() if html_tree is not None else "" @@ -128,6 +130,7 @@ def get_annotated_text(html_content: str, config: ParserConfig | None = None) -> Returns: A dictionary of text (key: 'text') and annotations (key: 'label') + """ html_tree = _get_html_tree(html_content) if html_tree is None: diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index be87836..eae39d7 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -51,6 +51,7 @@ def horizontal_shift( Returns: A list of :class:`Annotation`\s with the adjusted start and end positions. + """ if align == HorizontalAlignment.left: h_align = shift diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py index 356d7b9..56d4907 100644 --- a/src/inscriptis/annotation/output/__init__.py +++ b/src/inscriptis/annotation/output/__init__.py @@ -42,5 +42,6 @@ def __call__(self, annotated_text: dict[str, str]) -> Any: Returns: An output representation that has been changed according to the AnnotationProcessor's design. + """ raise NotImplementedError diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index 38dbf02..0266e5c 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -49,9 +49,11 @@ def _get_label_colors(labels: list[str]) -> dict[str, str]: Args: labels: a list of the annotations classes (e.g., heading, etc.) that need to be color-coded. + Returns: A mapping between the available labels and the corresponding color from the COLOR_SCHEMA. + """ return dict(zip({a[2] for a in sorted(labels)}, cycle(COLOR_SCHEMA))) @@ -81,6 +83,6 @@ def _get_css(self, labels: list[str]) -> str: f' content: "{label}";\n' " position: absolute;\n" f" background-color: {color};\n" - " font-size: 75%; }\n" + " font-size: 75%; }\n", ) return "\n".join(css) diff --git a/src/inscriptis/annotation/output/surface.py b/src/inscriptis/annotation/output/surface.py index 6a39271..b6ecfeb 100644 --- a/src/inscriptis/annotation/output/surface.py +++ b/src/inscriptis/annotation/output/surface.py @@ -11,8 +11,7 @@ class SurfaceExtractor(AnnotationProcessor): verbatim = False def __call__(self, annotated_text: dict[str, Any]) -> dict[str, Any]: - """ - Add information on the surface forms to the annotated_text dictionary. + """Add information on the surface forms to the annotated_text dictionary. Args: annotated_text: a dictionary containing the plain text and the @@ -21,6 +20,7 @@ def __call__(self, annotated_text: dict[str, Any]) -> dict[str, Any]: Returns: An extended dictionary which contains the extracted surface-forms of the annotations under the key 'surface'. + """ surface_forms = [(label, annotated_text["text"][start:end]) for start, end, label in annotated_text["label"]] annotated_text["surface"] = surface_forms diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py index 36de25f..ba88667 100644 --- a/src/inscriptis/annotation/parser.py +++ b/src/inscriptis/annotation/parser.py @@ -33,6 +33,7 @@ class ApplyAnnotation: given match_tag. match_value: only apply annotations to attribute with the given match_value. + """ __slots__ = ("annotations", "attr", "match_tag", "match_value", "matcher") @@ -75,6 +76,7 @@ class AnnotationModel: css: the refined CSS class which contains annotations for HtmlElements which should be annotated. css_attr: information on CSS attributes that shall be annotated. + """ def __init__(self, css_profile, model: dict): @@ -91,6 +93,7 @@ def _parse(model: dict) -> tuple[dict, list]: Returns: the AnnotationModel matching the input dictionary. + """ tags = defaultdict(list) attrs = [] diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index 758aa09..fb99f6f 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -25,6 +25,7 @@ def get_postprocessor(name): Returns: The matching postprocessing function + """ pp_class = name.capitalize() + "Extractor" mod = __import__("inscriptis.annotation.output." + name, fromlist=[pp_class]) @@ -36,6 +37,7 @@ def parse_command_line() -> argparse.Namespace: Returns: The parsed command line arguments. + """ parser = argparse.ArgumentParser(description="Convert the given HTML document to text.") parser.add_argument( @@ -127,8 +129,7 @@ def parse_command_line() -> argparse.Namespace: def get_html_content(url: str, timeout: int, encoding: str = "") -> str: - """ - Return the HTML content to convert. + """Return the HTML content to convert. Args: url: URL to the HTML content, or None if the content is obtained from stdin. @@ -144,7 +145,7 @@ def get_html_content(url: str, timeout: int, encoding: str = "") -> str: if (p := Path(url)).is_file(): with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f: return f.read() - elif url.startswith("http://") or url.startswith("https://"): + elif url.startswith(("http://", "https://")): req = requests.get(url, timeout=timeout) return req.content.decode(encoding or req.encoding) return "" diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 3637888..2e55924 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -56,6 +56,7 @@ class Inscriptis: # transform the HTML tree to text. parser = Inscriptis(html_tree) text = parser.get_text() + """ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None) -> None: @@ -95,7 +96,9 @@ def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas: """Parse the HTML tree. Args: + state: the current HTML document state. tree: the HTML tree to parse. + """ if isinstance(tree.tag, str): state.apply_starttag_layout(tree.tag, tree.attrib) diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py index 3930bb0..5914661 100644 --- a/src/inscriptis/model/attribute.py +++ b/src/inscriptis/model/attribute.py @@ -24,6 +24,7 @@ def merge_function(func1, func2): Args: func1: the first function func2: the second function + """ def merged(*args): @@ -42,6 +43,7 @@ class Attribute: Attributes: attribute_mapping: a mapping of attributes to the corresponding handler functions. + """ def __init__(self): @@ -53,6 +55,7 @@ def apply_attributes(self, attributes: dict[str, str], html_element: HtmlElement Args: attributes: the list of attributes html_element: the HTML element for which the attributes are parsed + """ for attr_name, attr_value in attributes.items(): if attr_name in self.attribute_mapping: diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py index 9fd2295..dff176f 100644 --- a/src/inscriptis/model/canvas/__init__.py +++ b/src/inscriptis/model/canvas/__init__.py @@ -34,6 +34,7 @@ class Canvas: annotations: the list of recorded :class:`~inscriptis.annotation.Annotation`\s. _open_annotations: a map of open tags that contain annotations. + """ __slots__ = ( @@ -56,6 +57,7 @@ def open_tag(self, tag: HtmlElement) -> None: Args: tag: the tag to open. + """ if tag.annotation: self._open_annotations[tag] = self.current_block.idx @@ -96,6 +98,7 @@ def close_tag(self, tag: HtmlElement) -> None: Args: tag: the tag to close. + """ if tag.display == Display.block: # write missing bullets, if no content has been written so far. @@ -118,6 +121,7 @@ def close_block(self, tag: HtmlElement) -> None: Args: tag: the HTML Block element to close + """ if tag.margin_after > self.margin: required_newlines = tag.margin_after - self.margin @@ -146,6 +150,7 @@ def flush_inline(self) -> bool: Returns: True if the attempt was successful, False otherwise. + """ if not self.current_block.is_empty(): self.blocks.append(self.current_block.content) diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 4406195..59c783e 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -23,6 +23,7 @@ class Block: Args: idx: the current block's start index. prefix: prefix used within the current block. + """ __slots__ = ("_content", "collapsable_whitespace", "idx", "prefix") @@ -39,6 +40,7 @@ def merge(self, text: str, whitespace: WhiteSpace) -> None: Args: text: the text to merge. whitespace: whitespace handling. + """ if whitespace == WhiteSpace.pre: self.merge_pre_text(text) @@ -54,6 +56,7 @@ def merge_normal_text(self, text: str) -> None: Note: If the previous text ended with a whitespace and text starts with one, both will automatically collapse into a single whitespace. + """ normalized_text = [] @@ -76,6 +79,7 @@ def merge_pre_text(self, text: str) -> None: Args: text: the text to merge + """ text = "".join((self.prefix.first, text.replace("\n", "\n" + self.prefix.rest))) text = unescape(text) diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py index 6ed8194..6e78bde 100644 --- a/src/inscriptis/model/canvas/prefix.py +++ b/src/inscriptis/model/canvas/prefix.py @@ -12,6 +12,7 @@ class Prefix: paddings: the list of paddings for the current and all previous tags. bullets: the list of bullets in the current and all previous tags. consumed: whether the current bullet has already been consumed. + """ __slots__ = ("bullets", "consumed", "current_padding", "paddings") @@ -28,6 +29,7 @@ def register_prefix(self, padding_inline: int, bullet: str) -> None: Args: padding_inline: the number of characters used for padding_inline bullet: an optional bullet. + """ self.current_padding += padding_inline self.paddings.append(padding_inline) diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py index 2633e8b..056f918 100644 --- a/src/inscriptis/model/config.py +++ b/src/inscriptis/model/config.py @@ -56,6 +56,7 @@ class ParserConfig: config = ParserConfig(css=css_profile, display_links=False) text = get_text('first link', config) print(text) + """ def __init__( @@ -84,6 +85,7 @@ def __init__( specify tags and attributes to annotation. table_cell_separator: separator to use between table cells. custom_html_tag_handler_mapping: an optional CustomHtmlTagHandler + """ self.display_images = display_images self.deduplicate_captions = deduplicate_captions @@ -108,5 +110,6 @@ def parse_a(self) -> bool: Returns: Whether we need to parse tags. + """ return self.display_links or self.display_anchors diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py index 4c61948..6df8ea4 100644 --- a/src/inscriptis/model/css.py +++ b/src/inscriptis/model/css.py @@ -36,6 +36,7 @@ def attr_style(style_attribute: str, html_element: HtmlElement): style_attribute: The attribute value of the given style sheet. Example: display: none html_element: The HtmlElement to which the given style is applied. + """ for style_directive in style_attribute.lower().split(";"): if ":" not in style_directive: @@ -60,6 +61,7 @@ def _get_em(length: str) -> int: Returns: the length in em. + """ _m = CssParse.RE_UNIT.search(length) value = float(_m.group(1)) diff --git a/src/inscriptis/model/html_document_state.py b/src/inscriptis/model/html_document_state.py index 07f9e1f..a97f6ec 100644 --- a/src/inscriptis/model/html_document_state.py +++ b/src/inscriptis/model/html_document_state.py @@ -45,12 +45,13 @@ def apply_starttag_layout(self, tag, attrs): Args: tag: the HTML start tag to process. attrs: a dictionary of HTML attributes and their respective values. + """ # use the css to handle tags known to it :) cur = self.tags[-1].get_refined_html_element( self.apply_attributes( attrs, html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT).__copy__().set_tag(tag), - ) + ), ) self.tags.append(cur) diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index 28f5652..0af443f 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -1,5 +1,4 @@ -""" -The HtmlElement class controls how Inscriptis interprets HTML Elements. +"""The HtmlElement class controls how Inscriptis interprets HTML Elements. - The module :mod:`inscriptis.css_profiles` contain CSS profiles which assign to each standard HTML tag the corresponding :class:`HtmlElement`. @@ -36,6 +35,7 @@ class HtmlElement: valign: the element's vertical alignment. previous_margin_after: the margin after of the previous HtmlElement. annotation: annotations associated with the HtmlElement. + """ __slots__ = ( @@ -114,6 +114,7 @@ def write_verbatim_text(self, text: str): Args: text: the text to write + """ if not text: return @@ -138,6 +139,7 @@ def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement": Returns: The refined element with the context applied. + """ new.canvas = self.canvas diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index 188d958..a093f7a 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -16,6 +16,7 @@ class TableCell(Canvas): annotations after a reformatting) vertical_padding: vertical padding that has been introduced due to vertical formatting rules. + """ __slots__ = ( @@ -45,6 +46,7 @@ def normalize_blocks(self) -> int: Returns: The height of the normalized cell. + """ self.flush_inline() self.blocks = list(chain(*(line.split("\n") for line in self.blocks))) @@ -58,6 +60,7 @@ def height(self) -> int: Returns: The cell's current height. + """ return max(1, len(self.blocks)) @@ -67,6 +70,7 @@ def width(self) -> int: Returns: The cell's current width. + """ if self._width: return self._width @@ -77,7 +81,8 @@ def width(self, width): """Set the table's width and applies the cell's horizontal formatting. Args: - The cell's expected width. + width: The cell's expected width. + """ # save the original line widths before reformatting self.line_width = [len(block) for block in self.blocks] @@ -94,6 +99,7 @@ def height(self, height: int): Notes: Depending on the height and the cell's vertical formatting this might require the introduction of empty lines. + """ rows = len(self.blocks) if rows < height: @@ -113,6 +119,7 @@ def get_annotations(self, idx: int, row_width: int) -> list[Annotation]: Returns: A list of annotations that have been adjusted to the cell's position. + """ self.current_block.idx = idx if not self.annotations: @@ -150,6 +157,7 @@ class TableRow: Attributes: columns: the table row's columns. cell_separator: string used for separating columns from each other. + """ __slots__ = ("cell_separator", "columns") @@ -184,6 +192,7 @@ class Table: rows: the table's rows. left_margin_len: length of the left margin before the table. cell_separator: string used for separating cells from each other. + """ __slots__ = ("cell_separator", "left_margin_len", "rows") @@ -248,6 +257,7 @@ def get_annotations(self, idx: int, left_margin_len: int) -> list[Annotation]: Returns: A list of all :class:`~inscriptis.annotation.Annotation`\s present in the table. + """ if not self.rows: return [] diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py index cd81172..a4120b6 100644 --- a/src/inscriptis/model/tag/__init__.py +++ b/src/inscriptis/model/tag/__init__.py @@ -16,6 +16,7 @@ class CustomHtmlTagHandlerMapping(NamedTuple): Attributes: start_tag_mapping: a dictionary of custom start tag handlers. end_tag_mapping: a dictionary of custom end tag handlers. + """ start_tag_mapping: dict[str, Callable[[HtmlDocumentState, dict], None]] diff --git a/src/inscriptis/model/tag/table_tag.py b/src/inscriptis/model/tag/table_tag.py index 179d914..d60586f 100644 --- a/src/inscriptis/model/tag/table_tag.py +++ b/src/inscriptis/model/tag/table_tag.py @@ -28,7 +28,7 @@ def table_start_handler(state: HtmlDocumentState, _: dict) -> None: Table( left_margin_len=state.tags[-1].canvas.left_margin, cell_separator=state.config.table_cell_separator, - ) + ), ) diff --git a/tests/test_annotation.py b/tests/test_annotation.py index 1ec8a89..786be48 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -14,12 +14,12 @@ def test_horizontal_shift(): # no shift assert horizontal_shift( - a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=0 + a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=0, ).pop() == Annotation(0, 4, "test") # shift assert horizontal_shift( - a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=3 + a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=3, ).pop() == Annotation(3, 7, "test") # realignment to the right diff --git a/tests/test_list_value.py b/tests/test_list_value.py index b7454c1..74a3dab 100644 --- a/tests/test_list_value.py +++ b/tests/test_list_value.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -"""Test list value in ordered and unordered lists. -""" +"""Test list value in ordered and unordered lists.""" from inscriptis import get_text from inscriptis.css_profiles import CSS_PROFILES @@ -31,5 +30,3 @@ def test_value_without_ol(): html = """Thomas
  • Maria
  • Ana
  • """ assert get_text(html, config) == "Thomas\n* Maria\n* Ana" - - diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 6a100b7..0000000 --- a/tox.ini +++ /dev/null @@ -1,61 +0,0 @@ -[tox] -envlist = pytest, pyroma, flake8 - -# standard unit tests -[testenv:pytest] -deps = pytest ~= 8.3.5 - pytest-cov ~= 6.1.1 - fastapi ~= 0.115.11 - httpx ~= 0.28.1 -commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests - -# python packaging best practices -[testenv:pyroma] -deps = pyroma -commands = pyroma . - -# flake8-warnings ~= 0.4.1 - -[testenv:flake8] -deps = flake8 ~= 7.2.0 - dlint ~= 0.16.0 - flake8-bandit ~= 4.1.1 - flake8-blind-except ~= 0.2.1 - flake8-bugbear ~= 24.12.12 - flake8-builtins ~= 2.2.0 - flake8-cognitive-complexity ~= 0.1.0 - flake8-colors ~= 0.1.9 - flake8-comprehensions ~= 3.16.0 - flake8-docstrings ~= 1.7.0 - flake8-eradicate ~= 1.5.0 - flake8-encodings ~= 0.5.1 - flake8-expression-complexity ~= 0.0.11 - flake8-logging-format ~= 2024.24.12 - flake8-mutable ~= 1.2.0 - flake8-pie ~= 0.16.0 - flake8-pytest ~= 1.4 - flake8-raise ~= 0.0.5 - flake8-simplify ~= 0.21.0 - flake8-string-format ~= 0.3.0 - flake8-tuple ~= 0.4.1 - flake8-use-pathlib ~= 0.3.0 - flake8-warnings ~= 0.4.1 - pep8-naming ~= 0.15.1 - -# S104 - do not cleanup XML data prior to processing -# S410 - bind to all IPs is okay in the case of the Web service, since it is -# aimed for use with docker. -# W503 - replaced with W504 -# D102 - missing docstring in public method -# D105 - missing docstring in magic method (e.g., __str__) -# D107 - missing docstring in __init__ -# E203, E704 black -commands = flake8 --exclude=".tox, setup.py, tests, venv, .venv docs, benchmarking, build" \ - --show-source \ - --max-line-length=120 \ - --ignore="DUO107, W503, D107, D105, D102, S104, S410, E203, E708" \ - --max-cognitive-complexity=13 - -# --ignore="S104, S410, W503, D107, D105, D102" \ -# --enable-extensions=G \ -# --max-cognitive-complexity=13